xref: /sqlite-3.40.0/src/btree.c (revision a9cfaba9)
1 /*
2 ** 2004 April 6
3 **
4 ** The author disclaims copyright to this source code.  In place of
5 ** a legal notice, here is a blessing:
6 **
7 **    May you do good and not evil.
8 **    May you find forgiveness for yourself and forgive others.
9 **    May you share freely, never taking more than you give.
10 **
11 *************************************************************************
12 ** This file implements a external (disk-based) database using BTrees.
13 ** See the header comment on "btreeInt.h" for additional information.
14 ** Including a description of file format and an overview of operation.
15 */
16 #include "btreeInt.h"
17 
18 /*
19 ** The header string that appears at the beginning of every
20 ** SQLite database.
21 */
22 static const char zMagicHeader[] = SQLITE_FILE_HEADER;
23 
24 /*
25 ** Set this global variable to 1 to enable tracing using the TRACE
26 ** macro.
27 */
28 #if 0
29 int sqlite3BtreeTrace=1;  /* True to enable tracing */
30 # define TRACE(X)  if(sqlite3BtreeTrace){printf X;fflush(stdout);}
31 #else
32 # define TRACE(X)
33 #endif
34 
35 /*
36 ** Extract a 2-byte big-endian integer from an array of unsigned bytes.
37 ** But if the value is zero, make it 65536.
38 **
39 ** This routine is used to extract the "offset to cell content area" value
40 ** from the header of a btree page.  If the page size is 65536 and the page
41 ** is empty, the offset should be 65536, but the 2-byte value stores zero.
42 ** This routine makes the necessary adjustment to 65536.
43 */
44 #define get2byteNotZero(X)  (((((int)get2byte(X))-1)&0xffff)+1)
45 
46 #ifndef SQLITE_OMIT_SHARED_CACHE
47 /*
48 ** A list of BtShared objects that are eligible for participation
49 ** in shared cache.  This variable has file scope during normal builds,
50 ** but the test harness needs to access it so we make it global for
51 ** test builds.
52 **
53 ** Access to this variable is protected by SQLITE_MUTEX_STATIC_MASTER.
54 */
55 #ifdef SQLITE_TEST
56 BtShared *SQLITE_WSD sqlite3SharedCacheList = 0;
57 #else
58 static BtShared *SQLITE_WSD sqlite3SharedCacheList = 0;
59 #endif
60 #endif /* SQLITE_OMIT_SHARED_CACHE */
61 
62 #ifndef SQLITE_OMIT_SHARED_CACHE
63 /*
64 ** Enable or disable the shared pager and schema features.
65 **
66 ** This routine has no effect on existing database connections.
67 ** The shared cache setting effects only future calls to
68 ** sqlite3_open(), sqlite3_open16(), or sqlite3_open_v2().
69 */
70 int sqlite3_enable_shared_cache(int enable){
71   sqlite3GlobalConfig.sharedCacheEnabled = enable;
72   return SQLITE_OK;
73 }
74 #endif
75 
76 
77 
78 #ifdef SQLITE_OMIT_SHARED_CACHE
79   /*
80   ** The functions querySharedCacheTableLock(), setSharedCacheTableLock(),
81   ** and clearAllSharedCacheTableLocks()
82   ** manipulate entries in the BtShared.pLock linked list used to store
83   ** shared-cache table level locks. If the library is compiled with the
84   ** shared-cache feature disabled, then there is only ever one user
85   ** of each BtShared structure and so this locking is not necessary.
86   ** So define the lock related functions as no-ops.
87   */
88   #define querySharedCacheTableLock(a,b,c) SQLITE_OK
89   #define setSharedCacheTableLock(a,b,c) SQLITE_OK
90   #define clearAllSharedCacheTableLocks(a)
91   #define downgradeAllSharedCacheTableLocks(a)
92   #define hasSharedCacheTableLock(a,b,c,d) 1
93   #define hasReadConflicts(a, b) 0
94 #endif
95 
96 #ifndef SQLITE_OMIT_SHARED_CACHE
97 
98 #ifdef SQLITE_DEBUG
99 /*
100 **** This function is only used as part of an assert() statement. ***
101 **
102 ** Check to see if pBtree holds the required locks to read or write to the
103 ** table with root page iRoot.   Return 1 if it does and 0 if not.
104 **
105 ** For example, when writing to a table with root-page iRoot via
106 ** Btree connection pBtree:
107 **
108 **    assert( hasSharedCacheTableLock(pBtree, iRoot, 0, WRITE_LOCK) );
109 **
110 ** When writing to an index that resides in a sharable database, the
111 ** caller should have first obtained a lock specifying the root page of
112 ** the corresponding table. This makes things a bit more complicated,
113 ** as this module treats each table as a separate structure. To determine
114 ** the table corresponding to the index being written, this
115 ** function has to search through the database schema.
116 **
117 ** Instead of a lock on the table/index rooted at page iRoot, the caller may
118 ** hold a write-lock on the schema table (root page 1). This is also
119 ** acceptable.
120 */
121 static int hasSharedCacheTableLock(
122   Btree *pBtree,         /* Handle that must hold lock */
123   Pgno iRoot,            /* Root page of b-tree */
124   int isIndex,           /* True if iRoot is the root of an index b-tree */
125   int eLockType          /* Required lock type (READ_LOCK or WRITE_LOCK) */
126 ){
127   Schema *pSchema = (Schema *)pBtree->pBt->pSchema;
128   Pgno iTab = 0;
129   BtLock *pLock;
130 
131   /* If this database is not shareable, or if the client is reading
132   ** and has the read-uncommitted flag set, then no lock is required.
133   ** Return true immediately.
134   */
135   if( (pBtree->sharable==0)
136    || (eLockType==READ_LOCK && (pBtree->db->flags & SQLITE_ReadUncommitted))
137   ){
138     return 1;
139   }
140 
141   /* If the client is reading  or writing an index and the schema is
142   ** not loaded, then it is too difficult to actually check to see if
143   ** the correct locks are held.  So do not bother - just return true.
144   ** This case does not come up very often anyhow.
145   */
146   if( isIndex && (!pSchema || (pSchema->flags&DB_SchemaLoaded)==0) ){
147     return 1;
148   }
149 
150   /* Figure out the root-page that the lock should be held on. For table
151   ** b-trees, this is just the root page of the b-tree being read or
152   ** written. For index b-trees, it is the root page of the associated
153   ** table.  */
154   if( isIndex ){
155     HashElem *p;
156     for(p=sqliteHashFirst(&pSchema->idxHash); p; p=sqliteHashNext(p)){
157       Index *pIdx = (Index *)sqliteHashData(p);
158       if( pIdx->tnum==(int)iRoot ){
159         iTab = pIdx->pTable->tnum;
160       }
161     }
162   }else{
163     iTab = iRoot;
164   }
165 
166   /* Search for the required lock. Either a write-lock on root-page iTab, a
167   ** write-lock on the schema table, or (if the client is reading) a
168   ** read-lock on iTab will suffice. Return 1 if any of these are found.  */
169   for(pLock=pBtree->pBt->pLock; pLock; pLock=pLock->pNext){
170     if( pLock->pBtree==pBtree
171      && (pLock->iTable==iTab || (pLock->eLock==WRITE_LOCK && pLock->iTable==1))
172      && pLock->eLock>=eLockType
173     ){
174       return 1;
175     }
176   }
177 
178   /* Failed to find the required lock. */
179   return 0;
180 }
181 #endif /* SQLITE_DEBUG */
182 
183 #ifdef SQLITE_DEBUG
184 /*
185 **** This function may be used as part of assert() statements only. ****
186 **
187 ** Return true if it would be illegal for pBtree to write into the
188 ** table or index rooted at iRoot because other shared connections are
189 ** simultaneously reading that same table or index.
190 **
191 ** It is illegal for pBtree to write if some other Btree object that
192 ** shares the same BtShared object is currently reading or writing
193 ** the iRoot table.  Except, if the other Btree object has the
194 ** read-uncommitted flag set, then it is OK for the other object to
195 ** have a read cursor.
196 **
197 ** For example, before writing to any part of the table or index
198 ** rooted at page iRoot, one should call:
199 **
200 **    assert( !hasReadConflicts(pBtree, iRoot) );
201 */
202 static int hasReadConflicts(Btree *pBtree, Pgno iRoot){
203   BtCursor *p;
204   for(p=pBtree->pBt->pCursor; p; p=p->pNext){
205     if( p->pgnoRoot==iRoot
206      && p->pBtree!=pBtree
207      && 0==(p->pBtree->db->flags & SQLITE_ReadUncommitted)
208     ){
209       return 1;
210     }
211   }
212   return 0;
213 }
214 #endif    /* #ifdef SQLITE_DEBUG */
215 
216 /*
217 ** Query to see if Btree handle p may obtain a lock of type eLock
218 ** (READ_LOCK or WRITE_LOCK) on the table with root-page iTab. Return
219 ** SQLITE_OK if the lock may be obtained (by calling
220 ** setSharedCacheTableLock()), or SQLITE_LOCKED if not.
221 */
222 static int querySharedCacheTableLock(Btree *p, Pgno iTab, u8 eLock){
223   BtShared *pBt = p->pBt;
224   BtLock *pIter;
225 
226   assert( sqlite3BtreeHoldsMutex(p) );
227   assert( eLock==READ_LOCK || eLock==WRITE_LOCK );
228   assert( p->db!=0 );
229   assert( !(p->db->flags&SQLITE_ReadUncommitted)||eLock==WRITE_LOCK||iTab==1 );
230 
231   /* If requesting a write-lock, then the Btree must have an open write
232   ** transaction on this file. And, obviously, for this to be so there
233   ** must be an open write transaction on the file itself.
234   */
235   assert( eLock==READ_LOCK || (p==pBt->pWriter && p->inTrans==TRANS_WRITE) );
236   assert( eLock==READ_LOCK || pBt->inTransaction==TRANS_WRITE );
237 
238   /* This routine is a no-op if the shared-cache is not enabled */
239   if( !p->sharable ){
240     return SQLITE_OK;
241   }
242 
243   /* If some other connection is holding an exclusive lock, the
244   ** requested lock may not be obtained.
245   */
246   if( pBt->pWriter!=p && (pBt->btsFlags & BTS_EXCLUSIVE)!=0 ){
247     sqlite3ConnectionBlocked(p->db, pBt->pWriter->db);
248     return SQLITE_LOCKED_SHAREDCACHE;
249   }
250 
251   for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){
252     /* The condition (pIter->eLock!=eLock) in the following if(...)
253     ** statement is a simplification of:
254     **
255     **   (eLock==WRITE_LOCK || pIter->eLock==WRITE_LOCK)
256     **
257     ** since we know that if eLock==WRITE_LOCK, then no other connection
258     ** may hold a WRITE_LOCK on any table in this file (since there can
259     ** only be a single writer).
260     */
261     assert( pIter->eLock==READ_LOCK || pIter->eLock==WRITE_LOCK );
262     assert( eLock==READ_LOCK || pIter->pBtree==p || pIter->eLock==READ_LOCK);
263     if( pIter->pBtree!=p && pIter->iTable==iTab && pIter->eLock!=eLock ){
264       sqlite3ConnectionBlocked(p->db, pIter->pBtree->db);
265       if( eLock==WRITE_LOCK ){
266         assert( p==pBt->pWriter );
267         pBt->btsFlags |= BTS_PENDING;
268       }
269       return SQLITE_LOCKED_SHAREDCACHE;
270     }
271   }
272   return SQLITE_OK;
273 }
274 #endif /* !SQLITE_OMIT_SHARED_CACHE */
275 
276 #ifndef SQLITE_OMIT_SHARED_CACHE
277 /*
278 ** Add a lock on the table with root-page iTable to the shared-btree used
279 ** by Btree handle p. Parameter eLock must be either READ_LOCK or
280 ** WRITE_LOCK.
281 **
282 ** This function assumes the following:
283 **
284 **   (a) The specified Btree object p is connected to a sharable
285 **       database (one with the BtShared.sharable flag set), and
286 **
287 **   (b) No other Btree objects hold a lock that conflicts
288 **       with the requested lock (i.e. querySharedCacheTableLock() has
289 **       already been called and returned SQLITE_OK).
290 **
291 ** SQLITE_OK is returned if the lock is added successfully. SQLITE_NOMEM
292 ** is returned if a malloc attempt fails.
293 */
294 static int setSharedCacheTableLock(Btree *p, Pgno iTable, u8 eLock){
295   BtShared *pBt = p->pBt;
296   BtLock *pLock = 0;
297   BtLock *pIter;
298 
299   assert( sqlite3BtreeHoldsMutex(p) );
300   assert( eLock==READ_LOCK || eLock==WRITE_LOCK );
301   assert( p->db!=0 );
302 
303   /* A connection with the read-uncommitted flag set will never try to
304   ** obtain a read-lock using this function. The only read-lock obtained
305   ** by a connection in read-uncommitted mode is on the sqlite_master
306   ** table, and that lock is obtained in BtreeBeginTrans().  */
307   assert( 0==(p->db->flags&SQLITE_ReadUncommitted) || eLock==WRITE_LOCK );
308 
309   /* This function should only be called on a sharable b-tree after it
310   ** has been determined that no other b-tree holds a conflicting lock.  */
311   assert( p->sharable );
312   assert( SQLITE_OK==querySharedCacheTableLock(p, iTable, eLock) );
313 
314   /* First search the list for an existing lock on this table. */
315   for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){
316     if( pIter->iTable==iTable && pIter->pBtree==p ){
317       pLock = pIter;
318       break;
319     }
320   }
321 
322   /* If the above search did not find a BtLock struct associating Btree p
323   ** with table iTable, allocate one and link it into the list.
324   */
325   if( !pLock ){
326     pLock = (BtLock *)sqlite3MallocZero(sizeof(BtLock));
327     if( !pLock ){
328       return SQLITE_NOMEM;
329     }
330     pLock->iTable = iTable;
331     pLock->pBtree = p;
332     pLock->pNext = pBt->pLock;
333     pBt->pLock = pLock;
334   }
335 
336   /* Set the BtLock.eLock variable to the maximum of the current lock
337   ** and the requested lock. This means if a write-lock was already held
338   ** and a read-lock requested, we don't incorrectly downgrade the lock.
339   */
340   assert( WRITE_LOCK>READ_LOCK );
341   if( eLock>pLock->eLock ){
342     pLock->eLock = eLock;
343   }
344 
345   return SQLITE_OK;
346 }
347 #endif /* !SQLITE_OMIT_SHARED_CACHE */
348 
349 #ifndef SQLITE_OMIT_SHARED_CACHE
350 /*
351 ** Release all the table locks (locks obtained via calls to
352 ** the setSharedCacheTableLock() procedure) held by Btree object p.
353 **
354 ** This function assumes that Btree p has an open read or write
355 ** transaction. If it does not, then the BTS_PENDING flag
356 ** may be incorrectly cleared.
357 */
358 static void clearAllSharedCacheTableLocks(Btree *p){
359   BtShared *pBt = p->pBt;
360   BtLock **ppIter = &pBt->pLock;
361 
362   assert( sqlite3BtreeHoldsMutex(p) );
363   assert( p->sharable || 0==*ppIter );
364   assert( p->inTrans>0 );
365 
366   while( *ppIter ){
367     BtLock *pLock = *ppIter;
368     assert( (pBt->btsFlags & BTS_EXCLUSIVE)==0 || pBt->pWriter==pLock->pBtree );
369     assert( pLock->pBtree->inTrans>=pLock->eLock );
370     if( pLock->pBtree==p ){
371       *ppIter = pLock->pNext;
372       assert( pLock->iTable!=1 || pLock==&p->lock );
373       if( pLock->iTable!=1 ){
374         sqlite3_free(pLock);
375       }
376     }else{
377       ppIter = &pLock->pNext;
378     }
379   }
380 
381   assert( (pBt->btsFlags & BTS_PENDING)==0 || pBt->pWriter );
382   if( pBt->pWriter==p ){
383     pBt->pWriter = 0;
384     pBt->btsFlags &= ~(BTS_EXCLUSIVE|BTS_PENDING);
385   }else if( pBt->nTransaction==2 ){
386     /* This function is called when Btree p is concluding its
387     ** transaction. If there currently exists a writer, and p is not
388     ** that writer, then the number of locks held by connections other
389     ** than the writer must be about to drop to zero. In this case
390     ** set the BTS_PENDING flag to 0.
391     **
392     ** If there is not currently a writer, then BTS_PENDING must
393     ** be zero already. So this next line is harmless in that case.
394     */
395     pBt->btsFlags &= ~BTS_PENDING;
396   }
397 }
398 
399 /*
400 ** This function changes all write-locks held by Btree p into read-locks.
401 */
402 static void downgradeAllSharedCacheTableLocks(Btree *p){
403   BtShared *pBt = p->pBt;
404   if( pBt->pWriter==p ){
405     BtLock *pLock;
406     pBt->pWriter = 0;
407     pBt->btsFlags &= ~(BTS_EXCLUSIVE|BTS_PENDING);
408     for(pLock=pBt->pLock; pLock; pLock=pLock->pNext){
409       assert( pLock->eLock==READ_LOCK || pLock->pBtree==p );
410       pLock->eLock = READ_LOCK;
411     }
412   }
413 }
414 
415 #endif /* SQLITE_OMIT_SHARED_CACHE */
416 
417 static void releasePage(MemPage *pPage);  /* Forward reference */
418 
419 /*
420 ***** This routine is used inside of assert() only ****
421 **
422 ** Verify that the cursor holds the mutex on its BtShared
423 */
424 #ifdef SQLITE_DEBUG
425 static int cursorHoldsMutex(BtCursor *p){
426   return sqlite3_mutex_held(p->pBt->mutex);
427 }
428 #endif
429 
430 
431 #ifndef SQLITE_OMIT_INCRBLOB
432 /*
433 ** Invalidate the overflow page-list cache for cursor pCur, if any.
434 */
435 static void invalidateOverflowCache(BtCursor *pCur){
436   assert( cursorHoldsMutex(pCur) );
437   sqlite3_free(pCur->aOverflow);
438   pCur->aOverflow = 0;
439 }
440 
441 /*
442 ** Invalidate the overflow page-list cache for all cursors opened
443 ** on the shared btree structure pBt.
444 */
445 static void invalidateAllOverflowCache(BtShared *pBt){
446   BtCursor *p;
447   assert( sqlite3_mutex_held(pBt->mutex) );
448   for(p=pBt->pCursor; p; p=p->pNext){
449     invalidateOverflowCache(p);
450   }
451 }
452 
453 /*
454 ** This function is called before modifying the contents of a table
455 ** to invalidate any incrblob cursors that are open on the
456 ** row or one of the rows being modified.
457 **
458 ** If argument isClearTable is true, then the entire contents of the
459 ** table is about to be deleted. In this case invalidate all incrblob
460 ** cursors open on any row within the table with root-page pgnoRoot.
461 **
462 ** Otherwise, if argument isClearTable is false, then the row with
463 ** rowid iRow is being replaced or deleted. In this case invalidate
464 ** only those incrblob cursors open on that specific row.
465 */
466 static void invalidateIncrblobCursors(
467   Btree *pBtree,          /* The database file to check */
468   i64 iRow,               /* The rowid that might be changing */
469   int isClearTable        /* True if all rows are being deleted */
470 ){
471   BtCursor *p;
472   BtShared *pBt = pBtree->pBt;
473   assert( sqlite3BtreeHoldsMutex(pBtree) );
474   for(p=pBt->pCursor; p; p=p->pNext){
475     if( p->isIncrblobHandle && (isClearTable || p->info.nKey==iRow) ){
476       p->eState = CURSOR_INVALID;
477     }
478   }
479 }
480 
481 #else
482   /* Stub functions when INCRBLOB is omitted */
483   #define invalidateOverflowCache(x)
484   #define invalidateAllOverflowCache(x)
485   #define invalidateIncrblobCursors(x,y,z)
486 #endif /* SQLITE_OMIT_INCRBLOB */
487 
488 /*
489 ** Set bit pgno of the BtShared.pHasContent bitvec. This is called
490 ** when a page that previously contained data becomes a free-list leaf
491 ** page.
492 **
493 ** The BtShared.pHasContent bitvec exists to work around an obscure
494 ** bug caused by the interaction of two useful IO optimizations surrounding
495 ** free-list leaf pages:
496 **
497 **   1) When all data is deleted from a page and the page becomes
498 **      a free-list leaf page, the page is not written to the database
499 **      (as free-list leaf pages contain no meaningful data). Sometimes
500 **      such a page is not even journalled (as it will not be modified,
501 **      why bother journalling it?).
502 **
503 **   2) When a free-list leaf page is reused, its content is not read
504 **      from the database or written to the journal file (why should it
505 **      be, if it is not at all meaningful?).
506 **
507 ** By themselves, these optimizations work fine and provide a handy
508 ** performance boost to bulk delete or insert operations. However, if
509 ** a page is moved to the free-list and then reused within the same
510 ** transaction, a problem comes up. If the page is not journalled when
511 ** it is moved to the free-list and it is also not journalled when it
512 ** is extracted from the free-list and reused, then the original data
513 ** may be lost. In the event of a rollback, it may not be possible
514 ** to restore the database to its original configuration.
515 **
516 ** The solution is the BtShared.pHasContent bitvec. Whenever a page is
517 ** moved to become a free-list leaf page, the corresponding bit is
518 ** set in the bitvec. Whenever a leaf page is extracted from the free-list,
519 ** optimization 2 above is omitted if the corresponding bit is already
520 ** set in BtShared.pHasContent. The contents of the bitvec are cleared
521 ** at the end of every transaction.
522 */
523 static int btreeSetHasContent(BtShared *pBt, Pgno pgno){
524   int rc = SQLITE_OK;
525   if( !pBt->pHasContent ){
526     assert( pgno<=pBt->nPage );
527     pBt->pHasContent = sqlite3BitvecCreate(pBt->nPage);
528     if( !pBt->pHasContent ){
529       rc = SQLITE_NOMEM;
530     }
531   }
532   if( rc==SQLITE_OK && pgno<=sqlite3BitvecSize(pBt->pHasContent) ){
533     rc = sqlite3BitvecSet(pBt->pHasContent, pgno);
534   }
535   return rc;
536 }
537 
538 /*
539 ** Query the BtShared.pHasContent vector.
540 **
541 ** This function is called when a free-list leaf page is removed from the
542 ** free-list for reuse. It returns false if it is safe to retrieve the
543 ** page from the pager layer with the 'no-content' flag set. True otherwise.
544 */
545 static int btreeGetHasContent(BtShared *pBt, Pgno pgno){
546   Bitvec *p = pBt->pHasContent;
547   return (p && (pgno>sqlite3BitvecSize(p) || sqlite3BitvecTest(p, pgno)));
548 }
549 
550 /*
551 ** Clear (destroy) the BtShared.pHasContent bitvec. This should be
552 ** invoked at the conclusion of each write-transaction.
553 */
554 static void btreeClearHasContent(BtShared *pBt){
555   sqlite3BitvecDestroy(pBt->pHasContent);
556   pBt->pHasContent = 0;
557 }
558 
559 /*
560 ** Save the current cursor position in the variables BtCursor.nKey
561 ** and BtCursor.pKey. The cursor's state is set to CURSOR_REQUIRESEEK.
562 **
563 ** The caller must ensure that the cursor is valid (has eState==CURSOR_VALID)
564 ** prior to calling this routine.
565 */
566 static int saveCursorPosition(BtCursor *pCur){
567   int rc;
568 
569   assert( CURSOR_VALID==pCur->eState );
570   assert( 0==pCur->pKey );
571   assert( cursorHoldsMutex(pCur) );
572 
573   rc = sqlite3BtreeKeySize(pCur, &pCur->nKey);
574   assert( rc==SQLITE_OK );  /* KeySize() cannot fail */
575 
576   /* If this is an intKey table, then the above call to BtreeKeySize()
577   ** stores the integer key in pCur->nKey. In this case this value is
578   ** all that is required. Otherwise, if pCur is not open on an intKey
579   ** table, then malloc space for and store the pCur->nKey bytes of key
580   ** data.
581   */
582   if( 0==pCur->apPage[0]->intKey ){
583     void *pKey = sqlite3Malloc( (int)pCur->nKey );
584     if( pKey ){
585       rc = sqlite3BtreeKey(pCur, 0, (int)pCur->nKey, pKey);
586       if( rc==SQLITE_OK ){
587         pCur->pKey = pKey;
588       }else{
589         sqlite3_free(pKey);
590       }
591     }else{
592       rc = SQLITE_NOMEM;
593     }
594   }
595   assert( !pCur->apPage[0]->intKey || !pCur->pKey );
596 
597   if( rc==SQLITE_OK ){
598     int i;
599     for(i=0; i<=pCur->iPage; i++){
600       releasePage(pCur->apPage[i]);
601       pCur->apPage[i] = 0;
602     }
603     pCur->iPage = -1;
604     pCur->eState = CURSOR_REQUIRESEEK;
605   }
606 
607   invalidateOverflowCache(pCur);
608   return rc;
609 }
610 
611 /*
612 ** Save the positions of all cursors (except pExcept) that are open on
613 ** the table  with root-page iRoot. Usually, this is called just before cursor
614 ** pExcept is used to modify the table (BtreeDelete() or BtreeInsert()).
615 */
616 static int saveAllCursors(BtShared *pBt, Pgno iRoot, BtCursor *pExcept){
617   BtCursor *p;
618   assert( sqlite3_mutex_held(pBt->mutex) );
619   assert( pExcept==0 || pExcept->pBt==pBt );
620   for(p=pBt->pCursor; p; p=p->pNext){
621     if( p!=pExcept && (0==iRoot || p->pgnoRoot==iRoot) &&
622         p->eState==CURSOR_VALID ){
623       int rc = saveCursorPosition(p);
624       if( SQLITE_OK!=rc ){
625         return rc;
626       }
627     }
628   }
629   return SQLITE_OK;
630 }
631 
632 /*
633 ** Clear the current cursor position.
634 */
635 void sqlite3BtreeClearCursor(BtCursor *pCur){
636   assert( cursorHoldsMutex(pCur) );
637   sqlite3_free(pCur->pKey);
638   pCur->pKey = 0;
639   pCur->eState = CURSOR_INVALID;
640 }
641 
642 /*
643 ** In this version of BtreeMoveto, pKey is a packed index record
644 ** such as is generated by the OP_MakeRecord opcode.  Unpack the
645 ** record and then call BtreeMovetoUnpacked() to do the work.
646 */
647 static int btreeMoveto(
648   BtCursor *pCur,     /* Cursor open on the btree to be searched */
649   const void *pKey,   /* Packed key if the btree is an index */
650   i64 nKey,           /* Integer key for tables.  Size of pKey for indices */
651   int bias,           /* Bias search to the high end */
652   int *pRes           /* Write search results here */
653 ){
654   int rc;                    /* Status code */
655   UnpackedRecord *pIdxKey;   /* Unpacked index key */
656   char aSpace[150];          /* Temp space for pIdxKey - to avoid a malloc */
657   char *pFree = 0;
658 
659   if( pKey ){
660     assert( nKey==(i64)(int)nKey );
661     pIdxKey = sqlite3VdbeAllocUnpackedRecord(
662         pCur->pKeyInfo, aSpace, sizeof(aSpace), &pFree
663     );
664     if( pIdxKey==0 ) return SQLITE_NOMEM;
665     sqlite3VdbeRecordUnpack(pCur->pKeyInfo, (int)nKey, pKey, pIdxKey);
666   }else{
667     pIdxKey = 0;
668   }
669   rc = sqlite3BtreeMovetoUnpacked(pCur, pIdxKey, nKey, bias, pRes);
670   if( pFree ){
671     sqlite3DbFree(pCur->pKeyInfo->db, pFree);
672   }
673   return rc;
674 }
675 
676 /*
677 ** Restore the cursor to the position it was in (or as close to as possible)
678 ** when saveCursorPosition() was called. Note that this call deletes the
679 ** saved position info stored by saveCursorPosition(), so there can be
680 ** at most one effective restoreCursorPosition() call after each
681 ** saveCursorPosition().
682 */
683 static int btreeRestoreCursorPosition(BtCursor *pCur){
684   int rc;
685   assert( cursorHoldsMutex(pCur) );
686   assert( pCur->eState>=CURSOR_REQUIRESEEK );
687   if( pCur->eState==CURSOR_FAULT ){
688     return pCur->skipNext;
689   }
690   pCur->eState = CURSOR_INVALID;
691   rc = btreeMoveto(pCur, pCur->pKey, pCur->nKey, 0, &pCur->skipNext);
692   if( rc==SQLITE_OK ){
693     sqlite3_free(pCur->pKey);
694     pCur->pKey = 0;
695     assert( pCur->eState==CURSOR_VALID || pCur->eState==CURSOR_INVALID );
696   }
697   return rc;
698 }
699 
700 #define restoreCursorPosition(p) \
701   (p->eState>=CURSOR_REQUIRESEEK ? \
702          btreeRestoreCursorPosition(p) : \
703          SQLITE_OK)
704 
705 /*
706 ** Determine whether or not a cursor has moved from the position it
707 ** was last placed at.  Cursors can move when the row they are pointing
708 ** at is deleted out from under them.
709 **
710 ** This routine returns an error code if something goes wrong.  The
711 ** integer *pHasMoved is set to one if the cursor has moved and 0 if not.
712 */
713 int sqlite3BtreeCursorHasMoved(BtCursor *pCur, int *pHasMoved){
714   int rc;
715 
716   rc = restoreCursorPosition(pCur);
717   if( rc ){
718     *pHasMoved = 1;
719     return rc;
720   }
721   if( pCur->eState!=CURSOR_VALID || pCur->skipNext!=0 ){
722     *pHasMoved = 1;
723   }else{
724     *pHasMoved = 0;
725   }
726   return SQLITE_OK;
727 }
728 
729 #ifndef SQLITE_OMIT_AUTOVACUUM
730 /*
731 ** Given a page number of a regular database page, return the page
732 ** number for the pointer-map page that contains the entry for the
733 ** input page number.
734 **
735 ** Return 0 (not a valid page) for pgno==1 since there is
736 ** no pointer map associated with page 1.  The integrity_check logic
737 ** requires that ptrmapPageno(*,1)!=1.
738 */
739 static Pgno ptrmapPageno(BtShared *pBt, Pgno pgno){
740   int nPagesPerMapPage;
741   Pgno iPtrMap, ret;
742   assert( sqlite3_mutex_held(pBt->mutex) );
743   if( pgno<2 ) return 0;
744   nPagesPerMapPage = (pBt->usableSize/5)+1;
745   iPtrMap = (pgno-2)/nPagesPerMapPage;
746   ret = (iPtrMap*nPagesPerMapPage) + 2;
747   if( ret==PENDING_BYTE_PAGE(pBt) ){
748     ret++;
749   }
750   return ret;
751 }
752 
753 /*
754 ** Write an entry into the pointer map.
755 **
756 ** This routine updates the pointer map entry for page number 'key'
757 ** so that it maps to type 'eType' and parent page number 'pgno'.
758 **
759 ** If *pRC is initially non-zero (non-SQLITE_OK) then this routine is
760 ** a no-op.  If an error occurs, the appropriate error code is written
761 ** into *pRC.
762 */
763 static void ptrmapPut(BtShared *pBt, Pgno key, u8 eType, Pgno parent, int *pRC){
764   DbPage *pDbPage;  /* The pointer map page */
765   u8 *pPtrmap;      /* The pointer map data */
766   Pgno iPtrmap;     /* The pointer map page number */
767   int offset;       /* Offset in pointer map page */
768   int rc;           /* Return code from subfunctions */
769 
770   if( *pRC ) return;
771 
772   assert( sqlite3_mutex_held(pBt->mutex) );
773   /* The master-journal page number must never be used as a pointer map page */
774   assert( 0==PTRMAP_ISPAGE(pBt, PENDING_BYTE_PAGE(pBt)) );
775 
776   assert( pBt->autoVacuum );
777   if( key==0 ){
778     *pRC = SQLITE_CORRUPT_BKPT;
779     return;
780   }
781   iPtrmap = PTRMAP_PAGENO(pBt, key);
782   rc = sqlite3PagerGet(pBt->pPager, iPtrmap, &pDbPage);
783   if( rc!=SQLITE_OK ){
784     *pRC = rc;
785     return;
786   }
787   offset = PTRMAP_PTROFFSET(iPtrmap, key);
788   if( offset<0 ){
789     *pRC = SQLITE_CORRUPT_BKPT;
790     goto ptrmap_exit;
791   }
792   assert( offset <= (int)pBt->usableSize-5 );
793   pPtrmap = (u8 *)sqlite3PagerGetData(pDbPage);
794 
795   if( eType!=pPtrmap[offset] || get4byte(&pPtrmap[offset+1])!=parent ){
796     TRACE(("PTRMAP_UPDATE: %d->(%d,%d)\n", key, eType, parent));
797     *pRC= rc = sqlite3PagerWrite(pDbPage);
798     if( rc==SQLITE_OK ){
799       pPtrmap[offset] = eType;
800       put4byte(&pPtrmap[offset+1], parent);
801     }
802   }
803 
804 ptrmap_exit:
805   sqlite3PagerUnref(pDbPage);
806 }
807 
808 /*
809 ** Read an entry from the pointer map.
810 **
811 ** This routine retrieves the pointer map entry for page 'key', writing
812 ** the type and parent page number to *pEType and *pPgno respectively.
813 ** An error code is returned if something goes wrong, otherwise SQLITE_OK.
814 */
815 static int ptrmapGet(BtShared *pBt, Pgno key, u8 *pEType, Pgno *pPgno){
816   DbPage *pDbPage;   /* The pointer map page */
817   int iPtrmap;       /* Pointer map page index */
818   u8 *pPtrmap;       /* Pointer map page data */
819   int offset;        /* Offset of entry in pointer map */
820   int rc;
821 
822   assert( sqlite3_mutex_held(pBt->mutex) );
823 
824   iPtrmap = PTRMAP_PAGENO(pBt, key);
825   rc = sqlite3PagerGet(pBt->pPager, iPtrmap, &pDbPage);
826   if( rc!=0 ){
827     return rc;
828   }
829   pPtrmap = (u8 *)sqlite3PagerGetData(pDbPage);
830 
831   offset = PTRMAP_PTROFFSET(iPtrmap, key);
832   if( offset<0 ){
833     sqlite3PagerUnref(pDbPage);
834     return SQLITE_CORRUPT_BKPT;
835   }
836   assert( offset <= (int)pBt->usableSize-5 );
837   assert( pEType!=0 );
838   *pEType = pPtrmap[offset];
839   if( pPgno ) *pPgno = get4byte(&pPtrmap[offset+1]);
840 
841   sqlite3PagerUnref(pDbPage);
842   if( *pEType<1 || *pEType>5 ) return SQLITE_CORRUPT_BKPT;
843   return SQLITE_OK;
844 }
845 
846 #else /* if defined SQLITE_OMIT_AUTOVACUUM */
847   #define ptrmapPut(w,x,y,z,rc)
848   #define ptrmapGet(w,x,y,z) SQLITE_OK
849   #define ptrmapPutOvflPtr(x, y, rc)
850 #endif
851 
852 /*
853 ** Given a btree page and a cell index (0 means the first cell on
854 ** the page, 1 means the second cell, and so forth) return a pointer
855 ** to the cell content.
856 **
857 ** This routine works only for pages that do not contain overflow cells.
858 */
859 #define findCell(P,I) \
860   ((P)->aData + ((P)->maskPage & get2byte(&(P)->aCellIdx[2*(I)])))
861 #define findCellv2(D,M,O,I) (D+(M&get2byte(D+(O+2*(I)))))
862 
863 
864 /*
865 ** This a more complex version of findCell() that works for
866 ** pages that do contain overflow cells.
867 */
868 static u8 *findOverflowCell(MemPage *pPage, int iCell){
869   int i;
870   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
871   for(i=pPage->nOverflow-1; i>=0; i--){
872     int k;
873     k = pPage->aiOvfl[i];
874     if( k<=iCell ){
875       if( k==iCell ){
876         return pPage->apOvfl[i];
877       }
878       iCell--;
879     }
880   }
881   return findCell(pPage, iCell);
882 }
883 
884 /*
885 ** Parse a cell content block and fill in the CellInfo structure.  There
886 ** are two versions of this function.  btreeParseCell() takes a
887 ** cell index as the second argument and btreeParseCellPtr()
888 ** takes a pointer to the body of the cell as its second argument.
889 **
890 ** Within this file, the parseCell() macro can be called instead of
891 ** btreeParseCellPtr(). Using some compilers, this will be faster.
892 */
893 static void btreeParseCellPtr(
894   MemPage *pPage,         /* Page containing the cell */
895   u8 *pCell,              /* Pointer to the cell text. */
896   CellInfo *pInfo         /* Fill in this structure */
897 ){
898   u16 n;                  /* Number bytes in cell content header */
899   u32 nPayload;           /* Number of bytes of cell payload */
900 
901   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
902 
903   pInfo->pCell = pCell;
904   assert( pPage->leaf==0 || pPage->leaf==1 );
905   n = pPage->childPtrSize;
906   assert( n==4-4*pPage->leaf );
907   if( pPage->intKey ){
908     if( pPage->hasData ){
909       n += getVarint32(&pCell[n], nPayload);
910     }else{
911       nPayload = 0;
912     }
913     n += getVarint(&pCell[n], (u64*)&pInfo->nKey);
914     pInfo->nData = nPayload;
915   }else{
916     pInfo->nData = 0;
917     n += getVarint32(&pCell[n], nPayload);
918     pInfo->nKey = nPayload;
919   }
920   pInfo->nPayload = nPayload;
921   pInfo->nHeader = n;
922   testcase( nPayload==pPage->maxLocal );
923   testcase( nPayload==pPage->maxLocal+1 );
924   if( likely(nPayload<=pPage->maxLocal) ){
925     /* This is the (easy) common case where the entire payload fits
926     ** on the local page.  No overflow is required.
927     */
928     if( (pInfo->nSize = (u16)(n+nPayload))<4 ) pInfo->nSize = 4;
929     pInfo->nLocal = (u16)nPayload;
930     pInfo->iOverflow = 0;
931   }else{
932     /* If the payload will not fit completely on the local page, we have
933     ** to decide how much to store locally and how much to spill onto
934     ** overflow pages.  The strategy is to minimize the amount of unused
935     ** space on overflow pages while keeping the amount of local storage
936     ** in between minLocal and maxLocal.
937     **
938     ** Warning:  changing the way overflow payload is distributed in any
939     ** way will result in an incompatible file format.
940     */
941     int minLocal;  /* Minimum amount of payload held locally */
942     int maxLocal;  /* Maximum amount of payload held locally */
943     int surplus;   /* Overflow payload available for local storage */
944 
945     minLocal = pPage->minLocal;
946     maxLocal = pPage->maxLocal;
947     surplus = minLocal + (nPayload - minLocal)%(pPage->pBt->usableSize - 4);
948     testcase( surplus==maxLocal );
949     testcase( surplus==maxLocal+1 );
950     if( surplus <= maxLocal ){
951       pInfo->nLocal = (u16)surplus;
952     }else{
953       pInfo->nLocal = (u16)minLocal;
954     }
955     pInfo->iOverflow = (u16)(pInfo->nLocal + n);
956     pInfo->nSize = pInfo->iOverflow + 4;
957   }
958 }
959 #define parseCell(pPage, iCell, pInfo) \
960   btreeParseCellPtr((pPage), findCell((pPage), (iCell)), (pInfo))
961 static void btreeParseCell(
962   MemPage *pPage,         /* Page containing the cell */
963   int iCell,              /* The cell index.  First cell is 0 */
964   CellInfo *pInfo         /* Fill in this structure */
965 ){
966   parseCell(pPage, iCell, pInfo);
967 }
968 
969 /*
970 ** Compute the total number of bytes that a Cell needs in the cell
971 ** data area of the btree-page.  The return number includes the cell
972 ** data header and the local payload, but not any overflow page or
973 ** the space used by the cell pointer.
974 */
975 static u16 cellSizePtr(MemPage *pPage, u8 *pCell){
976   u8 *pIter = &pCell[pPage->childPtrSize];
977   u32 nSize;
978 
979 #ifdef SQLITE_DEBUG
980   /* The value returned by this function should always be the same as
981   ** the (CellInfo.nSize) value found by doing a full parse of the
982   ** cell. If SQLITE_DEBUG is defined, an assert() at the bottom of
983   ** this function verifies that this invariant is not violated. */
984   CellInfo debuginfo;
985   btreeParseCellPtr(pPage, pCell, &debuginfo);
986 #endif
987 
988   if( pPage->intKey ){
989     u8 *pEnd;
990     if( pPage->hasData ){
991       pIter += getVarint32(pIter, nSize);
992     }else{
993       nSize = 0;
994     }
995 
996     /* pIter now points at the 64-bit integer key value, a variable length
997     ** integer. The following block moves pIter to point at the first byte
998     ** past the end of the key value. */
999     pEnd = &pIter[9];
1000     while( (*pIter++)&0x80 && pIter<pEnd );
1001   }else{
1002     pIter += getVarint32(pIter, nSize);
1003   }
1004 
1005   testcase( nSize==pPage->maxLocal );
1006   testcase( nSize==pPage->maxLocal+1 );
1007   if( nSize>pPage->maxLocal ){
1008     int minLocal = pPage->minLocal;
1009     nSize = minLocal + (nSize - minLocal) % (pPage->pBt->usableSize - 4);
1010     testcase( nSize==pPage->maxLocal );
1011     testcase( nSize==pPage->maxLocal+1 );
1012     if( nSize>pPage->maxLocal ){
1013       nSize = minLocal;
1014     }
1015     nSize += 4;
1016   }
1017   nSize += (u32)(pIter - pCell);
1018 
1019   /* The minimum size of any cell is 4 bytes. */
1020   if( nSize<4 ){
1021     nSize = 4;
1022   }
1023 
1024   assert( nSize==debuginfo.nSize );
1025   return (u16)nSize;
1026 }
1027 
1028 #ifdef SQLITE_DEBUG
1029 /* This variation on cellSizePtr() is used inside of assert() statements
1030 ** only. */
1031 static u16 cellSize(MemPage *pPage, int iCell){
1032   return cellSizePtr(pPage, findCell(pPage, iCell));
1033 }
1034 #endif
1035 
1036 #ifndef SQLITE_OMIT_AUTOVACUUM
1037 /*
1038 ** If the cell pCell, part of page pPage contains a pointer
1039 ** to an overflow page, insert an entry into the pointer-map
1040 ** for the overflow page.
1041 */
1042 static void ptrmapPutOvflPtr(MemPage *pPage, u8 *pCell, int *pRC){
1043   CellInfo info;
1044   if( *pRC ) return;
1045   assert( pCell!=0 );
1046   btreeParseCellPtr(pPage, pCell, &info);
1047   assert( (info.nData+(pPage->intKey?0:info.nKey))==info.nPayload );
1048   if( info.iOverflow ){
1049     Pgno ovfl = get4byte(&pCell[info.iOverflow]);
1050     ptrmapPut(pPage->pBt, ovfl, PTRMAP_OVERFLOW1, pPage->pgno, pRC);
1051   }
1052 }
1053 #endif
1054 
1055 
1056 /*
1057 ** Defragment the page given.  All Cells are moved to the
1058 ** end of the page and all free space is collected into one
1059 ** big FreeBlk that occurs in between the header and cell
1060 ** pointer array and the cell content area.
1061 */
1062 static int defragmentPage(MemPage *pPage){
1063   int i;                     /* Loop counter */
1064   int pc;                    /* Address of a i-th cell */
1065   int hdr;                   /* Offset to the page header */
1066   int size;                  /* Size of a cell */
1067   int usableSize;            /* Number of usable bytes on a page */
1068   int cellOffset;            /* Offset to the cell pointer array */
1069   int cbrk;                  /* Offset to the cell content area */
1070   int nCell;                 /* Number of cells on the page */
1071   unsigned char *data;       /* The page data */
1072   unsigned char *temp;       /* Temp area for cell content */
1073   int iCellFirst;            /* First allowable cell index */
1074   int iCellLast;             /* Last possible cell index */
1075 
1076 
1077   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
1078   assert( pPage->pBt!=0 );
1079   assert( pPage->pBt->usableSize <= SQLITE_MAX_PAGE_SIZE );
1080   assert( pPage->nOverflow==0 );
1081   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1082   temp = sqlite3PagerTempSpace(pPage->pBt->pPager);
1083   data = pPage->aData;
1084   hdr = pPage->hdrOffset;
1085   cellOffset = pPage->cellOffset;
1086   nCell = pPage->nCell;
1087   assert( nCell==get2byte(&data[hdr+3]) );
1088   usableSize = pPage->pBt->usableSize;
1089   cbrk = get2byte(&data[hdr+5]);
1090   memcpy(&temp[cbrk], &data[cbrk], usableSize - cbrk);
1091   cbrk = usableSize;
1092   iCellFirst = cellOffset + 2*nCell;
1093   iCellLast = usableSize - 4;
1094   for(i=0; i<nCell; i++){
1095     u8 *pAddr;     /* The i-th cell pointer */
1096     pAddr = &data[cellOffset + i*2];
1097     pc = get2byte(pAddr);
1098     testcase( pc==iCellFirst );
1099     testcase( pc==iCellLast );
1100 #if !defined(SQLITE_ENABLE_OVERSIZE_CELL_CHECK)
1101     /* These conditions have already been verified in btreeInitPage()
1102     ** if SQLITE_ENABLE_OVERSIZE_CELL_CHECK is defined
1103     */
1104     if( pc<iCellFirst || pc>iCellLast ){
1105       return SQLITE_CORRUPT_BKPT;
1106     }
1107 #endif
1108     assert( pc>=iCellFirst && pc<=iCellLast );
1109     size = cellSizePtr(pPage, &temp[pc]);
1110     cbrk -= size;
1111 #if defined(SQLITE_ENABLE_OVERSIZE_CELL_CHECK)
1112     if( cbrk<iCellFirst ){
1113       return SQLITE_CORRUPT_BKPT;
1114     }
1115 #else
1116     if( cbrk<iCellFirst || pc+size>usableSize ){
1117       return SQLITE_CORRUPT_BKPT;
1118     }
1119 #endif
1120     assert( cbrk+size<=usableSize && cbrk>=iCellFirst );
1121     testcase( cbrk+size==usableSize );
1122     testcase( pc+size==usableSize );
1123     memcpy(&data[cbrk], &temp[pc], size);
1124     put2byte(pAddr, cbrk);
1125   }
1126   assert( cbrk>=iCellFirst );
1127   put2byte(&data[hdr+5], cbrk);
1128   data[hdr+1] = 0;
1129   data[hdr+2] = 0;
1130   data[hdr+7] = 0;
1131   memset(&data[iCellFirst], 0, cbrk-iCellFirst);
1132   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
1133   if( cbrk-iCellFirst!=pPage->nFree ){
1134     return SQLITE_CORRUPT_BKPT;
1135   }
1136   return SQLITE_OK;
1137 }
1138 
1139 /*
1140 ** Allocate nByte bytes of space from within the B-Tree page passed
1141 ** as the first argument. Write into *pIdx the index into pPage->aData[]
1142 ** of the first byte of allocated space. Return either SQLITE_OK or
1143 ** an error code (usually SQLITE_CORRUPT).
1144 **
1145 ** The caller guarantees that there is sufficient space to make the
1146 ** allocation.  This routine might need to defragment in order to bring
1147 ** all the space together, however.  This routine will avoid using
1148 ** the first two bytes past the cell pointer area since presumably this
1149 ** allocation is being made in order to insert a new cell, so we will
1150 ** also end up needing a new cell pointer.
1151 */
1152 static int allocateSpace(MemPage *pPage, int nByte, int *pIdx){
1153   const int hdr = pPage->hdrOffset;    /* Local cache of pPage->hdrOffset */
1154   u8 * const data = pPage->aData;      /* Local cache of pPage->aData */
1155   int nFrag;                           /* Number of fragmented bytes on pPage */
1156   int top;                             /* First byte of cell content area */
1157   int gap;        /* First byte of gap between cell pointers and cell content */
1158   int rc;         /* Integer return code */
1159   int usableSize; /* Usable size of the page */
1160 
1161   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
1162   assert( pPage->pBt );
1163   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1164   assert( nByte>=0 );  /* Minimum cell size is 4 */
1165   assert( pPage->nFree>=nByte );
1166   assert( pPage->nOverflow==0 );
1167   usableSize = pPage->pBt->usableSize;
1168   assert( nByte < usableSize-8 );
1169 
1170   nFrag = data[hdr+7];
1171   assert( pPage->cellOffset == hdr + 12 - 4*pPage->leaf );
1172   gap = pPage->cellOffset + 2*pPage->nCell;
1173   top = get2byteNotZero(&data[hdr+5]);
1174   if( gap>top ) return SQLITE_CORRUPT_BKPT;
1175   testcase( gap+2==top );
1176   testcase( gap+1==top );
1177   testcase( gap==top );
1178 
1179   if( nFrag>=60 ){
1180     /* Always defragment highly fragmented pages */
1181     rc = defragmentPage(pPage);
1182     if( rc ) return rc;
1183     top = get2byteNotZero(&data[hdr+5]);
1184   }else if( gap+2<=top ){
1185     /* Search the freelist looking for a free slot big enough to satisfy
1186     ** the request. The allocation is made from the first free slot in
1187     ** the list that is large enough to accomadate it.
1188     */
1189     int pc, addr;
1190     for(addr=hdr+1; (pc = get2byte(&data[addr]))>0; addr=pc){
1191       int size;            /* Size of the free slot */
1192       if( pc>usableSize-4 || pc<addr+4 ){
1193         return SQLITE_CORRUPT_BKPT;
1194       }
1195       size = get2byte(&data[pc+2]);
1196       if( size>=nByte ){
1197         int x = size - nByte;
1198         testcase( x==4 );
1199         testcase( x==3 );
1200         if( x<4 ){
1201           /* Remove the slot from the free-list. Update the number of
1202           ** fragmented bytes within the page. */
1203           memcpy(&data[addr], &data[pc], 2);
1204           data[hdr+7] = (u8)(nFrag + x);
1205         }else if( size+pc > usableSize ){
1206           return SQLITE_CORRUPT_BKPT;
1207         }else{
1208           /* The slot remains on the free-list. Reduce its size to account
1209           ** for the portion used by the new allocation. */
1210           put2byte(&data[pc+2], x);
1211         }
1212         *pIdx = pc + x;
1213         return SQLITE_OK;
1214       }
1215     }
1216   }
1217 
1218   /* Check to make sure there is enough space in the gap to satisfy
1219   ** the allocation.  If not, defragment.
1220   */
1221   testcase( gap+2+nByte==top );
1222   if( gap+2+nByte>top ){
1223     rc = defragmentPage(pPage);
1224     if( rc ) return rc;
1225     top = get2byteNotZero(&data[hdr+5]);
1226     assert( gap+nByte<=top );
1227   }
1228 
1229 
1230   /* Allocate memory from the gap in between the cell pointer array
1231   ** and the cell content area.  The btreeInitPage() call has already
1232   ** validated the freelist.  Given that the freelist is valid, there
1233   ** is no way that the allocation can extend off the end of the page.
1234   ** The assert() below verifies the previous sentence.
1235   */
1236   top -= nByte;
1237   put2byte(&data[hdr+5], top);
1238   assert( top+nByte <= (int)pPage->pBt->usableSize );
1239   *pIdx = top;
1240   return SQLITE_OK;
1241 }
1242 
1243 /*
1244 ** Return a section of the pPage->aData to the freelist.
1245 ** The first byte of the new free block is pPage->aDisk[start]
1246 ** and the size of the block is "size" bytes.
1247 **
1248 ** Most of the effort here is involved in coalesing adjacent
1249 ** free blocks into a single big free block.
1250 */
1251 static int freeSpace(MemPage *pPage, int start, int size){
1252   int addr, pbegin, hdr;
1253   int iLast;                        /* Largest possible freeblock offset */
1254   unsigned char *data = pPage->aData;
1255 
1256   assert( pPage->pBt!=0 );
1257   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
1258   assert( start>=pPage->hdrOffset+6+pPage->childPtrSize );
1259   assert( (start + size) <= (int)pPage->pBt->usableSize );
1260   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1261   assert( size>=0 );   /* Minimum cell size is 4 */
1262 
1263   if( pPage->pBt->btsFlags & BTS_SECURE_DELETE ){
1264     /* Overwrite deleted information with zeros when the secure_delete
1265     ** option is enabled */
1266     memset(&data[start], 0, size);
1267   }
1268 
1269   /* Add the space back into the linked list of freeblocks.  Note that
1270   ** even though the freeblock list was checked by btreeInitPage(),
1271   ** btreeInitPage() did not detect overlapping cells or
1272   ** freeblocks that overlapped cells.   Nor does it detect when the
1273   ** cell content area exceeds the value in the page header.  If these
1274   ** situations arise, then subsequent insert operations might corrupt
1275   ** the freelist.  So we do need to check for corruption while scanning
1276   ** the freelist.
1277   */
1278   hdr = pPage->hdrOffset;
1279   addr = hdr + 1;
1280   iLast = pPage->pBt->usableSize - 4;
1281   assert( start<=iLast );
1282   while( (pbegin = get2byte(&data[addr]))<start && pbegin>0 ){
1283     if( pbegin<addr+4 ){
1284       return SQLITE_CORRUPT_BKPT;
1285     }
1286     addr = pbegin;
1287   }
1288   if( pbegin>iLast ){
1289     return SQLITE_CORRUPT_BKPT;
1290   }
1291   assert( pbegin>addr || pbegin==0 );
1292   put2byte(&data[addr], start);
1293   put2byte(&data[start], pbegin);
1294   put2byte(&data[start+2], size);
1295   pPage->nFree = pPage->nFree + (u16)size;
1296 
1297   /* Coalesce adjacent free blocks */
1298   addr = hdr + 1;
1299   while( (pbegin = get2byte(&data[addr]))>0 ){
1300     int pnext, psize, x;
1301     assert( pbegin>addr );
1302     assert( pbegin <= (int)pPage->pBt->usableSize-4 );
1303     pnext = get2byte(&data[pbegin]);
1304     psize = get2byte(&data[pbegin+2]);
1305     if( pbegin + psize + 3 >= pnext && pnext>0 ){
1306       int frag = pnext - (pbegin+psize);
1307       if( (frag<0) || (frag>(int)data[hdr+7]) ){
1308         return SQLITE_CORRUPT_BKPT;
1309       }
1310       data[hdr+7] -= (u8)frag;
1311       x = get2byte(&data[pnext]);
1312       put2byte(&data[pbegin], x);
1313       x = pnext + get2byte(&data[pnext+2]) - pbegin;
1314       put2byte(&data[pbegin+2], x);
1315     }else{
1316       addr = pbegin;
1317     }
1318   }
1319 
1320   /* If the cell content area begins with a freeblock, remove it. */
1321   if( data[hdr+1]==data[hdr+5] && data[hdr+2]==data[hdr+6] ){
1322     int top;
1323     pbegin = get2byte(&data[hdr+1]);
1324     memcpy(&data[hdr+1], &data[pbegin], 2);
1325     top = get2byte(&data[hdr+5]) + get2byte(&data[pbegin+2]);
1326     put2byte(&data[hdr+5], top);
1327   }
1328   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
1329   return SQLITE_OK;
1330 }
1331 
1332 /*
1333 ** Decode the flags byte (the first byte of the header) for a page
1334 ** and initialize fields of the MemPage structure accordingly.
1335 **
1336 ** Only the following combinations are supported.  Anything different
1337 ** indicates a corrupt database files:
1338 **
1339 **         PTF_ZERODATA
1340 **         PTF_ZERODATA | PTF_LEAF
1341 **         PTF_LEAFDATA | PTF_INTKEY
1342 **         PTF_LEAFDATA | PTF_INTKEY | PTF_LEAF
1343 */
1344 static int decodeFlags(MemPage *pPage, int flagByte){
1345   BtShared *pBt;     /* A copy of pPage->pBt */
1346 
1347   assert( pPage->hdrOffset==(pPage->pgno==1 ? 100 : 0) );
1348   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1349   pPage->leaf = (u8)(flagByte>>3);  assert( PTF_LEAF == 1<<3 );
1350   flagByte &= ~PTF_LEAF;
1351   pPage->childPtrSize = 4-4*pPage->leaf;
1352   pBt = pPage->pBt;
1353   if( flagByte==(PTF_LEAFDATA | PTF_INTKEY) ){
1354     pPage->intKey = 1;
1355     pPage->hasData = pPage->leaf;
1356     pPage->maxLocal = pBt->maxLeaf;
1357     pPage->minLocal = pBt->minLeaf;
1358   }else if( flagByte==PTF_ZERODATA ){
1359     pPage->intKey = 0;
1360     pPage->hasData = 0;
1361     pPage->maxLocal = pBt->maxLocal;
1362     pPage->minLocal = pBt->minLocal;
1363   }else{
1364     return SQLITE_CORRUPT_BKPT;
1365   }
1366   pPage->max1bytePayload = pBt->max1bytePayload;
1367   return SQLITE_OK;
1368 }
1369 
1370 /*
1371 ** Initialize the auxiliary information for a disk block.
1372 **
1373 ** Return SQLITE_OK on success.  If we see that the page does
1374 ** not contain a well-formed database page, then return
1375 ** SQLITE_CORRUPT.  Note that a return of SQLITE_OK does not
1376 ** guarantee that the page is well-formed.  It only shows that
1377 ** we failed to detect any corruption.
1378 */
1379 static int btreeInitPage(MemPage *pPage){
1380 
1381   assert( pPage->pBt!=0 );
1382   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1383   assert( pPage->pgno==sqlite3PagerPagenumber(pPage->pDbPage) );
1384   assert( pPage == sqlite3PagerGetExtra(pPage->pDbPage) );
1385   assert( pPage->aData == sqlite3PagerGetData(pPage->pDbPage) );
1386 
1387   if( !pPage->isInit ){
1388     u16 pc;            /* Address of a freeblock within pPage->aData[] */
1389     u8 hdr;            /* Offset to beginning of page header */
1390     u8 *data;          /* Equal to pPage->aData */
1391     BtShared *pBt;        /* The main btree structure */
1392     int usableSize;    /* Amount of usable space on each page */
1393     u16 cellOffset;    /* Offset from start of page to first cell pointer */
1394     int nFree;         /* Number of unused bytes on the page */
1395     int top;           /* First byte of the cell content area */
1396     int iCellFirst;    /* First allowable cell or freeblock offset */
1397     int iCellLast;     /* Last possible cell or freeblock offset */
1398 
1399     pBt = pPage->pBt;
1400 
1401     hdr = pPage->hdrOffset;
1402     data = pPage->aData;
1403     if( decodeFlags(pPage, data[hdr]) ) return SQLITE_CORRUPT_BKPT;
1404     assert( pBt->pageSize>=512 && pBt->pageSize<=65536 );
1405     pPage->maskPage = (u16)(pBt->pageSize - 1);
1406     pPage->nOverflow = 0;
1407     usableSize = pBt->usableSize;
1408     pPage->cellOffset = cellOffset = hdr + 12 - 4*pPage->leaf;
1409     pPage->aDataEnd = &data[usableSize];
1410     pPage->aCellIdx = &data[cellOffset];
1411     top = get2byteNotZero(&data[hdr+5]);
1412     pPage->nCell = get2byte(&data[hdr+3]);
1413     if( pPage->nCell>MX_CELL(pBt) ){
1414       /* To many cells for a single page.  The page must be corrupt */
1415       return SQLITE_CORRUPT_BKPT;
1416     }
1417     testcase( pPage->nCell==MX_CELL(pBt) );
1418 
1419     /* A malformed database page might cause us to read past the end
1420     ** of page when parsing a cell.
1421     **
1422     ** The following block of code checks early to see if a cell extends
1423     ** past the end of a page boundary and causes SQLITE_CORRUPT to be
1424     ** returned if it does.
1425     */
1426     iCellFirst = cellOffset + 2*pPage->nCell;
1427     iCellLast = usableSize - 4;
1428 #if defined(SQLITE_ENABLE_OVERSIZE_CELL_CHECK)
1429     {
1430       int i;            /* Index into the cell pointer array */
1431       int sz;           /* Size of a cell */
1432 
1433       if( !pPage->leaf ) iCellLast--;
1434       for(i=0; i<pPage->nCell; i++){
1435         pc = get2byte(&data[cellOffset+i*2]);
1436         testcase( pc==iCellFirst );
1437         testcase( pc==iCellLast );
1438         if( pc<iCellFirst || pc>iCellLast ){
1439           return SQLITE_CORRUPT_BKPT;
1440         }
1441         sz = cellSizePtr(pPage, &data[pc]);
1442         testcase( pc+sz==usableSize );
1443         if( pc+sz>usableSize ){
1444           return SQLITE_CORRUPT_BKPT;
1445         }
1446       }
1447       if( !pPage->leaf ) iCellLast++;
1448     }
1449 #endif
1450 
1451     /* Compute the total free space on the page */
1452     pc = get2byte(&data[hdr+1]);
1453     nFree = data[hdr+7] + top;
1454     while( pc>0 ){
1455       u16 next, size;
1456       if( pc<iCellFirst || pc>iCellLast ){
1457         /* Start of free block is off the page */
1458         return SQLITE_CORRUPT_BKPT;
1459       }
1460       next = get2byte(&data[pc]);
1461       size = get2byte(&data[pc+2]);
1462       if( (next>0 && next<=pc+size+3) || pc+size>usableSize ){
1463         /* Free blocks must be in ascending order. And the last byte of
1464 	** the free-block must lie on the database page.  */
1465         return SQLITE_CORRUPT_BKPT;
1466       }
1467       nFree = nFree + size;
1468       pc = next;
1469     }
1470 
1471     /* At this point, nFree contains the sum of the offset to the start
1472     ** of the cell-content area plus the number of free bytes within
1473     ** the cell-content area. If this is greater than the usable-size
1474     ** of the page, then the page must be corrupted. This check also
1475     ** serves to verify that the offset to the start of the cell-content
1476     ** area, according to the page header, lies within the page.
1477     */
1478     if( nFree>usableSize ){
1479       return SQLITE_CORRUPT_BKPT;
1480     }
1481     pPage->nFree = (u16)(nFree - iCellFirst);
1482     pPage->isInit = 1;
1483   }
1484   return SQLITE_OK;
1485 }
1486 
1487 /*
1488 ** Set up a raw page so that it looks like a database page holding
1489 ** no entries.
1490 */
1491 static void zeroPage(MemPage *pPage, int flags){
1492   unsigned char *data = pPage->aData;
1493   BtShared *pBt = pPage->pBt;
1494   u8 hdr = pPage->hdrOffset;
1495   u16 first;
1496 
1497   assert( sqlite3PagerPagenumber(pPage->pDbPage)==pPage->pgno );
1498   assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage );
1499   assert( sqlite3PagerGetData(pPage->pDbPage) == data );
1500   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
1501   assert( sqlite3_mutex_held(pBt->mutex) );
1502   if( pBt->btsFlags & BTS_SECURE_DELETE ){
1503     memset(&data[hdr], 0, pBt->usableSize - hdr);
1504   }
1505   data[hdr] = (char)flags;
1506   first = hdr + 8 + 4*((flags&PTF_LEAF)==0 ?1:0);
1507   memset(&data[hdr+1], 0, 4);
1508   data[hdr+7] = 0;
1509   put2byte(&data[hdr+5], pBt->usableSize);
1510   pPage->nFree = (u16)(pBt->usableSize - first);
1511   decodeFlags(pPage, flags);
1512   pPage->hdrOffset = hdr;
1513   pPage->cellOffset = first;
1514   pPage->aDataEnd = &data[pBt->usableSize];
1515   pPage->aCellIdx = &data[first];
1516   pPage->nOverflow = 0;
1517   assert( pBt->pageSize>=512 && pBt->pageSize<=65536 );
1518   pPage->maskPage = (u16)(pBt->pageSize - 1);
1519   pPage->nCell = 0;
1520   pPage->isInit = 1;
1521 }
1522 
1523 
1524 /*
1525 ** Convert a DbPage obtained from the pager into a MemPage used by
1526 ** the btree layer.
1527 */
1528 static MemPage *btreePageFromDbPage(DbPage *pDbPage, Pgno pgno, BtShared *pBt){
1529   MemPage *pPage = (MemPage*)sqlite3PagerGetExtra(pDbPage);
1530   pPage->aData = sqlite3PagerGetData(pDbPage);
1531   pPage->pDbPage = pDbPage;
1532   pPage->pBt = pBt;
1533   pPage->pgno = pgno;
1534   pPage->hdrOffset = pPage->pgno==1 ? 100 : 0;
1535   return pPage;
1536 }
1537 
1538 /*
1539 ** Get a page from the pager.  Initialize the MemPage.pBt and
1540 ** MemPage.aData elements if needed.
1541 **
1542 ** If the noContent flag is set, it means that we do not care about
1543 ** the content of the page at this time.  So do not go to the disk
1544 ** to fetch the content.  Just fill in the content with zeros for now.
1545 ** If in the future we call sqlite3PagerWrite() on this page, that
1546 ** means we have started to be concerned about content and the disk
1547 ** read should occur at that point.
1548 */
1549 static int btreeGetPage(
1550   BtShared *pBt,       /* The btree */
1551   Pgno pgno,           /* Number of the page to fetch */
1552   MemPage **ppPage,    /* Return the page in this parameter */
1553   int noContent        /* Do not load page content if true */
1554 ){
1555   int rc;
1556   DbPage *pDbPage;
1557 
1558   assert( sqlite3_mutex_held(pBt->mutex) );
1559   rc = sqlite3PagerAcquire(pBt->pPager, pgno, (DbPage**)&pDbPage, noContent);
1560   if( rc ) return rc;
1561   *ppPage = btreePageFromDbPage(pDbPage, pgno, pBt);
1562   return SQLITE_OK;
1563 }
1564 
1565 /*
1566 ** Retrieve a page from the pager cache. If the requested page is not
1567 ** already in the pager cache return NULL. Initialize the MemPage.pBt and
1568 ** MemPage.aData elements if needed.
1569 */
1570 static MemPage *btreePageLookup(BtShared *pBt, Pgno pgno){
1571   DbPage *pDbPage;
1572   assert( sqlite3_mutex_held(pBt->mutex) );
1573   pDbPage = sqlite3PagerLookup(pBt->pPager, pgno);
1574   if( pDbPage ){
1575     return btreePageFromDbPage(pDbPage, pgno, pBt);
1576   }
1577   return 0;
1578 }
1579 
1580 /*
1581 ** Return the size of the database file in pages. If there is any kind of
1582 ** error, return ((unsigned int)-1).
1583 */
1584 static Pgno btreePagecount(BtShared *pBt){
1585   return pBt->nPage;
1586 }
1587 u32 sqlite3BtreeLastPage(Btree *p){
1588   assert( sqlite3BtreeHoldsMutex(p) );
1589   assert( ((p->pBt->nPage)&0x8000000)==0 );
1590   return (int)btreePagecount(p->pBt);
1591 }
1592 
1593 /*
1594 ** Get a page from the pager and initialize it.  This routine is just a
1595 ** convenience wrapper around separate calls to btreeGetPage() and
1596 ** btreeInitPage().
1597 **
1598 ** If an error occurs, then the value *ppPage is set to is undefined. It
1599 ** may remain unchanged, or it may be set to an invalid value.
1600 */
1601 static int getAndInitPage(
1602   BtShared *pBt,          /* The database file */
1603   Pgno pgno,           /* Number of the page to get */
1604   MemPage **ppPage     /* Write the page pointer here */
1605 ){
1606   int rc;
1607   assert( sqlite3_mutex_held(pBt->mutex) );
1608 
1609   if( pgno>btreePagecount(pBt) ){
1610     rc = SQLITE_CORRUPT_BKPT;
1611   }else{
1612     rc = btreeGetPage(pBt, pgno, ppPage, 0);
1613     if( rc==SQLITE_OK ){
1614       rc = btreeInitPage(*ppPage);
1615       if( rc!=SQLITE_OK ){
1616         releasePage(*ppPage);
1617       }
1618     }
1619   }
1620 
1621   testcase( pgno==0 );
1622   assert( pgno!=0 || rc==SQLITE_CORRUPT );
1623   return rc;
1624 }
1625 
1626 /*
1627 ** Release a MemPage.  This should be called once for each prior
1628 ** call to btreeGetPage.
1629 */
1630 static void releasePage(MemPage *pPage){
1631   if( pPage ){
1632     assert( pPage->aData );
1633     assert( pPage->pBt );
1634     assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage );
1635     assert( sqlite3PagerGetData(pPage->pDbPage)==pPage->aData );
1636     assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1637     sqlite3PagerUnref(pPage->pDbPage);
1638   }
1639 }
1640 
1641 /*
1642 ** During a rollback, when the pager reloads information into the cache
1643 ** so that the cache is restored to its original state at the start of
1644 ** the transaction, for each page restored this routine is called.
1645 **
1646 ** This routine needs to reset the extra data section at the end of the
1647 ** page to agree with the restored data.
1648 */
1649 static void pageReinit(DbPage *pData){
1650   MemPage *pPage;
1651   pPage = (MemPage *)sqlite3PagerGetExtra(pData);
1652   assert( sqlite3PagerPageRefcount(pData)>0 );
1653   if( pPage->isInit ){
1654     assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1655     pPage->isInit = 0;
1656     if( sqlite3PagerPageRefcount(pData)>1 ){
1657       /* pPage might not be a btree page;  it might be an overflow page
1658       ** or ptrmap page or a free page.  In those cases, the following
1659       ** call to btreeInitPage() will likely return SQLITE_CORRUPT.
1660       ** But no harm is done by this.  And it is very important that
1661       ** btreeInitPage() be called on every btree page so we make
1662       ** the call for every page that comes in for re-initing. */
1663       btreeInitPage(pPage);
1664     }
1665   }
1666 }
1667 
1668 /*
1669 ** Invoke the busy handler for a btree.
1670 */
1671 static int btreeInvokeBusyHandler(void *pArg){
1672   BtShared *pBt = (BtShared*)pArg;
1673   assert( pBt->db );
1674   assert( sqlite3_mutex_held(pBt->db->mutex) );
1675   return sqlite3InvokeBusyHandler(&pBt->db->busyHandler);
1676 }
1677 
1678 /*
1679 ** Open a database file.
1680 **
1681 ** zFilename is the name of the database file.  If zFilename is NULL
1682 ** then an ephemeral database is created.  The ephemeral database might
1683 ** be exclusively in memory, or it might use a disk-based memory cache.
1684 ** Either way, the ephemeral database will be automatically deleted
1685 ** when sqlite3BtreeClose() is called.
1686 **
1687 ** If zFilename is ":memory:" then an in-memory database is created
1688 ** that is automatically destroyed when it is closed.
1689 **
1690 ** The "flags" parameter is a bitmask that might contain bits like
1691 ** BTREE_OMIT_JOURNAL and/or BTREE_MEMORY.
1692 **
1693 ** If the database is already opened in the same database connection
1694 ** and we are in shared cache mode, then the open will fail with an
1695 ** SQLITE_CONSTRAINT error.  We cannot allow two or more BtShared
1696 ** objects in the same database connection since doing so will lead
1697 ** to problems with locking.
1698 */
1699 int sqlite3BtreeOpen(
1700   sqlite3_vfs *pVfs,      /* VFS to use for this b-tree */
1701   const char *zFilename,  /* Name of the file containing the BTree database */
1702   sqlite3 *db,            /* Associated database handle */
1703   Btree **ppBtree,        /* Pointer to new Btree object written here */
1704   int flags,              /* Options */
1705   int vfsFlags            /* Flags passed through to sqlite3_vfs.xOpen() */
1706 ){
1707   BtShared *pBt = 0;             /* Shared part of btree structure */
1708   Btree *p;                      /* Handle to return */
1709   sqlite3_mutex *mutexOpen = 0;  /* Prevents a race condition. Ticket #3537 */
1710   int rc = SQLITE_OK;            /* Result code from this function */
1711   u8 nReserve;                   /* Byte of unused space on each page */
1712   unsigned char zDbHeader[100];  /* Database header content */
1713 
1714   /* True if opening an ephemeral, temporary database */
1715   const int isTempDb = zFilename==0 || zFilename[0]==0;
1716 
1717   /* Set the variable isMemdb to true for an in-memory database, or
1718   ** false for a file-based database.
1719   */
1720 #ifdef SQLITE_OMIT_MEMORYDB
1721   const int isMemdb = 0;
1722 #else
1723   const int isMemdb = (zFilename && strcmp(zFilename, ":memory:")==0)
1724                        || (isTempDb && sqlite3TempInMemory(db));
1725 #endif
1726 
1727   assert( db!=0 );
1728   assert( pVfs!=0 );
1729   assert( sqlite3_mutex_held(db->mutex) );
1730   assert( (flags&0xff)==flags );   /* flags fit in 8 bits */
1731 
1732   /* Only a BTREE_SINGLE database can be BTREE_UNORDERED */
1733   assert( (flags & BTREE_UNORDERED)==0 || (flags & BTREE_SINGLE)!=0 );
1734 
1735   /* A BTREE_SINGLE database is always a temporary and/or ephemeral */
1736   assert( (flags & BTREE_SINGLE)==0 || isTempDb );
1737 
1738   if( isMemdb ){
1739     flags |= BTREE_MEMORY;
1740   }
1741   if( (vfsFlags & SQLITE_OPEN_MAIN_DB)!=0 && (isMemdb || isTempDb) ){
1742     vfsFlags = (vfsFlags & ~SQLITE_OPEN_MAIN_DB) | SQLITE_OPEN_TEMP_DB;
1743   }
1744   p = sqlite3MallocZero(sizeof(Btree));
1745   if( !p ){
1746     return SQLITE_NOMEM;
1747   }
1748   p->inTrans = TRANS_NONE;
1749   p->db = db;
1750 #ifndef SQLITE_OMIT_SHARED_CACHE
1751   p->lock.pBtree = p;
1752   p->lock.iTable = 1;
1753 #endif
1754 
1755 #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)
1756   /*
1757   ** If this Btree is a candidate for shared cache, try to find an
1758   ** existing BtShared object that we can share with
1759   */
1760   if( isTempDb==0 && (isMemdb==0 || (vfsFlags&SQLITE_OPEN_URI)!=0) ){
1761     if( vfsFlags & SQLITE_OPEN_SHAREDCACHE ){
1762       int nFullPathname = pVfs->mxPathname+1;
1763       char *zFullPathname = sqlite3Malloc(nFullPathname);
1764       MUTEX_LOGIC( sqlite3_mutex *mutexShared; )
1765       p->sharable = 1;
1766       if( !zFullPathname ){
1767         sqlite3_free(p);
1768         return SQLITE_NOMEM;
1769       }
1770       if( isMemdb ){
1771         memcpy(zFullPathname, zFilename, sqlite3Strlen30(zFilename)+1);
1772       }else{
1773         rc = sqlite3OsFullPathname(pVfs, zFilename,
1774                                    nFullPathname, zFullPathname);
1775         if( rc ){
1776           sqlite3_free(zFullPathname);
1777           sqlite3_free(p);
1778           return rc;
1779         }
1780       }
1781 #if SQLITE_THREADSAFE
1782       mutexOpen = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_OPEN);
1783       sqlite3_mutex_enter(mutexOpen);
1784       mutexShared = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER);
1785       sqlite3_mutex_enter(mutexShared);
1786 #endif
1787       for(pBt=GLOBAL(BtShared*,sqlite3SharedCacheList); pBt; pBt=pBt->pNext){
1788         assert( pBt->nRef>0 );
1789         if( 0==strcmp(zFullPathname, sqlite3PagerFilename(pBt->pPager, 0))
1790                  && sqlite3PagerVfs(pBt->pPager)==pVfs ){
1791           int iDb;
1792           for(iDb=db->nDb-1; iDb>=0; iDb--){
1793             Btree *pExisting = db->aDb[iDb].pBt;
1794             if( pExisting && pExisting->pBt==pBt ){
1795               sqlite3_mutex_leave(mutexShared);
1796               sqlite3_mutex_leave(mutexOpen);
1797               sqlite3_free(zFullPathname);
1798               sqlite3_free(p);
1799               return SQLITE_CONSTRAINT;
1800             }
1801           }
1802           p->pBt = pBt;
1803           pBt->nRef++;
1804           break;
1805         }
1806       }
1807       sqlite3_mutex_leave(mutexShared);
1808       sqlite3_free(zFullPathname);
1809     }
1810 #ifdef SQLITE_DEBUG
1811     else{
1812       /* In debug mode, we mark all persistent databases as sharable
1813       ** even when they are not.  This exercises the locking code and
1814       ** gives more opportunity for asserts(sqlite3_mutex_held())
1815       ** statements to find locking problems.
1816       */
1817       p->sharable = 1;
1818     }
1819 #endif
1820   }
1821 #endif
1822   if( pBt==0 ){
1823     /*
1824     ** The following asserts make sure that structures used by the btree are
1825     ** the right size.  This is to guard against size changes that result
1826     ** when compiling on a different architecture.
1827     */
1828     assert( sizeof(i64)==8 || sizeof(i64)==4 );
1829     assert( sizeof(u64)==8 || sizeof(u64)==4 );
1830     assert( sizeof(u32)==4 );
1831     assert( sizeof(u16)==2 );
1832     assert( sizeof(Pgno)==4 );
1833 
1834     pBt = sqlite3MallocZero( sizeof(*pBt) );
1835     if( pBt==0 ){
1836       rc = SQLITE_NOMEM;
1837       goto btree_open_out;
1838     }
1839     rc = sqlite3PagerOpen(pVfs, &pBt->pPager, zFilename,
1840                           EXTRA_SIZE, flags, vfsFlags, pageReinit);
1841     if( rc==SQLITE_OK ){
1842       rc = sqlite3PagerReadFileheader(pBt->pPager,sizeof(zDbHeader),zDbHeader);
1843     }
1844     if( rc!=SQLITE_OK ){
1845       goto btree_open_out;
1846     }
1847     pBt->openFlags = (u8)flags;
1848     pBt->db = db;
1849     sqlite3PagerSetBusyhandler(pBt->pPager, btreeInvokeBusyHandler, pBt);
1850     p->pBt = pBt;
1851 
1852     pBt->pCursor = 0;
1853     pBt->pPage1 = 0;
1854     if( sqlite3PagerIsreadonly(pBt->pPager) ) pBt->btsFlags |= BTS_READ_ONLY;
1855 #ifdef SQLITE_SECURE_DELETE
1856     pBt->btsFlags |= BTS_SECURE_DELETE;
1857 #endif
1858     pBt->pageSize = (zDbHeader[16]<<8) | (zDbHeader[17]<<16);
1859     if( pBt->pageSize<512 || pBt->pageSize>SQLITE_MAX_PAGE_SIZE
1860          || ((pBt->pageSize-1)&pBt->pageSize)!=0 ){
1861       pBt->pageSize = 0;
1862 #ifndef SQLITE_OMIT_AUTOVACUUM
1863       /* If the magic name ":memory:" will create an in-memory database, then
1864       ** leave the autoVacuum mode at 0 (do not auto-vacuum), even if
1865       ** SQLITE_DEFAULT_AUTOVACUUM is true. On the other hand, if
1866       ** SQLITE_OMIT_MEMORYDB has been defined, then ":memory:" is just a
1867       ** regular file-name. In this case the auto-vacuum applies as per normal.
1868       */
1869       if( zFilename && !isMemdb ){
1870         pBt->autoVacuum = (SQLITE_DEFAULT_AUTOVACUUM ? 1 : 0);
1871         pBt->incrVacuum = (SQLITE_DEFAULT_AUTOVACUUM==2 ? 1 : 0);
1872       }
1873 #endif
1874       nReserve = 0;
1875     }else{
1876       nReserve = zDbHeader[20];
1877       pBt->btsFlags |= BTS_PAGESIZE_FIXED;
1878 #ifndef SQLITE_OMIT_AUTOVACUUM
1879       pBt->autoVacuum = (get4byte(&zDbHeader[36 + 4*4])?1:0);
1880       pBt->incrVacuum = (get4byte(&zDbHeader[36 + 7*4])?1:0);
1881 #endif
1882     }
1883     rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize, nReserve);
1884     if( rc ) goto btree_open_out;
1885     pBt->usableSize = pBt->pageSize - nReserve;
1886     assert( (pBt->pageSize & 7)==0 );  /* 8-byte alignment of pageSize */
1887 
1888 #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)
1889     /* Add the new BtShared object to the linked list sharable BtShareds.
1890     */
1891     if( p->sharable ){
1892       MUTEX_LOGIC( sqlite3_mutex *mutexShared; )
1893       pBt->nRef = 1;
1894       MUTEX_LOGIC( mutexShared = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER);)
1895       if( SQLITE_THREADSAFE && sqlite3GlobalConfig.bCoreMutex ){
1896         pBt->mutex = sqlite3MutexAlloc(SQLITE_MUTEX_FAST);
1897         if( pBt->mutex==0 ){
1898           rc = SQLITE_NOMEM;
1899           db->mallocFailed = 0;
1900           goto btree_open_out;
1901         }
1902       }
1903       sqlite3_mutex_enter(mutexShared);
1904       pBt->pNext = GLOBAL(BtShared*,sqlite3SharedCacheList);
1905       GLOBAL(BtShared*,sqlite3SharedCacheList) = pBt;
1906       sqlite3_mutex_leave(mutexShared);
1907     }
1908 #endif
1909   }
1910 
1911 #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)
1912   /* If the new Btree uses a sharable pBtShared, then link the new
1913   ** Btree into the list of all sharable Btrees for the same connection.
1914   ** The list is kept in ascending order by pBt address.
1915   */
1916   if( p->sharable ){
1917     int i;
1918     Btree *pSib;
1919     for(i=0; i<db->nDb; i++){
1920       if( (pSib = db->aDb[i].pBt)!=0 && pSib->sharable ){
1921         while( pSib->pPrev ){ pSib = pSib->pPrev; }
1922         if( p->pBt<pSib->pBt ){
1923           p->pNext = pSib;
1924           p->pPrev = 0;
1925           pSib->pPrev = p;
1926         }else{
1927           while( pSib->pNext && pSib->pNext->pBt<p->pBt ){
1928             pSib = pSib->pNext;
1929           }
1930           p->pNext = pSib->pNext;
1931           p->pPrev = pSib;
1932           if( p->pNext ){
1933             p->pNext->pPrev = p;
1934           }
1935           pSib->pNext = p;
1936         }
1937         break;
1938       }
1939     }
1940   }
1941 #endif
1942   *ppBtree = p;
1943 
1944 btree_open_out:
1945   if( rc!=SQLITE_OK ){
1946     if( pBt && pBt->pPager ){
1947       sqlite3PagerClose(pBt->pPager);
1948     }
1949     sqlite3_free(pBt);
1950     sqlite3_free(p);
1951     *ppBtree = 0;
1952   }else{
1953     /* If the B-Tree was successfully opened, set the pager-cache size to the
1954     ** default value. Except, when opening on an existing shared pager-cache,
1955     ** do not change the pager-cache size.
1956     */
1957     if( sqlite3BtreeSchema(p, 0, 0)==0 ){
1958       sqlite3PagerSetCachesize(p->pBt->pPager, SQLITE_DEFAULT_CACHE_SIZE);
1959     }
1960   }
1961   if( mutexOpen ){
1962     assert( sqlite3_mutex_held(mutexOpen) );
1963     sqlite3_mutex_leave(mutexOpen);
1964   }
1965   return rc;
1966 }
1967 
1968 /*
1969 ** Decrement the BtShared.nRef counter.  When it reaches zero,
1970 ** remove the BtShared structure from the sharing list.  Return
1971 ** true if the BtShared.nRef counter reaches zero and return
1972 ** false if it is still positive.
1973 */
1974 static int removeFromSharingList(BtShared *pBt){
1975 #ifndef SQLITE_OMIT_SHARED_CACHE
1976   MUTEX_LOGIC( sqlite3_mutex *pMaster; )
1977   BtShared *pList;
1978   int removed = 0;
1979 
1980   assert( sqlite3_mutex_notheld(pBt->mutex) );
1981   MUTEX_LOGIC( pMaster = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER); )
1982   sqlite3_mutex_enter(pMaster);
1983   pBt->nRef--;
1984   if( pBt->nRef<=0 ){
1985     if( GLOBAL(BtShared*,sqlite3SharedCacheList)==pBt ){
1986       GLOBAL(BtShared*,sqlite3SharedCacheList) = pBt->pNext;
1987     }else{
1988       pList = GLOBAL(BtShared*,sqlite3SharedCacheList);
1989       while( ALWAYS(pList) && pList->pNext!=pBt ){
1990         pList=pList->pNext;
1991       }
1992       if( ALWAYS(pList) ){
1993         pList->pNext = pBt->pNext;
1994       }
1995     }
1996     if( SQLITE_THREADSAFE ){
1997       sqlite3_mutex_free(pBt->mutex);
1998     }
1999     removed = 1;
2000   }
2001   sqlite3_mutex_leave(pMaster);
2002   return removed;
2003 #else
2004   return 1;
2005 #endif
2006 }
2007 
2008 /*
2009 ** Make sure pBt->pTmpSpace points to an allocation of
2010 ** MX_CELL_SIZE(pBt) bytes.
2011 */
2012 static void allocateTempSpace(BtShared *pBt){
2013   if( !pBt->pTmpSpace ){
2014     pBt->pTmpSpace = sqlite3PageMalloc( pBt->pageSize );
2015   }
2016 }
2017 
2018 /*
2019 ** Free the pBt->pTmpSpace allocation
2020 */
2021 static void freeTempSpace(BtShared *pBt){
2022   sqlite3PageFree( pBt->pTmpSpace);
2023   pBt->pTmpSpace = 0;
2024 }
2025 
2026 /*
2027 ** Close an open database and invalidate all cursors.
2028 */
2029 int sqlite3BtreeClose(Btree *p){
2030   BtShared *pBt = p->pBt;
2031   BtCursor *pCur;
2032 
2033   /* Close all cursors opened via this handle.  */
2034   assert( sqlite3_mutex_held(p->db->mutex) );
2035   sqlite3BtreeEnter(p);
2036   pCur = pBt->pCursor;
2037   while( pCur ){
2038     BtCursor *pTmp = pCur;
2039     pCur = pCur->pNext;
2040     if( pTmp->pBtree==p ){
2041       sqlite3BtreeCloseCursor(pTmp);
2042     }
2043   }
2044 
2045   /* Rollback any active transaction and free the handle structure.
2046   ** The call to sqlite3BtreeRollback() drops any table-locks held by
2047   ** this handle.
2048   */
2049   sqlite3BtreeRollback(p, SQLITE_OK);
2050   sqlite3BtreeLeave(p);
2051 
2052   /* If there are still other outstanding references to the shared-btree
2053   ** structure, return now. The remainder of this procedure cleans
2054   ** up the shared-btree.
2055   */
2056   assert( p->wantToLock==0 && p->locked==0 );
2057   if( !p->sharable || removeFromSharingList(pBt) ){
2058     /* The pBt is no longer on the sharing list, so we can access
2059     ** it without having to hold the mutex.
2060     **
2061     ** Clean out and delete the BtShared object.
2062     */
2063     assert( !pBt->pCursor );
2064     sqlite3PagerClose(pBt->pPager);
2065     if( pBt->xFreeSchema && pBt->pSchema ){
2066       pBt->xFreeSchema(pBt->pSchema);
2067     }
2068     sqlite3DbFree(0, pBt->pSchema);
2069     freeTempSpace(pBt);
2070     sqlite3_free(pBt);
2071   }
2072 
2073 #ifndef SQLITE_OMIT_SHARED_CACHE
2074   assert( p->wantToLock==0 );
2075   assert( p->locked==0 );
2076   if( p->pPrev ) p->pPrev->pNext = p->pNext;
2077   if( p->pNext ) p->pNext->pPrev = p->pPrev;
2078 #endif
2079 
2080   sqlite3_free(p);
2081   return SQLITE_OK;
2082 }
2083 
2084 /*
2085 ** Change the limit on the number of pages allowed in the cache.
2086 **
2087 ** The maximum number of cache pages is set to the absolute
2088 ** value of mxPage.  If mxPage is negative, the pager will
2089 ** operate asynchronously - it will not stop to do fsync()s
2090 ** to insure data is written to the disk surface before
2091 ** continuing.  Transactions still work if synchronous is off,
2092 ** and the database cannot be corrupted if this program
2093 ** crashes.  But if the operating system crashes or there is
2094 ** an abrupt power failure when synchronous is off, the database
2095 ** could be left in an inconsistent and unrecoverable state.
2096 ** Synchronous is on by default so database corruption is not
2097 ** normally a worry.
2098 */
2099 int sqlite3BtreeSetCacheSize(Btree *p, int mxPage){
2100   BtShared *pBt = p->pBt;
2101   assert( sqlite3_mutex_held(p->db->mutex) );
2102   sqlite3BtreeEnter(p);
2103   sqlite3PagerSetCachesize(pBt->pPager, mxPage);
2104   sqlite3BtreeLeave(p);
2105   return SQLITE_OK;
2106 }
2107 
2108 /*
2109 ** Change the way data is synced to disk in order to increase or decrease
2110 ** how well the database resists damage due to OS crashes and power
2111 ** failures.  Level 1 is the same as asynchronous (no syncs() occur and
2112 ** there is a high probability of damage)  Level 2 is the default.  There
2113 ** is a very low but non-zero probability of damage.  Level 3 reduces the
2114 ** probability of damage to near zero but with a write performance reduction.
2115 */
2116 #ifndef SQLITE_OMIT_PAGER_PRAGMAS
2117 int sqlite3BtreeSetSafetyLevel(
2118   Btree *p,              /* The btree to set the safety level on */
2119   int level,             /* PRAGMA synchronous.  1=OFF, 2=NORMAL, 3=FULL */
2120   int fullSync,          /* PRAGMA fullfsync. */
2121   int ckptFullSync       /* PRAGMA checkpoint_fullfync */
2122 ){
2123   BtShared *pBt = p->pBt;
2124   assert( sqlite3_mutex_held(p->db->mutex) );
2125   assert( level>=1 && level<=3 );
2126   sqlite3BtreeEnter(p);
2127   sqlite3PagerSetSafetyLevel(pBt->pPager, level, fullSync, ckptFullSync);
2128   sqlite3BtreeLeave(p);
2129   return SQLITE_OK;
2130 }
2131 #endif
2132 
2133 /*
2134 ** Return TRUE if the given btree is set to safety level 1.  In other
2135 ** words, return TRUE if no sync() occurs on the disk files.
2136 */
2137 int sqlite3BtreeSyncDisabled(Btree *p){
2138   BtShared *pBt = p->pBt;
2139   int rc;
2140   assert( sqlite3_mutex_held(p->db->mutex) );
2141   sqlite3BtreeEnter(p);
2142   assert( pBt && pBt->pPager );
2143   rc = sqlite3PagerNosync(pBt->pPager);
2144   sqlite3BtreeLeave(p);
2145   return rc;
2146 }
2147 
2148 /*
2149 ** Change the default pages size and the number of reserved bytes per page.
2150 ** Or, if the page size has already been fixed, return SQLITE_READONLY
2151 ** without changing anything.
2152 **
2153 ** The page size must be a power of 2 between 512 and 65536.  If the page
2154 ** size supplied does not meet this constraint then the page size is not
2155 ** changed.
2156 **
2157 ** Page sizes are constrained to be a power of two so that the region
2158 ** of the database file used for locking (beginning at PENDING_BYTE,
2159 ** the first byte past the 1GB boundary, 0x40000000) needs to occur
2160 ** at the beginning of a page.
2161 **
2162 ** If parameter nReserve is less than zero, then the number of reserved
2163 ** bytes per page is left unchanged.
2164 **
2165 ** If the iFix!=0 then the BTS_PAGESIZE_FIXED flag is set so that the page size
2166 ** and autovacuum mode can no longer be changed.
2167 */
2168 int sqlite3BtreeSetPageSize(Btree *p, int pageSize, int nReserve, int iFix){
2169   int rc = SQLITE_OK;
2170   BtShared *pBt = p->pBt;
2171   assert( nReserve>=-1 && nReserve<=255 );
2172   sqlite3BtreeEnter(p);
2173   if( pBt->btsFlags & BTS_PAGESIZE_FIXED ){
2174     sqlite3BtreeLeave(p);
2175     return SQLITE_READONLY;
2176   }
2177   if( nReserve<0 ){
2178     nReserve = pBt->pageSize - pBt->usableSize;
2179   }
2180   assert( nReserve>=0 && nReserve<=255 );
2181   if( pageSize>=512 && pageSize<=SQLITE_MAX_PAGE_SIZE &&
2182         ((pageSize-1)&pageSize)==0 ){
2183     assert( (pageSize & 7)==0 );
2184     assert( !pBt->pPage1 && !pBt->pCursor );
2185     pBt->pageSize = (u32)pageSize;
2186     freeTempSpace(pBt);
2187   }
2188   rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize, nReserve);
2189   pBt->usableSize = pBt->pageSize - (u16)nReserve;
2190   if( iFix ) pBt->btsFlags |= BTS_PAGESIZE_FIXED;
2191   sqlite3BtreeLeave(p);
2192   return rc;
2193 }
2194 
2195 /*
2196 ** Return the currently defined page size
2197 */
2198 int sqlite3BtreeGetPageSize(Btree *p){
2199   return p->pBt->pageSize;
2200 }
2201 
2202 #if !defined(SQLITE_OMIT_PAGER_PRAGMAS) || !defined(SQLITE_OMIT_VACUUM)
2203 /*
2204 ** Return the number of bytes of space at the end of every page that
2205 ** are intentually left unused.  This is the "reserved" space that is
2206 ** sometimes used by extensions.
2207 */
2208 int sqlite3BtreeGetReserve(Btree *p){
2209   int n;
2210   sqlite3BtreeEnter(p);
2211   n = p->pBt->pageSize - p->pBt->usableSize;
2212   sqlite3BtreeLeave(p);
2213   return n;
2214 }
2215 
2216 /*
2217 ** Set the maximum page count for a database if mxPage is positive.
2218 ** No changes are made if mxPage is 0 or negative.
2219 ** Regardless of the value of mxPage, return the maximum page count.
2220 */
2221 int sqlite3BtreeMaxPageCount(Btree *p, int mxPage){
2222   int n;
2223   sqlite3BtreeEnter(p);
2224   n = sqlite3PagerMaxPageCount(p->pBt->pPager, mxPage);
2225   sqlite3BtreeLeave(p);
2226   return n;
2227 }
2228 
2229 /*
2230 ** Set the BTS_SECURE_DELETE flag if newFlag is 0 or 1.  If newFlag is -1,
2231 ** then make no changes.  Always return the value of the BTS_SECURE_DELETE
2232 ** setting after the change.
2233 */
2234 int sqlite3BtreeSecureDelete(Btree *p, int newFlag){
2235   int b;
2236   if( p==0 ) return 0;
2237   sqlite3BtreeEnter(p);
2238   if( newFlag>=0 ){
2239     p->pBt->btsFlags &= ~BTS_SECURE_DELETE;
2240     if( newFlag ) p->pBt->btsFlags |= BTS_SECURE_DELETE;
2241   }
2242   b = (p->pBt->btsFlags & BTS_SECURE_DELETE)!=0;
2243   sqlite3BtreeLeave(p);
2244   return b;
2245 }
2246 #endif /* !defined(SQLITE_OMIT_PAGER_PRAGMAS) || !defined(SQLITE_OMIT_VACUUM) */
2247 
2248 /*
2249 ** Change the 'auto-vacuum' property of the database. If the 'autoVacuum'
2250 ** parameter is non-zero, then auto-vacuum mode is enabled. If zero, it
2251 ** is disabled. The default value for the auto-vacuum property is
2252 ** determined by the SQLITE_DEFAULT_AUTOVACUUM macro.
2253 */
2254 int sqlite3BtreeSetAutoVacuum(Btree *p, int autoVacuum){
2255 #ifdef SQLITE_OMIT_AUTOVACUUM
2256   return SQLITE_READONLY;
2257 #else
2258   BtShared *pBt = p->pBt;
2259   int rc = SQLITE_OK;
2260   u8 av = (u8)autoVacuum;
2261 
2262   sqlite3BtreeEnter(p);
2263   if( (pBt->btsFlags & BTS_PAGESIZE_FIXED)!=0 && (av ?1:0)!=pBt->autoVacuum ){
2264     rc = SQLITE_READONLY;
2265   }else{
2266     pBt->autoVacuum = av ?1:0;
2267     pBt->incrVacuum = av==2 ?1:0;
2268   }
2269   sqlite3BtreeLeave(p);
2270   return rc;
2271 #endif
2272 }
2273 
2274 /*
2275 ** Return the value of the 'auto-vacuum' property. If auto-vacuum is
2276 ** enabled 1 is returned. Otherwise 0.
2277 */
2278 int sqlite3BtreeGetAutoVacuum(Btree *p){
2279 #ifdef SQLITE_OMIT_AUTOVACUUM
2280   return BTREE_AUTOVACUUM_NONE;
2281 #else
2282   int rc;
2283   sqlite3BtreeEnter(p);
2284   rc = (
2285     (!p->pBt->autoVacuum)?BTREE_AUTOVACUUM_NONE:
2286     (!p->pBt->incrVacuum)?BTREE_AUTOVACUUM_FULL:
2287     BTREE_AUTOVACUUM_INCR
2288   );
2289   sqlite3BtreeLeave(p);
2290   return rc;
2291 #endif
2292 }
2293 
2294 
2295 /*
2296 ** Get a reference to pPage1 of the database file.  This will
2297 ** also acquire a readlock on that file.
2298 **
2299 ** SQLITE_OK is returned on success.  If the file is not a
2300 ** well-formed database file, then SQLITE_CORRUPT is returned.
2301 ** SQLITE_BUSY is returned if the database is locked.  SQLITE_NOMEM
2302 ** is returned if we run out of memory.
2303 */
2304 static int lockBtree(BtShared *pBt){
2305   int rc;              /* Result code from subfunctions */
2306   MemPage *pPage1;     /* Page 1 of the database file */
2307   int nPage;           /* Number of pages in the database */
2308   int nPageFile = 0;   /* Number of pages in the database file */
2309   int nPageHeader;     /* Number of pages in the database according to hdr */
2310 
2311   assert( sqlite3_mutex_held(pBt->mutex) );
2312   assert( pBt->pPage1==0 );
2313   rc = sqlite3PagerSharedLock(pBt->pPager);
2314   if( rc!=SQLITE_OK ) return rc;
2315   rc = btreeGetPage(pBt, 1, &pPage1, 0);
2316   if( rc!=SQLITE_OK ) return rc;
2317 
2318   /* Do some checking to help insure the file we opened really is
2319   ** a valid database file.
2320   */
2321   nPage = nPageHeader = get4byte(28+(u8*)pPage1->aData);
2322   sqlite3PagerPagecount(pBt->pPager, &nPageFile);
2323   if( nPage==0 || memcmp(24+(u8*)pPage1->aData, 92+(u8*)pPage1->aData,4)!=0 ){
2324     nPage = nPageFile;
2325   }
2326   if( nPage>0 ){
2327     u32 pageSize;
2328     u32 usableSize;
2329     u8 *page1 = pPage1->aData;
2330     rc = SQLITE_NOTADB;
2331     if( memcmp(page1, zMagicHeader, 16)!=0 ){
2332       goto page1_init_failed;
2333     }
2334 
2335 #ifdef SQLITE_OMIT_WAL
2336     if( page1[18]>1 ){
2337       pBt->btsFlags |= BTS_READ_ONLY;
2338     }
2339     if( page1[19]>1 ){
2340       goto page1_init_failed;
2341     }
2342 #else
2343     if( page1[18]>2 ){
2344       pBt->btsFlags |= BTS_READ_ONLY;
2345     }
2346     if( page1[19]>2 ){
2347       goto page1_init_failed;
2348     }
2349 
2350     /* If the write version is set to 2, this database should be accessed
2351     ** in WAL mode. If the log is not already open, open it now. Then
2352     ** return SQLITE_OK and return without populating BtShared.pPage1.
2353     ** The caller detects this and calls this function again. This is
2354     ** required as the version of page 1 currently in the page1 buffer
2355     ** may not be the latest version - there may be a newer one in the log
2356     ** file.
2357     */
2358     if( page1[19]==2 && (pBt->btsFlags & BTS_NO_WAL)==0 ){
2359       int isOpen = 0;
2360       rc = sqlite3PagerOpenWal(pBt->pPager, &isOpen);
2361       if( rc!=SQLITE_OK ){
2362         goto page1_init_failed;
2363       }else if( isOpen==0 ){
2364         releasePage(pPage1);
2365         return SQLITE_OK;
2366       }
2367       rc = SQLITE_NOTADB;
2368     }
2369 #endif
2370 
2371     /* The maximum embedded fraction must be exactly 25%.  And the minimum
2372     ** embedded fraction must be 12.5% for both leaf-data and non-leaf-data.
2373     ** The original design allowed these amounts to vary, but as of
2374     ** version 3.6.0, we require them to be fixed.
2375     */
2376     if( memcmp(&page1[21], "\100\040\040",3)!=0 ){
2377       goto page1_init_failed;
2378     }
2379     pageSize = (page1[16]<<8) | (page1[17]<<16);
2380     if( ((pageSize-1)&pageSize)!=0
2381      || pageSize>SQLITE_MAX_PAGE_SIZE
2382      || pageSize<=256
2383     ){
2384       goto page1_init_failed;
2385     }
2386     assert( (pageSize & 7)==0 );
2387     usableSize = pageSize - page1[20];
2388     if( (u32)pageSize!=pBt->pageSize ){
2389       /* After reading the first page of the database assuming a page size
2390       ** of BtShared.pageSize, we have discovered that the page-size is
2391       ** actually pageSize. Unlock the database, leave pBt->pPage1 at
2392       ** zero and return SQLITE_OK. The caller will call this function
2393       ** again with the correct page-size.
2394       */
2395       releasePage(pPage1);
2396       pBt->usableSize = usableSize;
2397       pBt->pageSize = pageSize;
2398       freeTempSpace(pBt);
2399       rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize,
2400                                    pageSize-usableSize);
2401       return rc;
2402     }
2403     if( (pBt->db->flags & SQLITE_RecoveryMode)==0 && nPage>nPageFile ){
2404       rc = SQLITE_CORRUPT_BKPT;
2405       goto page1_init_failed;
2406     }
2407     if( usableSize<480 ){
2408       goto page1_init_failed;
2409     }
2410     pBt->pageSize = pageSize;
2411     pBt->usableSize = usableSize;
2412 #ifndef SQLITE_OMIT_AUTOVACUUM
2413     pBt->autoVacuum = (get4byte(&page1[36 + 4*4])?1:0);
2414     pBt->incrVacuum = (get4byte(&page1[36 + 7*4])?1:0);
2415 #endif
2416   }
2417 
2418   /* maxLocal is the maximum amount of payload to store locally for
2419   ** a cell.  Make sure it is small enough so that at least minFanout
2420   ** cells can will fit on one page.  We assume a 10-byte page header.
2421   ** Besides the payload, the cell must store:
2422   **     2-byte pointer to the cell
2423   **     4-byte child pointer
2424   **     9-byte nKey value
2425   **     4-byte nData value
2426   **     4-byte overflow page pointer
2427   ** So a cell consists of a 2-byte pointer, a header which is as much as
2428   ** 17 bytes long, 0 to N bytes of payload, and an optional 4 byte overflow
2429   ** page pointer.
2430   */
2431   pBt->maxLocal = (u16)((pBt->usableSize-12)*64/255 - 23);
2432   pBt->minLocal = (u16)((pBt->usableSize-12)*32/255 - 23);
2433   pBt->maxLeaf = (u16)(pBt->usableSize - 35);
2434   pBt->minLeaf = (u16)((pBt->usableSize-12)*32/255 - 23);
2435   if( pBt->maxLocal>127 ){
2436     pBt->max1bytePayload = 127;
2437   }else{
2438     pBt->max1bytePayload = (u8)pBt->maxLocal;
2439   }
2440   assert( pBt->maxLeaf + 23 <= MX_CELL_SIZE(pBt) );
2441   pBt->pPage1 = pPage1;
2442   pBt->nPage = nPage;
2443   return SQLITE_OK;
2444 
2445 page1_init_failed:
2446   releasePage(pPage1);
2447   pBt->pPage1 = 0;
2448   return rc;
2449 }
2450 
2451 /*
2452 ** If there are no outstanding cursors and we are not in the middle
2453 ** of a transaction but there is a read lock on the database, then
2454 ** this routine unrefs the first page of the database file which
2455 ** has the effect of releasing the read lock.
2456 **
2457 ** If there is a transaction in progress, this routine is a no-op.
2458 */
2459 static void unlockBtreeIfUnused(BtShared *pBt){
2460   assert( sqlite3_mutex_held(pBt->mutex) );
2461   assert( pBt->pCursor==0 || pBt->inTransaction>TRANS_NONE );
2462   if( pBt->inTransaction==TRANS_NONE && pBt->pPage1!=0 ){
2463     assert( pBt->pPage1->aData );
2464     assert( sqlite3PagerRefcount(pBt->pPager)==1 );
2465     assert( pBt->pPage1->aData );
2466     releasePage(pBt->pPage1);
2467     pBt->pPage1 = 0;
2468   }
2469 }
2470 
2471 /*
2472 ** If pBt points to an empty file then convert that empty file
2473 ** into a new empty database by initializing the first page of
2474 ** the database.
2475 */
2476 static int newDatabase(BtShared *pBt){
2477   MemPage *pP1;
2478   unsigned char *data;
2479   int rc;
2480 
2481   assert( sqlite3_mutex_held(pBt->mutex) );
2482   if( pBt->nPage>0 ){
2483     return SQLITE_OK;
2484   }
2485   pP1 = pBt->pPage1;
2486   assert( pP1!=0 );
2487   data = pP1->aData;
2488   rc = sqlite3PagerWrite(pP1->pDbPage);
2489   if( rc ) return rc;
2490   memcpy(data, zMagicHeader, sizeof(zMagicHeader));
2491   assert( sizeof(zMagicHeader)==16 );
2492   data[16] = (u8)((pBt->pageSize>>8)&0xff);
2493   data[17] = (u8)((pBt->pageSize>>16)&0xff);
2494   data[18] = 1;
2495   data[19] = 1;
2496   assert( pBt->usableSize<=pBt->pageSize && pBt->usableSize+255>=pBt->pageSize);
2497   data[20] = (u8)(pBt->pageSize - pBt->usableSize);
2498   data[21] = 64;
2499   data[22] = 32;
2500   data[23] = 32;
2501   memset(&data[24], 0, 100-24);
2502   zeroPage(pP1, PTF_INTKEY|PTF_LEAF|PTF_LEAFDATA );
2503   pBt->btsFlags |= BTS_PAGESIZE_FIXED;
2504 #ifndef SQLITE_OMIT_AUTOVACUUM
2505   assert( pBt->autoVacuum==1 || pBt->autoVacuum==0 );
2506   assert( pBt->incrVacuum==1 || pBt->incrVacuum==0 );
2507   put4byte(&data[36 + 4*4], pBt->autoVacuum);
2508   put4byte(&data[36 + 7*4], pBt->incrVacuum);
2509 #endif
2510   pBt->nPage = 1;
2511   data[31] = 1;
2512   return SQLITE_OK;
2513 }
2514 
2515 /*
2516 ** Attempt to start a new transaction. A write-transaction
2517 ** is started if the second argument is nonzero, otherwise a read-
2518 ** transaction.  If the second argument is 2 or more and exclusive
2519 ** transaction is started, meaning that no other process is allowed
2520 ** to access the database.  A preexisting transaction may not be
2521 ** upgraded to exclusive by calling this routine a second time - the
2522 ** exclusivity flag only works for a new transaction.
2523 **
2524 ** A write-transaction must be started before attempting any
2525 ** changes to the database.  None of the following routines
2526 ** will work unless a transaction is started first:
2527 **
2528 **      sqlite3BtreeCreateTable()
2529 **      sqlite3BtreeCreateIndex()
2530 **      sqlite3BtreeClearTable()
2531 **      sqlite3BtreeDropTable()
2532 **      sqlite3BtreeInsert()
2533 **      sqlite3BtreeDelete()
2534 **      sqlite3BtreeUpdateMeta()
2535 **
2536 ** If an initial attempt to acquire the lock fails because of lock contention
2537 ** and the database was previously unlocked, then invoke the busy handler
2538 ** if there is one.  But if there was previously a read-lock, do not
2539 ** invoke the busy handler - just return SQLITE_BUSY.  SQLITE_BUSY is
2540 ** returned when there is already a read-lock in order to avoid a deadlock.
2541 **
2542 ** Suppose there are two processes A and B.  A has a read lock and B has
2543 ** a reserved lock.  B tries to promote to exclusive but is blocked because
2544 ** of A's read lock.  A tries to promote to reserved but is blocked by B.
2545 ** One or the other of the two processes must give way or there can be
2546 ** no progress.  By returning SQLITE_BUSY and not invoking the busy callback
2547 ** when A already has a read lock, we encourage A to give up and let B
2548 ** proceed.
2549 */
2550 int sqlite3BtreeBeginTrans(Btree *p, int wrflag){
2551   sqlite3 *pBlock = 0;
2552   BtShared *pBt = p->pBt;
2553   int rc = SQLITE_OK;
2554 
2555   sqlite3BtreeEnter(p);
2556   btreeIntegrity(p);
2557 
2558   /* If the btree is already in a write-transaction, or it
2559   ** is already in a read-transaction and a read-transaction
2560   ** is requested, this is a no-op.
2561   */
2562   if( p->inTrans==TRANS_WRITE || (p->inTrans==TRANS_READ && !wrflag) ){
2563     goto trans_begun;
2564   }
2565 
2566   /* Write transactions are not possible on a read-only database */
2567   if( (pBt->btsFlags & BTS_READ_ONLY)!=0 && wrflag ){
2568     rc = SQLITE_READONLY;
2569     goto trans_begun;
2570   }
2571 
2572 #ifndef SQLITE_OMIT_SHARED_CACHE
2573   /* If another database handle has already opened a write transaction
2574   ** on this shared-btree structure and a second write transaction is
2575   ** requested, return SQLITE_LOCKED.
2576   */
2577   if( (wrflag && pBt->inTransaction==TRANS_WRITE)
2578    || (pBt->btsFlags & BTS_PENDING)!=0
2579   ){
2580     pBlock = pBt->pWriter->db;
2581   }else if( wrflag>1 ){
2582     BtLock *pIter;
2583     for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){
2584       if( pIter->pBtree!=p ){
2585         pBlock = pIter->pBtree->db;
2586         break;
2587       }
2588     }
2589   }
2590   if( pBlock ){
2591     sqlite3ConnectionBlocked(p->db, pBlock);
2592     rc = SQLITE_LOCKED_SHAREDCACHE;
2593     goto trans_begun;
2594   }
2595 #endif
2596 
2597   /* Any read-only or read-write transaction implies a read-lock on
2598   ** page 1. So if some other shared-cache client already has a write-lock
2599   ** on page 1, the transaction cannot be opened. */
2600   rc = querySharedCacheTableLock(p, MASTER_ROOT, READ_LOCK);
2601   if( SQLITE_OK!=rc ) goto trans_begun;
2602 
2603   pBt->btsFlags &= ~BTS_INITIALLY_EMPTY;
2604   if( pBt->nPage==0 ) pBt->btsFlags |= BTS_INITIALLY_EMPTY;
2605   do {
2606     /* Call lockBtree() until either pBt->pPage1 is populated or
2607     ** lockBtree() returns something other than SQLITE_OK. lockBtree()
2608     ** may return SQLITE_OK but leave pBt->pPage1 set to 0 if after
2609     ** reading page 1 it discovers that the page-size of the database
2610     ** file is not pBt->pageSize. In this case lockBtree() will update
2611     ** pBt->pageSize to the page-size of the file on disk.
2612     */
2613     while( pBt->pPage1==0 && SQLITE_OK==(rc = lockBtree(pBt)) );
2614 
2615     if( rc==SQLITE_OK && wrflag ){
2616       if( (pBt->btsFlags & BTS_READ_ONLY)!=0 ){
2617         rc = SQLITE_READONLY;
2618       }else{
2619         rc = sqlite3PagerBegin(pBt->pPager,wrflag>1,sqlite3TempInMemory(p->db));
2620         if( rc==SQLITE_OK ){
2621           rc = newDatabase(pBt);
2622         }
2623       }
2624     }
2625 
2626     if( rc!=SQLITE_OK ){
2627       unlockBtreeIfUnused(pBt);
2628     }
2629   }while( (rc&0xFF)==SQLITE_BUSY && pBt->inTransaction==TRANS_NONE &&
2630           btreeInvokeBusyHandler(pBt) );
2631 
2632   if( rc==SQLITE_OK ){
2633     if( p->inTrans==TRANS_NONE ){
2634       pBt->nTransaction++;
2635 #ifndef SQLITE_OMIT_SHARED_CACHE
2636       if( p->sharable ){
2637 	assert( p->lock.pBtree==p && p->lock.iTable==1 );
2638         p->lock.eLock = READ_LOCK;
2639         p->lock.pNext = pBt->pLock;
2640         pBt->pLock = &p->lock;
2641       }
2642 #endif
2643     }
2644     p->inTrans = (wrflag?TRANS_WRITE:TRANS_READ);
2645     if( p->inTrans>pBt->inTransaction ){
2646       pBt->inTransaction = p->inTrans;
2647     }
2648     if( wrflag ){
2649       MemPage *pPage1 = pBt->pPage1;
2650 #ifndef SQLITE_OMIT_SHARED_CACHE
2651       assert( !pBt->pWriter );
2652       pBt->pWriter = p;
2653       pBt->btsFlags &= ~BTS_EXCLUSIVE;
2654       if( wrflag>1 ) pBt->btsFlags |= BTS_EXCLUSIVE;
2655 #endif
2656 
2657       /* If the db-size header field is incorrect (as it may be if an old
2658       ** client has been writing the database file), update it now. Doing
2659       ** this sooner rather than later means the database size can safely
2660       ** re-read the database size from page 1 if a savepoint or transaction
2661       ** rollback occurs within the transaction.
2662       */
2663       if( pBt->nPage!=get4byte(&pPage1->aData[28]) ){
2664         rc = sqlite3PagerWrite(pPage1->pDbPage);
2665         if( rc==SQLITE_OK ){
2666           put4byte(&pPage1->aData[28], pBt->nPage);
2667         }
2668       }
2669     }
2670   }
2671 
2672 
2673 trans_begun:
2674   if( rc==SQLITE_OK && wrflag ){
2675     /* This call makes sure that the pager has the correct number of
2676     ** open savepoints. If the second parameter is greater than 0 and
2677     ** the sub-journal is not already open, then it will be opened here.
2678     */
2679     rc = sqlite3PagerOpenSavepoint(pBt->pPager, p->db->nSavepoint);
2680   }
2681 
2682   btreeIntegrity(p);
2683   sqlite3BtreeLeave(p);
2684   return rc;
2685 }
2686 
2687 #ifndef SQLITE_OMIT_AUTOVACUUM
2688 
2689 /*
2690 ** Set the pointer-map entries for all children of page pPage. Also, if
2691 ** pPage contains cells that point to overflow pages, set the pointer
2692 ** map entries for the overflow pages as well.
2693 */
2694 static int setChildPtrmaps(MemPage *pPage){
2695   int i;                             /* Counter variable */
2696   int nCell;                         /* Number of cells in page pPage */
2697   int rc;                            /* Return code */
2698   BtShared *pBt = pPage->pBt;
2699   u8 isInitOrig = pPage->isInit;
2700   Pgno pgno = pPage->pgno;
2701 
2702   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
2703   rc = btreeInitPage(pPage);
2704   if( rc!=SQLITE_OK ){
2705     goto set_child_ptrmaps_out;
2706   }
2707   nCell = pPage->nCell;
2708 
2709   for(i=0; i<nCell; i++){
2710     u8 *pCell = findCell(pPage, i);
2711 
2712     ptrmapPutOvflPtr(pPage, pCell, &rc);
2713 
2714     if( !pPage->leaf ){
2715       Pgno childPgno = get4byte(pCell);
2716       ptrmapPut(pBt, childPgno, PTRMAP_BTREE, pgno, &rc);
2717     }
2718   }
2719 
2720   if( !pPage->leaf ){
2721     Pgno childPgno = get4byte(&pPage->aData[pPage->hdrOffset+8]);
2722     ptrmapPut(pBt, childPgno, PTRMAP_BTREE, pgno, &rc);
2723   }
2724 
2725 set_child_ptrmaps_out:
2726   pPage->isInit = isInitOrig;
2727   return rc;
2728 }
2729 
2730 /*
2731 ** Somewhere on pPage is a pointer to page iFrom.  Modify this pointer so
2732 ** that it points to iTo. Parameter eType describes the type of pointer to
2733 ** be modified, as  follows:
2734 **
2735 ** PTRMAP_BTREE:     pPage is a btree-page. The pointer points at a child
2736 **                   page of pPage.
2737 **
2738 ** PTRMAP_OVERFLOW1: pPage is a btree-page. The pointer points at an overflow
2739 **                   page pointed to by one of the cells on pPage.
2740 **
2741 ** PTRMAP_OVERFLOW2: pPage is an overflow-page. The pointer points at the next
2742 **                   overflow page in the list.
2743 */
2744 static int modifyPagePointer(MemPage *pPage, Pgno iFrom, Pgno iTo, u8 eType){
2745   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
2746   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
2747   if( eType==PTRMAP_OVERFLOW2 ){
2748     /* The pointer is always the first 4 bytes of the page in this case.  */
2749     if( get4byte(pPage->aData)!=iFrom ){
2750       return SQLITE_CORRUPT_BKPT;
2751     }
2752     put4byte(pPage->aData, iTo);
2753   }else{
2754     u8 isInitOrig = pPage->isInit;
2755     int i;
2756     int nCell;
2757 
2758     btreeInitPage(pPage);
2759     nCell = pPage->nCell;
2760 
2761     for(i=0; i<nCell; i++){
2762       u8 *pCell = findCell(pPage, i);
2763       if( eType==PTRMAP_OVERFLOW1 ){
2764         CellInfo info;
2765         btreeParseCellPtr(pPage, pCell, &info);
2766         if( info.iOverflow
2767          && pCell+info.iOverflow+3<=pPage->aData+pPage->maskPage
2768          && iFrom==get4byte(&pCell[info.iOverflow])
2769         ){
2770           put4byte(&pCell[info.iOverflow], iTo);
2771           break;
2772         }
2773       }else{
2774         if( get4byte(pCell)==iFrom ){
2775           put4byte(pCell, iTo);
2776           break;
2777         }
2778       }
2779     }
2780 
2781     if( i==nCell ){
2782       if( eType!=PTRMAP_BTREE ||
2783           get4byte(&pPage->aData[pPage->hdrOffset+8])!=iFrom ){
2784         return SQLITE_CORRUPT_BKPT;
2785       }
2786       put4byte(&pPage->aData[pPage->hdrOffset+8], iTo);
2787     }
2788 
2789     pPage->isInit = isInitOrig;
2790   }
2791   return SQLITE_OK;
2792 }
2793 
2794 
2795 /*
2796 ** Move the open database page pDbPage to location iFreePage in the
2797 ** database. The pDbPage reference remains valid.
2798 **
2799 ** The isCommit flag indicates that there is no need to remember that
2800 ** the journal needs to be sync()ed before database page pDbPage->pgno
2801 ** can be written to. The caller has already promised not to write to that
2802 ** page.
2803 */
2804 static int relocatePage(
2805   BtShared *pBt,           /* Btree */
2806   MemPage *pDbPage,        /* Open page to move */
2807   u8 eType,                /* Pointer map 'type' entry for pDbPage */
2808   Pgno iPtrPage,           /* Pointer map 'page-no' entry for pDbPage */
2809   Pgno iFreePage,          /* The location to move pDbPage to */
2810   int isCommit             /* isCommit flag passed to sqlite3PagerMovepage */
2811 ){
2812   MemPage *pPtrPage;   /* The page that contains a pointer to pDbPage */
2813   Pgno iDbPage = pDbPage->pgno;
2814   Pager *pPager = pBt->pPager;
2815   int rc;
2816 
2817   assert( eType==PTRMAP_OVERFLOW2 || eType==PTRMAP_OVERFLOW1 ||
2818       eType==PTRMAP_BTREE || eType==PTRMAP_ROOTPAGE );
2819   assert( sqlite3_mutex_held(pBt->mutex) );
2820   assert( pDbPage->pBt==pBt );
2821 
2822   /* Move page iDbPage from its current location to page number iFreePage */
2823   TRACE(("AUTOVACUUM: Moving %d to free page %d (ptr page %d type %d)\n",
2824       iDbPage, iFreePage, iPtrPage, eType));
2825   rc = sqlite3PagerMovepage(pPager, pDbPage->pDbPage, iFreePage, isCommit);
2826   if( rc!=SQLITE_OK ){
2827     return rc;
2828   }
2829   pDbPage->pgno = iFreePage;
2830 
2831   /* If pDbPage was a btree-page, then it may have child pages and/or cells
2832   ** that point to overflow pages. The pointer map entries for all these
2833   ** pages need to be changed.
2834   **
2835   ** If pDbPage is an overflow page, then the first 4 bytes may store a
2836   ** pointer to a subsequent overflow page. If this is the case, then
2837   ** the pointer map needs to be updated for the subsequent overflow page.
2838   */
2839   if( eType==PTRMAP_BTREE || eType==PTRMAP_ROOTPAGE ){
2840     rc = setChildPtrmaps(pDbPage);
2841     if( rc!=SQLITE_OK ){
2842       return rc;
2843     }
2844   }else{
2845     Pgno nextOvfl = get4byte(pDbPage->aData);
2846     if( nextOvfl!=0 ){
2847       ptrmapPut(pBt, nextOvfl, PTRMAP_OVERFLOW2, iFreePage, &rc);
2848       if( rc!=SQLITE_OK ){
2849         return rc;
2850       }
2851     }
2852   }
2853 
2854   /* Fix the database pointer on page iPtrPage that pointed at iDbPage so
2855   ** that it points at iFreePage. Also fix the pointer map entry for
2856   ** iPtrPage.
2857   */
2858   if( eType!=PTRMAP_ROOTPAGE ){
2859     rc = btreeGetPage(pBt, iPtrPage, &pPtrPage, 0);
2860     if( rc!=SQLITE_OK ){
2861       return rc;
2862     }
2863     rc = sqlite3PagerWrite(pPtrPage->pDbPage);
2864     if( rc!=SQLITE_OK ){
2865       releasePage(pPtrPage);
2866       return rc;
2867     }
2868     rc = modifyPagePointer(pPtrPage, iDbPage, iFreePage, eType);
2869     releasePage(pPtrPage);
2870     if( rc==SQLITE_OK ){
2871       ptrmapPut(pBt, iFreePage, eType, iPtrPage, &rc);
2872     }
2873   }
2874   return rc;
2875 }
2876 
2877 /* Forward declaration required by incrVacuumStep(). */
2878 static int allocateBtreePage(BtShared *, MemPage **, Pgno *, Pgno, u8);
2879 
2880 /*
2881 ** Perform a single step of an incremental-vacuum. If successful,
2882 ** return SQLITE_OK. If there is no work to do (and therefore no
2883 ** point in calling this function again), return SQLITE_DONE.
2884 **
2885 ** More specificly, this function attempts to re-organize the
2886 ** database so that the last page of the file currently in use
2887 ** is no longer in use.
2888 **
2889 ** If the nFin parameter is non-zero, this function assumes
2890 ** that the caller will keep calling incrVacuumStep() until
2891 ** it returns SQLITE_DONE or an error, and that nFin is the
2892 ** number of pages the database file will contain after this
2893 ** process is complete.  If nFin is zero, it is assumed that
2894 ** incrVacuumStep() will be called a finite amount of times
2895 ** which may or may not empty the freelist.  A full autovacuum
2896 ** has nFin>0.  A "PRAGMA incremental_vacuum" has nFin==0.
2897 */
2898 static int incrVacuumStep(BtShared *pBt, Pgno nFin, Pgno iLastPg){
2899   Pgno nFreeList;           /* Number of pages still on the free-list */
2900   int rc;
2901 
2902   assert( sqlite3_mutex_held(pBt->mutex) );
2903   assert( iLastPg>nFin );
2904 
2905   if( !PTRMAP_ISPAGE(pBt, iLastPg) && iLastPg!=PENDING_BYTE_PAGE(pBt) ){
2906     u8 eType;
2907     Pgno iPtrPage;
2908 
2909     nFreeList = get4byte(&pBt->pPage1->aData[36]);
2910     if( nFreeList==0 ){
2911       return SQLITE_DONE;
2912     }
2913 
2914     rc = ptrmapGet(pBt, iLastPg, &eType, &iPtrPage);
2915     if( rc!=SQLITE_OK ){
2916       return rc;
2917     }
2918     if( eType==PTRMAP_ROOTPAGE ){
2919       return SQLITE_CORRUPT_BKPT;
2920     }
2921 
2922     if( eType==PTRMAP_FREEPAGE ){
2923       if( nFin==0 ){
2924         /* Remove the page from the files free-list. This is not required
2925         ** if nFin is non-zero. In that case, the free-list will be
2926         ** truncated to zero after this function returns, so it doesn't
2927         ** matter if it still contains some garbage entries.
2928         */
2929         Pgno iFreePg;
2930         MemPage *pFreePg;
2931         rc = allocateBtreePage(pBt, &pFreePg, &iFreePg, iLastPg, 1);
2932         if( rc!=SQLITE_OK ){
2933           return rc;
2934         }
2935         assert( iFreePg==iLastPg );
2936         releasePage(pFreePg);
2937       }
2938     } else {
2939       Pgno iFreePg;             /* Index of free page to move pLastPg to */
2940       MemPage *pLastPg;
2941 
2942       rc = btreeGetPage(pBt, iLastPg, &pLastPg, 0);
2943       if( rc!=SQLITE_OK ){
2944         return rc;
2945       }
2946 
2947       /* If nFin is zero, this loop runs exactly once and page pLastPg
2948       ** is swapped with the first free page pulled off the free list.
2949       **
2950       ** On the other hand, if nFin is greater than zero, then keep
2951       ** looping until a free-page located within the first nFin pages
2952       ** of the file is found.
2953       */
2954       do {
2955         MemPage *pFreePg;
2956         rc = allocateBtreePage(pBt, &pFreePg, &iFreePg, 0, 0);
2957         if( rc!=SQLITE_OK ){
2958           releasePage(pLastPg);
2959           return rc;
2960         }
2961         releasePage(pFreePg);
2962       }while( nFin!=0 && iFreePg>nFin );
2963       assert( iFreePg<iLastPg );
2964 
2965       rc = sqlite3PagerWrite(pLastPg->pDbPage);
2966       if( rc==SQLITE_OK ){
2967         rc = relocatePage(pBt, pLastPg, eType, iPtrPage, iFreePg, nFin!=0);
2968       }
2969       releasePage(pLastPg);
2970       if( rc!=SQLITE_OK ){
2971         return rc;
2972       }
2973     }
2974   }
2975 
2976   if( nFin==0 ){
2977     iLastPg--;
2978     while( iLastPg==PENDING_BYTE_PAGE(pBt)||PTRMAP_ISPAGE(pBt, iLastPg) ){
2979       if( PTRMAP_ISPAGE(pBt, iLastPg) ){
2980         MemPage *pPg;
2981         rc = btreeGetPage(pBt, iLastPg, &pPg, 0);
2982         if( rc!=SQLITE_OK ){
2983           return rc;
2984         }
2985         rc = sqlite3PagerWrite(pPg->pDbPage);
2986         releasePage(pPg);
2987         if( rc!=SQLITE_OK ){
2988           return rc;
2989         }
2990       }
2991       iLastPg--;
2992     }
2993     sqlite3PagerTruncateImage(pBt->pPager, iLastPg);
2994     pBt->nPage = iLastPg;
2995   }
2996   return SQLITE_OK;
2997 }
2998 
2999 /*
3000 ** A write-transaction must be opened before calling this function.
3001 ** It performs a single unit of work towards an incremental vacuum.
3002 **
3003 ** If the incremental vacuum is finished after this function has run,
3004 ** SQLITE_DONE is returned. If it is not finished, but no error occurred,
3005 ** SQLITE_OK is returned. Otherwise an SQLite error code.
3006 */
3007 int sqlite3BtreeIncrVacuum(Btree *p){
3008   int rc;
3009   BtShared *pBt = p->pBt;
3010 
3011   sqlite3BtreeEnter(p);
3012   assert( pBt->inTransaction==TRANS_WRITE && p->inTrans==TRANS_WRITE );
3013   if( !pBt->autoVacuum ){
3014     rc = SQLITE_DONE;
3015   }else{
3016     invalidateAllOverflowCache(pBt);
3017     rc = incrVacuumStep(pBt, 0, btreePagecount(pBt));
3018     if( rc==SQLITE_OK ){
3019       rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
3020       put4byte(&pBt->pPage1->aData[28], pBt->nPage);
3021     }
3022   }
3023   sqlite3BtreeLeave(p);
3024   return rc;
3025 }
3026 
3027 /*
3028 ** This routine is called prior to sqlite3PagerCommit when a transaction
3029 ** is commited for an auto-vacuum database.
3030 **
3031 ** If SQLITE_OK is returned, then *pnTrunc is set to the number of pages
3032 ** the database file should be truncated to during the commit process.
3033 ** i.e. the database has been reorganized so that only the first *pnTrunc
3034 ** pages are in use.
3035 */
3036 static int autoVacuumCommit(BtShared *pBt){
3037   int rc = SQLITE_OK;
3038   Pager *pPager = pBt->pPager;
3039   VVA_ONLY( int nRef = sqlite3PagerRefcount(pPager) );
3040 
3041   assert( sqlite3_mutex_held(pBt->mutex) );
3042   invalidateAllOverflowCache(pBt);
3043   assert(pBt->autoVacuum);
3044   if( !pBt->incrVacuum ){
3045     Pgno nFin;         /* Number of pages in database after autovacuuming */
3046     Pgno nFree;        /* Number of pages on the freelist initially */
3047     Pgno nPtrmap;      /* Number of PtrMap pages to be freed */
3048     Pgno iFree;        /* The next page to be freed */
3049     int nEntry;        /* Number of entries on one ptrmap page */
3050     Pgno nOrig;        /* Database size before freeing */
3051 
3052     nOrig = btreePagecount(pBt);
3053     if( PTRMAP_ISPAGE(pBt, nOrig) || nOrig==PENDING_BYTE_PAGE(pBt) ){
3054       /* It is not possible to create a database for which the final page
3055       ** is either a pointer-map page or the pending-byte page. If one
3056       ** is encountered, this indicates corruption.
3057       */
3058       return SQLITE_CORRUPT_BKPT;
3059     }
3060 
3061     nFree = get4byte(&pBt->pPage1->aData[36]);
3062     nEntry = pBt->usableSize/5;
3063     nPtrmap = (nFree-nOrig+PTRMAP_PAGENO(pBt, nOrig)+nEntry)/nEntry;
3064     nFin = nOrig - nFree - nPtrmap;
3065     if( nOrig>PENDING_BYTE_PAGE(pBt) && nFin<PENDING_BYTE_PAGE(pBt) ){
3066       nFin--;
3067     }
3068     while( PTRMAP_ISPAGE(pBt, nFin) || nFin==PENDING_BYTE_PAGE(pBt) ){
3069       nFin--;
3070     }
3071     if( nFin>nOrig ) return SQLITE_CORRUPT_BKPT;
3072 
3073     for(iFree=nOrig; iFree>nFin && rc==SQLITE_OK; iFree--){
3074       rc = incrVacuumStep(pBt, nFin, iFree);
3075     }
3076     if( (rc==SQLITE_DONE || rc==SQLITE_OK) && nFree>0 ){
3077       rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
3078       put4byte(&pBt->pPage1->aData[32], 0);
3079       put4byte(&pBt->pPage1->aData[36], 0);
3080       put4byte(&pBt->pPage1->aData[28], nFin);
3081       sqlite3PagerTruncateImage(pBt->pPager, nFin);
3082       pBt->nPage = nFin;
3083     }
3084     if( rc!=SQLITE_OK ){
3085       sqlite3PagerRollback(pPager);
3086     }
3087   }
3088 
3089   assert( nRef==sqlite3PagerRefcount(pPager) );
3090   return rc;
3091 }
3092 
3093 #else /* ifndef SQLITE_OMIT_AUTOVACUUM */
3094 # define setChildPtrmaps(x) SQLITE_OK
3095 #endif
3096 
3097 /*
3098 ** This routine does the first phase of a two-phase commit.  This routine
3099 ** causes a rollback journal to be created (if it does not already exist)
3100 ** and populated with enough information so that if a power loss occurs
3101 ** the database can be restored to its original state by playing back
3102 ** the journal.  Then the contents of the journal are flushed out to
3103 ** the disk.  After the journal is safely on oxide, the changes to the
3104 ** database are written into the database file and flushed to oxide.
3105 ** At the end of this call, the rollback journal still exists on the
3106 ** disk and we are still holding all locks, so the transaction has not
3107 ** committed.  See sqlite3BtreeCommitPhaseTwo() for the second phase of the
3108 ** commit process.
3109 **
3110 ** This call is a no-op if no write-transaction is currently active on pBt.
3111 **
3112 ** Otherwise, sync the database file for the btree pBt. zMaster points to
3113 ** the name of a master journal file that should be written into the
3114 ** individual journal file, or is NULL, indicating no master journal file
3115 ** (single database transaction).
3116 **
3117 ** When this is called, the master journal should already have been
3118 ** created, populated with this journal pointer and synced to disk.
3119 **
3120 ** Once this is routine has returned, the only thing required to commit
3121 ** the write-transaction for this database file is to delete the journal.
3122 */
3123 int sqlite3BtreeCommitPhaseOne(Btree *p, const char *zMaster){
3124   int rc = SQLITE_OK;
3125   if( p->inTrans==TRANS_WRITE ){
3126     BtShared *pBt = p->pBt;
3127     sqlite3BtreeEnter(p);
3128 #ifndef SQLITE_OMIT_AUTOVACUUM
3129     if( pBt->autoVacuum ){
3130       rc = autoVacuumCommit(pBt);
3131       if( rc!=SQLITE_OK ){
3132         sqlite3BtreeLeave(p);
3133         return rc;
3134       }
3135     }
3136 #endif
3137     rc = sqlite3PagerCommitPhaseOne(pBt->pPager, zMaster, 0);
3138     sqlite3BtreeLeave(p);
3139   }
3140   return rc;
3141 }
3142 
3143 /*
3144 ** This function is called from both BtreeCommitPhaseTwo() and BtreeRollback()
3145 ** at the conclusion of a transaction.
3146 */
3147 static void btreeEndTransaction(Btree *p){
3148   BtShared *pBt = p->pBt;
3149   assert( sqlite3BtreeHoldsMutex(p) );
3150 
3151   btreeClearHasContent(pBt);
3152   if( p->inTrans>TRANS_NONE && p->db->activeVdbeCnt>1 ){
3153     /* If there are other active statements that belong to this database
3154     ** handle, downgrade to a read-only transaction. The other statements
3155     ** may still be reading from the database.  */
3156     downgradeAllSharedCacheTableLocks(p);
3157     p->inTrans = TRANS_READ;
3158   }else{
3159     /* If the handle had any kind of transaction open, decrement the
3160     ** transaction count of the shared btree. If the transaction count
3161     ** reaches 0, set the shared state to TRANS_NONE. The unlockBtreeIfUnused()
3162     ** call below will unlock the pager.  */
3163     if( p->inTrans!=TRANS_NONE ){
3164       clearAllSharedCacheTableLocks(p);
3165       pBt->nTransaction--;
3166       if( 0==pBt->nTransaction ){
3167         pBt->inTransaction = TRANS_NONE;
3168       }
3169     }
3170 
3171     /* Set the current transaction state to TRANS_NONE and unlock the
3172     ** pager if this call closed the only read or write transaction.  */
3173     p->inTrans = TRANS_NONE;
3174     unlockBtreeIfUnused(pBt);
3175   }
3176 
3177   btreeIntegrity(p);
3178 }
3179 
3180 /*
3181 ** Commit the transaction currently in progress.
3182 **
3183 ** This routine implements the second phase of a 2-phase commit.  The
3184 ** sqlite3BtreeCommitPhaseOne() routine does the first phase and should
3185 ** be invoked prior to calling this routine.  The sqlite3BtreeCommitPhaseOne()
3186 ** routine did all the work of writing information out to disk and flushing the
3187 ** contents so that they are written onto the disk platter.  All this
3188 ** routine has to do is delete or truncate or zero the header in the
3189 ** the rollback journal (which causes the transaction to commit) and
3190 ** drop locks.
3191 **
3192 ** Normally, if an error occurs while the pager layer is attempting to
3193 ** finalize the underlying journal file, this function returns an error and
3194 ** the upper layer will attempt a rollback. However, if the second argument
3195 ** is non-zero then this b-tree transaction is part of a multi-file
3196 ** transaction. In this case, the transaction has already been committed
3197 ** (by deleting a master journal file) and the caller will ignore this
3198 ** functions return code. So, even if an error occurs in the pager layer,
3199 ** reset the b-tree objects internal state to indicate that the write
3200 ** transaction has been closed. This is quite safe, as the pager will have
3201 ** transitioned to the error state.
3202 **
3203 ** This will release the write lock on the database file.  If there
3204 ** are no active cursors, it also releases the read lock.
3205 */
3206 int sqlite3BtreeCommitPhaseTwo(Btree *p, int bCleanup){
3207 
3208   if( p->inTrans==TRANS_NONE ) return SQLITE_OK;
3209   sqlite3BtreeEnter(p);
3210   btreeIntegrity(p);
3211 
3212   /* If the handle has a write-transaction open, commit the shared-btrees
3213   ** transaction and set the shared state to TRANS_READ.
3214   */
3215   if( p->inTrans==TRANS_WRITE ){
3216     int rc;
3217     BtShared *pBt = p->pBt;
3218     assert( pBt->inTransaction==TRANS_WRITE );
3219     assert( pBt->nTransaction>0 );
3220     rc = sqlite3PagerCommitPhaseTwo(pBt->pPager);
3221     if( rc!=SQLITE_OK && bCleanup==0 ){
3222       sqlite3BtreeLeave(p);
3223       return rc;
3224     }
3225     pBt->inTransaction = TRANS_READ;
3226   }
3227 
3228   btreeEndTransaction(p);
3229   sqlite3BtreeLeave(p);
3230   return SQLITE_OK;
3231 }
3232 
3233 /*
3234 ** Do both phases of a commit.
3235 */
3236 int sqlite3BtreeCommit(Btree *p){
3237   int rc;
3238   sqlite3BtreeEnter(p);
3239   rc = sqlite3BtreeCommitPhaseOne(p, 0);
3240   if( rc==SQLITE_OK ){
3241     rc = sqlite3BtreeCommitPhaseTwo(p, 0);
3242   }
3243   sqlite3BtreeLeave(p);
3244   return rc;
3245 }
3246 
3247 #ifndef NDEBUG
3248 /*
3249 ** Return the number of write-cursors open on this handle. This is for use
3250 ** in assert() expressions, so it is only compiled if NDEBUG is not
3251 ** defined.
3252 **
3253 ** For the purposes of this routine, a write-cursor is any cursor that
3254 ** is capable of writing to the databse.  That means the cursor was
3255 ** originally opened for writing and the cursor has not be disabled
3256 ** by having its state changed to CURSOR_FAULT.
3257 */
3258 static int countWriteCursors(BtShared *pBt){
3259   BtCursor *pCur;
3260   int r = 0;
3261   for(pCur=pBt->pCursor; pCur; pCur=pCur->pNext){
3262     if( pCur->wrFlag && pCur->eState!=CURSOR_FAULT ) r++;
3263   }
3264   return r;
3265 }
3266 #endif
3267 
3268 /*
3269 ** This routine sets the state to CURSOR_FAULT and the error
3270 ** code to errCode for every cursor on BtShared that pBtree
3271 ** references.
3272 **
3273 ** Every cursor is tripped, including cursors that belong
3274 ** to other database connections that happen to be sharing
3275 ** the cache with pBtree.
3276 **
3277 ** This routine gets called when a rollback occurs.
3278 ** All cursors using the same cache must be tripped
3279 ** to prevent them from trying to use the btree after
3280 ** the rollback.  The rollback may have deleted tables
3281 ** or moved root pages, so it is not sufficient to
3282 ** save the state of the cursor.  The cursor must be
3283 ** invalidated.
3284 */
3285 void sqlite3BtreeTripAllCursors(Btree *pBtree, int errCode){
3286   BtCursor *p;
3287   if( pBtree==0 ) return;
3288   sqlite3BtreeEnter(pBtree);
3289   for(p=pBtree->pBt->pCursor; p; p=p->pNext){
3290     int i;
3291     sqlite3BtreeClearCursor(p);
3292     p->eState = CURSOR_FAULT;
3293     p->skipNext = errCode;
3294     for(i=0; i<=p->iPage; i++){
3295       releasePage(p->apPage[i]);
3296       p->apPage[i] = 0;
3297     }
3298   }
3299   sqlite3BtreeLeave(pBtree);
3300 }
3301 
3302 /*
3303 ** Rollback the transaction in progress.  All cursors will be
3304 ** invalided by this operation.  Any attempt to use a cursor
3305 ** that was open at the beginning of this operation will result
3306 ** in an error.
3307 **
3308 ** This will release the write lock on the database file.  If there
3309 ** are no active cursors, it also releases the read lock.
3310 */
3311 int sqlite3BtreeRollback(Btree *p, int tripCode){
3312   int rc;
3313   BtShared *pBt = p->pBt;
3314   MemPage *pPage1;
3315 
3316   sqlite3BtreeEnter(p);
3317   if( tripCode==SQLITE_OK ){
3318     rc = tripCode = saveAllCursors(pBt, 0, 0);
3319   }else{
3320     rc = SQLITE_OK;
3321   }
3322   if( tripCode ){
3323     sqlite3BtreeTripAllCursors(p, tripCode);
3324   }
3325   btreeIntegrity(p);
3326 
3327   if( p->inTrans==TRANS_WRITE ){
3328     int rc2;
3329 
3330     assert( TRANS_WRITE==pBt->inTransaction );
3331     rc2 = sqlite3PagerRollback(pBt->pPager);
3332     if( rc2!=SQLITE_OK ){
3333       rc = rc2;
3334     }
3335 
3336     /* The rollback may have destroyed the pPage1->aData value.  So
3337     ** call btreeGetPage() on page 1 again to make
3338     ** sure pPage1->aData is set correctly. */
3339     if( btreeGetPage(pBt, 1, &pPage1, 0)==SQLITE_OK ){
3340       int nPage = get4byte(28+(u8*)pPage1->aData);
3341       testcase( nPage==0 );
3342       if( nPage==0 ) sqlite3PagerPagecount(pBt->pPager, &nPage);
3343       testcase( pBt->nPage!=nPage );
3344       pBt->nPage = nPage;
3345       releasePage(pPage1);
3346     }
3347     assert( countWriteCursors(pBt)==0 );
3348     pBt->inTransaction = TRANS_READ;
3349   }
3350 
3351   btreeEndTransaction(p);
3352   sqlite3BtreeLeave(p);
3353   return rc;
3354 }
3355 
3356 /*
3357 ** Start a statement subtransaction. The subtransaction can can be rolled
3358 ** back independently of the main transaction. You must start a transaction
3359 ** before starting a subtransaction. The subtransaction is ended automatically
3360 ** if the main transaction commits or rolls back.
3361 **
3362 ** Statement subtransactions are used around individual SQL statements
3363 ** that are contained within a BEGIN...COMMIT block.  If a constraint
3364 ** error occurs within the statement, the effect of that one statement
3365 ** can be rolled back without having to rollback the entire transaction.
3366 **
3367 ** A statement sub-transaction is implemented as an anonymous savepoint. The
3368 ** value passed as the second parameter is the total number of savepoints,
3369 ** including the new anonymous savepoint, open on the B-Tree. i.e. if there
3370 ** are no active savepoints and no other statement-transactions open,
3371 ** iStatement is 1. This anonymous savepoint can be released or rolled back
3372 ** using the sqlite3BtreeSavepoint() function.
3373 */
3374 int sqlite3BtreeBeginStmt(Btree *p, int iStatement){
3375   int rc;
3376   BtShared *pBt = p->pBt;
3377   sqlite3BtreeEnter(p);
3378   assert( p->inTrans==TRANS_WRITE );
3379   assert( (pBt->btsFlags & BTS_READ_ONLY)==0 );
3380   assert( iStatement>0 );
3381   assert( iStatement>p->db->nSavepoint );
3382   assert( pBt->inTransaction==TRANS_WRITE );
3383   /* At the pager level, a statement transaction is a savepoint with
3384   ** an index greater than all savepoints created explicitly using
3385   ** SQL statements. It is illegal to open, release or rollback any
3386   ** such savepoints while the statement transaction savepoint is active.
3387   */
3388   rc = sqlite3PagerOpenSavepoint(pBt->pPager, iStatement);
3389   sqlite3BtreeLeave(p);
3390   return rc;
3391 }
3392 
3393 /*
3394 ** The second argument to this function, op, is always SAVEPOINT_ROLLBACK
3395 ** or SAVEPOINT_RELEASE. This function either releases or rolls back the
3396 ** savepoint identified by parameter iSavepoint, depending on the value
3397 ** of op.
3398 **
3399 ** Normally, iSavepoint is greater than or equal to zero. However, if op is
3400 ** SAVEPOINT_ROLLBACK, then iSavepoint may also be -1. In this case the
3401 ** contents of the entire transaction are rolled back. This is different
3402 ** from a normal transaction rollback, as no locks are released and the
3403 ** transaction remains open.
3404 */
3405 int sqlite3BtreeSavepoint(Btree *p, int op, int iSavepoint){
3406   int rc = SQLITE_OK;
3407   if( p && p->inTrans==TRANS_WRITE ){
3408     BtShared *pBt = p->pBt;
3409     assert( op==SAVEPOINT_RELEASE || op==SAVEPOINT_ROLLBACK );
3410     assert( iSavepoint>=0 || (iSavepoint==-1 && op==SAVEPOINT_ROLLBACK) );
3411     sqlite3BtreeEnter(p);
3412     rc = sqlite3PagerSavepoint(pBt->pPager, op, iSavepoint);
3413     if( rc==SQLITE_OK ){
3414       if( iSavepoint<0 && (pBt->btsFlags & BTS_INITIALLY_EMPTY)!=0 ){
3415         pBt->nPage = 0;
3416       }
3417       rc = newDatabase(pBt);
3418       pBt->nPage = get4byte(28 + pBt->pPage1->aData);
3419 
3420       /* The database size was written into the offset 28 of the header
3421       ** when the transaction started, so we know that the value at offset
3422       ** 28 is nonzero. */
3423       assert( pBt->nPage>0 );
3424     }
3425     sqlite3BtreeLeave(p);
3426   }
3427   return rc;
3428 }
3429 
3430 /*
3431 ** Create a new cursor for the BTree whose root is on the page
3432 ** iTable. If a read-only cursor is requested, it is assumed that
3433 ** the caller already has at least a read-only transaction open
3434 ** on the database already. If a write-cursor is requested, then
3435 ** the caller is assumed to have an open write transaction.
3436 **
3437 ** If wrFlag==0, then the cursor can only be used for reading.
3438 ** If wrFlag==1, then the cursor can be used for reading or for
3439 ** writing if other conditions for writing are also met.  These
3440 ** are the conditions that must be met in order for writing to
3441 ** be allowed:
3442 **
3443 ** 1:  The cursor must have been opened with wrFlag==1
3444 **
3445 ** 2:  Other database connections that share the same pager cache
3446 **     but which are not in the READ_UNCOMMITTED state may not have
3447 **     cursors open with wrFlag==0 on the same table.  Otherwise
3448 **     the changes made by this write cursor would be visible to
3449 **     the read cursors in the other database connection.
3450 **
3451 ** 3:  The database must be writable (not on read-only media)
3452 **
3453 ** 4:  There must be an active transaction.
3454 **
3455 ** No checking is done to make sure that page iTable really is the
3456 ** root page of a b-tree.  If it is not, then the cursor acquired
3457 ** will not work correctly.
3458 **
3459 ** It is assumed that the sqlite3BtreeCursorZero() has been called
3460 ** on pCur to initialize the memory space prior to invoking this routine.
3461 */
3462 static int btreeCursor(
3463   Btree *p,                              /* The btree */
3464   int iTable,                            /* Root page of table to open */
3465   int wrFlag,                            /* 1 to write. 0 read-only */
3466   struct KeyInfo *pKeyInfo,              /* First arg to comparison function */
3467   BtCursor *pCur                         /* Space for new cursor */
3468 ){
3469   BtShared *pBt = p->pBt;                /* Shared b-tree handle */
3470 
3471   assert( sqlite3BtreeHoldsMutex(p) );
3472   assert( wrFlag==0 || wrFlag==1 );
3473 
3474   /* The following assert statements verify that if this is a sharable
3475   ** b-tree database, the connection is holding the required table locks,
3476   ** and that no other connection has any open cursor that conflicts with
3477   ** this lock.  */
3478   assert( hasSharedCacheTableLock(p, iTable, pKeyInfo!=0, wrFlag+1) );
3479   assert( wrFlag==0 || !hasReadConflicts(p, iTable) );
3480 
3481   /* Assert that the caller has opened the required transaction. */
3482   assert( p->inTrans>TRANS_NONE );
3483   assert( wrFlag==0 || p->inTrans==TRANS_WRITE );
3484   assert( pBt->pPage1 && pBt->pPage1->aData );
3485 
3486   if( NEVER(wrFlag && (pBt->btsFlags & BTS_READ_ONLY)!=0) ){
3487     return SQLITE_READONLY;
3488   }
3489   if( iTable==1 && btreePagecount(pBt)==0 ){
3490     assert( wrFlag==0 );
3491     iTable = 0;
3492   }
3493 
3494   /* Now that no other errors can occur, finish filling in the BtCursor
3495   ** variables and link the cursor into the BtShared list.  */
3496   pCur->pgnoRoot = (Pgno)iTable;
3497   pCur->iPage = -1;
3498   pCur->pKeyInfo = pKeyInfo;
3499   pCur->pBtree = p;
3500   pCur->pBt = pBt;
3501   pCur->wrFlag = (u8)wrFlag;
3502   pCur->pNext = pBt->pCursor;
3503   if( pCur->pNext ){
3504     pCur->pNext->pPrev = pCur;
3505   }
3506   pBt->pCursor = pCur;
3507   pCur->eState = CURSOR_INVALID;
3508   pCur->cachedRowid = 0;
3509   return SQLITE_OK;
3510 }
3511 int sqlite3BtreeCursor(
3512   Btree *p,                                   /* The btree */
3513   int iTable,                                 /* Root page of table to open */
3514   int wrFlag,                                 /* 1 to write. 0 read-only */
3515   struct KeyInfo *pKeyInfo,                   /* First arg to xCompare() */
3516   BtCursor *pCur                              /* Write new cursor here */
3517 ){
3518   int rc;
3519   sqlite3BtreeEnter(p);
3520   rc = btreeCursor(p, iTable, wrFlag, pKeyInfo, pCur);
3521   sqlite3BtreeLeave(p);
3522   return rc;
3523 }
3524 
3525 /*
3526 ** Return the size of a BtCursor object in bytes.
3527 **
3528 ** This interfaces is needed so that users of cursors can preallocate
3529 ** sufficient storage to hold a cursor.  The BtCursor object is opaque
3530 ** to users so they cannot do the sizeof() themselves - they must call
3531 ** this routine.
3532 */
3533 int sqlite3BtreeCursorSize(void){
3534   return ROUND8(sizeof(BtCursor));
3535 }
3536 
3537 /*
3538 ** Initialize memory that will be converted into a BtCursor object.
3539 **
3540 ** The simple approach here would be to memset() the entire object
3541 ** to zero.  But it turns out that the apPage[] and aiIdx[] arrays
3542 ** do not need to be zeroed and they are large, so we can save a lot
3543 ** of run-time by skipping the initialization of those elements.
3544 */
3545 void sqlite3BtreeCursorZero(BtCursor *p){
3546   memset(p, 0, offsetof(BtCursor, iPage));
3547 }
3548 
3549 /*
3550 ** Set the cached rowid value of every cursor in the same database file
3551 ** as pCur and having the same root page number as pCur.  The value is
3552 ** set to iRowid.
3553 **
3554 ** Only positive rowid values are considered valid for this cache.
3555 ** The cache is initialized to zero, indicating an invalid cache.
3556 ** A btree will work fine with zero or negative rowids.  We just cannot
3557 ** cache zero or negative rowids, which means tables that use zero or
3558 ** negative rowids might run a little slower.  But in practice, zero
3559 ** or negative rowids are very uncommon so this should not be a problem.
3560 */
3561 void sqlite3BtreeSetCachedRowid(BtCursor *pCur, sqlite3_int64 iRowid){
3562   BtCursor *p;
3563   for(p=pCur->pBt->pCursor; p; p=p->pNext){
3564     if( p->pgnoRoot==pCur->pgnoRoot ) p->cachedRowid = iRowid;
3565   }
3566   assert( pCur->cachedRowid==iRowid );
3567 }
3568 
3569 /*
3570 ** Return the cached rowid for the given cursor.  A negative or zero
3571 ** return value indicates that the rowid cache is invalid and should be
3572 ** ignored.  If the rowid cache has never before been set, then a
3573 ** zero is returned.
3574 */
3575 sqlite3_int64 sqlite3BtreeGetCachedRowid(BtCursor *pCur){
3576   return pCur->cachedRowid;
3577 }
3578 
3579 /*
3580 ** Close a cursor.  The read lock on the database file is released
3581 ** when the last cursor is closed.
3582 */
3583 int sqlite3BtreeCloseCursor(BtCursor *pCur){
3584   Btree *pBtree = pCur->pBtree;
3585   if( pBtree ){
3586     int i;
3587     BtShared *pBt = pCur->pBt;
3588     sqlite3BtreeEnter(pBtree);
3589     sqlite3BtreeClearCursor(pCur);
3590     if( pCur->pPrev ){
3591       pCur->pPrev->pNext = pCur->pNext;
3592     }else{
3593       pBt->pCursor = pCur->pNext;
3594     }
3595     if( pCur->pNext ){
3596       pCur->pNext->pPrev = pCur->pPrev;
3597     }
3598     for(i=0; i<=pCur->iPage; i++){
3599       releasePage(pCur->apPage[i]);
3600     }
3601     unlockBtreeIfUnused(pBt);
3602     invalidateOverflowCache(pCur);
3603     /* sqlite3_free(pCur); */
3604     sqlite3BtreeLeave(pBtree);
3605   }
3606   return SQLITE_OK;
3607 }
3608 
3609 /*
3610 ** Make sure the BtCursor* given in the argument has a valid
3611 ** BtCursor.info structure.  If it is not already valid, call
3612 ** btreeParseCell() to fill it in.
3613 **
3614 ** BtCursor.info is a cache of the information in the current cell.
3615 ** Using this cache reduces the number of calls to btreeParseCell().
3616 **
3617 ** 2007-06-25:  There is a bug in some versions of MSVC that cause the
3618 ** compiler to crash when getCellInfo() is implemented as a macro.
3619 ** But there is a measureable speed advantage to using the macro on gcc
3620 ** (when less compiler optimizations like -Os or -O0 are used and the
3621 ** compiler is not doing agressive inlining.)  So we use a real function
3622 ** for MSVC and a macro for everything else.  Ticket #2457.
3623 */
3624 #ifndef NDEBUG
3625   static void assertCellInfo(BtCursor *pCur){
3626     CellInfo info;
3627     int iPage = pCur->iPage;
3628     memset(&info, 0, sizeof(info));
3629     btreeParseCell(pCur->apPage[iPage], pCur->aiIdx[iPage], &info);
3630     assert( memcmp(&info, &pCur->info, sizeof(info))==0 );
3631   }
3632 #else
3633   #define assertCellInfo(x)
3634 #endif
3635 #ifdef _MSC_VER
3636   /* Use a real function in MSVC to work around bugs in that compiler. */
3637   static void getCellInfo(BtCursor *pCur){
3638     if( pCur->info.nSize==0 ){
3639       int iPage = pCur->iPage;
3640       btreeParseCell(pCur->apPage[iPage],pCur->aiIdx[iPage],&pCur->info);
3641       pCur->validNKey = 1;
3642     }else{
3643       assertCellInfo(pCur);
3644     }
3645   }
3646 #else /* if not _MSC_VER */
3647   /* Use a macro in all other compilers so that the function is inlined */
3648 #define getCellInfo(pCur)                                                      \
3649   if( pCur->info.nSize==0 ){                                                   \
3650     int iPage = pCur->iPage;                                                   \
3651     btreeParseCell(pCur->apPage[iPage],pCur->aiIdx[iPage],&pCur->info); \
3652     pCur->validNKey = 1;                                                       \
3653   }else{                                                                       \
3654     assertCellInfo(pCur);                                                      \
3655   }
3656 #endif /* _MSC_VER */
3657 
3658 #ifndef NDEBUG  /* The next routine used only within assert() statements */
3659 /*
3660 ** Return true if the given BtCursor is valid.  A valid cursor is one
3661 ** that is currently pointing to a row in a (non-empty) table.
3662 ** This is a verification routine is used only within assert() statements.
3663 */
3664 int sqlite3BtreeCursorIsValid(BtCursor *pCur){
3665   return pCur && pCur->eState==CURSOR_VALID;
3666 }
3667 #endif /* NDEBUG */
3668 
3669 /*
3670 ** Set *pSize to the size of the buffer needed to hold the value of
3671 ** the key for the current entry.  If the cursor is not pointing
3672 ** to a valid entry, *pSize is set to 0.
3673 **
3674 ** For a table with the INTKEY flag set, this routine returns the key
3675 ** itself, not the number of bytes in the key.
3676 **
3677 ** The caller must position the cursor prior to invoking this routine.
3678 **
3679 ** This routine cannot fail.  It always returns SQLITE_OK.
3680 */
3681 int sqlite3BtreeKeySize(BtCursor *pCur, i64 *pSize){
3682   assert( cursorHoldsMutex(pCur) );
3683   assert( pCur->eState==CURSOR_INVALID || pCur->eState==CURSOR_VALID );
3684   if( pCur->eState!=CURSOR_VALID ){
3685     *pSize = 0;
3686   }else{
3687     getCellInfo(pCur);
3688     *pSize = pCur->info.nKey;
3689   }
3690   return SQLITE_OK;
3691 }
3692 
3693 /*
3694 ** Set *pSize to the number of bytes of data in the entry the
3695 ** cursor currently points to.
3696 **
3697 ** The caller must guarantee that the cursor is pointing to a non-NULL
3698 ** valid entry.  In other words, the calling procedure must guarantee
3699 ** that the cursor has Cursor.eState==CURSOR_VALID.
3700 **
3701 ** Failure is not possible.  This function always returns SQLITE_OK.
3702 ** It might just as well be a procedure (returning void) but we continue
3703 ** to return an integer result code for historical reasons.
3704 */
3705 int sqlite3BtreeDataSize(BtCursor *pCur, u32 *pSize){
3706   assert( cursorHoldsMutex(pCur) );
3707   assert( pCur->eState==CURSOR_VALID );
3708   getCellInfo(pCur);
3709   *pSize = pCur->info.nData;
3710   return SQLITE_OK;
3711 }
3712 
3713 /*
3714 ** Given the page number of an overflow page in the database (parameter
3715 ** ovfl), this function finds the page number of the next page in the
3716 ** linked list of overflow pages. If possible, it uses the auto-vacuum
3717 ** pointer-map data instead of reading the content of page ovfl to do so.
3718 **
3719 ** If an error occurs an SQLite error code is returned. Otherwise:
3720 **
3721 ** The page number of the next overflow page in the linked list is
3722 ** written to *pPgnoNext. If page ovfl is the last page in its linked
3723 ** list, *pPgnoNext is set to zero.
3724 **
3725 ** If ppPage is not NULL, and a reference to the MemPage object corresponding
3726 ** to page number pOvfl was obtained, then *ppPage is set to point to that
3727 ** reference. It is the responsibility of the caller to call releasePage()
3728 ** on *ppPage to free the reference. In no reference was obtained (because
3729 ** the pointer-map was used to obtain the value for *pPgnoNext), then
3730 ** *ppPage is set to zero.
3731 */
3732 static int getOverflowPage(
3733   BtShared *pBt,               /* The database file */
3734   Pgno ovfl,                   /* Current overflow page number */
3735   MemPage **ppPage,            /* OUT: MemPage handle (may be NULL) */
3736   Pgno *pPgnoNext              /* OUT: Next overflow page number */
3737 ){
3738   Pgno next = 0;
3739   MemPage *pPage = 0;
3740   int rc = SQLITE_OK;
3741 
3742   assert( sqlite3_mutex_held(pBt->mutex) );
3743   assert(pPgnoNext);
3744 
3745 #ifndef SQLITE_OMIT_AUTOVACUUM
3746   /* Try to find the next page in the overflow list using the
3747   ** autovacuum pointer-map pages. Guess that the next page in
3748   ** the overflow list is page number (ovfl+1). If that guess turns
3749   ** out to be wrong, fall back to loading the data of page
3750   ** number ovfl to determine the next page number.
3751   */
3752   if( pBt->autoVacuum ){
3753     Pgno pgno;
3754     Pgno iGuess = ovfl+1;
3755     u8 eType;
3756 
3757     while( PTRMAP_ISPAGE(pBt, iGuess) || iGuess==PENDING_BYTE_PAGE(pBt) ){
3758       iGuess++;
3759     }
3760 
3761     if( iGuess<=btreePagecount(pBt) ){
3762       rc = ptrmapGet(pBt, iGuess, &eType, &pgno);
3763       if( rc==SQLITE_OK && eType==PTRMAP_OVERFLOW2 && pgno==ovfl ){
3764         next = iGuess;
3765         rc = SQLITE_DONE;
3766       }
3767     }
3768   }
3769 #endif
3770 
3771   assert( next==0 || rc==SQLITE_DONE );
3772   if( rc==SQLITE_OK ){
3773     rc = btreeGetPage(pBt, ovfl, &pPage, 0);
3774     assert( rc==SQLITE_OK || pPage==0 );
3775     if( rc==SQLITE_OK ){
3776       next = get4byte(pPage->aData);
3777     }
3778   }
3779 
3780   *pPgnoNext = next;
3781   if( ppPage ){
3782     *ppPage = pPage;
3783   }else{
3784     releasePage(pPage);
3785   }
3786   return (rc==SQLITE_DONE ? SQLITE_OK : rc);
3787 }
3788 
3789 /*
3790 ** Copy data from a buffer to a page, or from a page to a buffer.
3791 **
3792 ** pPayload is a pointer to data stored on database page pDbPage.
3793 ** If argument eOp is false, then nByte bytes of data are copied
3794 ** from pPayload to the buffer pointed at by pBuf. If eOp is true,
3795 ** then sqlite3PagerWrite() is called on pDbPage and nByte bytes
3796 ** of data are copied from the buffer pBuf to pPayload.
3797 **
3798 ** SQLITE_OK is returned on success, otherwise an error code.
3799 */
3800 static int copyPayload(
3801   void *pPayload,           /* Pointer to page data */
3802   void *pBuf,               /* Pointer to buffer */
3803   int nByte,                /* Number of bytes to copy */
3804   int eOp,                  /* 0 -> copy from page, 1 -> copy to page */
3805   DbPage *pDbPage           /* Page containing pPayload */
3806 ){
3807   if( eOp ){
3808     /* Copy data from buffer to page (a write operation) */
3809     int rc = sqlite3PagerWrite(pDbPage);
3810     if( rc!=SQLITE_OK ){
3811       return rc;
3812     }
3813     memcpy(pPayload, pBuf, nByte);
3814   }else{
3815     /* Copy data from page to buffer (a read operation) */
3816     memcpy(pBuf, pPayload, nByte);
3817   }
3818   return SQLITE_OK;
3819 }
3820 
3821 /*
3822 ** This function is used to read or overwrite payload information
3823 ** for the entry that the pCur cursor is pointing to. If the eOp
3824 ** parameter is 0, this is a read operation (data copied into
3825 ** buffer pBuf). If it is non-zero, a write (data copied from
3826 ** buffer pBuf).
3827 **
3828 ** A total of "amt" bytes are read or written beginning at "offset".
3829 ** Data is read to or from the buffer pBuf.
3830 **
3831 ** The content being read or written might appear on the main page
3832 ** or be scattered out on multiple overflow pages.
3833 **
3834 ** If the BtCursor.isIncrblobHandle flag is set, and the current
3835 ** cursor entry uses one or more overflow pages, this function
3836 ** allocates space for and lazily popluates the overflow page-list
3837 ** cache array (BtCursor.aOverflow). Subsequent calls use this
3838 ** cache to make seeking to the supplied offset more efficient.
3839 **
3840 ** Once an overflow page-list cache has been allocated, it may be
3841 ** invalidated if some other cursor writes to the same table, or if
3842 ** the cursor is moved to a different row. Additionally, in auto-vacuum
3843 ** mode, the following events may invalidate an overflow page-list cache.
3844 **
3845 **   * An incremental vacuum,
3846 **   * A commit in auto_vacuum="full" mode,
3847 **   * Creating a table (may require moving an overflow page).
3848 */
3849 static int accessPayload(
3850   BtCursor *pCur,      /* Cursor pointing to entry to read from */
3851   u32 offset,          /* Begin reading this far into payload */
3852   u32 amt,             /* Read this many bytes */
3853   unsigned char *pBuf, /* Write the bytes into this buffer */
3854   int eOp              /* zero to read. non-zero to write. */
3855 ){
3856   unsigned char *aPayload;
3857   int rc = SQLITE_OK;
3858   u32 nKey;
3859   int iIdx = 0;
3860   MemPage *pPage = pCur->apPage[pCur->iPage]; /* Btree page of current entry */
3861   BtShared *pBt = pCur->pBt;                  /* Btree this cursor belongs to */
3862 
3863   assert( pPage );
3864   assert( pCur->eState==CURSOR_VALID );
3865   assert( pCur->aiIdx[pCur->iPage]<pPage->nCell );
3866   assert( cursorHoldsMutex(pCur) );
3867 
3868   getCellInfo(pCur);
3869   aPayload = pCur->info.pCell + pCur->info.nHeader;
3870   nKey = (pPage->intKey ? 0 : (int)pCur->info.nKey);
3871 
3872   if( NEVER(offset+amt > nKey+pCur->info.nData)
3873    || &aPayload[pCur->info.nLocal] > &pPage->aData[pBt->usableSize]
3874   ){
3875     /* Trying to read or write past the end of the data is an error */
3876     return SQLITE_CORRUPT_BKPT;
3877   }
3878 
3879   /* Check if data must be read/written to/from the btree page itself. */
3880   if( offset<pCur->info.nLocal ){
3881     int a = amt;
3882     if( a+offset>pCur->info.nLocal ){
3883       a = pCur->info.nLocal - offset;
3884     }
3885     rc = copyPayload(&aPayload[offset], pBuf, a, eOp, pPage->pDbPage);
3886     offset = 0;
3887     pBuf += a;
3888     amt -= a;
3889   }else{
3890     offset -= pCur->info.nLocal;
3891   }
3892 
3893   if( rc==SQLITE_OK && amt>0 ){
3894     const u32 ovflSize = pBt->usableSize - 4;  /* Bytes content per ovfl page */
3895     Pgno nextPage;
3896 
3897     nextPage = get4byte(&aPayload[pCur->info.nLocal]);
3898 
3899 #ifndef SQLITE_OMIT_INCRBLOB
3900     /* If the isIncrblobHandle flag is set and the BtCursor.aOverflow[]
3901     ** has not been allocated, allocate it now. The array is sized at
3902     ** one entry for each overflow page in the overflow chain. The
3903     ** page number of the first overflow page is stored in aOverflow[0],
3904     ** etc. A value of 0 in the aOverflow[] array means "not yet known"
3905     ** (the cache is lazily populated).
3906     */
3907     if( pCur->isIncrblobHandle && !pCur->aOverflow ){
3908       int nOvfl = (pCur->info.nPayload-pCur->info.nLocal+ovflSize-1)/ovflSize;
3909       pCur->aOverflow = (Pgno *)sqlite3MallocZero(sizeof(Pgno)*nOvfl);
3910       /* nOvfl is always positive.  If it were zero, fetchPayload would have
3911       ** been used instead of this routine. */
3912       if( ALWAYS(nOvfl) && !pCur->aOverflow ){
3913         rc = SQLITE_NOMEM;
3914       }
3915     }
3916 
3917     /* If the overflow page-list cache has been allocated and the
3918     ** entry for the first required overflow page is valid, skip
3919     ** directly to it.
3920     */
3921     if( pCur->aOverflow && pCur->aOverflow[offset/ovflSize] ){
3922       iIdx = (offset/ovflSize);
3923       nextPage = pCur->aOverflow[iIdx];
3924       offset = (offset%ovflSize);
3925     }
3926 #endif
3927 
3928     for( ; rc==SQLITE_OK && amt>0 && nextPage; iIdx++){
3929 
3930 #ifndef SQLITE_OMIT_INCRBLOB
3931       /* If required, populate the overflow page-list cache. */
3932       if( pCur->aOverflow ){
3933         assert(!pCur->aOverflow[iIdx] || pCur->aOverflow[iIdx]==nextPage);
3934         pCur->aOverflow[iIdx] = nextPage;
3935       }
3936 #endif
3937 
3938       if( offset>=ovflSize ){
3939         /* The only reason to read this page is to obtain the page
3940         ** number for the next page in the overflow chain. The page
3941         ** data is not required. So first try to lookup the overflow
3942         ** page-list cache, if any, then fall back to the getOverflowPage()
3943         ** function.
3944         */
3945 #ifndef SQLITE_OMIT_INCRBLOB
3946         if( pCur->aOverflow && pCur->aOverflow[iIdx+1] ){
3947           nextPage = pCur->aOverflow[iIdx+1];
3948         } else
3949 #endif
3950           rc = getOverflowPage(pBt, nextPage, 0, &nextPage);
3951         offset -= ovflSize;
3952       }else{
3953         /* Need to read this page properly. It contains some of the
3954         ** range of data that is being read (eOp==0) or written (eOp!=0).
3955         */
3956 #ifdef SQLITE_DIRECT_OVERFLOW_READ
3957         sqlite3_file *fd;
3958 #endif
3959         int a = amt;
3960         if( a + offset > ovflSize ){
3961           a = ovflSize - offset;
3962         }
3963 
3964 #ifdef SQLITE_DIRECT_OVERFLOW_READ
3965         /* If all the following are true:
3966         **
3967         **   1) this is a read operation, and
3968         **   2) data is required from the start of this overflow page, and
3969         **   3) the database is file-backed, and
3970         **   4) there is no open write-transaction, and
3971         **   5) the database is not a WAL database,
3972         **
3973         ** then data can be read directly from the database file into the
3974         ** output buffer, bypassing the page-cache altogether. This speeds
3975         ** up loading large records that span many overflow pages.
3976         */
3977         if( eOp==0                                             /* (1) */
3978          && offset==0                                          /* (2) */
3979          && pBt->inTransaction==TRANS_READ                     /* (4) */
3980          && (fd = sqlite3PagerFile(pBt->pPager))->pMethods     /* (3) */
3981          && pBt->pPage1->aData[19]==0x01                       /* (5) */
3982         ){
3983           u8 aSave[4];
3984           u8 *aWrite = &pBuf[-4];
3985           memcpy(aSave, aWrite, 4);
3986           rc = sqlite3OsRead(fd, aWrite, a+4, (i64)pBt->pageSize*(nextPage-1));
3987           nextPage = get4byte(aWrite);
3988           memcpy(aWrite, aSave, 4);
3989         }else
3990 #endif
3991 
3992         {
3993           DbPage *pDbPage;
3994           rc = sqlite3PagerGet(pBt->pPager, nextPage, &pDbPage);
3995           if( rc==SQLITE_OK ){
3996             aPayload = sqlite3PagerGetData(pDbPage);
3997             nextPage = get4byte(aPayload);
3998             rc = copyPayload(&aPayload[offset+4], pBuf, a, eOp, pDbPage);
3999             sqlite3PagerUnref(pDbPage);
4000             offset = 0;
4001           }
4002         }
4003         amt -= a;
4004         pBuf += a;
4005       }
4006     }
4007   }
4008 
4009   if( rc==SQLITE_OK && amt>0 ){
4010     return SQLITE_CORRUPT_BKPT;
4011   }
4012   return rc;
4013 }
4014 
4015 /*
4016 ** Read part of the key associated with cursor pCur.  Exactly
4017 ** "amt" bytes will be transfered into pBuf[].  The transfer
4018 ** begins at "offset".
4019 **
4020 ** The caller must ensure that pCur is pointing to a valid row
4021 ** in the table.
4022 **
4023 ** Return SQLITE_OK on success or an error code if anything goes
4024 ** wrong.  An error is returned if "offset+amt" is larger than
4025 ** the available payload.
4026 */
4027 int sqlite3BtreeKey(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){
4028   assert( cursorHoldsMutex(pCur) );
4029   assert( pCur->eState==CURSOR_VALID );
4030   assert( pCur->iPage>=0 && pCur->apPage[pCur->iPage] );
4031   assert( pCur->aiIdx[pCur->iPage]<pCur->apPage[pCur->iPage]->nCell );
4032   return accessPayload(pCur, offset, amt, (unsigned char*)pBuf, 0);
4033 }
4034 
4035 /*
4036 ** Read part of the data associated with cursor pCur.  Exactly
4037 ** "amt" bytes will be transfered into pBuf[].  The transfer
4038 ** begins at "offset".
4039 **
4040 ** Return SQLITE_OK on success or an error code if anything goes
4041 ** wrong.  An error is returned if "offset+amt" is larger than
4042 ** the available payload.
4043 */
4044 int sqlite3BtreeData(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){
4045   int rc;
4046 
4047 #ifndef SQLITE_OMIT_INCRBLOB
4048   if ( pCur->eState==CURSOR_INVALID ){
4049     return SQLITE_ABORT;
4050   }
4051 #endif
4052 
4053   assert( cursorHoldsMutex(pCur) );
4054   rc = restoreCursorPosition(pCur);
4055   if( rc==SQLITE_OK ){
4056     assert( pCur->eState==CURSOR_VALID );
4057     assert( pCur->iPage>=0 && pCur->apPage[pCur->iPage] );
4058     assert( pCur->aiIdx[pCur->iPage]<pCur->apPage[pCur->iPage]->nCell );
4059     rc = accessPayload(pCur, offset, amt, pBuf, 0);
4060   }
4061   return rc;
4062 }
4063 
4064 /*
4065 ** Return a pointer to payload information from the entry that the
4066 ** pCur cursor is pointing to.  The pointer is to the beginning of
4067 ** the key if skipKey==0 and it points to the beginning of data if
4068 ** skipKey==1.  The number of bytes of available key/data is written
4069 ** into *pAmt.  If *pAmt==0, then the value returned will not be
4070 ** a valid pointer.
4071 **
4072 ** This routine is an optimization.  It is common for the entire key
4073 ** and data to fit on the local page and for there to be no overflow
4074 ** pages.  When that is so, this routine can be used to access the
4075 ** key and data without making a copy.  If the key and/or data spills
4076 ** onto overflow pages, then accessPayload() must be used to reassemble
4077 ** the key/data and copy it into a preallocated buffer.
4078 **
4079 ** The pointer returned by this routine looks directly into the cached
4080 ** page of the database.  The data might change or move the next time
4081 ** any btree routine is called.
4082 */
4083 static const unsigned char *fetchPayload(
4084   BtCursor *pCur,      /* Cursor pointing to entry to read from */
4085   int *pAmt,           /* Write the number of available bytes here */
4086   int skipKey          /* read beginning at data if this is true */
4087 ){
4088   unsigned char *aPayload;
4089   MemPage *pPage;
4090   u32 nKey;
4091   u32 nLocal;
4092 
4093   assert( pCur!=0 && pCur->iPage>=0 && pCur->apPage[pCur->iPage]);
4094   assert( pCur->eState==CURSOR_VALID );
4095   assert( cursorHoldsMutex(pCur) );
4096   pPage = pCur->apPage[pCur->iPage];
4097   assert( pCur->aiIdx[pCur->iPage]<pPage->nCell );
4098   if( NEVER(pCur->info.nSize==0) ){
4099     btreeParseCell(pCur->apPage[pCur->iPage], pCur->aiIdx[pCur->iPage],
4100                    &pCur->info);
4101   }
4102   aPayload = pCur->info.pCell;
4103   aPayload += pCur->info.nHeader;
4104   if( pPage->intKey ){
4105     nKey = 0;
4106   }else{
4107     nKey = (int)pCur->info.nKey;
4108   }
4109   if( skipKey ){
4110     aPayload += nKey;
4111     nLocal = pCur->info.nLocal - nKey;
4112   }else{
4113     nLocal = pCur->info.nLocal;
4114     assert( nLocal<=nKey );
4115   }
4116   *pAmt = nLocal;
4117   return aPayload;
4118 }
4119 
4120 
4121 /*
4122 ** For the entry that cursor pCur is point to, return as
4123 ** many bytes of the key or data as are available on the local
4124 ** b-tree page.  Write the number of available bytes into *pAmt.
4125 **
4126 ** The pointer returned is ephemeral.  The key/data may move
4127 ** or be destroyed on the next call to any Btree routine,
4128 ** including calls from other threads against the same cache.
4129 ** Hence, a mutex on the BtShared should be held prior to calling
4130 ** this routine.
4131 **
4132 ** These routines is used to get quick access to key and data
4133 ** in the common case where no overflow pages are used.
4134 */
4135 const void *sqlite3BtreeKeyFetch(BtCursor *pCur, int *pAmt){
4136   const void *p = 0;
4137   assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
4138   assert( cursorHoldsMutex(pCur) );
4139   if( ALWAYS(pCur->eState==CURSOR_VALID) ){
4140     p = (const void*)fetchPayload(pCur, pAmt, 0);
4141   }
4142   return p;
4143 }
4144 const void *sqlite3BtreeDataFetch(BtCursor *pCur, int *pAmt){
4145   const void *p = 0;
4146   assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
4147   assert( cursorHoldsMutex(pCur) );
4148   if( ALWAYS(pCur->eState==CURSOR_VALID) ){
4149     p = (const void*)fetchPayload(pCur, pAmt, 1);
4150   }
4151   return p;
4152 }
4153 
4154 
4155 /*
4156 ** Move the cursor down to a new child page.  The newPgno argument is the
4157 ** page number of the child page to move to.
4158 **
4159 ** This function returns SQLITE_CORRUPT if the page-header flags field of
4160 ** the new child page does not match the flags field of the parent (i.e.
4161 ** if an intkey page appears to be the parent of a non-intkey page, or
4162 ** vice-versa).
4163 */
4164 static int moveToChild(BtCursor *pCur, u32 newPgno){
4165   int rc;
4166   int i = pCur->iPage;
4167   MemPage *pNewPage;
4168   BtShared *pBt = pCur->pBt;
4169 
4170   assert( cursorHoldsMutex(pCur) );
4171   assert( pCur->eState==CURSOR_VALID );
4172   assert( pCur->iPage<BTCURSOR_MAX_DEPTH );
4173   if( pCur->iPage>=(BTCURSOR_MAX_DEPTH-1) ){
4174     return SQLITE_CORRUPT_BKPT;
4175   }
4176   rc = getAndInitPage(pBt, newPgno, &pNewPage);
4177   if( rc ) return rc;
4178   pCur->apPage[i+1] = pNewPage;
4179   pCur->aiIdx[i+1] = 0;
4180   pCur->iPage++;
4181 
4182   pCur->info.nSize = 0;
4183   pCur->validNKey = 0;
4184   if( pNewPage->nCell<1 || pNewPage->intKey!=pCur->apPage[i]->intKey ){
4185     return SQLITE_CORRUPT_BKPT;
4186   }
4187   return SQLITE_OK;
4188 }
4189 
4190 #if 0
4191 /*
4192 ** Page pParent is an internal (non-leaf) tree page. This function
4193 ** asserts that page number iChild is the left-child if the iIdx'th
4194 ** cell in page pParent. Or, if iIdx is equal to the total number of
4195 ** cells in pParent, that page number iChild is the right-child of
4196 ** the page.
4197 */
4198 static void assertParentIndex(MemPage *pParent, int iIdx, Pgno iChild){
4199   assert( iIdx<=pParent->nCell );
4200   if( iIdx==pParent->nCell ){
4201     assert( get4byte(&pParent->aData[pParent->hdrOffset+8])==iChild );
4202   }else{
4203     assert( get4byte(findCell(pParent, iIdx))==iChild );
4204   }
4205 }
4206 #else
4207 #  define assertParentIndex(x,y,z)
4208 #endif
4209 
4210 /*
4211 ** Move the cursor up to the parent page.
4212 **
4213 ** pCur->idx is set to the cell index that contains the pointer
4214 ** to the page we are coming from.  If we are coming from the
4215 ** right-most child page then pCur->idx is set to one more than
4216 ** the largest cell index.
4217 */
4218 static void moveToParent(BtCursor *pCur){
4219   assert( cursorHoldsMutex(pCur) );
4220   assert( pCur->eState==CURSOR_VALID );
4221   assert( pCur->iPage>0 );
4222   assert( pCur->apPage[pCur->iPage] );
4223 
4224   /* UPDATE: It is actually possible for the condition tested by the assert
4225   ** below to be untrue if the database file is corrupt. This can occur if
4226   ** one cursor has modified page pParent while a reference to it is held
4227   ** by a second cursor. Which can only happen if a single page is linked
4228   ** into more than one b-tree structure in a corrupt database.  */
4229 #if 0
4230   assertParentIndex(
4231     pCur->apPage[pCur->iPage-1],
4232     pCur->aiIdx[pCur->iPage-1],
4233     pCur->apPage[pCur->iPage]->pgno
4234   );
4235 #endif
4236   testcase( pCur->aiIdx[pCur->iPage-1] > pCur->apPage[pCur->iPage-1]->nCell );
4237 
4238   releasePage(pCur->apPage[pCur->iPage]);
4239   pCur->iPage--;
4240   pCur->info.nSize = 0;
4241   pCur->validNKey = 0;
4242 }
4243 
4244 /*
4245 ** Move the cursor to point to the root page of its b-tree structure.
4246 **
4247 ** If the table has a virtual root page, then the cursor is moved to point
4248 ** to the virtual root page instead of the actual root page. A table has a
4249 ** virtual root page when the actual root page contains no cells and a
4250 ** single child page. This can only happen with the table rooted at page 1.
4251 **
4252 ** If the b-tree structure is empty, the cursor state is set to
4253 ** CURSOR_INVALID. Otherwise, the cursor is set to point to the first
4254 ** cell located on the root (or virtual root) page and the cursor state
4255 ** is set to CURSOR_VALID.
4256 **
4257 ** If this function returns successfully, it may be assumed that the
4258 ** page-header flags indicate that the [virtual] root-page is the expected
4259 ** kind of b-tree page (i.e. if when opening the cursor the caller did not
4260 ** specify a KeyInfo structure the flags byte is set to 0x05 or 0x0D,
4261 ** indicating a table b-tree, or if the caller did specify a KeyInfo
4262 ** structure the flags byte is set to 0x02 or 0x0A, indicating an index
4263 ** b-tree).
4264 */
4265 static int moveToRoot(BtCursor *pCur){
4266   MemPage *pRoot;
4267   int rc = SQLITE_OK;
4268   Btree *p = pCur->pBtree;
4269   BtShared *pBt = p->pBt;
4270 
4271   assert( cursorHoldsMutex(pCur) );
4272   assert( CURSOR_INVALID < CURSOR_REQUIRESEEK );
4273   assert( CURSOR_VALID   < CURSOR_REQUIRESEEK );
4274   assert( CURSOR_FAULT   > CURSOR_REQUIRESEEK );
4275   if( pCur->eState>=CURSOR_REQUIRESEEK ){
4276     if( pCur->eState==CURSOR_FAULT ){
4277       assert( pCur->skipNext!=SQLITE_OK );
4278       return pCur->skipNext;
4279     }
4280     sqlite3BtreeClearCursor(pCur);
4281   }
4282 
4283   if( pCur->iPage>=0 ){
4284     int i;
4285     for(i=1; i<=pCur->iPage; i++){
4286       releasePage(pCur->apPage[i]);
4287     }
4288     pCur->iPage = 0;
4289   }else if( pCur->pgnoRoot==0 ){
4290     pCur->eState = CURSOR_INVALID;
4291     return SQLITE_OK;
4292   }else{
4293     rc = getAndInitPage(pBt, pCur->pgnoRoot, &pCur->apPage[0]);
4294     if( rc!=SQLITE_OK ){
4295       pCur->eState = CURSOR_INVALID;
4296       return rc;
4297     }
4298     pCur->iPage = 0;
4299 
4300     /* If pCur->pKeyInfo is not NULL, then the caller that opened this cursor
4301     ** expected to open it on an index b-tree. Otherwise, if pKeyInfo is
4302     ** NULL, the caller expects a table b-tree. If this is not the case,
4303     ** return an SQLITE_CORRUPT error.  */
4304     assert( pCur->apPage[0]->intKey==1 || pCur->apPage[0]->intKey==0 );
4305     if( (pCur->pKeyInfo==0)!=pCur->apPage[0]->intKey ){
4306       return SQLITE_CORRUPT_BKPT;
4307     }
4308   }
4309 
4310   /* Assert that the root page is of the correct type. This must be the
4311   ** case as the call to this function that loaded the root-page (either
4312   ** this call or a previous invocation) would have detected corruption
4313   ** if the assumption were not true, and it is not possible for the flags
4314   ** byte to have been modified while this cursor is holding a reference
4315   ** to the page.  */
4316   pRoot = pCur->apPage[0];
4317   assert( pRoot->pgno==pCur->pgnoRoot );
4318   assert( pRoot->isInit && (pCur->pKeyInfo==0)==pRoot->intKey );
4319 
4320   pCur->aiIdx[0] = 0;
4321   pCur->info.nSize = 0;
4322   pCur->atLast = 0;
4323   pCur->validNKey = 0;
4324 
4325   if( pRoot->nCell==0 && !pRoot->leaf ){
4326     Pgno subpage;
4327     if( pRoot->pgno!=1 ) return SQLITE_CORRUPT_BKPT;
4328     subpage = get4byte(&pRoot->aData[pRoot->hdrOffset+8]);
4329     pCur->eState = CURSOR_VALID;
4330     rc = moveToChild(pCur, subpage);
4331   }else{
4332     pCur->eState = ((pRoot->nCell>0)?CURSOR_VALID:CURSOR_INVALID);
4333   }
4334   return rc;
4335 }
4336 
4337 /*
4338 ** Move the cursor down to the left-most leaf entry beneath the
4339 ** entry to which it is currently pointing.
4340 **
4341 ** The left-most leaf is the one with the smallest key - the first
4342 ** in ascending order.
4343 */
4344 static int moveToLeftmost(BtCursor *pCur){
4345   Pgno pgno;
4346   int rc = SQLITE_OK;
4347   MemPage *pPage;
4348 
4349   assert( cursorHoldsMutex(pCur) );
4350   assert( pCur->eState==CURSOR_VALID );
4351   while( rc==SQLITE_OK && !(pPage = pCur->apPage[pCur->iPage])->leaf ){
4352     assert( pCur->aiIdx[pCur->iPage]<pPage->nCell );
4353     pgno = get4byte(findCell(pPage, pCur->aiIdx[pCur->iPage]));
4354     rc = moveToChild(pCur, pgno);
4355   }
4356   return rc;
4357 }
4358 
4359 /*
4360 ** Move the cursor down to the right-most leaf entry beneath the
4361 ** page to which it is currently pointing.  Notice the difference
4362 ** between moveToLeftmost() and moveToRightmost().  moveToLeftmost()
4363 ** finds the left-most entry beneath the *entry* whereas moveToRightmost()
4364 ** finds the right-most entry beneath the *page*.
4365 **
4366 ** The right-most entry is the one with the largest key - the last
4367 ** key in ascending order.
4368 */
4369 static int moveToRightmost(BtCursor *pCur){
4370   Pgno pgno;
4371   int rc = SQLITE_OK;
4372   MemPage *pPage = 0;
4373 
4374   assert( cursorHoldsMutex(pCur) );
4375   assert( pCur->eState==CURSOR_VALID );
4376   while( rc==SQLITE_OK && !(pPage = pCur->apPage[pCur->iPage])->leaf ){
4377     pgno = get4byte(&pPage->aData[pPage->hdrOffset+8]);
4378     pCur->aiIdx[pCur->iPage] = pPage->nCell;
4379     rc = moveToChild(pCur, pgno);
4380   }
4381   if( rc==SQLITE_OK ){
4382     pCur->aiIdx[pCur->iPage] = pPage->nCell-1;
4383     pCur->info.nSize = 0;
4384     pCur->validNKey = 0;
4385   }
4386   return rc;
4387 }
4388 
4389 /* Move the cursor to the first entry in the table.  Return SQLITE_OK
4390 ** on success.  Set *pRes to 0 if the cursor actually points to something
4391 ** or set *pRes to 1 if the table is empty.
4392 */
4393 int sqlite3BtreeFirst(BtCursor *pCur, int *pRes){
4394   int rc;
4395 
4396   assert( cursorHoldsMutex(pCur) );
4397   assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
4398   rc = moveToRoot(pCur);
4399   if( rc==SQLITE_OK ){
4400     if( pCur->eState==CURSOR_INVALID ){
4401       assert( pCur->pgnoRoot==0 || pCur->apPage[pCur->iPage]->nCell==0 );
4402       *pRes = 1;
4403     }else{
4404       assert( pCur->apPage[pCur->iPage]->nCell>0 );
4405       *pRes = 0;
4406       rc = moveToLeftmost(pCur);
4407     }
4408   }
4409   return rc;
4410 }
4411 
4412 /* Move the cursor to the last entry in the table.  Return SQLITE_OK
4413 ** on success.  Set *pRes to 0 if the cursor actually points to something
4414 ** or set *pRes to 1 if the table is empty.
4415 */
4416 int sqlite3BtreeLast(BtCursor *pCur, int *pRes){
4417   int rc;
4418 
4419   assert( cursorHoldsMutex(pCur) );
4420   assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
4421 
4422   /* If the cursor already points to the last entry, this is a no-op. */
4423   if( CURSOR_VALID==pCur->eState && pCur->atLast ){
4424 #ifdef SQLITE_DEBUG
4425     /* This block serves to assert() that the cursor really does point
4426     ** to the last entry in the b-tree. */
4427     int ii;
4428     for(ii=0; ii<pCur->iPage; ii++){
4429       assert( pCur->aiIdx[ii]==pCur->apPage[ii]->nCell );
4430     }
4431     assert( pCur->aiIdx[pCur->iPage]==pCur->apPage[pCur->iPage]->nCell-1 );
4432     assert( pCur->apPage[pCur->iPage]->leaf );
4433 #endif
4434     return SQLITE_OK;
4435   }
4436 
4437   rc = moveToRoot(pCur);
4438   if( rc==SQLITE_OK ){
4439     if( CURSOR_INVALID==pCur->eState ){
4440       assert( pCur->pgnoRoot==0 || pCur->apPage[pCur->iPage]->nCell==0 );
4441       *pRes = 1;
4442     }else{
4443       assert( pCur->eState==CURSOR_VALID );
4444       *pRes = 0;
4445       rc = moveToRightmost(pCur);
4446       pCur->atLast = rc==SQLITE_OK ?1:0;
4447     }
4448   }
4449   return rc;
4450 }
4451 
4452 /* Move the cursor so that it points to an entry near the key
4453 ** specified by pIdxKey or intKey.   Return a success code.
4454 **
4455 ** For INTKEY tables, the intKey parameter is used.  pIdxKey
4456 ** must be NULL.  For index tables, pIdxKey is used and intKey
4457 ** is ignored.
4458 **
4459 ** If an exact match is not found, then the cursor is always
4460 ** left pointing at a leaf page which would hold the entry if it
4461 ** were present.  The cursor might point to an entry that comes
4462 ** before or after the key.
4463 **
4464 ** An integer is written into *pRes which is the result of
4465 ** comparing the key with the entry to which the cursor is
4466 ** pointing.  The meaning of the integer written into
4467 ** *pRes is as follows:
4468 **
4469 **     *pRes<0      The cursor is left pointing at an entry that
4470 **                  is smaller than intKey/pIdxKey or if the table is empty
4471 **                  and the cursor is therefore left point to nothing.
4472 **
4473 **     *pRes==0     The cursor is left pointing at an entry that
4474 **                  exactly matches intKey/pIdxKey.
4475 **
4476 **     *pRes>0      The cursor is left pointing at an entry that
4477 **                  is larger than intKey/pIdxKey.
4478 **
4479 */
4480 int sqlite3BtreeMovetoUnpacked(
4481   BtCursor *pCur,          /* The cursor to be moved */
4482   UnpackedRecord *pIdxKey, /* Unpacked index key */
4483   i64 intKey,              /* The table key */
4484   int biasRight,           /* If true, bias the search to the high end */
4485   int *pRes                /* Write search results here */
4486 ){
4487   int rc;
4488 
4489   assert( cursorHoldsMutex(pCur) );
4490   assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
4491   assert( pRes );
4492   assert( (pIdxKey==0)==(pCur->pKeyInfo==0) );
4493 
4494   /* If the cursor is already positioned at the point we are trying
4495   ** to move to, then just return without doing any work */
4496   if( pCur->eState==CURSOR_VALID && pCur->validNKey
4497    && pCur->apPage[0]->intKey
4498   ){
4499     if( pCur->info.nKey==intKey ){
4500       *pRes = 0;
4501       return SQLITE_OK;
4502     }
4503     if( pCur->atLast && pCur->info.nKey<intKey ){
4504       *pRes = -1;
4505       return SQLITE_OK;
4506     }
4507   }
4508 
4509   rc = moveToRoot(pCur);
4510   if( rc ){
4511     return rc;
4512   }
4513   assert( pCur->pgnoRoot==0 || pCur->apPage[pCur->iPage] );
4514   assert( pCur->pgnoRoot==0 || pCur->apPage[pCur->iPage]->isInit );
4515   assert( pCur->eState==CURSOR_INVALID || pCur->apPage[pCur->iPage]->nCell>0 );
4516   if( pCur->eState==CURSOR_INVALID ){
4517     *pRes = -1;
4518     assert( pCur->pgnoRoot==0 || pCur->apPage[pCur->iPage]->nCell==0 );
4519     return SQLITE_OK;
4520   }
4521   assert( pCur->apPage[0]->intKey || pIdxKey );
4522   for(;;){
4523     int lwr, upr, idx;
4524     Pgno chldPg;
4525     MemPage *pPage = pCur->apPage[pCur->iPage];
4526     int c;
4527 
4528     /* pPage->nCell must be greater than zero. If this is the root-page
4529     ** the cursor would have been INVALID above and this for(;;) loop
4530     ** not run. If this is not the root-page, then the moveToChild() routine
4531     ** would have already detected db corruption. Similarly, pPage must
4532     ** be the right kind (index or table) of b-tree page. Otherwise
4533     ** a moveToChild() or moveToRoot() call would have detected corruption.  */
4534     assert( pPage->nCell>0 );
4535     assert( pPage->intKey==(pIdxKey==0) );
4536     lwr = 0;
4537     upr = pPage->nCell-1;
4538     if( biasRight ){
4539       pCur->aiIdx[pCur->iPage] = (u16)(idx = upr);
4540     }else{
4541       pCur->aiIdx[pCur->iPage] = (u16)(idx = (upr+lwr)/2);
4542     }
4543     for(;;){
4544       u8 *pCell;                          /* Pointer to current cell in pPage */
4545 
4546       assert( idx==pCur->aiIdx[pCur->iPage] );
4547       pCur->info.nSize = 0;
4548       pCell = findCell(pPage, idx) + pPage->childPtrSize;
4549       if( pPage->intKey ){
4550         i64 nCellKey;
4551         if( pPage->hasData ){
4552           u32 dummy;
4553           pCell += getVarint32(pCell, dummy);
4554         }
4555         getVarint(pCell, (u64*)&nCellKey);
4556         if( nCellKey==intKey ){
4557           c = 0;
4558         }else if( nCellKey<intKey ){
4559           c = -1;
4560         }else{
4561           assert( nCellKey>intKey );
4562           c = +1;
4563         }
4564         pCur->validNKey = 1;
4565         pCur->info.nKey = nCellKey;
4566       }else{
4567         /* The maximum supported page-size is 65536 bytes. This means that
4568         ** the maximum number of record bytes stored on an index B-Tree
4569         ** page is less than 16384 bytes and may be stored as a 2-byte
4570         ** varint. This information is used to attempt to avoid parsing
4571         ** the entire cell by checking for the cases where the record is
4572         ** stored entirely within the b-tree page by inspecting the first
4573         ** 2 bytes of the cell.
4574         */
4575         int nCell = pCell[0];
4576         if( nCell<=pPage->max1bytePayload
4577          /* && (pCell+nCell)<pPage->aDataEnd */
4578         ){
4579           /* This branch runs if the record-size field of the cell is a
4580           ** single byte varint and the record fits entirely on the main
4581           ** b-tree page.  */
4582           testcase( pCell+nCell+1==pPage->aDataEnd );
4583           c = sqlite3VdbeRecordCompare(nCell, (void*)&pCell[1], pIdxKey);
4584         }else if( !(pCell[1] & 0x80)
4585           && (nCell = ((nCell&0x7f)<<7) + pCell[1])<=pPage->maxLocal
4586           /* && (pCell+nCell+2)<=pPage->aDataEnd */
4587         ){
4588           /* The record-size field is a 2 byte varint and the record
4589           ** fits entirely on the main b-tree page.  */
4590           testcase( pCell+nCell+2==pPage->aDataEnd );
4591           c = sqlite3VdbeRecordCompare(nCell, (void*)&pCell[2], pIdxKey);
4592         }else{
4593           /* The record flows over onto one or more overflow pages. In
4594           ** this case the whole cell needs to be parsed, a buffer allocated
4595           ** and accessPayload() used to retrieve the record into the
4596           ** buffer before VdbeRecordCompare() can be called. */
4597           void *pCellKey;
4598           u8 * const pCellBody = pCell - pPage->childPtrSize;
4599           btreeParseCellPtr(pPage, pCellBody, &pCur->info);
4600           nCell = (int)pCur->info.nKey;
4601           pCellKey = sqlite3Malloc( nCell );
4602           if( pCellKey==0 ){
4603             rc = SQLITE_NOMEM;
4604             goto moveto_finish;
4605           }
4606           rc = accessPayload(pCur, 0, nCell, (unsigned char*)pCellKey, 0);
4607           if( rc ){
4608             sqlite3_free(pCellKey);
4609             goto moveto_finish;
4610           }
4611           c = sqlite3VdbeRecordCompare(nCell, pCellKey, pIdxKey);
4612           sqlite3_free(pCellKey);
4613         }
4614       }
4615       if( c==0 ){
4616         if( pPage->intKey && !pPage->leaf ){
4617           lwr = idx;
4618           break;
4619         }else{
4620           *pRes = 0;
4621           rc = SQLITE_OK;
4622           goto moveto_finish;
4623         }
4624       }
4625       if( c<0 ){
4626         lwr = idx+1;
4627       }else{
4628         upr = idx-1;
4629       }
4630       if( lwr>upr ){
4631         break;
4632       }
4633       pCur->aiIdx[pCur->iPage] = (u16)(idx = (lwr+upr)/2);
4634     }
4635     assert( lwr==upr+1 || (pPage->intKey && !pPage->leaf) );
4636     assert( pPage->isInit );
4637     if( pPage->leaf ){
4638       chldPg = 0;
4639     }else if( lwr>=pPage->nCell ){
4640       chldPg = get4byte(&pPage->aData[pPage->hdrOffset+8]);
4641     }else{
4642       chldPg = get4byte(findCell(pPage, lwr));
4643     }
4644     if( chldPg==0 ){
4645       assert( pCur->aiIdx[pCur->iPage]<pCur->apPage[pCur->iPage]->nCell );
4646       *pRes = c;
4647       rc = SQLITE_OK;
4648       goto moveto_finish;
4649     }
4650     pCur->aiIdx[pCur->iPage] = (u16)lwr;
4651     pCur->info.nSize = 0;
4652     pCur->validNKey = 0;
4653     rc = moveToChild(pCur, chldPg);
4654     if( rc ) goto moveto_finish;
4655   }
4656 moveto_finish:
4657   return rc;
4658 }
4659 
4660 
4661 /*
4662 ** Return TRUE if the cursor is not pointing at an entry of the table.
4663 **
4664 ** TRUE will be returned after a call to sqlite3BtreeNext() moves
4665 ** past the last entry in the table or sqlite3BtreePrev() moves past
4666 ** the first entry.  TRUE is also returned if the table is empty.
4667 */
4668 int sqlite3BtreeEof(BtCursor *pCur){
4669   /* TODO: What if the cursor is in CURSOR_REQUIRESEEK but all table entries
4670   ** have been deleted? This API will need to change to return an error code
4671   ** as well as the boolean result value.
4672   */
4673   return (CURSOR_VALID!=pCur->eState);
4674 }
4675 
4676 /*
4677 ** Advance the cursor to the next entry in the database.  If
4678 ** successful then set *pRes=0.  If the cursor
4679 ** was already pointing to the last entry in the database before
4680 ** this routine was called, then set *pRes=1.
4681 */
4682 int sqlite3BtreeNext(BtCursor *pCur, int *pRes){
4683   int rc;
4684   int idx;
4685   MemPage *pPage;
4686 
4687   assert( cursorHoldsMutex(pCur) );
4688   rc = restoreCursorPosition(pCur);
4689   if( rc!=SQLITE_OK ){
4690     return rc;
4691   }
4692   assert( pRes!=0 );
4693   if( CURSOR_INVALID==pCur->eState ){
4694     *pRes = 1;
4695     return SQLITE_OK;
4696   }
4697   if( pCur->skipNext>0 ){
4698     pCur->skipNext = 0;
4699     *pRes = 0;
4700     return SQLITE_OK;
4701   }
4702   pCur->skipNext = 0;
4703 
4704   pPage = pCur->apPage[pCur->iPage];
4705   idx = ++pCur->aiIdx[pCur->iPage];
4706   assert( pPage->isInit );
4707 
4708   /* If the database file is corrupt, it is possible for the value of idx
4709   ** to be invalid here. This can only occur if a second cursor modifies
4710   ** the page while cursor pCur is holding a reference to it. Which can
4711   ** only happen if the database is corrupt in such a way as to link the
4712   ** page into more than one b-tree structure. */
4713   testcase( idx>pPage->nCell );
4714 
4715   pCur->info.nSize = 0;
4716   pCur->validNKey = 0;
4717   if( idx>=pPage->nCell ){
4718     if( !pPage->leaf ){
4719       rc = moveToChild(pCur, get4byte(&pPage->aData[pPage->hdrOffset+8]));
4720       if( rc ) return rc;
4721       rc = moveToLeftmost(pCur);
4722       *pRes = 0;
4723       return rc;
4724     }
4725     do{
4726       if( pCur->iPage==0 ){
4727         *pRes = 1;
4728         pCur->eState = CURSOR_INVALID;
4729         return SQLITE_OK;
4730       }
4731       moveToParent(pCur);
4732       pPage = pCur->apPage[pCur->iPage];
4733     }while( pCur->aiIdx[pCur->iPage]>=pPage->nCell );
4734     *pRes = 0;
4735     if( pPage->intKey ){
4736       rc = sqlite3BtreeNext(pCur, pRes);
4737     }else{
4738       rc = SQLITE_OK;
4739     }
4740     return rc;
4741   }
4742   *pRes = 0;
4743   if( pPage->leaf ){
4744     return SQLITE_OK;
4745   }
4746   rc = moveToLeftmost(pCur);
4747   return rc;
4748 }
4749 
4750 
4751 /*
4752 ** Step the cursor to the back to the previous entry in the database.  If
4753 ** successful then set *pRes=0.  If the cursor
4754 ** was already pointing to the first entry in the database before
4755 ** this routine was called, then set *pRes=1.
4756 */
4757 int sqlite3BtreePrevious(BtCursor *pCur, int *pRes){
4758   int rc;
4759   MemPage *pPage;
4760 
4761   assert( cursorHoldsMutex(pCur) );
4762   rc = restoreCursorPosition(pCur);
4763   if( rc!=SQLITE_OK ){
4764     return rc;
4765   }
4766   pCur->atLast = 0;
4767   if( CURSOR_INVALID==pCur->eState ){
4768     *pRes = 1;
4769     return SQLITE_OK;
4770   }
4771   if( pCur->skipNext<0 ){
4772     pCur->skipNext = 0;
4773     *pRes = 0;
4774     return SQLITE_OK;
4775   }
4776   pCur->skipNext = 0;
4777 
4778   pPage = pCur->apPage[pCur->iPage];
4779   assert( pPage->isInit );
4780   if( !pPage->leaf ){
4781     int idx = pCur->aiIdx[pCur->iPage];
4782     rc = moveToChild(pCur, get4byte(findCell(pPage, idx)));
4783     if( rc ){
4784       return rc;
4785     }
4786     rc = moveToRightmost(pCur);
4787   }else{
4788     while( pCur->aiIdx[pCur->iPage]==0 ){
4789       if( pCur->iPage==0 ){
4790         pCur->eState = CURSOR_INVALID;
4791         *pRes = 1;
4792         return SQLITE_OK;
4793       }
4794       moveToParent(pCur);
4795     }
4796     pCur->info.nSize = 0;
4797     pCur->validNKey = 0;
4798 
4799     pCur->aiIdx[pCur->iPage]--;
4800     pPage = pCur->apPage[pCur->iPage];
4801     if( pPage->intKey && !pPage->leaf ){
4802       rc = sqlite3BtreePrevious(pCur, pRes);
4803     }else{
4804       rc = SQLITE_OK;
4805     }
4806   }
4807   *pRes = 0;
4808   return rc;
4809 }
4810 
4811 /*
4812 ** Allocate a new page from the database file.
4813 **
4814 ** The new page is marked as dirty.  (In other words, sqlite3PagerWrite()
4815 ** has already been called on the new page.)  The new page has also
4816 ** been referenced and the calling routine is responsible for calling
4817 ** sqlite3PagerUnref() on the new page when it is done.
4818 **
4819 ** SQLITE_OK is returned on success.  Any other return value indicates
4820 ** an error.  *ppPage and *pPgno are undefined in the event of an error.
4821 ** Do not invoke sqlite3PagerUnref() on *ppPage if an error is returned.
4822 **
4823 ** If the "nearby" parameter is not 0, then a (feeble) effort is made to
4824 ** locate a page close to the page number "nearby".  This can be used in an
4825 ** attempt to keep related pages close to each other in the database file,
4826 ** which in turn can make database access faster.
4827 **
4828 ** If the "exact" parameter is not 0, and the page-number nearby exists
4829 ** anywhere on the free-list, then it is guarenteed to be returned. This
4830 ** is only used by auto-vacuum databases when allocating a new table.
4831 */
4832 static int allocateBtreePage(
4833   BtShared *pBt,
4834   MemPage **ppPage,
4835   Pgno *pPgno,
4836   Pgno nearby,
4837   u8 exact
4838 ){
4839   MemPage *pPage1;
4840   int rc;
4841   u32 n;     /* Number of pages on the freelist */
4842   u32 k;     /* Number of leaves on the trunk of the freelist */
4843   MemPage *pTrunk = 0;
4844   MemPage *pPrevTrunk = 0;
4845   Pgno mxPage;     /* Total size of the database file */
4846 
4847   assert( sqlite3_mutex_held(pBt->mutex) );
4848   pPage1 = pBt->pPage1;
4849   mxPage = btreePagecount(pBt);
4850   n = get4byte(&pPage1->aData[36]);
4851   testcase( n==mxPage-1 );
4852   if( n>=mxPage ){
4853     return SQLITE_CORRUPT_BKPT;
4854   }
4855   if( n>0 ){
4856     /* There are pages on the freelist.  Reuse one of those pages. */
4857     Pgno iTrunk;
4858     u8 searchList = 0; /* If the free-list must be searched for 'nearby' */
4859 
4860     /* If the 'exact' parameter was true and a query of the pointer-map
4861     ** shows that the page 'nearby' is somewhere on the free-list, then
4862     ** the entire-list will be searched for that page.
4863     */
4864 #ifndef SQLITE_OMIT_AUTOVACUUM
4865     if( exact && nearby<=mxPage ){
4866       u8 eType;
4867       assert( nearby>0 );
4868       assert( pBt->autoVacuum );
4869       rc = ptrmapGet(pBt, nearby, &eType, 0);
4870       if( rc ) return rc;
4871       if( eType==PTRMAP_FREEPAGE ){
4872         searchList = 1;
4873       }
4874       *pPgno = nearby;
4875     }
4876 #endif
4877 
4878     /* Decrement the free-list count by 1. Set iTrunk to the index of the
4879     ** first free-list trunk page. iPrevTrunk is initially 1.
4880     */
4881     rc = sqlite3PagerWrite(pPage1->pDbPage);
4882     if( rc ) return rc;
4883     put4byte(&pPage1->aData[36], n-1);
4884 
4885     /* The code within this loop is run only once if the 'searchList' variable
4886     ** is not true. Otherwise, it runs once for each trunk-page on the
4887     ** free-list until the page 'nearby' is located.
4888     */
4889     do {
4890       pPrevTrunk = pTrunk;
4891       if( pPrevTrunk ){
4892         iTrunk = get4byte(&pPrevTrunk->aData[0]);
4893       }else{
4894         iTrunk = get4byte(&pPage1->aData[32]);
4895       }
4896       testcase( iTrunk==mxPage );
4897       if( iTrunk>mxPage ){
4898         rc = SQLITE_CORRUPT_BKPT;
4899       }else{
4900         rc = btreeGetPage(pBt, iTrunk, &pTrunk, 0);
4901       }
4902       if( rc ){
4903         pTrunk = 0;
4904         goto end_allocate_page;
4905       }
4906       assert( pTrunk!=0 );
4907       assert( pTrunk->aData!=0 );
4908 
4909       k = get4byte(&pTrunk->aData[4]); /* # of leaves on this trunk page */
4910       if( k==0 && !searchList ){
4911         /* The trunk has no leaves and the list is not being searched.
4912         ** So extract the trunk page itself and use it as the newly
4913         ** allocated page */
4914         assert( pPrevTrunk==0 );
4915         rc = sqlite3PagerWrite(pTrunk->pDbPage);
4916         if( rc ){
4917           goto end_allocate_page;
4918         }
4919         *pPgno = iTrunk;
4920         memcpy(&pPage1->aData[32], &pTrunk->aData[0], 4);
4921         *ppPage = pTrunk;
4922         pTrunk = 0;
4923         TRACE(("ALLOCATE: %d trunk - %d free pages left\n", *pPgno, n-1));
4924       }else if( k>(u32)(pBt->usableSize/4 - 2) ){
4925         /* Value of k is out of range.  Database corruption */
4926         rc = SQLITE_CORRUPT_BKPT;
4927         goto end_allocate_page;
4928 #ifndef SQLITE_OMIT_AUTOVACUUM
4929       }else if( searchList && nearby==iTrunk ){
4930         /* The list is being searched and this trunk page is the page
4931         ** to allocate, regardless of whether it has leaves.
4932         */
4933         assert( *pPgno==iTrunk );
4934         *ppPage = pTrunk;
4935         searchList = 0;
4936         rc = sqlite3PagerWrite(pTrunk->pDbPage);
4937         if( rc ){
4938           goto end_allocate_page;
4939         }
4940         if( k==0 ){
4941           if( !pPrevTrunk ){
4942             memcpy(&pPage1->aData[32], &pTrunk->aData[0], 4);
4943           }else{
4944             rc = sqlite3PagerWrite(pPrevTrunk->pDbPage);
4945             if( rc!=SQLITE_OK ){
4946               goto end_allocate_page;
4947             }
4948             memcpy(&pPrevTrunk->aData[0], &pTrunk->aData[0], 4);
4949           }
4950         }else{
4951           /* The trunk page is required by the caller but it contains
4952           ** pointers to free-list leaves. The first leaf becomes a trunk
4953           ** page in this case.
4954           */
4955           MemPage *pNewTrunk;
4956           Pgno iNewTrunk = get4byte(&pTrunk->aData[8]);
4957           if( iNewTrunk>mxPage ){
4958             rc = SQLITE_CORRUPT_BKPT;
4959             goto end_allocate_page;
4960           }
4961           testcase( iNewTrunk==mxPage );
4962           rc = btreeGetPage(pBt, iNewTrunk, &pNewTrunk, 0);
4963           if( rc!=SQLITE_OK ){
4964             goto end_allocate_page;
4965           }
4966           rc = sqlite3PagerWrite(pNewTrunk->pDbPage);
4967           if( rc!=SQLITE_OK ){
4968             releasePage(pNewTrunk);
4969             goto end_allocate_page;
4970           }
4971           memcpy(&pNewTrunk->aData[0], &pTrunk->aData[0], 4);
4972           put4byte(&pNewTrunk->aData[4], k-1);
4973           memcpy(&pNewTrunk->aData[8], &pTrunk->aData[12], (k-1)*4);
4974           releasePage(pNewTrunk);
4975           if( !pPrevTrunk ){
4976             assert( sqlite3PagerIswriteable(pPage1->pDbPage) );
4977             put4byte(&pPage1->aData[32], iNewTrunk);
4978           }else{
4979             rc = sqlite3PagerWrite(pPrevTrunk->pDbPage);
4980             if( rc ){
4981               goto end_allocate_page;
4982             }
4983             put4byte(&pPrevTrunk->aData[0], iNewTrunk);
4984           }
4985         }
4986         pTrunk = 0;
4987         TRACE(("ALLOCATE: %d trunk - %d free pages left\n", *pPgno, n-1));
4988 #endif
4989       }else if( k>0 ){
4990         /* Extract a leaf from the trunk */
4991         u32 closest;
4992         Pgno iPage;
4993         unsigned char *aData = pTrunk->aData;
4994         if( nearby>0 ){
4995           u32 i;
4996           int dist;
4997           closest = 0;
4998           dist = sqlite3AbsInt32(get4byte(&aData[8]) - nearby);
4999           for(i=1; i<k; i++){
5000             int d2 = sqlite3AbsInt32(get4byte(&aData[8+i*4]) - nearby);
5001             if( d2<dist ){
5002               closest = i;
5003               dist = d2;
5004             }
5005           }
5006         }else{
5007           closest = 0;
5008         }
5009 
5010         iPage = get4byte(&aData[8+closest*4]);
5011         testcase( iPage==mxPage );
5012         if( iPage>mxPage ){
5013           rc = SQLITE_CORRUPT_BKPT;
5014           goto end_allocate_page;
5015         }
5016         testcase( iPage==mxPage );
5017         if( !searchList || iPage==nearby ){
5018           int noContent;
5019           *pPgno = iPage;
5020           TRACE(("ALLOCATE: %d was leaf %d of %d on trunk %d"
5021                  ": %d more free pages\n",
5022                  *pPgno, closest+1, k, pTrunk->pgno, n-1));
5023           rc = sqlite3PagerWrite(pTrunk->pDbPage);
5024           if( rc ) goto end_allocate_page;
5025           if( closest<k-1 ){
5026             memcpy(&aData[8+closest*4], &aData[4+k*4], 4);
5027           }
5028           put4byte(&aData[4], k-1);
5029           noContent = !btreeGetHasContent(pBt, *pPgno);
5030           rc = btreeGetPage(pBt, *pPgno, ppPage, noContent);
5031           if( rc==SQLITE_OK ){
5032             rc = sqlite3PagerWrite((*ppPage)->pDbPage);
5033             if( rc!=SQLITE_OK ){
5034               releasePage(*ppPage);
5035             }
5036           }
5037           searchList = 0;
5038         }
5039       }
5040       releasePage(pPrevTrunk);
5041       pPrevTrunk = 0;
5042     }while( searchList );
5043   }else{
5044     /* There are no pages on the freelist, so create a new page at the
5045     ** end of the file */
5046     rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
5047     if( rc ) return rc;
5048     pBt->nPage++;
5049     if( pBt->nPage==PENDING_BYTE_PAGE(pBt) ) pBt->nPage++;
5050 
5051 #ifndef SQLITE_OMIT_AUTOVACUUM
5052     if( pBt->autoVacuum && PTRMAP_ISPAGE(pBt, pBt->nPage) ){
5053       /* If *pPgno refers to a pointer-map page, allocate two new pages
5054       ** at the end of the file instead of one. The first allocated page
5055       ** becomes a new pointer-map page, the second is used by the caller.
5056       */
5057       MemPage *pPg = 0;
5058       TRACE(("ALLOCATE: %d from end of file (pointer-map page)\n", pBt->nPage));
5059       assert( pBt->nPage!=PENDING_BYTE_PAGE(pBt) );
5060       rc = btreeGetPage(pBt, pBt->nPage, &pPg, 1);
5061       if( rc==SQLITE_OK ){
5062         rc = sqlite3PagerWrite(pPg->pDbPage);
5063         releasePage(pPg);
5064       }
5065       if( rc ) return rc;
5066       pBt->nPage++;
5067       if( pBt->nPage==PENDING_BYTE_PAGE(pBt) ){ pBt->nPage++; }
5068     }
5069 #endif
5070     put4byte(28 + (u8*)pBt->pPage1->aData, pBt->nPage);
5071     *pPgno = pBt->nPage;
5072 
5073     assert( *pPgno!=PENDING_BYTE_PAGE(pBt) );
5074     rc = btreeGetPage(pBt, *pPgno, ppPage, 1);
5075     if( rc ) return rc;
5076     rc = sqlite3PagerWrite((*ppPage)->pDbPage);
5077     if( rc!=SQLITE_OK ){
5078       releasePage(*ppPage);
5079     }
5080     TRACE(("ALLOCATE: %d from end of file\n", *pPgno));
5081   }
5082 
5083   assert( *pPgno!=PENDING_BYTE_PAGE(pBt) );
5084 
5085 end_allocate_page:
5086   releasePage(pTrunk);
5087   releasePage(pPrevTrunk);
5088   if( rc==SQLITE_OK ){
5089     if( sqlite3PagerPageRefcount((*ppPage)->pDbPage)>1 ){
5090       releasePage(*ppPage);
5091       return SQLITE_CORRUPT_BKPT;
5092     }
5093     (*ppPage)->isInit = 0;
5094   }else{
5095     *ppPage = 0;
5096   }
5097   assert( rc!=SQLITE_OK || sqlite3PagerIswriteable((*ppPage)->pDbPage) );
5098   return rc;
5099 }
5100 
5101 /*
5102 ** This function is used to add page iPage to the database file free-list.
5103 ** It is assumed that the page is not already a part of the free-list.
5104 **
5105 ** The value passed as the second argument to this function is optional.
5106 ** If the caller happens to have a pointer to the MemPage object
5107 ** corresponding to page iPage handy, it may pass it as the second value.
5108 ** Otherwise, it may pass NULL.
5109 **
5110 ** If a pointer to a MemPage object is passed as the second argument,
5111 ** its reference count is not altered by this function.
5112 */
5113 static int freePage2(BtShared *pBt, MemPage *pMemPage, Pgno iPage){
5114   MemPage *pTrunk = 0;                /* Free-list trunk page */
5115   Pgno iTrunk = 0;                    /* Page number of free-list trunk page */
5116   MemPage *pPage1 = pBt->pPage1;      /* Local reference to page 1 */
5117   MemPage *pPage;                     /* Page being freed. May be NULL. */
5118   int rc;                             /* Return Code */
5119   int nFree;                          /* Initial number of pages on free-list */
5120 
5121   assert( sqlite3_mutex_held(pBt->mutex) );
5122   assert( iPage>1 );
5123   assert( !pMemPage || pMemPage->pgno==iPage );
5124 
5125   if( pMemPage ){
5126     pPage = pMemPage;
5127     sqlite3PagerRef(pPage->pDbPage);
5128   }else{
5129     pPage = btreePageLookup(pBt, iPage);
5130   }
5131 
5132   /* Increment the free page count on pPage1 */
5133   rc = sqlite3PagerWrite(pPage1->pDbPage);
5134   if( rc ) goto freepage_out;
5135   nFree = get4byte(&pPage1->aData[36]);
5136   put4byte(&pPage1->aData[36], nFree+1);
5137 
5138   if( pBt->btsFlags & BTS_SECURE_DELETE ){
5139     /* If the secure_delete option is enabled, then
5140     ** always fully overwrite deleted information with zeros.
5141     */
5142     if( (!pPage && ((rc = btreeGetPage(pBt, iPage, &pPage, 0))!=0) )
5143      ||            ((rc = sqlite3PagerWrite(pPage->pDbPage))!=0)
5144     ){
5145       goto freepage_out;
5146     }
5147     memset(pPage->aData, 0, pPage->pBt->pageSize);
5148   }
5149 
5150   /* If the database supports auto-vacuum, write an entry in the pointer-map
5151   ** to indicate that the page is free.
5152   */
5153   if( ISAUTOVACUUM ){
5154     ptrmapPut(pBt, iPage, PTRMAP_FREEPAGE, 0, &rc);
5155     if( rc ) goto freepage_out;
5156   }
5157 
5158   /* Now manipulate the actual database free-list structure. There are two
5159   ** possibilities. If the free-list is currently empty, or if the first
5160   ** trunk page in the free-list is full, then this page will become a
5161   ** new free-list trunk page. Otherwise, it will become a leaf of the
5162   ** first trunk page in the current free-list. This block tests if it
5163   ** is possible to add the page as a new free-list leaf.
5164   */
5165   if( nFree!=0 ){
5166     u32 nLeaf;                /* Initial number of leaf cells on trunk page */
5167 
5168     iTrunk = get4byte(&pPage1->aData[32]);
5169     rc = btreeGetPage(pBt, iTrunk, &pTrunk, 0);
5170     if( rc!=SQLITE_OK ){
5171       goto freepage_out;
5172     }
5173 
5174     nLeaf = get4byte(&pTrunk->aData[4]);
5175     assert( pBt->usableSize>32 );
5176     if( nLeaf > (u32)pBt->usableSize/4 - 2 ){
5177       rc = SQLITE_CORRUPT_BKPT;
5178       goto freepage_out;
5179     }
5180     if( nLeaf < (u32)pBt->usableSize/4 - 8 ){
5181       /* In this case there is room on the trunk page to insert the page
5182       ** being freed as a new leaf.
5183       **
5184       ** Note that the trunk page is not really full until it contains
5185       ** usableSize/4 - 2 entries, not usableSize/4 - 8 entries as we have
5186       ** coded.  But due to a coding error in versions of SQLite prior to
5187       ** 3.6.0, databases with freelist trunk pages holding more than
5188       ** usableSize/4 - 8 entries will be reported as corrupt.  In order
5189       ** to maintain backwards compatibility with older versions of SQLite,
5190       ** we will continue to restrict the number of entries to usableSize/4 - 8
5191       ** for now.  At some point in the future (once everyone has upgraded
5192       ** to 3.6.0 or later) we should consider fixing the conditional above
5193       ** to read "usableSize/4-2" instead of "usableSize/4-8".
5194       */
5195       rc = sqlite3PagerWrite(pTrunk->pDbPage);
5196       if( rc==SQLITE_OK ){
5197         put4byte(&pTrunk->aData[4], nLeaf+1);
5198         put4byte(&pTrunk->aData[8+nLeaf*4], iPage);
5199         if( pPage && (pBt->btsFlags & BTS_SECURE_DELETE)==0 ){
5200           sqlite3PagerDontWrite(pPage->pDbPage);
5201         }
5202         rc = btreeSetHasContent(pBt, iPage);
5203       }
5204       TRACE(("FREE-PAGE: %d leaf on trunk page %d\n",pPage->pgno,pTrunk->pgno));
5205       goto freepage_out;
5206     }
5207   }
5208 
5209   /* If control flows to this point, then it was not possible to add the
5210   ** the page being freed as a leaf page of the first trunk in the free-list.
5211   ** Possibly because the free-list is empty, or possibly because the
5212   ** first trunk in the free-list is full. Either way, the page being freed
5213   ** will become the new first trunk page in the free-list.
5214   */
5215   if( pPage==0 && SQLITE_OK!=(rc = btreeGetPage(pBt, iPage, &pPage, 0)) ){
5216     goto freepage_out;
5217   }
5218   rc = sqlite3PagerWrite(pPage->pDbPage);
5219   if( rc!=SQLITE_OK ){
5220     goto freepage_out;
5221   }
5222   put4byte(pPage->aData, iTrunk);
5223   put4byte(&pPage->aData[4], 0);
5224   put4byte(&pPage1->aData[32], iPage);
5225   TRACE(("FREE-PAGE: %d new trunk page replacing %d\n", pPage->pgno, iTrunk));
5226 
5227 freepage_out:
5228   if( pPage ){
5229     pPage->isInit = 0;
5230   }
5231   releasePage(pPage);
5232   releasePage(pTrunk);
5233   return rc;
5234 }
5235 static void freePage(MemPage *pPage, int *pRC){
5236   if( (*pRC)==SQLITE_OK ){
5237     *pRC = freePage2(pPage->pBt, pPage, pPage->pgno);
5238   }
5239 }
5240 
5241 /*
5242 ** Free any overflow pages associated with the given Cell.
5243 */
5244 static int clearCell(MemPage *pPage, unsigned char *pCell){
5245   BtShared *pBt = pPage->pBt;
5246   CellInfo info;
5247   Pgno ovflPgno;
5248   int rc;
5249   int nOvfl;
5250   u32 ovflPageSize;
5251 
5252   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
5253   btreeParseCellPtr(pPage, pCell, &info);
5254   if( info.iOverflow==0 ){
5255     return SQLITE_OK;  /* No overflow pages. Return without doing anything */
5256   }
5257   if( pCell+info.iOverflow+3 > pPage->aData+pPage->maskPage ){
5258     return SQLITE_CORRUPT;  /* Cell extends past end of page */
5259   }
5260   ovflPgno = get4byte(&pCell[info.iOverflow]);
5261   assert( pBt->usableSize > 4 );
5262   ovflPageSize = pBt->usableSize - 4;
5263   nOvfl = (info.nPayload - info.nLocal + ovflPageSize - 1)/ovflPageSize;
5264   assert( ovflPgno==0 || nOvfl>0 );
5265   while( nOvfl-- ){
5266     Pgno iNext = 0;
5267     MemPage *pOvfl = 0;
5268     if( ovflPgno<2 || ovflPgno>btreePagecount(pBt) ){
5269       /* 0 is not a legal page number and page 1 cannot be an
5270       ** overflow page. Therefore if ovflPgno<2 or past the end of the
5271       ** file the database must be corrupt. */
5272       return SQLITE_CORRUPT_BKPT;
5273     }
5274     if( nOvfl ){
5275       rc = getOverflowPage(pBt, ovflPgno, &pOvfl, &iNext);
5276       if( rc ) return rc;
5277     }
5278 
5279     if( ( pOvfl || ((pOvfl = btreePageLookup(pBt, ovflPgno))!=0) )
5280      && sqlite3PagerPageRefcount(pOvfl->pDbPage)!=1
5281     ){
5282       /* There is no reason any cursor should have an outstanding reference
5283       ** to an overflow page belonging to a cell that is being deleted/updated.
5284       ** So if there exists more than one reference to this page, then it
5285       ** must not really be an overflow page and the database must be corrupt.
5286       ** It is helpful to detect this before calling freePage2(), as
5287       ** freePage2() may zero the page contents if secure-delete mode is
5288       ** enabled. If this 'overflow' page happens to be a page that the
5289       ** caller is iterating through or using in some other way, this
5290       ** can be problematic.
5291       */
5292       rc = SQLITE_CORRUPT_BKPT;
5293     }else{
5294       rc = freePage2(pBt, pOvfl, ovflPgno);
5295     }
5296 
5297     if( pOvfl ){
5298       sqlite3PagerUnref(pOvfl->pDbPage);
5299     }
5300     if( rc ) return rc;
5301     ovflPgno = iNext;
5302   }
5303   return SQLITE_OK;
5304 }
5305 
5306 /*
5307 ** Create the byte sequence used to represent a cell on page pPage
5308 ** and write that byte sequence into pCell[].  Overflow pages are
5309 ** allocated and filled in as necessary.  The calling procedure
5310 ** is responsible for making sure sufficient space has been allocated
5311 ** for pCell[].
5312 **
5313 ** Note that pCell does not necessary need to point to the pPage->aData
5314 ** area.  pCell might point to some temporary storage.  The cell will
5315 ** be constructed in this temporary area then copied into pPage->aData
5316 ** later.
5317 */
5318 static int fillInCell(
5319   MemPage *pPage,                /* The page that contains the cell */
5320   unsigned char *pCell,          /* Complete text of the cell */
5321   const void *pKey, i64 nKey,    /* The key */
5322   const void *pData,int nData,   /* The data */
5323   int nZero,                     /* Extra zero bytes to append to pData */
5324   int *pnSize                    /* Write cell size here */
5325 ){
5326   int nPayload;
5327   const u8 *pSrc;
5328   int nSrc, n, rc;
5329   int spaceLeft;
5330   MemPage *pOvfl = 0;
5331   MemPage *pToRelease = 0;
5332   unsigned char *pPrior;
5333   unsigned char *pPayload;
5334   BtShared *pBt = pPage->pBt;
5335   Pgno pgnoOvfl = 0;
5336   int nHeader;
5337   CellInfo info;
5338 
5339   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
5340 
5341   /* pPage is not necessarily writeable since pCell might be auxiliary
5342   ** buffer space that is separate from the pPage buffer area */
5343   assert( pCell<pPage->aData || pCell>=&pPage->aData[pBt->pageSize]
5344             || sqlite3PagerIswriteable(pPage->pDbPage) );
5345 
5346   /* Fill in the header. */
5347   nHeader = 0;
5348   if( !pPage->leaf ){
5349     nHeader += 4;
5350   }
5351   if( pPage->hasData ){
5352     nHeader += putVarint(&pCell[nHeader], nData+nZero);
5353   }else{
5354     nData = nZero = 0;
5355   }
5356   nHeader += putVarint(&pCell[nHeader], *(u64*)&nKey);
5357   btreeParseCellPtr(pPage, pCell, &info);
5358   assert( info.nHeader==nHeader );
5359   assert( info.nKey==nKey );
5360   assert( info.nData==(u32)(nData+nZero) );
5361 
5362   /* Fill in the payload */
5363   nPayload = nData + nZero;
5364   if( pPage->intKey ){
5365     pSrc = pData;
5366     nSrc = nData;
5367     nData = 0;
5368   }else{
5369     if( NEVER(nKey>0x7fffffff || pKey==0) ){
5370       return SQLITE_CORRUPT_BKPT;
5371     }
5372     nPayload += (int)nKey;
5373     pSrc = pKey;
5374     nSrc = (int)nKey;
5375   }
5376   *pnSize = info.nSize;
5377   spaceLeft = info.nLocal;
5378   pPayload = &pCell[nHeader];
5379   pPrior = &pCell[info.iOverflow];
5380 
5381   while( nPayload>0 ){
5382     if( spaceLeft==0 ){
5383 #ifndef SQLITE_OMIT_AUTOVACUUM
5384       Pgno pgnoPtrmap = pgnoOvfl; /* Overflow page pointer-map entry page */
5385       if( pBt->autoVacuum ){
5386         do{
5387           pgnoOvfl++;
5388         } while(
5389           PTRMAP_ISPAGE(pBt, pgnoOvfl) || pgnoOvfl==PENDING_BYTE_PAGE(pBt)
5390         );
5391       }
5392 #endif
5393       rc = allocateBtreePage(pBt, &pOvfl, &pgnoOvfl, pgnoOvfl, 0);
5394 #ifndef SQLITE_OMIT_AUTOVACUUM
5395       /* If the database supports auto-vacuum, and the second or subsequent
5396       ** overflow page is being allocated, add an entry to the pointer-map
5397       ** for that page now.
5398       **
5399       ** If this is the first overflow page, then write a partial entry
5400       ** to the pointer-map. If we write nothing to this pointer-map slot,
5401       ** then the optimistic overflow chain processing in clearCell()
5402       ** may misinterpret the uninitialised values and delete the
5403       ** wrong pages from the database.
5404       */
5405       if( pBt->autoVacuum && rc==SQLITE_OK ){
5406         u8 eType = (pgnoPtrmap?PTRMAP_OVERFLOW2:PTRMAP_OVERFLOW1);
5407         ptrmapPut(pBt, pgnoOvfl, eType, pgnoPtrmap, &rc);
5408         if( rc ){
5409           releasePage(pOvfl);
5410         }
5411       }
5412 #endif
5413       if( rc ){
5414         releasePage(pToRelease);
5415         return rc;
5416       }
5417 
5418       /* If pToRelease is not zero than pPrior points into the data area
5419       ** of pToRelease.  Make sure pToRelease is still writeable. */
5420       assert( pToRelease==0 || sqlite3PagerIswriteable(pToRelease->pDbPage) );
5421 
5422       /* If pPrior is part of the data area of pPage, then make sure pPage
5423       ** is still writeable */
5424       assert( pPrior<pPage->aData || pPrior>=&pPage->aData[pBt->pageSize]
5425             || sqlite3PagerIswriteable(pPage->pDbPage) );
5426 
5427       put4byte(pPrior, pgnoOvfl);
5428       releasePage(pToRelease);
5429       pToRelease = pOvfl;
5430       pPrior = pOvfl->aData;
5431       put4byte(pPrior, 0);
5432       pPayload = &pOvfl->aData[4];
5433       spaceLeft = pBt->usableSize - 4;
5434     }
5435     n = nPayload;
5436     if( n>spaceLeft ) n = spaceLeft;
5437 
5438     /* If pToRelease is not zero than pPayload points into the data area
5439     ** of pToRelease.  Make sure pToRelease is still writeable. */
5440     assert( pToRelease==0 || sqlite3PagerIswriteable(pToRelease->pDbPage) );
5441 
5442     /* If pPayload is part of the data area of pPage, then make sure pPage
5443     ** is still writeable */
5444     assert( pPayload<pPage->aData || pPayload>=&pPage->aData[pBt->pageSize]
5445             || sqlite3PagerIswriteable(pPage->pDbPage) );
5446 
5447     if( nSrc>0 ){
5448       if( n>nSrc ) n = nSrc;
5449       assert( pSrc );
5450       memcpy(pPayload, pSrc, n);
5451     }else{
5452       memset(pPayload, 0, n);
5453     }
5454     nPayload -= n;
5455     pPayload += n;
5456     pSrc += n;
5457     nSrc -= n;
5458     spaceLeft -= n;
5459     if( nSrc==0 ){
5460       nSrc = nData;
5461       pSrc = pData;
5462     }
5463   }
5464   releasePage(pToRelease);
5465   return SQLITE_OK;
5466 }
5467 
5468 /*
5469 ** Remove the i-th cell from pPage.  This routine effects pPage only.
5470 ** The cell content is not freed or deallocated.  It is assumed that
5471 ** the cell content has been copied someplace else.  This routine just
5472 ** removes the reference to the cell from pPage.
5473 **
5474 ** "sz" must be the number of bytes in the cell.
5475 */
5476 static void dropCell(MemPage *pPage, int idx, int sz, int *pRC){
5477   u32 pc;         /* Offset to cell content of cell being deleted */
5478   u8 *data;       /* pPage->aData */
5479   u8 *ptr;        /* Used to move bytes around within data[] */
5480   u8 *endPtr;     /* End of loop */
5481   int rc;         /* The return code */
5482   int hdr;        /* Beginning of the header.  0 most pages.  100 page 1 */
5483 
5484   if( *pRC ) return;
5485 
5486   assert( idx>=0 && idx<pPage->nCell );
5487   assert( sz==cellSize(pPage, idx) );
5488   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
5489   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
5490   data = pPage->aData;
5491   ptr = &pPage->aCellIdx[2*idx];
5492   pc = get2byte(ptr);
5493   hdr = pPage->hdrOffset;
5494   testcase( pc==get2byte(&data[hdr+5]) );
5495   testcase( pc+sz==pPage->pBt->usableSize );
5496   if( pc < (u32)get2byte(&data[hdr+5]) || pc+sz > pPage->pBt->usableSize ){
5497     *pRC = SQLITE_CORRUPT_BKPT;
5498     return;
5499   }
5500   rc = freeSpace(pPage, pc, sz);
5501   if( rc ){
5502     *pRC = rc;
5503     return;
5504   }
5505   endPtr = &pPage->aCellIdx[2*pPage->nCell - 2];
5506   assert( (SQLITE_PTR_TO_INT(ptr)&1)==0 );  /* ptr is always 2-byte aligned */
5507   while( ptr<endPtr ){
5508     *(u16*)ptr = *(u16*)&ptr[2];
5509     ptr += 2;
5510   }
5511   pPage->nCell--;
5512   put2byte(&data[hdr+3], pPage->nCell);
5513   pPage->nFree += 2;
5514 }
5515 
5516 /*
5517 ** Insert a new cell on pPage at cell index "i".  pCell points to the
5518 ** content of the cell.
5519 **
5520 ** If the cell content will fit on the page, then put it there.  If it
5521 ** will not fit, then make a copy of the cell content into pTemp if
5522 ** pTemp is not null.  Regardless of pTemp, allocate a new entry
5523 ** in pPage->apOvfl[] and make it point to the cell content (either
5524 ** in pTemp or the original pCell) and also record its index.
5525 ** Allocating a new entry in pPage->aCell[] implies that
5526 ** pPage->nOverflow is incremented.
5527 **
5528 ** If nSkip is non-zero, then do not copy the first nSkip bytes of the
5529 ** cell. The caller will overwrite them after this function returns. If
5530 ** nSkip is non-zero, then pCell may not point to an invalid memory location
5531 ** (but pCell+nSkip is always valid).
5532 */
5533 static void insertCell(
5534   MemPage *pPage,   /* Page into which we are copying */
5535   int i,            /* New cell becomes the i-th cell of the page */
5536   u8 *pCell,        /* Content of the new cell */
5537   int sz,           /* Bytes of content in pCell */
5538   u8 *pTemp,        /* Temp storage space for pCell, if needed */
5539   Pgno iChild,      /* If non-zero, replace first 4 bytes with this value */
5540   int *pRC          /* Read and write return code from here */
5541 ){
5542   int idx = 0;      /* Where to write new cell content in data[] */
5543   int j;            /* Loop counter */
5544   int end;          /* First byte past the last cell pointer in data[] */
5545   int ins;          /* Index in data[] where new cell pointer is inserted */
5546   int cellOffset;   /* Address of first cell pointer in data[] */
5547   u8 *data;         /* The content of the whole page */
5548   u8 *ptr;          /* Used for moving information around in data[] */
5549   u8 *endPtr;       /* End of the loop */
5550 
5551   int nSkip = (iChild ? 4 : 0);
5552 
5553   if( *pRC ) return;
5554 
5555   assert( i>=0 && i<=pPage->nCell+pPage->nOverflow );
5556   assert( pPage->nCell<=MX_CELL(pPage->pBt) && MX_CELL(pPage->pBt)<=10921 );
5557   assert( pPage->nOverflow<=ArraySize(pPage->apOvfl) );
5558   assert( ArraySize(pPage->apOvfl)==ArraySize(pPage->aiOvfl) );
5559   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
5560   /* The cell should normally be sized correctly.  However, when moving a
5561   ** malformed cell from a leaf page to an interior page, if the cell size
5562   ** wanted to be less than 4 but got rounded up to 4 on the leaf, then size
5563   ** might be less than 8 (leaf-size + pointer) on the interior node.  Hence
5564   ** the term after the || in the following assert(). */
5565   assert( sz==cellSizePtr(pPage, pCell) || (sz==8 && iChild>0) );
5566   if( pPage->nOverflow || sz+2>pPage->nFree ){
5567     if( pTemp ){
5568       memcpy(pTemp+nSkip, pCell+nSkip, sz-nSkip);
5569       pCell = pTemp;
5570     }
5571     if( iChild ){
5572       put4byte(pCell, iChild);
5573     }
5574     j = pPage->nOverflow++;
5575     assert( j<(int)(sizeof(pPage->apOvfl)/sizeof(pPage->apOvfl[0])) );
5576     pPage->apOvfl[j] = pCell;
5577     pPage->aiOvfl[j] = (u16)i;
5578   }else{
5579     int rc = sqlite3PagerWrite(pPage->pDbPage);
5580     if( rc!=SQLITE_OK ){
5581       *pRC = rc;
5582       return;
5583     }
5584     assert( sqlite3PagerIswriteable(pPage->pDbPage) );
5585     data = pPage->aData;
5586     cellOffset = pPage->cellOffset;
5587     end = cellOffset + 2*pPage->nCell;
5588     ins = cellOffset + 2*i;
5589     rc = allocateSpace(pPage, sz, &idx);
5590     if( rc ){ *pRC = rc; return; }
5591     /* The allocateSpace() routine guarantees the following two properties
5592     ** if it returns success */
5593     assert( idx >= end+2 );
5594     assert( idx+sz <= (int)pPage->pBt->usableSize );
5595     pPage->nCell++;
5596     pPage->nFree -= (u16)(2 + sz);
5597     memcpy(&data[idx+nSkip], pCell+nSkip, sz-nSkip);
5598     if( iChild ){
5599       put4byte(&data[idx], iChild);
5600     }
5601     ptr = &data[end];
5602     endPtr = &data[ins];
5603     assert( (SQLITE_PTR_TO_INT(ptr)&1)==0 );  /* ptr is always 2-byte aligned */
5604     while( ptr>endPtr ){
5605       *(u16*)ptr = *(u16*)&ptr[-2];
5606       ptr -= 2;
5607     }
5608     put2byte(&data[ins], idx);
5609     put2byte(&data[pPage->hdrOffset+3], pPage->nCell);
5610 #ifndef SQLITE_OMIT_AUTOVACUUM
5611     if( pPage->pBt->autoVacuum ){
5612       /* The cell may contain a pointer to an overflow page. If so, write
5613       ** the entry for the overflow page into the pointer map.
5614       */
5615       ptrmapPutOvflPtr(pPage, pCell, pRC);
5616     }
5617 #endif
5618   }
5619 }
5620 
5621 /*
5622 ** Add a list of cells to a page.  The page should be initially empty.
5623 ** The cells are guaranteed to fit on the page.
5624 */
5625 static void assemblePage(
5626   MemPage *pPage,   /* The page to be assemblied */
5627   int nCell,        /* The number of cells to add to this page */
5628   u8 **apCell,      /* Pointers to cell bodies */
5629   u16 *aSize        /* Sizes of the cells */
5630 ){
5631   int i;            /* Loop counter */
5632   u8 *pCellptr;     /* Address of next cell pointer */
5633   int cellbody;     /* Address of next cell body */
5634   u8 * const data = pPage->aData;             /* Pointer to data for pPage */
5635   const int hdr = pPage->hdrOffset;           /* Offset of header on pPage */
5636   const int nUsable = pPage->pBt->usableSize; /* Usable size of page */
5637 
5638   assert( pPage->nOverflow==0 );
5639   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
5640   assert( nCell>=0 && nCell<=(int)MX_CELL(pPage->pBt)
5641             && (int)MX_CELL(pPage->pBt)<=10921);
5642   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
5643 
5644   /* Check that the page has just been zeroed by zeroPage() */
5645   assert( pPage->nCell==0 );
5646   assert( get2byteNotZero(&data[hdr+5])==nUsable );
5647 
5648   pCellptr = &pPage->aCellIdx[nCell*2];
5649   cellbody = nUsable;
5650   for(i=nCell-1; i>=0; i--){
5651     u16 sz = aSize[i];
5652     pCellptr -= 2;
5653     cellbody -= sz;
5654     put2byte(pCellptr, cellbody);
5655     memcpy(&data[cellbody], apCell[i], sz);
5656   }
5657   put2byte(&data[hdr+3], nCell);
5658   put2byte(&data[hdr+5], cellbody);
5659   pPage->nFree -= (nCell*2 + nUsable - cellbody);
5660   pPage->nCell = (u16)nCell;
5661 }
5662 
5663 /*
5664 ** The following parameters determine how many adjacent pages get involved
5665 ** in a balancing operation.  NN is the number of neighbors on either side
5666 ** of the page that participate in the balancing operation.  NB is the
5667 ** total number of pages that participate, including the target page and
5668 ** NN neighbors on either side.
5669 **
5670 ** The minimum value of NN is 1 (of course).  Increasing NN above 1
5671 ** (to 2 or 3) gives a modest improvement in SELECT and DELETE performance
5672 ** in exchange for a larger degradation in INSERT and UPDATE performance.
5673 ** The value of NN appears to give the best results overall.
5674 */
5675 #define NN 1             /* Number of neighbors on either side of pPage */
5676 #define NB (NN*2+1)      /* Total pages involved in the balance */
5677 
5678 
5679 #ifndef SQLITE_OMIT_QUICKBALANCE
5680 /*
5681 ** This version of balance() handles the common special case where
5682 ** a new entry is being inserted on the extreme right-end of the
5683 ** tree, in other words, when the new entry will become the largest
5684 ** entry in the tree.
5685 **
5686 ** Instead of trying to balance the 3 right-most leaf pages, just add
5687 ** a new page to the right-hand side and put the one new entry in
5688 ** that page.  This leaves the right side of the tree somewhat
5689 ** unbalanced.  But odds are that we will be inserting new entries
5690 ** at the end soon afterwards so the nearly empty page will quickly
5691 ** fill up.  On average.
5692 **
5693 ** pPage is the leaf page which is the right-most page in the tree.
5694 ** pParent is its parent.  pPage must have a single overflow entry
5695 ** which is also the right-most entry on the page.
5696 **
5697 ** The pSpace buffer is used to store a temporary copy of the divider
5698 ** cell that will be inserted into pParent. Such a cell consists of a 4
5699 ** byte page number followed by a variable length integer. In other
5700 ** words, at most 13 bytes. Hence the pSpace buffer must be at
5701 ** least 13 bytes in size.
5702 */
5703 static int balance_quick(MemPage *pParent, MemPage *pPage, u8 *pSpace){
5704   BtShared *const pBt = pPage->pBt;    /* B-Tree Database */
5705   MemPage *pNew;                       /* Newly allocated page */
5706   int rc;                              /* Return Code */
5707   Pgno pgnoNew;                        /* Page number of pNew */
5708 
5709   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
5710   assert( sqlite3PagerIswriteable(pParent->pDbPage) );
5711   assert( pPage->nOverflow==1 );
5712 
5713   /* This error condition is now caught prior to reaching this function */
5714   if( pPage->nCell<=0 ) return SQLITE_CORRUPT_BKPT;
5715 
5716   /* Allocate a new page. This page will become the right-sibling of
5717   ** pPage. Make the parent page writable, so that the new divider cell
5718   ** may be inserted. If both these operations are successful, proceed.
5719   */
5720   rc = allocateBtreePage(pBt, &pNew, &pgnoNew, 0, 0);
5721 
5722   if( rc==SQLITE_OK ){
5723 
5724     u8 *pOut = &pSpace[4];
5725     u8 *pCell = pPage->apOvfl[0];
5726     u16 szCell = cellSizePtr(pPage, pCell);
5727     u8 *pStop;
5728 
5729     assert( sqlite3PagerIswriteable(pNew->pDbPage) );
5730     assert( pPage->aData[0]==(PTF_INTKEY|PTF_LEAFDATA|PTF_LEAF) );
5731     zeroPage(pNew, PTF_INTKEY|PTF_LEAFDATA|PTF_LEAF);
5732     assemblePage(pNew, 1, &pCell, &szCell);
5733 
5734     /* If this is an auto-vacuum database, update the pointer map
5735     ** with entries for the new page, and any pointer from the
5736     ** cell on the page to an overflow page. If either of these
5737     ** operations fails, the return code is set, but the contents
5738     ** of the parent page are still manipulated by thh code below.
5739     ** That is Ok, at this point the parent page is guaranteed to
5740     ** be marked as dirty. Returning an error code will cause a
5741     ** rollback, undoing any changes made to the parent page.
5742     */
5743     if( ISAUTOVACUUM ){
5744       ptrmapPut(pBt, pgnoNew, PTRMAP_BTREE, pParent->pgno, &rc);
5745       if( szCell>pNew->minLocal ){
5746         ptrmapPutOvflPtr(pNew, pCell, &rc);
5747       }
5748     }
5749 
5750     /* Create a divider cell to insert into pParent. The divider cell
5751     ** consists of a 4-byte page number (the page number of pPage) and
5752     ** a variable length key value (which must be the same value as the
5753     ** largest key on pPage).
5754     **
5755     ** To find the largest key value on pPage, first find the right-most
5756     ** cell on pPage. The first two fields of this cell are the
5757     ** record-length (a variable length integer at most 32-bits in size)
5758     ** and the key value (a variable length integer, may have any value).
5759     ** The first of the while(...) loops below skips over the record-length
5760     ** field. The second while(...) loop copies the key value from the
5761     ** cell on pPage into the pSpace buffer.
5762     */
5763     pCell = findCell(pPage, pPage->nCell-1);
5764     pStop = &pCell[9];
5765     while( (*(pCell++)&0x80) && pCell<pStop );
5766     pStop = &pCell[9];
5767     while( ((*(pOut++) = *(pCell++))&0x80) && pCell<pStop );
5768 
5769     /* Insert the new divider cell into pParent. */
5770     insertCell(pParent, pParent->nCell, pSpace, (int)(pOut-pSpace),
5771                0, pPage->pgno, &rc);
5772 
5773     /* Set the right-child pointer of pParent to point to the new page. */
5774     put4byte(&pParent->aData[pParent->hdrOffset+8], pgnoNew);
5775 
5776     /* Release the reference to the new page. */
5777     releasePage(pNew);
5778   }
5779 
5780   return rc;
5781 }
5782 #endif /* SQLITE_OMIT_QUICKBALANCE */
5783 
5784 #if 0
5785 /*
5786 ** This function does not contribute anything to the operation of SQLite.
5787 ** it is sometimes activated temporarily while debugging code responsible
5788 ** for setting pointer-map entries.
5789 */
5790 static int ptrmapCheckPages(MemPage **apPage, int nPage){
5791   int i, j;
5792   for(i=0; i<nPage; i++){
5793     Pgno n;
5794     u8 e;
5795     MemPage *pPage = apPage[i];
5796     BtShared *pBt = pPage->pBt;
5797     assert( pPage->isInit );
5798 
5799     for(j=0; j<pPage->nCell; j++){
5800       CellInfo info;
5801       u8 *z;
5802 
5803       z = findCell(pPage, j);
5804       btreeParseCellPtr(pPage, z, &info);
5805       if( info.iOverflow ){
5806         Pgno ovfl = get4byte(&z[info.iOverflow]);
5807         ptrmapGet(pBt, ovfl, &e, &n);
5808         assert( n==pPage->pgno && e==PTRMAP_OVERFLOW1 );
5809       }
5810       if( !pPage->leaf ){
5811         Pgno child = get4byte(z);
5812         ptrmapGet(pBt, child, &e, &n);
5813         assert( n==pPage->pgno && e==PTRMAP_BTREE );
5814       }
5815     }
5816     if( !pPage->leaf ){
5817       Pgno child = get4byte(&pPage->aData[pPage->hdrOffset+8]);
5818       ptrmapGet(pBt, child, &e, &n);
5819       assert( n==pPage->pgno && e==PTRMAP_BTREE );
5820     }
5821   }
5822   return 1;
5823 }
5824 #endif
5825 
5826 /*
5827 ** This function is used to copy the contents of the b-tree node stored
5828 ** on page pFrom to page pTo. If page pFrom was not a leaf page, then
5829 ** the pointer-map entries for each child page are updated so that the
5830 ** parent page stored in the pointer map is page pTo. If pFrom contained
5831 ** any cells with overflow page pointers, then the corresponding pointer
5832 ** map entries are also updated so that the parent page is page pTo.
5833 **
5834 ** If pFrom is currently carrying any overflow cells (entries in the
5835 ** MemPage.apOvfl[] array), they are not copied to pTo.
5836 **
5837 ** Before returning, page pTo is reinitialized using btreeInitPage().
5838 **
5839 ** The performance of this function is not critical. It is only used by
5840 ** the balance_shallower() and balance_deeper() procedures, neither of
5841 ** which are called often under normal circumstances.
5842 */
5843 static void copyNodeContent(MemPage *pFrom, MemPage *pTo, int *pRC){
5844   if( (*pRC)==SQLITE_OK ){
5845     BtShared * const pBt = pFrom->pBt;
5846     u8 * const aFrom = pFrom->aData;
5847     u8 * const aTo = pTo->aData;
5848     int const iFromHdr = pFrom->hdrOffset;
5849     int const iToHdr = ((pTo->pgno==1) ? 100 : 0);
5850     int rc;
5851     int iData;
5852 
5853 
5854     assert( pFrom->isInit );
5855     assert( pFrom->nFree>=iToHdr );
5856     assert( get2byte(&aFrom[iFromHdr+5]) <= (int)pBt->usableSize );
5857 
5858     /* Copy the b-tree node content from page pFrom to page pTo. */
5859     iData = get2byte(&aFrom[iFromHdr+5]);
5860     memcpy(&aTo[iData], &aFrom[iData], pBt->usableSize-iData);
5861     memcpy(&aTo[iToHdr], &aFrom[iFromHdr], pFrom->cellOffset + 2*pFrom->nCell);
5862 
5863     /* Reinitialize page pTo so that the contents of the MemPage structure
5864     ** match the new data. The initialization of pTo can actually fail under
5865     ** fairly obscure circumstances, even though it is a copy of initialized
5866     ** page pFrom.
5867     */
5868     pTo->isInit = 0;
5869     rc = btreeInitPage(pTo);
5870     if( rc!=SQLITE_OK ){
5871       *pRC = rc;
5872       return;
5873     }
5874 
5875     /* If this is an auto-vacuum database, update the pointer-map entries
5876     ** for any b-tree or overflow pages that pTo now contains the pointers to.
5877     */
5878     if( ISAUTOVACUUM ){
5879       *pRC = setChildPtrmaps(pTo);
5880     }
5881   }
5882 }
5883 
5884 /*
5885 ** This routine redistributes cells on the iParentIdx'th child of pParent
5886 ** (hereafter "the page") and up to 2 siblings so that all pages have about the
5887 ** same amount of free space. Usually a single sibling on either side of the
5888 ** page are used in the balancing, though both siblings might come from one
5889 ** side if the page is the first or last child of its parent. If the page
5890 ** has fewer than 2 siblings (something which can only happen if the page
5891 ** is a root page or a child of a root page) then all available siblings
5892 ** participate in the balancing.
5893 **
5894 ** The number of siblings of the page might be increased or decreased by
5895 ** one or two in an effort to keep pages nearly full but not over full.
5896 **
5897 ** Note that when this routine is called, some of the cells on the page
5898 ** might not actually be stored in MemPage.aData[]. This can happen
5899 ** if the page is overfull. This routine ensures that all cells allocated
5900 ** to the page and its siblings fit into MemPage.aData[] before returning.
5901 **
5902 ** In the course of balancing the page and its siblings, cells may be
5903 ** inserted into or removed from the parent page (pParent). Doing so
5904 ** may cause the parent page to become overfull or underfull. If this
5905 ** happens, it is the responsibility of the caller to invoke the correct
5906 ** balancing routine to fix this problem (see the balance() routine).
5907 **
5908 ** If this routine fails for any reason, it might leave the database
5909 ** in a corrupted state. So if this routine fails, the database should
5910 ** be rolled back.
5911 **
5912 ** The third argument to this function, aOvflSpace, is a pointer to a
5913 ** buffer big enough to hold one page. If while inserting cells into the parent
5914 ** page (pParent) the parent page becomes overfull, this buffer is
5915 ** used to store the parent's overflow cells. Because this function inserts
5916 ** a maximum of four divider cells into the parent page, and the maximum
5917 ** size of a cell stored within an internal node is always less than 1/4
5918 ** of the page-size, the aOvflSpace[] buffer is guaranteed to be large
5919 ** enough for all overflow cells.
5920 **
5921 ** If aOvflSpace is set to a null pointer, this function returns
5922 ** SQLITE_NOMEM.
5923 */
5924 static int balance_nonroot(
5925   MemPage *pParent,               /* Parent page of siblings being balanced */
5926   int iParentIdx,                 /* Index of "the page" in pParent */
5927   u8 *aOvflSpace,                 /* page-size bytes of space for parent ovfl */
5928   int isRoot                      /* True if pParent is a root-page */
5929 ){
5930   BtShared *pBt;               /* The whole database */
5931   int nCell = 0;               /* Number of cells in apCell[] */
5932   int nMaxCells = 0;           /* Allocated size of apCell, szCell, aFrom. */
5933   int nNew = 0;                /* Number of pages in apNew[] */
5934   int nOld;                    /* Number of pages in apOld[] */
5935   int i, j, k;                 /* Loop counters */
5936   int nxDiv;                   /* Next divider slot in pParent->aCell[] */
5937   int rc = SQLITE_OK;          /* The return code */
5938   u16 leafCorrection;          /* 4 if pPage is a leaf.  0 if not */
5939   int leafData;                /* True if pPage is a leaf of a LEAFDATA tree */
5940   int usableSpace;             /* Bytes in pPage beyond the header */
5941   int pageFlags;               /* Value of pPage->aData[0] */
5942   int subtotal;                /* Subtotal of bytes in cells on one page */
5943   int iSpace1 = 0;             /* First unused byte of aSpace1[] */
5944   int iOvflSpace = 0;          /* First unused byte of aOvflSpace[] */
5945   int szScratch;               /* Size of scratch memory requested */
5946   MemPage *apOld[NB];          /* pPage and up to two siblings */
5947   MemPage *apCopy[NB];         /* Private copies of apOld[] pages */
5948   MemPage *apNew[NB+2];        /* pPage and up to NB siblings after balancing */
5949   u8 *pRight;                  /* Location in parent of right-sibling pointer */
5950   u8 *apDiv[NB-1];             /* Divider cells in pParent */
5951   int cntNew[NB+2];            /* Index in aCell[] of cell after i-th page */
5952   int szNew[NB+2];             /* Combined size of cells place on i-th page */
5953   u8 **apCell = 0;             /* All cells begin balanced */
5954   u16 *szCell;                 /* Local size of all cells in apCell[] */
5955   u8 *aSpace1;                 /* Space for copies of dividers cells */
5956   Pgno pgno;                   /* Temp var to store a page number in */
5957 
5958   pBt = pParent->pBt;
5959   assert( sqlite3_mutex_held(pBt->mutex) );
5960   assert( sqlite3PagerIswriteable(pParent->pDbPage) );
5961 
5962 #if 0
5963   TRACE(("BALANCE: begin page %d child of %d\n", pPage->pgno, pParent->pgno));
5964 #endif
5965 
5966   /* At this point pParent may have at most one overflow cell. And if
5967   ** this overflow cell is present, it must be the cell with
5968   ** index iParentIdx. This scenario comes about when this function
5969   ** is called (indirectly) from sqlite3BtreeDelete().
5970   */
5971   assert( pParent->nOverflow==0 || pParent->nOverflow==1 );
5972   assert( pParent->nOverflow==0 || pParent->aiOvfl[0]==iParentIdx );
5973 
5974   if( !aOvflSpace ){
5975     return SQLITE_NOMEM;
5976   }
5977 
5978   /* Find the sibling pages to balance. Also locate the cells in pParent
5979   ** that divide the siblings. An attempt is made to find NN siblings on
5980   ** either side of pPage. More siblings are taken from one side, however,
5981   ** if there are fewer than NN siblings on the other side. If pParent
5982   ** has NB or fewer children then all children of pParent are taken.
5983   **
5984   ** This loop also drops the divider cells from the parent page. This
5985   ** way, the remainder of the function does not have to deal with any
5986   ** overflow cells in the parent page, since if any existed they will
5987   ** have already been removed.
5988   */
5989   i = pParent->nOverflow + pParent->nCell;
5990   if( i<2 ){
5991     nxDiv = 0;
5992     nOld = i+1;
5993   }else{
5994     nOld = 3;
5995     if( iParentIdx==0 ){
5996       nxDiv = 0;
5997     }else if( iParentIdx==i ){
5998       nxDiv = i-2;
5999     }else{
6000       nxDiv = iParentIdx-1;
6001     }
6002     i = 2;
6003   }
6004   if( (i+nxDiv-pParent->nOverflow)==pParent->nCell ){
6005     pRight = &pParent->aData[pParent->hdrOffset+8];
6006   }else{
6007     pRight = findCell(pParent, i+nxDiv-pParent->nOverflow);
6008   }
6009   pgno = get4byte(pRight);
6010   while( 1 ){
6011     rc = getAndInitPage(pBt, pgno, &apOld[i]);
6012     if( rc ){
6013       memset(apOld, 0, (i+1)*sizeof(MemPage*));
6014       goto balance_cleanup;
6015     }
6016     nMaxCells += 1+apOld[i]->nCell+apOld[i]->nOverflow;
6017     if( (i--)==0 ) break;
6018 
6019     if( i+nxDiv==pParent->aiOvfl[0] && pParent->nOverflow ){
6020       apDiv[i] = pParent->apOvfl[0];
6021       pgno = get4byte(apDiv[i]);
6022       szNew[i] = cellSizePtr(pParent, apDiv[i]);
6023       pParent->nOverflow = 0;
6024     }else{
6025       apDiv[i] = findCell(pParent, i+nxDiv-pParent->nOverflow);
6026       pgno = get4byte(apDiv[i]);
6027       szNew[i] = cellSizePtr(pParent, apDiv[i]);
6028 
6029       /* Drop the cell from the parent page. apDiv[i] still points to
6030       ** the cell within the parent, even though it has been dropped.
6031       ** This is safe because dropping a cell only overwrites the first
6032       ** four bytes of it, and this function does not need the first
6033       ** four bytes of the divider cell. So the pointer is safe to use
6034       ** later on.
6035       **
6036       ** But not if we are in secure-delete mode. In secure-delete mode,
6037       ** the dropCell() routine will overwrite the entire cell with zeroes.
6038       ** In this case, temporarily copy the cell into the aOvflSpace[]
6039       ** buffer. It will be copied out again as soon as the aSpace[] buffer
6040       ** is allocated.  */
6041       if( pBt->btsFlags & BTS_SECURE_DELETE ){
6042         int iOff;
6043 
6044         iOff = SQLITE_PTR_TO_INT(apDiv[i]) - SQLITE_PTR_TO_INT(pParent->aData);
6045         if( (iOff+szNew[i])>(int)pBt->usableSize ){
6046           rc = SQLITE_CORRUPT_BKPT;
6047           memset(apOld, 0, (i+1)*sizeof(MemPage*));
6048           goto balance_cleanup;
6049         }else{
6050           memcpy(&aOvflSpace[iOff], apDiv[i], szNew[i]);
6051           apDiv[i] = &aOvflSpace[apDiv[i]-pParent->aData];
6052         }
6053       }
6054       dropCell(pParent, i+nxDiv-pParent->nOverflow, szNew[i], &rc);
6055     }
6056   }
6057 
6058   /* Make nMaxCells a multiple of 4 in order to preserve 8-byte
6059   ** alignment */
6060   nMaxCells = (nMaxCells + 3)&~3;
6061 
6062   /*
6063   ** Allocate space for memory structures
6064   */
6065   k = pBt->pageSize + ROUND8(sizeof(MemPage));
6066   szScratch =
6067        nMaxCells*sizeof(u8*)                       /* apCell */
6068      + nMaxCells*sizeof(u16)                       /* szCell */
6069      + pBt->pageSize                               /* aSpace1 */
6070      + k*nOld;                                     /* Page copies (apCopy) */
6071   apCell = sqlite3ScratchMalloc( szScratch );
6072   if( apCell==0 ){
6073     rc = SQLITE_NOMEM;
6074     goto balance_cleanup;
6075   }
6076   szCell = (u16*)&apCell[nMaxCells];
6077   aSpace1 = (u8*)&szCell[nMaxCells];
6078   assert( EIGHT_BYTE_ALIGNMENT(aSpace1) );
6079 
6080   /*
6081   ** Load pointers to all cells on sibling pages and the divider cells
6082   ** into the local apCell[] array.  Make copies of the divider cells
6083   ** into space obtained from aSpace1[] and remove the the divider Cells
6084   ** from pParent.
6085   **
6086   ** If the siblings are on leaf pages, then the child pointers of the
6087   ** divider cells are stripped from the cells before they are copied
6088   ** into aSpace1[].  In this way, all cells in apCell[] are without
6089   ** child pointers.  If siblings are not leaves, then all cell in
6090   ** apCell[] include child pointers.  Either way, all cells in apCell[]
6091   ** are alike.
6092   **
6093   ** leafCorrection:  4 if pPage is a leaf.  0 if pPage is not a leaf.
6094   **       leafData:  1 if pPage holds key+data and pParent holds only keys.
6095   */
6096   leafCorrection = apOld[0]->leaf*4;
6097   leafData = apOld[0]->hasData;
6098   for(i=0; i<nOld; i++){
6099     int limit;
6100 
6101     /* Before doing anything else, take a copy of the i'th original sibling
6102     ** The rest of this function will use data from the copies rather
6103     ** that the original pages since the original pages will be in the
6104     ** process of being overwritten.  */
6105     MemPage *pOld = apCopy[i] = (MemPage*)&aSpace1[pBt->pageSize + k*i];
6106     memcpy(pOld, apOld[i], sizeof(MemPage));
6107     pOld->aData = (void*)&pOld[1];
6108     memcpy(pOld->aData, apOld[i]->aData, pBt->pageSize);
6109 
6110     limit = pOld->nCell+pOld->nOverflow;
6111     if( pOld->nOverflow>0 ){
6112       for(j=0; j<limit; j++){
6113         assert( nCell<nMaxCells );
6114         apCell[nCell] = findOverflowCell(pOld, j);
6115         szCell[nCell] = cellSizePtr(pOld, apCell[nCell]);
6116         nCell++;
6117       }
6118     }else{
6119       u8 *aData = pOld->aData;
6120       u16 maskPage = pOld->maskPage;
6121       u16 cellOffset = pOld->cellOffset;
6122       for(j=0; j<limit; j++){
6123         assert( nCell<nMaxCells );
6124         apCell[nCell] = findCellv2(aData, maskPage, cellOffset, j);
6125         szCell[nCell] = cellSizePtr(pOld, apCell[nCell]);
6126         nCell++;
6127       }
6128     }
6129     if( i<nOld-1 && !leafData){
6130       u16 sz = (u16)szNew[i];
6131       u8 *pTemp;
6132       assert( nCell<nMaxCells );
6133       szCell[nCell] = sz;
6134       pTemp = &aSpace1[iSpace1];
6135       iSpace1 += sz;
6136       assert( sz<=pBt->maxLocal+23 );
6137       assert( iSpace1 <= (int)pBt->pageSize );
6138       memcpy(pTemp, apDiv[i], sz);
6139       apCell[nCell] = pTemp+leafCorrection;
6140       assert( leafCorrection==0 || leafCorrection==4 );
6141       szCell[nCell] = szCell[nCell] - leafCorrection;
6142       if( !pOld->leaf ){
6143         assert( leafCorrection==0 );
6144         assert( pOld->hdrOffset==0 );
6145         /* The right pointer of the child page pOld becomes the left
6146         ** pointer of the divider cell */
6147         memcpy(apCell[nCell], &pOld->aData[8], 4);
6148       }else{
6149         assert( leafCorrection==4 );
6150         if( szCell[nCell]<4 ){
6151           /* Do not allow any cells smaller than 4 bytes. */
6152           szCell[nCell] = 4;
6153         }
6154       }
6155       nCell++;
6156     }
6157   }
6158 
6159   /*
6160   ** Figure out the number of pages needed to hold all nCell cells.
6161   ** Store this number in "k".  Also compute szNew[] which is the total
6162   ** size of all cells on the i-th page and cntNew[] which is the index
6163   ** in apCell[] of the cell that divides page i from page i+1.
6164   ** cntNew[k] should equal nCell.
6165   **
6166   ** Values computed by this block:
6167   **
6168   **           k: The total number of sibling pages
6169   **    szNew[i]: Spaced used on the i-th sibling page.
6170   **   cntNew[i]: Index in apCell[] and szCell[] for the first cell to
6171   **              the right of the i-th sibling page.
6172   ** usableSpace: Number of bytes of space available on each sibling.
6173   **
6174   */
6175   usableSpace = pBt->usableSize - 12 + leafCorrection;
6176   for(subtotal=k=i=0; i<nCell; i++){
6177     assert( i<nMaxCells );
6178     subtotal += szCell[i] + 2;
6179     if( subtotal > usableSpace ){
6180       szNew[k] = subtotal - szCell[i];
6181       cntNew[k] = i;
6182       if( leafData ){ i--; }
6183       subtotal = 0;
6184       k++;
6185       if( k>NB+1 ){ rc = SQLITE_CORRUPT_BKPT; goto balance_cleanup; }
6186     }
6187   }
6188   szNew[k] = subtotal;
6189   cntNew[k] = nCell;
6190   k++;
6191 
6192   /*
6193   ** The packing computed by the previous block is biased toward the siblings
6194   ** on the left side.  The left siblings are always nearly full, while the
6195   ** right-most sibling might be nearly empty.  This block of code attempts
6196   ** to adjust the packing of siblings to get a better balance.
6197   **
6198   ** This adjustment is more than an optimization.  The packing above might
6199   ** be so out of balance as to be illegal.  For example, the right-most
6200   ** sibling might be completely empty.  This adjustment is not optional.
6201   */
6202   for(i=k-1; i>0; i--){
6203     int szRight = szNew[i];  /* Size of sibling on the right */
6204     int szLeft = szNew[i-1]; /* Size of sibling on the left */
6205     int r;              /* Index of right-most cell in left sibling */
6206     int d;              /* Index of first cell to the left of right sibling */
6207 
6208     r = cntNew[i-1] - 1;
6209     d = r + 1 - leafData;
6210     assert( d<nMaxCells );
6211     assert( r<nMaxCells );
6212     while( szRight==0 || szRight+szCell[d]+2<=szLeft-(szCell[r]+2) ){
6213       szRight += szCell[d] + 2;
6214       szLeft -= szCell[r] + 2;
6215       cntNew[i-1]--;
6216       r = cntNew[i-1] - 1;
6217       d = r + 1 - leafData;
6218     }
6219     szNew[i] = szRight;
6220     szNew[i-1] = szLeft;
6221   }
6222 
6223   /* Either we found one or more cells (cntnew[0])>0) or pPage is
6224   ** a virtual root page.  A virtual root page is when the real root
6225   ** page is page 1 and we are the only child of that page.
6226   **
6227   ** UPDATE:  The assert() below is not necessarily true if the database
6228   ** file is corrupt.  The corruption will be detected and reported later
6229   ** in this procedure so there is no need to act upon it now.
6230   */
6231 #if 0
6232   assert( cntNew[0]>0 || (pParent->pgno==1 && pParent->nCell==0) );
6233 #endif
6234 
6235   TRACE(("BALANCE: old: %d %d %d  ",
6236     apOld[0]->pgno,
6237     nOld>=2 ? apOld[1]->pgno : 0,
6238     nOld>=3 ? apOld[2]->pgno : 0
6239   ));
6240 
6241   /*
6242   ** Allocate k new pages.  Reuse old pages where possible.
6243   */
6244   if( apOld[0]->pgno<=1 ){
6245     rc = SQLITE_CORRUPT_BKPT;
6246     goto balance_cleanup;
6247   }
6248   pageFlags = apOld[0]->aData[0];
6249   for(i=0; i<k; i++){
6250     MemPage *pNew;
6251     if( i<nOld ){
6252       pNew = apNew[i] = apOld[i];
6253       apOld[i] = 0;
6254       rc = sqlite3PagerWrite(pNew->pDbPage);
6255       nNew++;
6256       if( rc ) goto balance_cleanup;
6257     }else{
6258       assert( i>0 );
6259       rc = allocateBtreePage(pBt, &pNew, &pgno, pgno, 0);
6260       if( rc ) goto balance_cleanup;
6261       apNew[i] = pNew;
6262       nNew++;
6263 
6264       /* Set the pointer-map entry for the new sibling page. */
6265       if( ISAUTOVACUUM ){
6266         ptrmapPut(pBt, pNew->pgno, PTRMAP_BTREE, pParent->pgno, &rc);
6267         if( rc!=SQLITE_OK ){
6268           goto balance_cleanup;
6269         }
6270       }
6271     }
6272   }
6273 
6274   /* Free any old pages that were not reused as new pages.
6275   */
6276   while( i<nOld ){
6277     freePage(apOld[i], &rc);
6278     if( rc ) goto balance_cleanup;
6279     releasePage(apOld[i]);
6280     apOld[i] = 0;
6281     i++;
6282   }
6283 
6284   /*
6285   ** Put the new pages in accending order.  This helps to
6286   ** keep entries in the disk file in order so that a scan
6287   ** of the table is a linear scan through the file.  That
6288   ** in turn helps the operating system to deliver pages
6289   ** from the disk more rapidly.
6290   **
6291   ** An O(n^2) insertion sort algorithm is used, but since
6292   ** n is never more than NB (a small constant), that should
6293   ** not be a problem.
6294   **
6295   ** When NB==3, this one optimization makes the database
6296   ** about 25% faster for large insertions and deletions.
6297   */
6298   for(i=0; i<k-1; i++){
6299     int minV = apNew[i]->pgno;
6300     int minI = i;
6301     for(j=i+1; j<k; j++){
6302       if( apNew[j]->pgno<(unsigned)minV ){
6303         minI = j;
6304         minV = apNew[j]->pgno;
6305       }
6306     }
6307     if( minI>i ){
6308       MemPage *pT;
6309       pT = apNew[i];
6310       apNew[i] = apNew[minI];
6311       apNew[minI] = pT;
6312     }
6313   }
6314   TRACE(("new: %d(%d) %d(%d) %d(%d) %d(%d) %d(%d)\n",
6315     apNew[0]->pgno, szNew[0],
6316     nNew>=2 ? apNew[1]->pgno : 0, nNew>=2 ? szNew[1] : 0,
6317     nNew>=3 ? apNew[2]->pgno : 0, nNew>=3 ? szNew[2] : 0,
6318     nNew>=4 ? apNew[3]->pgno : 0, nNew>=4 ? szNew[3] : 0,
6319     nNew>=5 ? apNew[4]->pgno : 0, nNew>=5 ? szNew[4] : 0));
6320 
6321   assert( sqlite3PagerIswriteable(pParent->pDbPage) );
6322   put4byte(pRight, apNew[nNew-1]->pgno);
6323 
6324   /*
6325   ** Evenly distribute the data in apCell[] across the new pages.
6326   ** Insert divider cells into pParent as necessary.
6327   */
6328   j = 0;
6329   for(i=0; i<nNew; i++){
6330     /* Assemble the new sibling page. */
6331     MemPage *pNew = apNew[i];
6332     assert( j<nMaxCells );
6333     zeroPage(pNew, pageFlags);
6334     assemblePage(pNew, cntNew[i]-j, &apCell[j], &szCell[j]);
6335     assert( pNew->nCell>0 || (nNew==1 && cntNew[0]==0) );
6336     assert( pNew->nOverflow==0 );
6337 
6338     j = cntNew[i];
6339 
6340     /* If the sibling page assembled above was not the right-most sibling,
6341     ** insert a divider cell into the parent page.
6342     */
6343     assert( i<nNew-1 || j==nCell );
6344     if( j<nCell ){
6345       u8 *pCell;
6346       u8 *pTemp;
6347       int sz;
6348 
6349       assert( j<nMaxCells );
6350       pCell = apCell[j];
6351       sz = szCell[j] + leafCorrection;
6352       pTemp = &aOvflSpace[iOvflSpace];
6353       if( !pNew->leaf ){
6354         memcpy(&pNew->aData[8], pCell, 4);
6355       }else if( leafData ){
6356         /* If the tree is a leaf-data tree, and the siblings are leaves,
6357         ** then there is no divider cell in apCell[]. Instead, the divider
6358         ** cell consists of the integer key for the right-most cell of
6359         ** the sibling-page assembled above only.
6360         */
6361         CellInfo info;
6362         j--;
6363         btreeParseCellPtr(pNew, apCell[j], &info);
6364         pCell = pTemp;
6365         sz = 4 + putVarint(&pCell[4], info.nKey);
6366         pTemp = 0;
6367       }else{
6368         pCell -= 4;
6369         /* Obscure case for non-leaf-data trees: If the cell at pCell was
6370         ** previously stored on a leaf node, and its reported size was 4
6371         ** bytes, then it may actually be smaller than this
6372         ** (see btreeParseCellPtr(), 4 bytes is the minimum size of
6373         ** any cell). But it is important to pass the correct size to
6374         ** insertCell(), so reparse the cell now.
6375         **
6376         ** Note that this can never happen in an SQLite data file, as all
6377         ** cells are at least 4 bytes. It only happens in b-trees used
6378         ** to evaluate "IN (SELECT ...)" and similar clauses.
6379         */
6380         if( szCell[j]==4 ){
6381           assert(leafCorrection==4);
6382           sz = cellSizePtr(pParent, pCell);
6383         }
6384       }
6385       iOvflSpace += sz;
6386       assert( sz<=pBt->maxLocal+23 );
6387       assert( iOvflSpace <= (int)pBt->pageSize );
6388       insertCell(pParent, nxDiv, pCell, sz, pTemp, pNew->pgno, &rc);
6389       if( rc!=SQLITE_OK ) goto balance_cleanup;
6390       assert( sqlite3PagerIswriteable(pParent->pDbPage) );
6391 
6392       j++;
6393       nxDiv++;
6394     }
6395   }
6396   assert( j==nCell );
6397   assert( nOld>0 );
6398   assert( nNew>0 );
6399   if( (pageFlags & PTF_LEAF)==0 ){
6400     u8 *zChild = &apCopy[nOld-1]->aData[8];
6401     memcpy(&apNew[nNew-1]->aData[8], zChild, 4);
6402   }
6403 
6404   if( isRoot && pParent->nCell==0 && pParent->hdrOffset<=apNew[0]->nFree ){
6405     /* The root page of the b-tree now contains no cells. The only sibling
6406     ** page is the right-child of the parent. Copy the contents of the
6407     ** child page into the parent, decreasing the overall height of the
6408     ** b-tree structure by one. This is described as the "balance-shallower"
6409     ** sub-algorithm in some documentation.
6410     **
6411     ** If this is an auto-vacuum database, the call to copyNodeContent()
6412     ** sets all pointer-map entries corresponding to database image pages
6413     ** for which the pointer is stored within the content being copied.
6414     **
6415     ** The second assert below verifies that the child page is defragmented
6416     ** (it must be, as it was just reconstructed using assemblePage()). This
6417     ** is important if the parent page happens to be page 1 of the database
6418     ** image.  */
6419     assert( nNew==1 );
6420     assert( apNew[0]->nFree ==
6421         (get2byte(&apNew[0]->aData[5])-apNew[0]->cellOffset-apNew[0]->nCell*2)
6422     );
6423     copyNodeContent(apNew[0], pParent, &rc);
6424     freePage(apNew[0], &rc);
6425   }else if( ISAUTOVACUUM ){
6426     /* Fix the pointer-map entries for all the cells that were shifted around.
6427     ** There are several different types of pointer-map entries that need to
6428     ** be dealt with by this routine. Some of these have been set already, but
6429     ** many have not. The following is a summary:
6430     **
6431     **   1) The entries associated with new sibling pages that were not
6432     **      siblings when this function was called. These have already
6433     **      been set. We don't need to worry about old siblings that were
6434     **      moved to the free-list - the freePage() code has taken care
6435     **      of those.
6436     **
6437     **   2) The pointer-map entries associated with the first overflow
6438     **      page in any overflow chains used by new divider cells. These
6439     **      have also already been taken care of by the insertCell() code.
6440     **
6441     **   3) If the sibling pages are not leaves, then the child pages of
6442     **      cells stored on the sibling pages may need to be updated.
6443     **
6444     **   4) If the sibling pages are not internal intkey nodes, then any
6445     **      overflow pages used by these cells may need to be updated
6446     **      (internal intkey nodes never contain pointers to overflow pages).
6447     **
6448     **   5) If the sibling pages are not leaves, then the pointer-map
6449     **      entries for the right-child pages of each sibling may need
6450     **      to be updated.
6451     **
6452     ** Cases 1 and 2 are dealt with above by other code. The next
6453     ** block deals with cases 3 and 4 and the one after that, case 5. Since
6454     ** setting a pointer map entry is a relatively expensive operation, this
6455     ** code only sets pointer map entries for child or overflow pages that have
6456     ** actually moved between pages.  */
6457     MemPage *pNew = apNew[0];
6458     MemPage *pOld = apCopy[0];
6459     int nOverflow = pOld->nOverflow;
6460     int iNextOld = pOld->nCell + nOverflow;
6461     int iOverflow = (nOverflow ? pOld->aiOvfl[0] : -1);
6462     j = 0;                             /* Current 'old' sibling page */
6463     k = 0;                             /* Current 'new' sibling page */
6464     for(i=0; i<nCell; i++){
6465       int isDivider = 0;
6466       while( i==iNextOld ){
6467         /* Cell i is the cell immediately following the last cell on old
6468         ** sibling page j. If the siblings are not leaf pages of an
6469         ** intkey b-tree, then cell i was a divider cell. */
6470         assert( j+1 < ArraySize(apCopy) );
6471         pOld = apCopy[++j];
6472         iNextOld = i + !leafData + pOld->nCell + pOld->nOverflow;
6473         if( pOld->nOverflow ){
6474           nOverflow = pOld->nOverflow;
6475           iOverflow = i + !leafData + pOld->aiOvfl[0];
6476         }
6477         isDivider = !leafData;
6478       }
6479 
6480       assert(nOverflow>0 || iOverflow<i );
6481       assert(nOverflow<2 || pOld->aiOvfl[0]==pOld->aiOvfl[1]-1);
6482       assert(nOverflow<3 || pOld->aiOvfl[1]==pOld->aiOvfl[2]-1);
6483       if( i==iOverflow ){
6484         isDivider = 1;
6485         if( (--nOverflow)>0 ){
6486           iOverflow++;
6487         }
6488       }
6489 
6490       if( i==cntNew[k] ){
6491         /* Cell i is the cell immediately following the last cell on new
6492         ** sibling page k. If the siblings are not leaf pages of an
6493         ** intkey b-tree, then cell i is a divider cell.  */
6494         pNew = apNew[++k];
6495         if( !leafData ) continue;
6496       }
6497       assert( j<nOld );
6498       assert( k<nNew );
6499 
6500       /* If the cell was originally divider cell (and is not now) or
6501       ** an overflow cell, or if the cell was located on a different sibling
6502       ** page before the balancing, then the pointer map entries associated
6503       ** with any child or overflow pages need to be updated.  */
6504       if( isDivider || pOld->pgno!=pNew->pgno ){
6505         if( !leafCorrection ){
6506           ptrmapPut(pBt, get4byte(apCell[i]), PTRMAP_BTREE, pNew->pgno, &rc);
6507         }
6508         if( szCell[i]>pNew->minLocal ){
6509           ptrmapPutOvflPtr(pNew, apCell[i], &rc);
6510         }
6511       }
6512     }
6513 
6514     if( !leafCorrection ){
6515       for(i=0; i<nNew; i++){
6516         u32 key = get4byte(&apNew[i]->aData[8]);
6517         ptrmapPut(pBt, key, PTRMAP_BTREE, apNew[i]->pgno, &rc);
6518       }
6519     }
6520 
6521 #if 0
6522     /* The ptrmapCheckPages() contains assert() statements that verify that
6523     ** all pointer map pages are set correctly. This is helpful while
6524     ** debugging. This is usually disabled because a corrupt database may
6525     ** cause an assert() statement to fail.  */
6526     ptrmapCheckPages(apNew, nNew);
6527     ptrmapCheckPages(&pParent, 1);
6528 #endif
6529   }
6530 
6531   assert( pParent->isInit );
6532   TRACE(("BALANCE: finished: old=%d new=%d cells=%d\n",
6533           nOld, nNew, nCell));
6534 
6535   /*
6536   ** Cleanup before returning.
6537   */
6538 balance_cleanup:
6539   sqlite3ScratchFree(apCell);
6540   for(i=0; i<nOld; i++){
6541     releasePage(apOld[i]);
6542   }
6543   for(i=0; i<nNew; i++){
6544     releasePage(apNew[i]);
6545   }
6546 
6547   return rc;
6548 }
6549 
6550 
6551 /*
6552 ** This function is called when the root page of a b-tree structure is
6553 ** overfull (has one or more overflow pages).
6554 **
6555 ** A new child page is allocated and the contents of the current root
6556 ** page, including overflow cells, are copied into the child. The root
6557 ** page is then overwritten to make it an empty page with the right-child
6558 ** pointer pointing to the new page.
6559 **
6560 ** Before returning, all pointer-map entries corresponding to pages
6561 ** that the new child-page now contains pointers to are updated. The
6562 ** entry corresponding to the new right-child pointer of the root
6563 ** page is also updated.
6564 **
6565 ** If successful, *ppChild is set to contain a reference to the child
6566 ** page and SQLITE_OK is returned. In this case the caller is required
6567 ** to call releasePage() on *ppChild exactly once. If an error occurs,
6568 ** an error code is returned and *ppChild is set to 0.
6569 */
6570 static int balance_deeper(MemPage *pRoot, MemPage **ppChild){
6571   int rc;                        /* Return value from subprocedures */
6572   MemPage *pChild = 0;           /* Pointer to a new child page */
6573   Pgno pgnoChild = 0;            /* Page number of the new child page */
6574   BtShared *pBt = pRoot->pBt;    /* The BTree */
6575 
6576   assert( pRoot->nOverflow>0 );
6577   assert( sqlite3_mutex_held(pBt->mutex) );
6578 
6579   /* Make pRoot, the root page of the b-tree, writable. Allocate a new
6580   ** page that will become the new right-child of pPage. Copy the contents
6581   ** of the node stored on pRoot into the new child page.
6582   */
6583   rc = sqlite3PagerWrite(pRoot->pDbPage);
6584   if( rc==SQLITE_OK ){
6585     rc = allocateBtreePage(pBt,&pChild,&pgnoChild,pRoot->pgno,0);
6586     copyNodeContent(pRoot, pChild, &rc);
6587     if( ISAUTOVACUUM ){
6588       ptrmapPut(pBt, pgnoChild, PTRMAP_BTREE, pRoot->pgno, &rc);
6589     }
6590   }
6591   if( rc ){
6592     *ppChild = 0;
6593     releasePage(pChild);
6594     return rc;
6595   }
6596   assert( sqlite3PagerIswriteable(pChild->pDbPage) );
6597   assert( sqlite3PagerIswriteable(pRoot->pDbPage) );
6598   assert( pChild->nCell==pRoot->nCell );
6599 
6600   TRACE(("BALANCE: copy root %d into %d\n", pRoot->pgno, pChild->pgno));
6601 
6602   /* Copy the overflow cells from pRoot to pChild */
6603   memcpy(pChild->aiOvfl, pRoot->aiOvfl,
6604          pRoot->nOverflow*sizeof(pRoot->aiOvfl[0]));
6605   memcpy(pChild->apOvfl, pRoot->apOvfl,
6606          pRoot->nOverflow*sizeof(pRoot->apOvfl[0]));
6607   pChild->nOverflow = pRoot->nOverflow;
6608 
6609   /* Zero the contents of pRoot. Then install pChild as the right-child. */
6610   zeroPage(pRoot, pChild->aData[0] & ~PTF_LEAF);
6611   put4byte(&pRoot->aData[pRoot->hdrOffset+8], pgnoChild);
6612 
6613   *ppChild = pChild;
6614   return SQLITE_OK;
6615 }
6616 
6617 /*
6618 ** The page that pCur currently points to has just been modified in
6619 ** some way. This function figures out if this modification means the
6620 ** tree needs to be balanced, and if so calls the appropriate balancing
6621 ** routine. Balancing routines are:
6622 **
6623 **   balance_quick()
6624 **   balance_deeper()
6625 **   balance_nonroot()
6626 */
6627 static int balance(BtCursor *pCur){
6628   int rc = SQLITE_OK;
6629   const int nMin = pCur->pBt->usableSize * 2 / 3;
6630   u8 aBalanceQuickSpace[13];
6631   u8 *pFree = 0;
6632 
6633   TESTONLY( int balance_quick_called = 0 );
6634   TESTONLY( int balance_deeper_called = 0 );
6635 
6636   do {
6637     int iPage = pCur->iPage;
6638     MemPage *pPage = pCur->apPage[iPage];
6639 
6640     if( iPage==0 ){
6641       if( pPage->nOverflow ){
6642         /* The root page of the b-tree is overfull. In this case call the
6643         ** balance_deeper() function to create a new child for the root-page
6644         ** and copy the current contents of the root-page to it. The
6645         ** next iteration of the do-loop will balance the child page.
6646         */
6647         assert( (balance_deeper_called++)==0 );
6648         rc = balance_deeper(pPage, &pCur->apPage[1]);
6649         if( rc==SQLITE_OK ){
6650           pCur->iPage = 1;
6651           pCur->aiIdx[0] = 0;
6652           pCur->aiIdx[1] = 0;
6653           assert( pCur->apPage[1]->nOverflow );
6654         }
6655       }else{
6656         break;
6657       }
6658     }else if( pPage->nOverflow==0 && pPage->nFree<=nMin ){
6659       break;
6660     }else{
6661       MemPage * const pParent = pCur->apPage[iPage-1];
6662       int const iIdx = pCur->aiIdx[iPage-1];
6663 
6664       rc = sqlite3PagerWrite(pParent->pDbPage);
6665       if( rc==SQLITE_OK ){
6666 #ifndef SQLITE_OMIT_QUICKBALANCE
6667         if( pPage->hasData
6668          && pPage->nOverflow==1
6669          && pPage->aiOvfl[0]==pPage->nCell
6670          && pParent->pgno!=1
6671          && pParent->nCell==iIdx
6672         ){
6673           /* Call balance_quick() to create a new sibling of pPage on which
6674           ** to store the overflow cell. balance_quick() inserts a new cell
6675           ** into pParent, which may cause pParent overflow. If this
6676           ** happens, the next interation of the do-loop will balance pParent
6677           ** use either balance_nonroot() or balance_deeper(). Until this
6678           ** happens, the overflow cell is stored in the aBalanceQuickSpace[]
6679           ** buffer.
6680           **
6681           ** The purpose of the following assert() is to check that only a
6682           ** single call to balance_quick() is made for each call to this
6683           ** function. If this were not verified, a subtle bug involving reuse
6684           ** of the aBalanceQuickSpace[] might sneak in.
6685           */
6686           assert( (balance_quick_called++)==0 );
6687           rc = balance_quick(pParent, pPage, aBalanceQuickSpace);
6688         }else
6689 #endif
6690         {
6691           /* In this case, call balance_nonroot() to redistribute cells
6692           ** between pPage and up to 2 of its sibling pages. This involves
6693           ** modifying the contents of pParent, which may cause pParent to
6694           ** become overfull or underfull. The next iteration of the do-loop
6695           ** will balance the parent page to correct this.
6696           **
6697           ** If the parent page becomes overfull, the overflow cell or cells
6698           ** are stored in the pSpace buffer allocated immediately below.
6699           ** A subsequent iteration of the do-loop will deal with this by
6700           ** calling balance_nonroot() (balance_deeper() may be called first,
6701           ** but it doesn't deal with overflow cells - just moves them to a
6702           ** different page). Once this subsequent call to balance_nonroot()
6703           ** has completed, it is safe to release the pSpace buffer used by
6704           ** the previous call, as the overflow cell data will have been
6705           ** copied either into the body of a database page or into the new
6706           ** pSpace buffer passed to the latter call to balance_nonroot().
6707           */
6708           u8 *pSpace = sqlite3PageMalloc(pCur->pBt->pageSize);
6709           rc = balance_nonroot(pParent, iIdx, pSpace, iPage==1);
6710           if( pFree ){
6711             /* If pFree is not NULL, it points to the pSpace buffer used
6712             ** by a previous call to balance_nonroot(). Its contents are
6713             ** now stored either on real database pages or within the
6714             ** new pSpace buffer, so it may be safely freed here. */
6715             sqlite3PageFree(pFree);
6716           }
6717 
6718           /* The pSpace buffer will be freed after the next call to
6719           ** balance_nonroot(), or just before this function returns, whichever
6720           ** comes first. */
6721           pFree = pSpace;
6722         }
6723       }
6724 
6725       pPage->nOverflow = 0;
6726 
6727       /* The next iteration of the do-loop balances the parent page. */
6728       releasePage(pPage);
6729       pCur->iPage--;
6730     }
6731   }while( rc==SQLITE_OK );
6732 
6733   if( pFree ){
6734     sqlite3PageFree(pFree);
6735   }
6736   return rc;
6737 }
6738 
6739 
6740 /*
6741 ** Insert a new record into the BTree.  The key is given by (pKey,nKey)
6742 ** and the data is given by (pData,nData).  The cursor is used only to
6743 ** define what table the record should be inserted into.  The cursor
6744 ** is left pointing at a random location.
6745 **
6746 ** For an INTKEY table, only the nKey value of the key is used.  pKey is
6747 ** ignored.  For a ZERODATA table, the pData and nData are both ignored.
6748 **
6749 ** If the seekResult parameter is non-zero, then a successful call to
6750 ** MovetoUnpacked() to seek cursor pCur to (pKey, nKey) has already
6751 ** been performed. seekResult is the search result returned (a negative
6752 ** number if pCur points at an entry that is smaller than (pKey, nKey), or
6753 ** a positive value if pCur points at an etry that is larger than
6754 ** (pKey, nKey)).
6755 **
6756 ** If the seekResult parameter is non-zero, then the caller guarantees that
6757 ** cursor pCur is pointing at the existing copy of a row that is to be
6758 ** overwritten.  If the seekResult parameter is 0, then cursor pCur may
6759 ** point to any entry or to no entry at all and so this function has to seek
6760 ** the cursor before the new key can be inserted.
6761 */
6762 int sqlite3BtreeInsert(
6763   BtCursor *pCur,                /* Insert data into the table of this cursor */
6764   const void *pKey, i64 nKey,    /* The key of the new record */
6765   const void *pData, int nData,  /* The data of the new record */
6766   int nZero,                     /* Number of extra 0 bytes to append to data */
6767   int appendBias,                /* True if this is likely an append */
6768   int seekResult                 /* Result of prior MovetoUnpacked() call */
6769 ){
6770   int rc;
6771   int loc = seekResult;          /* -1: before desired location  +1: after */
6772   int szNew = 0;
6773   int idx;
6774   MemPage *pPage;
6775   Btree *p = pCur->pBtree;
6776   BtShared *pBt = p->pBt;
6777   unsigned char *oldCell;
6778   unsigned char *newCell = 0;
6779 
6780   if( pCur->eState==CURSOR_FAULT ){
6781     assert( pCur->skipNext!=SQLITE_OK );
6782     return pCur->skipNext;
6783   }
6784 
6785   assert( cursorHoldsMutex(pCur) );
6786   assert( pCur->wrFlag && pBt->inTransaction==TRANS_WRITE
6787               && (pBt->btsFlags & BTS_READ_ONLY)==0 );
6788   assert( hasSharedCacheTableLock(p, pCur->pgnoRoot, pCur->pKeyInfo!=0, 2) );
6789 
6790   /* Assert that the caller has been consistent. If this cursor was opened
6791   ** expecting an index b-tree, then the caller should be inserting blob
6792   ** keys with no associated data. If the cursor was opened expecting an
6793   ** intkey table, the caller should be inserting integer keys with a
6794   ** blob of associated data.  */
6795   assert( (pKey==0)==(pCur->pKeyInfo==0) );
6796 
6797   /* Save the positions of any other cursors open on this table.
6798   **
6799   ** In some cases, the call to btreeMoveto() below is a no-op. For
6800   ** example, when inserting data into a table with auto-generated integer
6801   ** keys, the VDBE layer invokes sqlite3BtreeLast() to figure out the
6802   ** integer key to use. It then calls this function to actually insert the
6803   ** data into the intkey B-Tree. In this case btreeMoveto() recognizes
6804   ** that the cursor is already where it needs to be and returns without
6805   ** doing any work. To avoid thwarting these optimizations, it is important
6806   ** not to clear the cursor here.
6807   */
6808   rc = saveAllCursors(pBt, pCur->pgnoRoot, pCur);
6809   if( rc ) return rc;
6810 
6811   /* If this is an insert into a table b-tree, invalidate any incrblob
6812   ** cursors open on the row being replaced (assuming this is a replace
6813   ** operation - if it is not, the following is a no-op).  */
6814   if( pCur->pKeyInfo==0 ){
6815     invalidateIncrblobCursors(p, nKey, 0);
6816   }
6817 
6818   if( !loc ){
6819     rc = btreeMoveto(pCur, pKey, nKey, appendBias, &loc);
6820     if( rc ) return rc;
6821   }
6822   assert( pCur->eState==CURSOR_VALID || (pCur->eState==CURSOR_INVALID && loc) );
6823 
6824   pPage = pCur->apPage[pCur->iPage];
6825   assert( pPage->intKey || nKey>=0 );
6826   assert( pPage->leaf || !pPage->intKey );
6827 
6828   TRACE(("INSERT: table=%d nkey=%lld ndata=%d page=%d %s\n",
6829           pCur->pgnoRoot, nKey, nData, pPage->pgno,
6830           loc==0 ? "overwrite" : "new entry"));
6831   assert( pPage->isInit );
6832   allocateTempSpace(pBt);
6833   newCell = pBt->pTmpSpace;
6834   if( newCell==0 ) return SQLITE_NOMEM;
6835   rc = fillInCell(pPage, newCell, pKey, nKey, pData, nData, nZero, &szNew);
6836   if( rc ) goto end_insert;
6837   assert( szNew==cellSizePtr(pPage, newCell) );
6838   assert( szNew <= MX_CELL_SIZE(pBt) );
6839   idx = pCur->aiIdx[pCur->iPage];
6840   if( loc==0 ){
6841     u16 szOld;
6842     assert( idx<pPage->nCell );
6843     rc = sqlite3PagerWrite(pPage->pDbPage);
6844     if( rc ){
6845       goto end_insert;
6846     }
6847     oldCell = findCell(pPage, idx);
6848     if( !pPage->leaf ){
6849       memcpy(newCell, oldCell, 4);
6850     }
6851     szOld = cellSizePtr(pPage, oldCell);
6852     rc = clearCell(pPage, oldCell);
6853     dropCell(pPage, idx, szOld, &rc);
6854     if( rc ) goto end_insert;
6855   }else if( loc<0 && pPage->nCell>0 ){
6856     assert( pPage->leaf );
6857     idx = ++pCur->aiIdx[pCur->iPage];
6858   }else{
6859     assert( pPage->leaf );
6860   }
6861   insertCell(pPage, idx, newCell, szNew, 0, 0, &rc);
6862   assert( rc!=SQLITE_OK || pPage->nCell>0 || pPage->nOverflow>0 );
6863 
6864   /* If no error has occured and pPage has an overflow cell, call balance()
6865   ** to redistribute the cells within the tree. Since balance() may move
6866   ** the cursor, zero the BtCursor.info.nSize and BtCursor.validNKey
6867   ** variables.
6868   **
6869   ** Previous versions of SQLite called moveToRoot() to move the cursor
6870   ** back to the root page as balance() used to invalidate the contents
6871   ** of BtCursor.apPage[] and BtCursor.aiIdx[]. Instead of doing that,
6872   ** set the cursor state to "invalid". This makes common insert operations
6873   ** slightly faster.
6874   **
6875   ** There is a subtle but important optimization here too. When inserting
6876   ** multiple records into an intkey b-tree using a single cursor (as can
6877   ** happen while processing an "INSERT INTO ... SELECT" statement), it
6878   ** is advantageous to leave the cursor pointing to the last entry in
6879   ** the b-tree if possible. If the cursor is left pointing to the last
6880   ** entry in the table, and the next row inserted has an integer key
6881   ** larger than the largest existing key, it is possible to insert the
6882   ** row without seeking the cursor. This can be a big performance boost.
6883   */
6884   pCur->info.nSize = 0;
6885   pCur->validNKey = 0;
6886   if( rc==SQLITE_OK && pPage->nOverflow ){
6887     rc = balance(pCur);
6888 
6889     /* Must make sure nOverflow is reset to zero even if the balance()
6890     ** fails. Internal data structure corruption will result otherwise.
6891     ** Also, set the cursor state to invalid. This stops saveCursorPosition()
6892     ** from trying to save the current position of the cursor.  */
6893     pCur->apPage[pCur->iPage]->nOverflow = 0;
6894     pCur->eState = CURSOR_INVALID;
6895   }
6896   assert( pCur->apPage[pCur->iPage]->nOverflow==0 );
6897 
6898 end_insert:
6899   return rc;
6900 }
6901 
6902 /*
6903 ** Delete the entry that the cursor is pointing to.  The cursor
6904 ** is left pointing at a arbitrary location.
6905 */
6906 int sqlite3BtreeDelete(BtCursor *pCur){
6907   Btree *p = pCur->pBtree;
6908   BtShared *pBt = p->pBt;
6909   int rc;                              /* Return code */
6910   MemPage *pPage;                      /* Page to delete cell from */
6911   unsigned char *pCell;                /* Pointer to cell to delete */
6912   int iCellIdx;                        /* Index of cell to delete */
6913   int iCellDepth;                      /* Depth of node containing pCell */
6914 
6915   assert( cursorHoldsMutex(pCur) );
6916   assert( pBt->inTransaction==TRANS_WRITE );
6917   assert( (pBt->btsFlags & BTS_READ_ONLY)==0 );
6918   assert( pCur->wrFlag );
6919   assert( hasSharedCacheTableLock(p, pCur->pgnoRoot, pCur->pKeyInfo!=0, 2) );
6920   assert( !hasReadConflicts(p, pCur->pgnoRoot) );
6921 
6922   if( NEVER(pCur->aiIdx[pCur->iPage]>=pCur->apPage[pCur->iPage]->nCell)
6923    || NEVER(pCur->eState!=CURSOR_VALID)
6924   ){
6925     return SQLITE_ERROR;  /* Something has gone awry. */
6926   }
6927 
6928   iCellDepth = pCur->iPage;
6929   iCellIdx = pCur->aiIdx[iCellDepth];
6930   pPage = pCur->apPage[iCellDepth];
6931   pCell = findCell(pPage, iCellIdx);
6932 
6933   /* If the page containing the entry to delete is not a leaf page, move
6934   ** the cursor to the largest entry in the tree that is smaller than
6935   ** the entry being deleted. This cell will replace the cell being deleted
6936   ** from the internal node. The 'previous' entry is used for this instead
6937   ** of the 'next' entry, as the previous entry is always a part of the
6938   ** sub-tree headed by the child page of the cell being deleted. This makes
6939   ** balancing the tree following the delete operation easier.  */
6940   if( !pPage->leaf ){
6941     int notUsed;
6942     rc = sqlite3BtreePrevious(pCur, &notUsed);
6943     if( rc ) return rc;
6944   }
6945 
6946   /* Save the positions of any other cursors open on this table before
6947   ** making any modifications. Make the page containing the entry to be
6948   ** deleted writable. Then free any overflow pages associated with the
6949   ** entry and finally remove the cell itself from within the page.
6950   */
6951   rc = saveAllCursors(pBt, pCur->pgnoRoot, pCur);
6952   if( rc ) return rc;
6953 
6954   /* If this is a delete operation to remove a row from a table b-tree,
6955   ** invalidate any incrblob cursors open on the row being deleted.  */
6956   if( pCur->pKeyInfo==0 ){
6957     invalidateIncrblobCursors(p, pCur->info.nKey, 0);
6958   }
6959 
6960   rc = sqlite3PagerWrite(pPage->pDbPage);
6961   if( rc ) return rc;
6962   rc = clearCell(pPage, pCell);
6963   dropCell(pPage, iCellIdx, cellSizePtr(pPage, pCell), &rc);
6964   if( rc ) return rc;
6965 
6966   /* If the cell deleted was not located on a leaf page, then the cursor
6967   ** is currently pointing to the largest entry in the sub-tree headed
6968   ** by the child-page of the cell that was just deleted from an internal
6969   ** node. The cell from the leaf node needs to be moved to the internal
6970   ** node to replace the deleted cell.  */
6971   if( !pPage->leaf ){
6972     MemPage *pLeaf = pCur->apPage[pCur->iPage];
6973     int nCell;
6974     Pgno n = pCur->apPage[iCellDepth+1]->pgno;
6975     unsigned char *pTmp;
6976 
6977     pCell = findCell(pLeaf, pLeaf->nCell-1);
6978     nCell = cellSizePtr(pLeaf, pCell);
6979     assert( MX_CELL_SIZE(pBt) >= nCell );
6980 
6981     allocateTempSpace(pBt);
6982     pTmp = pBt->pTmpSpace;
6983 
6984     rc = sqlite3PagerWrite(pLeaf->pDbPage);
6985     insertCell(pPage, iCellIdx, pCell-4, nCell+4, pTmp, n, &rc);
6986     dropCell(pLeaf, pLeaf->nCell-1, nCell, &rc);
6987     if( rc ) return rc;
6988   }
6989 
6990   /* Balance the tree. If the entry deleted was located on a leaf page,
6991   ** then the cursor still points to that page. In this case the first
6992   ** call to balance() repairs the tree, and the if(...) condition is
6993   ** never true.
6994   **
6995   ** Otherwise, if the entry deleted was on an internal node page, then
6996   ** pCur is pointing to the leaf page from which a cell was removed to
6997   ** replace the cell deleted from the internal node. This is slightly
6998   ** tricky as the leaf node may be underfull, and the internal node may
6999   ** be either under or overfull. In this case run the balancing algorithm
7000   ** on the leaf node first. If the balance proceeds far enough up the
7001   ** tree that we can be sure that any problem in the internal node has
7002   ** been corrected, so be it. Otherwise, after balancing the leaf node,
7003   ** walk the cursor up the tree to the internal node and balance it as
7004   ** well.  */
7005   rc = balance(pCur);
7006   if( rc==SQLITE_OK && pCur->iPage>iCellDepth ){
7007     while( pCur->iPage>iCellDepth ){
7008       releasePage(pCur->apPage[pCur->iPage--]);
7009     }
7010     rc = balance(pCur);
7011   }
7012 
7013   if( rc==SQLITE_OK ){
7014     moveToRoot(pCur);
7015   }
7016   return rc;
7017 }
7018 
7019 /*
7020 ** Create a new BTree table.  Write into *piTable the page
7021 ** number for the root page of the new table.
7022 **
7023 ** The type of type is determined by the flags parameter.  Only the
7024 ** following values of flags are currently in use.  Other values for
7025 ** flags might not work:
7026 **
7027 **     BTREE_INTKEY|BTREE_LEAFDATA     Used for SQL tables with rowid keys
7028 **     BTREE_ZERODATA                  Used for SQL indices
7029 */
7030 static int btreeCreateTable(Btree *p, int *piTable, int createTabFlags){
7031   BtShared *pBt = p->pBt;
7032   MemPage *pRoot;
7033   Pgno pgnoRoot;
7034   int rc;
7035   int ptfFlags;          /* Page-type flage for the root page of new table */
7036 
7037   assert( sqlite3BtreeHoldsMutex(p) );
7038   assert( pBt->inTransaction==TRANS_WRITE );
7039   assert( (pBt->btsFlags & BTS_READ_ONLY)==0 );
7040 
7041 #ifdef SQLITE_OMIT_AUTOVACUUM
7042   rc = allocateBtreePage(pBt, &pRoot, &pgnoRoot, 1, 0);
7043   if( rc ){
7044     return rc;
7045   }
7046 #else
7047   if( pBt->autoVacuum ){
7048     Pgno pgnoMove;      /* Move a page here to make room for the root-page */
7049     MemPage *pPageMove; /* The page to move to. */
7050 
7051     /* Creating a new table may probably require moving an existing database
7052     ** to make room for the new tables root page. In case this page turns
7053     ** out to be an overflow page, delete all overflow page-map caches
7054     ** held by open cursors.
7055     */
7056     invalidateAllOverflowCache(pBt);
7057 
7058     /* Read the value of meta[3] from the database to determine where the
7059     ** root page of the new table should go. meta[3] is the largest root-page
7060     ** created so far, so the new root-page is (meta[3]+1).
7061     */
7062     sqlite3BtreeGetMeta(p, BTREE_LARGEST_ROOT_PAGE, &pgnoRoot);
7063     pgnoRoot++;
7064 
7065     /* The new root-page may not be allocated on a pointer-map page, or the
7066     ** PENDING_BYTE page.
7067     */
7068     while( pgnoRoot==PTRMAP_PAGENO(pBt, pgnoRoot) ||
7069         pgnoRoot==PENDING_BYTE_PAGE(pBt) ){
7070       pgnoRoot++;
7071     }
7072     assert( pgnoRoot>=3 );
7073 
7074     /* Allocate a page. The page that currently resides at pgnoRoot will
7075     ** be moved to the allocated page (unless the allocated page happens
7076     ** to reside at pgnoRoot).
7077     */
7078     rc = allocateBtreePage(pBt, &pPageMove, &pgnoMove, pgnoRoot, 1);
7079     if( rc!=SQLITE_OK ){
7080       return rc;
7081     }
7082 
7083     if( pgnoMove!=pgnoRoot ){
7084       /* pgnoRoot is the page that will be used for the root-page of
7085       ** the new table (assuming an error did not occur). But we were
7086       ** allocated pgnoMove. If required (i.e. if it was not allocated
7087       ** by extending the file), the current page at position pgnoMove
7088       ** is already journaled.
7089       */
7090       u8 eType = 0;
7091       Pgno iPtrPage = 0;
7092 
7093       releasePage(pPageMove);
7094 
7095       /* Move the page currently at pgnoRoot to pgnoMove. */
7096       rc = btreeGetPage(pBt, pgnoRoot, &pRoot, 0);
7097       if( rc!=SQLITE_OK ){
7098         return rc;
7099       }
7100       rc = ptrmapGet(pBt, pgnoRoot, &eType, &iPtrPage);
7101       if( eType==PTRMAP_ROOTPAGE || eType==PTRMAP_FREEPAGE ){
7102         rc = SQLITE_CORRUPT_BKPT;
7103       }
7104       if( rc!=SQLITE_OK ){
7105         releasePage(pRoot);
7106         return rc;
7107       }
7108       assert( eType!=PTRMAP_ROOTPAGE );
7109       assert( eType!=PTRMAP_FREEPAGE );
7110       rc = relocatePage(pBt, pRoot, eType, iPtrPage, pgnoMove, 0);
7111       releasePage(pRoot);
7112 
7113       /* Obtain the page at pgnoRoot */
7114       if( rc!=SQLITE_OK ){
7115         return rc;
7116       }
7117       rc = btreeGetPage(pBt, pgnoRoot, &pRoot, 0);
7118       if( rc!=SQLITE_OK ){
7119         return rc;
7120       }
7121       rc = sqlite3PagerWrite(pRoot->pDbPage);
7122       if( rc!=SQLITE_OK ){
7123         releasePage(pRoot);
7124         return rc;
7125       }
7126     }else{
7127       pRoot = pPageMove;
7128     }
7129 
7130     /* Update the pointer-map and meta-data with the new root-page number. */
7131     ptrmapPut(pBt, pgnoRoot, PTRMAP_ROOTPAGE, 0, &rc);
7132     if( rc ){
7133       releasePage(pRoot);
7134       return rc;
7135     }
7136 
7137     /* When the new root page was allocated, page 1 was made writable in
7138     ** order either to increase the database filesize, or to decrement the
7139     ** freelist count.  Hence, the sqlite3BtreeUpdateMeta() call cannot fail.
7140     */
7141     assert( sqlite3PagerIswriteable(pBt->pPage1->pDbPage) );
7142     rc = sqlite3BtreeUpdateMeta(p, 4, pgnoRoot);
7143     if( NEVER(rc) ){
7144       releasePage(pRoot);
7145       return rc;
7146     }
7147 
7148   }else{
7149     rc = allocateBtreePage(pBt, &pRoot, &pgnoRoot, 1, 0);
7150     if( rc ) return rc;
7151   }
7152 #endif
7153   assert( sqlite3PagerIswriteable(pRoot->pDbPage) );
7154   if( createTabFlags & BTREE_INTKEY ){
7155     ptfFlags = PTF_INTKEY | PTF_LEAFDATA | PTF_LEAF;
7156   }else{
7157     ptfFlags = PTF_ZERODATA | PTF_LEAF;
7158   }
7159   zeroPage(pRoot, ptfFlags);
7160   sqlite3PagerUnref(pRoot->pDbPage);
7161   assert( (pBt->openFlags & BTREE_SINGLE)==0 || pgnoRoot==2 );
7162   *piTable = (int)pgnoRoot;
7163   return SQLITE_OK;
7164 }
7165 int sqlite3BtreeCreateTable(Btree *p, int *piTable, int flags){
7166   int rc;
7167   sqlite3BtreeEnter(p);
7168   rc = btreeCreateTable(p, piTable, flags);
7169   sqlite3BtreeLeave(p);
7170   return rc;
7171 }
7172 
7173 /*
7174 ** Erase the given database page and all its children.  Return
7175 ** the page to the freelist.
7176 */
7177 static int clearDatabasePage(
7178   BtShared *pBt,           /* The BTree that contains the table */
7179   Pgno pgno,               /* Page number to clear */
7180   int freePageFlag,        /* Deallocate page if true */
7181   int *pnChange            /* Add number of Cells freed to this counter */
7182 ){
7183   MemPage *pPage;
7184   int rc;
7185   unsigned char *pCell;
7186   int i;
7187 
7188   assert( sqlite3_mutex_held(pBt->mutex) );
7189   if( pgno>btreePagecount(pBt) ){
7190     return SQLITE_CORRUPT_BKPT;
7191   }
7192 
7193   rc = getAndInitPage(pBt, pgno, &pPage);
7194   if( rc ) return rc;
7195   for(i=0; i<pPage->nCell; i++){
7196     pCell = findCell(pPage, i);
7197     if( !pPage->leaf ){
7198       rc = clearDatabasePage(pBt, get4byte(pCell), 1, pnChange);
7199       if( rc ) goto cleardatabasepage_out;
7200     }
7201     rc = clearCell(pPage, pCell);
7202     if( rc ) goto cleardatabasepage_out;
7203   }
7204   if( !pPage->leaf ){
7205     rc = clearDatabasePage(pBt, get4byte(&pPage->aData[8]), 1, pnChange);
7206     if( rc ) goto cleardatabasepage_out;
7207   }else if( pnChange ){
7208     assert( pPage->intKey );
7209     *pnChange += pPage->nCell;
7210   }
7211   if( freePageFlag ){
7212     freePage(pPage, &rc);
7213   }else if( (rc = sqlite3PagerWrite(pPage->pDbPage))==0 ){
7214     zeroPage(pPage, pPage->aData[0] | PTF_LEAF);
7215   }
7216 
7217 cleardatabasepage_out:
7218   releasePage(pPage);
7219   return rc;
7220 }
7221 
7222 /*
7223 ** Delete all information from a single table in the database.  iTable is
7224 ** the page number of the root of the table.  After this routine returns,
7225 ** the root page is empty, but still exists.
7226 **
7227 ** This routine will fail with SQLITE_LOCKED if there are any open
7228 ** read cursors on the table.  Open write cursors are moved to the
7229 ** root of the table.
7230 **
7231 ** If pnChange is not NULL, then table iTable must be an intkey table. The
7232 ** integer value pointed to by pnChange is incremented by the number of
7233 ** entries in the table.
7234 */
7235 int sqlite3BtreeClearTable(Btree *p, int iTable, int *pnChange){
7236   int rc;
7237   BtShared *pBt = p->pBt;
7238   sqlite3BtreeEnter(p);
7239   assert( p->inTrans==TRANS_WRITE );
7240 
7241   rc = saveAllCursors(pBt, (Pgno)iTable, 0);
7242 
7243   if( SQLITE_OK==rc ){
7244     /* Invalidate all incrblob cursors open on table iTable (assuming iTable
7245     ** is the root of a table b-tree - if it is not, the following call is
7246     ** a no-op).  */
7247     invalidateIncrblobCursors(p, 0, 1);
7248     rc = clearDatabasePage(pBt, (Pgno)iTable, 0, pnChange);
7249   }
7250   sqlite3BtreeLeave(p);
7251   return rc;
7252 }
7253 
7254 /*
7255 ** Erase all information in a table and add the root of the table to
7256 ** the freelist.  Except, the root of the principle table (the one on
7257 ** page 1) is never added to the freelist.
7258 **
7259 ** This routine will fail with SQLITE_LOCKED if there are any open
7260 ** cursors on the table.
7261 **
7262 ** If AUTOVACUUM is enabled and the page at iTable is not the last
7263 ** root page in the database file, then the last root page
7264 ** in the database file is moved into the slot formerly occupied by
7265 ** iTable and that last slot formerly occupied by the last root page
7266 ** is added to the freelist instead of iTable.  In this say, all
7267 ** root pages are kept at the beginning of the database file, which
7268 ** is necessary for AUTOVACUUM to work right.  *piMoved is set to the
7269 ** page number that used to be the last root page in the file before
7270 ** the move.  If no page gets moved, *piMoved is set to 0.
7271 ** The last root page is recorded in meta[3] and the value of
7272 ** meta[3] is updated by this procedure.
7273 */
7274 static int btreeDropTable(Btree *p, Pgno iTable, int *piMoved){
7275   int rc;
7276   MemPage *pPage = 0;
7277   BtShared *pBt = p->pBt;
7278 
7279   assert( sqlite3BtreeHoldsMutex(p) );
7280   assert( p->inTrans==TRANS_WRITE );
7281 
7282   /* It is illegal to drop a table if any cursors are open on the
7283   ** database. This is because in auto-vacuum mode the backend may
7284   ** need to move another root-page to fill a gap left by the deleted
7285   ** root page. If an open cursor was using this page a problem would
7286   ** occur.
7287   **
7288   ** This error is caught long before control reaches this point.
7289   */
7290   if( NEVER(pBt->pCursor) ){
7291     sqlite3ConnectionBlocked(p->db, pBt->pCursor->pBtree->db);
7292     return SQLITE_LOCKED_SHAREDCACHE;
7293   }
7294 
7295   rc = btreeGetPage(pBt, (Pgno)iTable, &pPage, 0);
7296   if( rc ) return rc;
7297   rc = sqlite3BtreeClearTable(p, iTable, 0);
7298   if( rc ){
7299     releasePage(pPage);
7300     return rc;
7301   }
7302 
7303   *piMoved = 0;
7304 
7305   if( iTable>1 ){
7306 #ifdef SQLITE_OMIT_AUTOVACUUM
7307     freePage(pPage, &rc);
7308     releasePage(pPage);
7309 #else
7310     if( pBt->autoVacuum ){
7311       Pgno maxRootPgno;
7312       sqlite3BtreeGetMeta(p, BTREE_LARGEST_ROOT_PAGE, &maxRootPgno);
7313 
7314       if( iTable==maxRootPgno ){
7315         /* If the table being dropped is the table with the largest root-page
7316         ** number in the database, put the root page on the free list.
7317         */
7318         freePage(pPage, &rc);
7319         releasePage(pPage);
7320         if( rc!=SQLITE_OK ){
7321           return rc;
7322         }
7323       }else{
7324         /* The table being dropped does not have the largest root-page
7325         ** number in the database. So move the page that does into the
7326         ** gap left by the deleted root-page.
7327         */
7328         MemPage *pMove;
7329         releasePage(pPage);
7330         rc = btreeGetPage(pBt, maxRootPgno, &pMove, 0);
7331         if( rc!=SQLITE_OK ){
7332           return rc;
7333         }
7334         rc = relocatePage(pBt, pMove, PTRMAP_ROOTPAGE, 0, iTable, 0);
7335         releasePage(pMove);
7336         if( rc!=SQLITE_OK ){
7337           return rc;
7338         }
7339         pMove = 0;
7340         rc = btreeGetPage(pBt, maxRootPgno, &pMove, 0);
7341         freePage(pMove, &rc);
7342         releasePage(pMove);
7343         if( rc!=SQLITE_OK ){
7344           return rc;
7345         }
7346         *piMoved = maxRootPgno;
7347       }
7348 
7349       /* Set the new 'max-root-page' value in the database header. This
7350       ** is the old value less one, less one more if that happens to
7351       ** be a root-page number, less one again if that is the
7352       ** PENDING_BYTE_PAGE.
7353       */
7354       maxRootPgno--;
7355       while( maxRootPgno==PENDING_BYTE_PAGE(pBt)
7356              || PTRMAP_ISPAGE(pBt, maxRootPgno) ){
7357         maxRootPgno--;
7358       }
7359       assert( maxRootPgno!=PENDING_BYTE_PAGE(pBt) );
7360 
7361       rc = sqlite3BtreeUpdateMeta(p, 4, maxRootPgno);
7362     }else{
7363       freePage(pPage, &rc);
7364       releasePage(pPage);
7365     }
7366 #endif
7367   }else{
7368     /* If sqlite3BtreeDropTable was called on page 1.
7369     ** This really never should happen except in a corrupt
7370     ** database.
7371     */
7372     zeroPage(pPage, PTF_INTKEY|PTF_LEAF );
7373     releasePage(pPage);
7374   }
7375   return rc;
7376 }
7377 int sqlite3BtreeDropTable(Btree *p, int iTable, int *piMoved){
7378   int rc;
7379   sqlite3BtreeEnter(p);
7380   rc = btreeDropTable(p, iTable, piMoved);
7381   sqlite3BtreeLeave(p);
7382   return rc;
7383 }
7384 
7385 
7386 /*
7387 ** This function may only be called if the b-tree connection already
7388 ** has a read or write transaction open on the database.
7389 **
7390 ** Read the meta-information out of a database file.  Meta[0]
7391 ** is the number of free pages currently in the database.  Meta[1]
7392 ** through meta[15] are available for use by higher layers.  Meta[0]
7393 ** is read-only, the others are read/write.
7394 **
7395 ** The schema layer numbers meta values differently.  At the schema
7396 ** layer (and the SetCookie and ReadCookie opcodes) the number of
7397 ** free pages is not visible.  So Cookie[0] is the same as Meta[1].
7398 */
7399 void sqlite3BtreeGetMeta(Btree *p, int idx, u32 *pMeta){
7400   BtShared *pBt = p->pBt;
7401 
7402   sqlite3BtreeEnter(p);
7403   assert( p->inTrans>TRANS_NONE );
7404   assert( SQLITE_OK==querySharedCacheTableLock(p, MASTER_ROOT, READ_LOCK) );
7405   assert( pBt->pPage1 );
7406   assert( idx>=0 && idx<=15 );
7407 
7408   *pMeta = get4byte(&pBt->pPage1->aData[36 + idx*4]);
7409 
7410   /* If auto-vacuum is disabled in this build and this is an auto-vacuum
7411   ** database, mark the database as read-only.  */
7412 #ifdef SQLITE_OMIT_AUTOVACUUM
7413   if( idx==BTREE_LARGEST_ROOT_PAGE && *pMeta>0 ){
7414     pBt->btsFlags |= BTS_READ_ONLY;
7415   }
7416 #endif
7417 
7418   sqlite3BtreeLeave(p);
7419 }
7420 
7421 /*
7422 ** Write meta-information back into the database.  Meta[0] is
7423 ** read-only and may not be written.
7424 */
7425 int sqlite3BtreeUpdateMeta(Btree *p, int idx, u32 iMeta){
7426   BtShared *pBt = p->pBt;
7427   unsigned char *pP1;
7428   int rc;
7429   assert( idx>=1 && idx<=15 );
7430   sqlite3BtreeEnter(p);
7431   assert( p->inTrans==TRANS_WRITE );
7432   assert( pBt->pPage1!=0 );
7433   pP1 = pBt->pPage1->aData;
7434   rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
7435   if( rc==SQLITE_OK ){
7436     put4byte(&pP1[36 + idx*4], iMeta);
7437 #ifndef SQLITE_OMIT_AUTOVACUUM
7438     if( idx==BTREE_INCR_VACUUM ){
7439       assert( pBt->autoVacuum || iMeta==0 );
7440       assert( iMeta==0 || iMeta==1 );
7441       pBt->incrVacuum = (u8)iMeta;
7442     }
7443 #endif
7444   }
7445   sqlite3BtreeLeave(p);
7446   return rc;
7447 }
7448 
7449 #ifndef SQLITE_OMIT_BTREECOUNT
7450 /*
7451 ** The first argument, pCur, is a cursor opened on some b-tree. Count the
7452 ** number of entries in the b-tree and write the result to *pnEntry.
7453 **
7454 ** SQLITE_OK is returned if the operation is successfully executed.
7455 ** Otherwise, if an error is encountered (i.e. an IO error or database
7456 ** corruption) an SQLite error code is returned.
7457 */
7458 int sqlite3BtreeCount(BtCursor *pCur, i64 *pnEntry){
7459   i64 nEntry = 0;                      /* Value to return in *pnEntry */
7460   int rc;                              /* Return code */
7461 
7462   if( pCur->pgnoRoot==0 ){
7463     *pnEntry = 0;
7464     return SQLITE_OK;
7465   }
7466   rc = moveToRoot(pCur);
7467 
7468   /* Unless an error occurs, the following loop runs one iteration for each
7469   ** page in the B-Tree structure (not including overflow pages).
7470   */
7471   while( rc==SQLITE_OK ){
7472     int iIdx;                          /* Index of child node in parent */
7473     MemPage *pPage;                    /* Current page of the b-tree */
7474 
7475     /* If this is a leaf page or the tree is not an int-key tree, then
7476     ** this page contains countable entries. Increment the entry counter
7477     ** accordingly.
7478     */
7479     pPage = pCur->apPage[pCur->iPage];
7480     if( pPage->leaf || !pPage->intKey ){
7481       nEntry += pPage->nCell;
7482     }
7483 
7484     /* pPage is a leaf node. This loop navigates the cursor so that it
7485     ** points to the first interior cell that it points to the parent of
7486     ** the next page in the tree that has not yet been visited. The
7487     ** pCur->aiIdx[pCur->iPage] value is set to the index of the parent cell
7488     ** of the page, or to the number of cells in the page if the next page
7489     ** to visit is the right-child of its parent.
7490     **
7491     ** If all pages in the tree have been visited, return SQLITE_OK to the
7492     ** caller.
7493     */
7494     if( pPage->leaf ){
7495       do {
7496         if( pCur->iPage==0 ){
7497           /* All pages of the b-tree have been visited. Return successfully. */
7498           *pnEntry = nEntry;
7499           return SQLITE_OK;
7500         }
7501         moveToParent(pCur);
7502       }while ( pCur->aiIdx[pCur->iPage]>=pCur->apPage[pCur->iPage]->nCell );
7503 
7504       pCur->aiIdx[pCur->iPage]++;
7505       pPage = pCur->apPage[pCur->iPage];
7506     }
7507 
7508     /* Descend to the child node of the cell that the cursor currently
7509     ** points at. This is the right-child if (iIdx==pPage->nCell).
7510     */
7511     iIdx = pCur->aiIdx[pCur->iPage];
7512     if( iIdx==pPage->nCell ){
7513       rc = moveToChild(pCur, get4byte(&pPage->aData[pPage->hdrOffset+8]));
7514     }else{
7515       rc = moveToChild(pCur, get4byte(findCell(pPage, iIdx)));
7516     }
7517   }
7518 
7519   /* An error has occurred. Return an error code. */
7520   return rc;
7521 }
7522 #endif
7523 
7524 /*
7525 ** Return the pager associated with a BTree.  This routine is used for
7526 ** testing and debugging only.
7527 */
7528 Pager *sqlite3BtreePager(Btree *p){
7529   return p->pBt->pPager;
7530 }
7531 
7532 #ifndef SQLITE_OMIT_INTEGRITY_CHECK
7533 /*
7534 ** Append a message to the error message string.
7535 */
7536 static void checkAppendMsg(
7537   IntegrityCk *pCheck,
7538   char *zMsg1,
7539   const char *zFormat,
7540   ...
7541 ){
7542   va_list ap;
7543   if( !pCheck->mxErr ) return;
7544   pCheck->mxErr--;
7545   pCheck->nErr++;
7546   va_start(ap, zFormat);
7547   if( pCheck->errMsg.nChar ){
7548     sqlite3StrAccumAppend(&pCheck->errMsg, "\n", 1);
7549   }
7550   if( zMsg1 ){
7551     sqlite3StrAccumAppend(&pCheck->errMsg, zMsg1, -1);
7552   }
7553   sqlite3VXPrintf(&pCheck->errMsg, 1, zFormat, ap);
7554   va_end(ap);
7555   if( pCheck->errMsg.mallocFailed ){
7556     pCheck->mallocFailed = 1;
7557   }
7558 }
7559 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
7560 
7561 #ifndef SQLITE_OMIT_INTEGRITY_CHECK
7562 
7563 /*
7564 ** Return non-zero if the bit in the IntegrityCk.aPgRef[] array that
7565 ** corresponds to page iPg is already set.
7566 */
7567 static int getPageReferenced(IntegrityCk *pCheck, Pgno iPg){
7568   assert( iPg<=pCheck->nPage && sizeof(pCheck->aPgRef[0])==1 );
7569   return (pCheck->aPgRef[iPg/8] & (1 << (iPg & 0x07)));
7570 }
7571 
7572 /*
7573 ** Set the bit in the IntegrityCk.aPgRef[] array that corresponds to page iPg.
7574 */
7575 static void setPageReferenced(IntegrityCk *pCheck, Pgno iPg){
7576   assert( iPg<=pCheck->nPage && sizeof(pCheck->aPgRef[0])==1 );
7577   pCheck->aPgRef[iPg/8] |= (1 << (iPg & 0x07));
7578 }
7579 
7580 
7581 /*
7582 ** Add 1 to the reference count for page iPage.  If this is the second
7583 ** reference to the page, add an error message to pCheck->zErrMsg.
7584 ** Return 1 if there are 2 ore more references to the page and 0 if
7585 ** if this is the first reference to the page.
7586 **
7587 ** Also check that the page number is in bounds.
7588 */
7589 static int checkRef(IntegrityCk *pCheck, Pgno iPage, char *zContext){
7590   if( iPage==0 ) return 1;
7591   if( iPage>pCheck->nPage ){
7592     checkAppendMsg(pCheck, zContext, "invalid page number %d", iPage);
7593     return 1;
7594   }
7595   if( getPageReferenced(pCheck, iPage) ){
7596     checkAppendMsg(pCheck, zContext, "2nd reference to page %d", iPage);
7597     return 1;
7598   }
7599   setPageReferenced(pCheck, iPage);
7600   return 0;
7601 }
7602 
7603 #ifndef SQLITE_OMIT_AUTOVACUUM
7604 /*
7605 ** Check that the entry in the pointer-map for page iChild maps to
7606 ** page iParent, pointer type ptrType. If not, append an error message
7607 ** to pCheck.
7608 */
7609 static void checkPtrmap(
7610   IntegrityCk *pCheck,   /* Integrity check context */
7611   Pgno iChild,           /* Child page number */
7612   u8 eType,              /* Expected pointer map type */
7613   Pgno iParent,          /* Expected pointer map parent page number */
7614   char *zContext         /* Context description (used for error msg) */
7615 ){
7616   int rc;
7617   u8 ePtrmapType;
7618   Pgno iPtrmapParent;
7619 
7620   rc = ptrmapGet(pCheck->pBt, iChild, &ePtrmapType, &iPtrmapParent);
7621   if( rc!=SQLITE_OK ){
7622     if( rc==SQLITE_NOMEM || rc==SQLITE_IOERR_NOMEM ) pCheck->mallocFailed = 1;
7623     checkAppendMsg(pCheck, zContext, "Failed to read ptrmap key=%d", iChild);
7624     return;
7625   }
7626 
7627   if( ePtrmapType!=eType || iPtrmapParent!=iParent ){
7628     checkAppendMsg(pCheck, zContext,
7629       "Bad ptr map entry key=%d expected=(%d,%d) got=(%d,%d)",
7630       iChild, eType, iParent, ePtrmapType, iPtrmapParent);
7631   }
7632 }
7633 #endif
7634 
7635 /*
7636 ** Check the integrity of the freelist or of an overflow page list.
7637 ** Verify that the number of pages on the list is N.
7638 */
7639 static void checkList(
7640   IntegrityCk *pCheck,  /* Integrity checking context */
7641   int isFreeList,       /* True for a freelist.  False for overflow page list */
7642   int iPage,            /* Page number for first page in the list */
7643   int N,                /* Expected number of pages in the list */
7644   char *zContext        /* Context for error messages */
7645 ){
7646   int i;
7647   int expected = N;
7648   int iFirst = iPage;
7649   while( N-- > 0 && pCheck->mxErr ){
7650     DbPage *pOvflPage;
7651     unsigned char *pOvflData;
7652     if( iPage<1 ){
7653       checkAppendMsg(pCheck, zContext,
7654          "%d of %d pages missing from overflow list starting at %d",
7655           N+1, expected, iFirst);
7656       break;
7657     }
7658     if( checkRef(pCheck, iPage, zContext) ) break;
7659     if( sqlite3PagerGet(pCheck->pPager, (Pgno)iPage, &pOvflPage) ){
7660       checkAppendMsg(pCheck, zContext, "failed to get page %d", iPage);
7661       break;
7662     }
7663     pOvflData = (unsigned char *)sqlite3PagerGetData(pOvflPage);
7664     if( isFreeList ){
7665       int n = get4byte(&pOvflData[4]);
7666 #ifndef SQLITE_OMIT_AUTOVACUUM
7667       if( pCheck->pBt->autoVacuum ){
7668         checkPtrmap(pCheck, iPage, PTRMAP_FREEPAGE, 0, zContext);
7669       }
7670 #endif
7671       if( n>(int)pCheck->pBt->usableSize/4-2 ){
7672         checkAppendMsg(pCheck, zContext,
7673            "freelist leaf count too big on page %d", iPage);
7674         N--;
7675       }else{
7676         for(i=0; i<n; i++){
7677           Pgno iFreePage = get4byte(&pOvflData[8+i*4]);
7678 #ifndef SQLITE_OMIT_AUTOVACUUM
7679           if( pCheck->pBt->autoVacuum ){
7680             checkPtrmap(pCheck, iFreePage, PTRMAP_FREEPAGE, 0, zContext);
7681           }
7682 #endif
7683           checkRef(pCheck, iFreePage, zContext);
7684         }
7685         N -= n;
7686       }
7687     }
7688 #ifndef SQLITE_OMIT_AUTOVACUUM
7689     else{
7690       /* If this database supports auto-vacuum and iPage is not the last
7691       ** page in this overflow list, check that the pointer-map entry for
7692       ** the following page matches iPage.
7693       */
7694       if( pCheck->pBt->autoVacuum && N>0 ){
7695         i = get4byte(pOvflData);
7696         checkPtrmap(pCheck, i, PTRMAP_OVERFLOW2, iPage, zContext);
7697       }
7698     }
7699 #endif
7700     iPage = get4byte(pOvflData);
7701     sqlite3PagerUnref(pOvflPage);
7702   }
7703 }
7704 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
7705 
7706 #ifndef SQLITE_OMIT_INTEGRITY_CHECK
7707 /*
7708 ** Do various sanity checks on a single page of a tree.  Return
7709 ** the tree depth.  Root pages return 0.  Parents of root pages
7710 ** return 1, and so forth.
7711 **
7712 ** These checks are done:
7713 **
7714 **      1.  Make sure that cells and freeblocks do not overlap
7715 **          but combine to completely cover the page.
7716 **  NO  2.  Make sure cell keys are in order.
7717 **  NO  3.  Make sure no key is less than or equal to zLowerBound.
7718 **  NO  4.  Make sure no key is greater than or equal to zUpperBound.
7719 **      5.  Check the integrity of overflow pages.
7720 **      6.  Recursively call checkTreePage on all children.
7721 **      7.  Verify that the depth of all children is the same.
7722 **      8.  Make sure this page is at least 33% full or else it is
7723 **          the root of the tree.
7724 */
7725 static int checkTreePage(
7726   IntegrityCk *pCheck,  /* Context for the sanity check */
7727   int iPage,            /* Page number of the page to check */
7728   char *zParentContext, /* Parent context */
7729   i64 *pnParentMinKey,
7730   i64 *pnParentMaxKey
7731 ){
7732   MemPage *pPage;
7733   int i, rc, depth, d2, pgno, cnt;
7734   int hdr, cellStart;
7735   int nCell;
7736   u8 *data;
7737   BtShared *pBt;
7738   int usableSize;
7739   char zContext[100];
7740   char *hit = 0;
7741   i64 nMinKey = 0;
7742   i64 nMaxKey = 0;
7743 
7744   sqlite3_snprintf(sizeof(zContext), zContext, "Page %d: ", iPage);
7745 
7746   /* Check that the page exists
7747   */
7748   pBt = pCheck->pBt;
7749   usableSize = pBt->usableSize;
7750   if( iPage==0 ) return 0;
7751   if( checkRef(pCheck, iPage, zParentContext) ) return 0;
7752   if( (rc = btreeGetPage(pBt, (Pgno)iPage, &pPage, 0))!=0 ){
7753     checkAppendMsg(pCheck, zContext,
7754        "unable to get the page. error code=%d", rc);
7755     return 0;
7756   }
7757 
7758   /* Clear MemPage.isInit to make sure the corruption detection code in
7759   ** btreeInitPage() is executed.  */
7760   pPage->isInit = 0;
7761   if( (rc = btreeInitPage(pPage))!=0 ){
7762     assert( rc==SQLITE_CORRUPT );  /* The only possible error from InitPage */
7763     checkAppendMsg(pCheck, zContext,
7764                    "btreeInitPage() returns error code %d", rc);
7765     releasePage(pPage);
7766     return 0;
7767   }
7768 
7769   /* Check out all the cells.
7770   */
7771   depth = 0;
7772   for(i=0; i<pPage->nCell && pCheck->mxErr; i++){
7773     u8 *pCell;
7774     u32 sz;
7775     CellInfo info;
7776 
7777     /* Check payload overflow pages
7778     */
7779     sqlite3_snprintf(sizeof(zContext), zContext,
7780              "On tree page %d cell %d: ", iPage, i);
7781     pCell = findCell(pPage,i);
7782     btreeParseCellPtr(pPage, pCell, &info);
7783     sz = info.nData;
7784     if( !pPage->intKey ) sz += (int)info.nKey;
7785     /* For intKey pages, check that the keys are in order.
7786     */
7787     else if( i==0 ) nMinKey = nMaxKey = info.nKey;
7788     else{
7789       if( info.nKey <= nMaxKey ){
7790         checkAppendMsg(pCheck, zContext,
7791             "Rowid %lld out of order (previous was %lld)", info.nKey, nMaxKey);
7792       }
7793       nMaxKey = info.nKey;
7794     }
7795     assert( sz==info.nPayload );
7796     if( (sz>info.nLocal)
7797      && (&pCell[info.iOverflow]<=&pPage->aData[pBt->usableSize])
7798     ){
7799       int nPage = (sz - info.nLocal + usableSize - 5)/(usableSize - 4);
7800       Pgno pgnoOvfl = get4byte(&pCell[info.iOverflow]);
7801 #ifndef SQLITE_OMIT_AUTOVACUUM
7802       if( pBt->autoVacuum ){
7803         checkPtrmap(pCheck, pgnoOvfl, PTRMAP_OVERFLOW1, iPage, zContext);
7804       }
7805 #endif
7806       checkList(pCheck, 0, pgnoOvfl, nPage, zContext);
7807     }
7808 
7809     /* Check sanity of left child page.
7810     */
7811     if( !pPage->leaf ){
7812       pgno = get4byte(pCell);
7813 #ifndef SQLITE_OMIT_AUTOVACUUM
7814       if( pBt->autoVacuum ){
7815         checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage, zContext);
7816       }
7817 #endif
7818       d2 = checkTreePage(pCheck, pgno, zContext, &nMinKey, i==0 ? NULL : &nMaxKey);
7819       if( i>0 && d2!=depth ){
7820         checkAppendMsg(pCheck, zContext, "Child page depth differs");
7821       }
7822       depth = d2;
7823     }
7824   }
7825 
7826   if( !pPage->leaf ){
7827     pgno = get4byte(&pPage->aData[pPage->hdrOffset+8]);
7828     sqlite3_snprintf(sizeof(zContext), zContext,
7829                      "On page %d at right child: ", iPage);
7830 #ifndef SQLITE_OMIT_AUTOVACUUM
7831     if( pBt->autoVacuum ){
7832       checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage, zContext);
7833     }
7834 #endif
7835     checkTreePage(pCheck, pgno, zContext, NULL, !pPage->nCell ? NULL : &nMaxKey);
7836   }
7837 
7838   /* For intKey leaf pages, check that the min/max keys are in order
7839   ** with any left/parent/right pages.
7840   */
7841   if( pPage->leaf && pPage->intKey ){
7842     /* if we are a left child page */
7843     if( pnParentMinKey ){
7844       /* if we are the left most child page */
7845       if( !pnParentMaxKey ){
7846         if( nMaxKey > *pnParentMinKey ){
7847           checkAppendMsg(pCheck, zContext,
7848               "Rowid %lld out of order (max larger than parent min of %lld)",
7849               nMaxKey, *pnParentMinKey);
7850         }
7851       }else{
7852         if( nMinKey <= *pnParentMinKey ){
7853           checkAppendMsg(pCheck, zContext,
7854               "Rowid %lld out of order (min less than parent min of %lld)",
7855               nMinKey, *pnParentMinKey);
7856         }
7857         if( nMaxKey > *pnParentMaxKey ){
7858           checkAppendMsg(pCheck, zContext,
7859               "Rowid %lld out of order (max larger than parent max of %lld)",
7860               nMaxKey, *pnParentMaxKey);
7861         }
7862         *pnParentMinKey = nMaxKey;
7863       }
7864     /* else if we're a right child page */
7865     } else if( pnParentMaxKey ){
7866       if( nMinKey <= *pnParentMaxKey ){
7867         checkAppendMsg(pCheck, zContext,
7868             "Rowid %lld out of order (min less than parent max of %lld)",
7869             nMinKey, *pnParentMaxKey);
7870       }
7871     }
7872   }
7873 
7874   /* Check for complete coverage of the page
7875   */
7876   data = pPage->aData;
7877   hdr = pPage->hdrOffset;
7878   hit = sqlite3PageMalloc( pBt->pageSize );
7879   if( hit==0 ){
7880     pCheck->mallocFailed = 1;
7881   }else{
7882     int contentOffset = get2byteNotZero(&data[hdr+5]);
7883     assert( contentOffset<=usableSize );  /* Enforced by btreeInitPage() */
7884     memset(hit+contentOffset, 0, usableSize-contentOffset);
7885     memset(hit, 1, contentOffset);
7886     nCell = get2byte(&data[hdr+3]);
7887     cellStart = hdr + 12 - 4*pPage->leaf;
7888     for(i=0; i<nCell; i++){
7889       int pc = get2byte(&data[cellStart+i*2]);
7890       u32 size = 65536;
7891       int j;
7892       if( pc<=usableSize-4 ){
7893         size = cellSizePtr(pPage, &data[pc]);
7894       }
7895       if( (int)(pc+size-1)>=usableSize ){
7896         checkAppendMsg(pCheck, 0,
7897             "Corruption detected in cell %d on page %d",i,iPage);
7898       }else{
7899         for(j=pc+size-1; j>=pc; j--) hit[j]++;
7900       }
7901     }
7902     i = get2byte(&data[hdr+1]);
7903     while( i>0 ){
7904       int size, j;
7905       assert( i<=usableSize-4 );     /* Enforced by btreeInitPage() */
7906       size = get2byte(&data[i+2]);
7907       assert( i+size<=usableSize );  /* Enforced by btreeInitPage() */
7908       for(j=i+size-1; j>=i; j--) hit[j]++;
7909       j = get2byte(&data[i]);
7910       assert( j==0 || j>i+size );  /* Enforced by btreeInitPage() */
7911       assert( j<=usableSize-4 );   /* Enforced by btreeInitPage() */
7912       i = j;
7913     }
7914     for(i=cnt=0; i<usableSize; i++){
7915       if( hit[i]==0 ){
7916         cnt++;
7917       }else if( hit[i]>1 ){
7918         checkAppendMsg(pCheck, 0,
7919           "Multiple uses for byte %d of page %d", i, iPage);
7920         break;
7921       }
7922     }
7923     if( cnt!=data[hdr+7] ){
7924       checkAppendMsg(pCheck, 0,
7925           "Fragmentation of %d bytes reported as %d on page %d",
7926           cnt, data[hdr+7], iPage);
7927     }
7928   }
7929   sqlite3PageFree(hit);
7930   releasePage(pPage);
7931   return depth+1;
7932 }
7933 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
7934 
7935 #ifndef SQLITE_OMIT_INTEGRITY_CHECK
7936 /*
7937 ** This routine does a complete check of the given BTree file.  aRoot[] is
7938 ** an array of pages numbers were each page number is the root page of
7939 ** a table.  nRoot is the number of entries in aRoot.
7940 **
7941 ** A read-only or read-write transaction must be opened before calling
7942 ** this function.
7943 **
7944 ** Write the number of error seen in *pnErr.  Except for some memory
7945 ** allocation errors,  an error message held in memory obtained from
7946 ** malloc is returned if *pnErr is non-zero.  If *pnErr==0 then NULL is
7947 ** returned.  If a memory allocation error occurs, NULL is returned.
7948 */
7949 char *sqlite3BtreeIntegrityCheck(
7950   Btree *p,     /* The btree to be checked */
7951   int *aRoot,   /* An array of root pages numbers for individual trees */
7952   int nRoot,    /* Number of entries in aRoot[] */
7953   int mxErr,    /* Stop reporting errors after this many */
7954   int *pnErr    /* Write number of errors seen to this variable */
7955 ){
7956   Pgno i;
7957   int nRef;
7958   IntegrityCk sCheck;
7959   BtShared *pBt = p->pBt;
7960   char zErr[100];
7961 
7962   sqlite3BtreeEnter(p);
7963   assert( p->inTrans>TRANS_NONE && pBt->inTransaction>TRANS_NONE );
7964   nRef = sqlite3PagerRefcount(pBt->pPager);
7965   sCheck.pBt = pBt;
7966   sCheck.pPager = pBt->pPager;
7967   sCheck.nPage = btreePagecount(sCheck.pBt);
7968   sCheck.mxErr = mxErr;
7969   sCheck.nErr = 0;
7970   sCheck.mallocFailed = 0;
7971   *pnErr = 0;
7972   if( sCheck.nPage==0 ){
7973     sqlite3BtreeLeave(p);
7974     return 0;
7975   }
7976 
7977   sCheck.aPgRef = sqlite3MallocZero((sCheck.nPage / 8)+ 1);
7978   if( !sCheck.aPgRef ){
7979     *pnErr = 1;
7980     sqlite3BtreeLeave(p);
7981     return 0;
7982   }
7983   i = PENDING_BYTE_PAGE(pBt);
7984   if( i<=sCheck.nPage ) setPageReferenced(&sCheck, i);
7985   sqlite3StrAccumInit(&sCheck.errMsg, zErr, sizeof(zErr), 20000);
7986   sCheck.errMsg.useMalloc = 2;
7987 
7988   /* Check the integrity of the freelist
7989   */
7990   checkList(&sCheck, 1, get4byte(&pBt->pPage1->aData[32]),
7991             get4byte(&pBt->pPage1->aData[36]), "Main freelist: ");
7992 
7993   /* Check all the tables.
7994   */
7995   for(i=0; (int)i<nRoot && sCheck.mxErr; i++){
7996     if( aRoot[i]==0 ) continue;
7997 #ifndef SQLITE_OMIT_AUTOVACUUM
7998     if( pBt->autoVacuum && aRoot[i]>1 ){
7999       checkPtrmap(&sCheck, aRoot[i], PTRMAP_ROOTPAGE, 0, 0);
8000     }
8001 #endif
8002     checkTreePage(&sCheck, aRoot[i], "List of tree roots: ", NULL, NULL);
8003   }
8004 
8005   /* Make sure every page in the file is referenced
8006   */
8007   for(i=1; i<=sCheck.nPage && sCheck.mxErr; i++){
8008 #ifdef SQLITE_OMIT_AUTOVACUUM
8009     if( getPageReferenced(&sCheck, i)==0 ){
8010       checkAppendMsg(&sCheck, 0, "Page %d is never used", i);
8011     }
8012 #else
8013     /* If the database supports auto-vacuum, make sure no tables contain
8014     ** references to pointer-map pages.
8015     */
8016     if( getPageReferenced(&sCheck, i)==0 &&
8017        (PTRMAP_PAGENO(pBt, i)!=i || !pBt->autoVacuum) ){
8018       checkAppendMsg(&sCheck, 0, "Page %d is never used", i);
8019     }
8020     if( getPageReferenced(&sCheck, i)!=0 &&
8021        (PTRMAP_PAGENO(pBt, i)==i && pBt->autoVacuum) ){
8022       checkAppendMsg(&sCheck, 0, "Pointer map page %d is referenced", i);
8023     }
8024 #endif
8025   }
8026 
8027   /* Make sure this analysis did not leave any unref() pages.
8028   ** This is an internal consistency check; an integrity check
8029   ** of the integrity check.
8030   */
8031   if( NEVER(nRef != sqlite3PagerRefcount(pBt->pPager)) ){
8032     checkAppendMsg(&sCheck, 0,
8033       "Outstanding page count goes from %d to %d during this analysis",
8034       nRef, sqlite3PagerRefcount(pBt->pPager)
8035     );
8036   }
8037 
8038   /* Clean  up and report errors.
8039   */
8040   sqlite3BtreeLeave(p);
8041   sqlite3_free(sCheck.aPgRef);
8042   if( sCheck.mallocFailed ){
8043     sqlite3StrAccumReset(&sCheck.errMsg);
8044     *pnErr = sCheck.nErr+1;
8045     return 0;
8046   }
8047   *pnErr = sCheck.nErr;
8048   if( sCheck.nErr==0 ) sqlite3StrAccumReset(&sCheck.errMsg);
8049   return sqlite3StrAccumFinish(&sCheck.errMsg);
8050 }
8051 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
8052 
8053 /*
8054 ** Return the full pathname of the underlying database file.  Return
8055 ** an empty string if the database is in-memory or a TEMP database.
8056 **
8057 ** The pager filename is invariant as long as the pager is
8058 ** open so it is safe to access without the BtShared mutex.
8059 */
8060 const char *sqlite3BtreeGetFilename(Btree *p){
8061   assert( p->pBt->pPager!=0 );
8062   return sqlite3PagerFilename(p->pBt->pPager, 1);
8063 }
8064 
8065 /*
8066 ** Return the pathname of the journal file for this database. The return
8067 ** value of this routine is the same regardless of whether the journal file
8068 ** has been created or not.
8069 **
8070 ** The pager journal filename is invariant as long as the pager is
8071 ** open so it is safe to access without the BtShared mutex.
8072 */
8073 const char *sqlite3BtreeGetJournalname(Btree *p){
8074   assert( p->pBt->pPager!=0 );
8075   return sqlite3PagerJournalname(p->pBt->pPager);
8076 }
8077 
8078 /*
8079 ** Return non-zero if a transaction is active.
8080 */
8081 int sqlite3BtreeIsInTrans(Btree *p){
8082   assert( p==0 || sqlite3_mutex_held(p->db->mutex) );
8083   return (p && (p->inTrans==TRANS_WRITE));
8084 }
8085 
8086 #ifndef SQLITE_OMIT_WAL
8087 /*
8088 ** Run a checkpoint on the Btree passed as the first argument.
8089 **
8090 ** Return SQLITE_LOCKED if this or any other connection has an open
8091 ** transaction on the shared-cache the argument Btree is connected to.
8092 **
8093 ** Parameter eMode is one of SQLITE_CHECKPOINT_PASSIVE, FULL or RESTART.
8094 */
8095 int sqlite3BtreeCheckpoint(Btree *p, int eMode, int *pnLog, int *pnCkpt){
8096   int rc = SQLITE_OK;
8097   if( p ){
8098     BtShared *pBt = p->pBt;
8099     sqlite3BtreeEnter(p);
8100     if( pBt->inTransaction!=TRANS_NONE ){
8101       rc = SQLITE_LOCKED;
8102     }else{
8103       rc = sqlite3PagerCheckpoint(pBt->pPager, eMode, pnLog, pnCkpt);
8104     }
8105     sqlite3BtreeLeave(p);
8106   }
8107   return rc;
8108 }
8109 #endif
8110 
8111 /*
8112 ** Return non-zero if a read (or write) transaction is active.
8113 */
8114 int sqlite3BtreeIsInReadTrans(Btree *p){
8115   assert( p );
8116   assert( sqlite3_mutex_held(p->db->mutex) );
8117   return p->inTrans!=TRANS_NONE;
8118 }
8119 
8120 int sqlite3BtreeIsInBackup(Btree *p){
8121   assert( p );
8122   assert( sqlite3_mutex_held(p->db->mutex) );
8123   return p->nBackup!=0;
8124 }
8125 
8126 /*
8127 ** This function returns a pointer to a blob of memory associated with
8128 ** a single shared-btree. The memory is used by client code for its own
8129 ** purposes (for example, to store a high-level schema associated with
8130 ** the shared-btree). The btree layer manages reference counting issues.
8131 **
8132 ** The first time this is called on a shared-btree, nBytes bytes of memory
8133 ** are allocated, zeroed, and returned to the caller. For each subsequent
8134 ** call the nBytes parameter is ignored and a pointer to the same blob
8135 ** of memory returned.
8136 **
8137 ** If the nBytes parameter is 0 and the blob of memory has not yet been
8138 ** allocated, a null pointer is returned. If the blob has already been
8139 ** allocated, it is returned as normal.
8140 **
8141 ** Just before the shared-btree is closed, the function passed as the
8142 ** xFree argument when the memory allocation was made is invoked on the
8143 ** blob of allocated memory. The xFree function should not call sqlite3_free()
8144 ** on the memory, the btree layer does that.
8145 */
8146 void *sqlite3BtreeSchema(Btree *p, int nBytes, void(*xFree)(void *)){
8147   BtShared *pBt = p->pBt;
8148   sqlite3BtreeEnter(p);
8149   if( !pBt->pSchema && nBytes ){
8150     pBt->pSchema = sqlite3DbMallocZero(0, nBytes);
8151     pBt->xFreeSchema = xFree;
8152   }
8153   sqlite3BtreeLeave(p);
8154   return pBt->pSchema;
8155 }
8156 
8157 /*
8158 ** Return SQLITE_LOCKED_SHAREDCACHE if another user of the same shared
8159 ** btree as the argument handle holds an exclusive lock on the
8160 ** sqlite_master table. Otherwise SQLITE_OK.
8161 */
8162 int sqlite3BtreeSchemaLocked(Btree *p){
8163   int rc;
8164   assert( sqlite3_mutex_held(p->db->mutex) );
8165   sqlite3BtreeEnter(p);
8166   rc = querySharedCacheTableLock(p, MASTER_ROOT, READ_LOCK);
8167   assert( rc==SQLITE_OK || rc==SQLITE_LOCKED_SHAREDCACHE );
8168   sqlite3BtreeLeave(p);
8169   return rc;
8170 }
8171 
8172 
8173 #ifndef SQLITE_OMIT_SHARED_CACHE
8174 /*
8175 ** Obtain a lock on the table whose root page is iTab.  The
8176 ** lock is a write lock if isWritelock is true or a read lock
8177 ** if it is false.
8178 */
8179 int sqlite3BtreeLockTable(Btree *p, int iTab, u8 isWriteLock){
8180   int rc = SQLITE_OK;
8181   assert( p->inTrans!=TRANS_NONE );
8182   if( p->sharable ){
8183     u8 lockType = READ_LOCK + isWriteLock;
8184     assert( READ_LOCK+1==WRITE_LOCK );
8185     assert( isWriteLock==0 || isWriteLock==1 );
8186 
8187     sqlite3BtreeEnter(p);
8188     rc = querySharedCacheTableLock(p, iTab, lockType);
8189     if( rc==SQLITE_OK ){
8190       rc = setSharedCacheTableLock(p, iTab, lockType);
8191     }
8192     sqlite3BtreeLeave(p);
8193   }
8194   return rc;
8195 }
8196 #endif
8197 
8198 #ifndef SQLITE_OMIT_INCRBLOB
8199 /*
8200 ** Argument pCsr must be a cursor opened for writing on an
8201 ** INTKEY table currently pointing at a valid table entry.
8202 ** This function modifies the data stored as part of that entry.
8203 **
8204 ** Only the data content may only be modified, it is not possible to
8205 ** change the length of the data stored. If this function is called with
8206 ** parameters that attempt to write past the end of the existing data,
8207 ** no modifications are made and SQLITE_CORRUPT is returned.
8208 */
8209 int sqlite3BtreePutData(BtCursor *pCsr, u32 offset, u32 amt, void *z){
8210   int rc;
8211   assert( cursorHoldsMutex(pCsr) );
8212   assert( sqlite3_mutex_held(pCsr->pBtree->db->mutex) );
8213   assert( pCsr->isIncrblobHandle );
8214 
8215   rc = restoreCursorPosition(pCsr);
8216   if( rc!=SQLITE_OK ){
8217     return rc;
8218   }
8219   assert( pCsr->eState!=CURSOR_REQUIRESEEK );
8220   if( pCsr->eState!=CURSOR_VALID ){
8221     return SQLITE_ABORT;
8222   }
8223 
8224   /* Check some assumptions:
8225   **   (a) the cursor is open for writing,
8226   **   (b) there is a read/write transaction open,
8227   **   (c) the connection holds a write-lock on the table (if required),
8228   **   (d) there are no conflicting read-locks, and
8229   **   (e) the cursor points at a valid row of an intKey table.
8230   */
8231   if( !pCsr->wrFlag ){
8232     return SQLITE_READONLY;
8233   }
8234   assert( (pCsr->pBt->btsFlags & BTS_READ_ONLY)==0
8235               && pCsr->pBt->inTransaction==TRANS_WRITE );
8236   assert( hasSharedCacheTableLock(pCsr->pBtree, pCsr->pgnoRoot, 0, 2) );
8237   assert( !hasReadConflicts(pCsr->pBtree, pCsr->pgnoRoot) );
8238   assert( pCsr->apPage[pCsr->iPage]->intKey );
8239 
8240   return accessPayload(pCsr, offset, amt, (unsigned char *)z, 1);
8241 }
8242 
8243 /*
8244 ** Set a flag on this cursor to cache the locations of pages from the
8245 ** overflow list for the current row. This is used by cursors opened
8246 ** for incremental blob IO only.
8247 **
8248 ** This function sets a flag only. The actual page location cache
8249 ** (stored in BtCursor.aOverflow[]) is allocated and used by function
8250 ** accessPayload() (the worker function for sqlite3BtreeData() and
8251 ** sqlite3BtreePutData()).
8252 */
8253 void sqlite3BtreeCacheOverflow(BtCursor *pCur){
8254   assert( cursorHoldsMutex(pCur) );
8255   assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
8256   invalidateOverflowCache(pCur);
8257   pCur->isIncrblobHandle = 1;
8258 }
8259 #endif
8260 
8261 /*
8262 ** Set both the "read version" (single byte at byte offset 18) and
8263 ** "write version" (single byte at byte offset 19) fields in the database
8264 ** header to iVersion.
8265 */
8266 int sqlite3BtreeSetVersion(Btree *pBtree, int iVersion){
8267   BtShared *pBt = pBtree->pBt;
8268   int rc;                         /* Return code */
8269 
8270   assert( iVersion==1 || iVersion==2 );
8271 
8272   /* If setting the version fields to 1, do not automatically open the
8273   ** WAL connection, even if the version fields are currently set to 2.
8274   */
8275   pBt->btsFlags &= ~BTS_NO_WAL;
8276   if( iVersion==1 ) pBt->btsFlags |= BTS_NO_WAL;
8277 
8278   rc = sqlite3BtreeBeginTrans(pBtree, 0);
8279   if( rc==SQLITE_OK ){
8280     u8 *aData = pBt->pPage1->aData;
8281     if( aData[18]!=(u8)iVersion || aData[19]!=(u8)iVersion ){
8282       rc = sqlite3BtreeBeginTrans(pBtree, 2);
8283       if( rc==SQLITE_OK ){
8284         rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
8285         if( rc==SQLITE_OK ){
8286           aData[18] = (u8)iVersion;
8287           aData[19] = (u8)iVersion;
8288         }
8289       }
8290     }
8291   }
8292 
8293   pBt->btsFlags &= ~BTS_NO_WAL;
8294   return rc;
8295 }
8296