xref: /sqlite-3.40.0/src/btree.c (revision a153bbc4)
1 /*
2 ** 2004 April 6
3 **
4 ** The author disclaims copyright to this source code.  In place of
5 ** a legal notice, here is a blessing:
6 **
7 **    May you do good and not evil.
8 **    May you find forgiveness for yourself and forgive others.
9 **    May you share freely, never taking more than you give.
10 **
11 *************************************************************************
12 ** This file implements an external (disk-based) database using BTrees.
13 ** See the header comment on "btreeInt.h" for additional information.
14 ** Including a description of file format and an overview of operation.
15 */
16 #include "btreeInt.h"
17 
18 /*
19 ** The header string that appears at the beginning of every
20 ** SQLite database.
21 */
22 static const char zMagicHeader[] = SQLITE_FILE_HEADER;
23 
24 /*
25 ** Set this global variable to 1 to enable tracing using the TRACE
26 ** macro.
27 */
28 #if 0
29 int sqlite3BtreeTrace=1;  /* True to enable tracing */
30 # define TRACE(X)  if(sqlite3BtreeTrace){printf X;fflush(stdout);}
31 #else
32 # define TRACE(X)
33 #endif
34 
35 /*
36 ** Extract a 2-byte big-endian integer from an array of unsigned bytes.
37 ** But if the value is zero, make it 65536.
38 **
39 ** This routine is used to extract the "offset to cell content area" value
40 ** from the header of a btree page.  If the page size is 65536 and the page
41 ** is empty, the offset should be 65536, but the 2-byte value stores zero.
42 ** This routine makes the necessary adjustment to 65536.
43 */
44 #define get2byteNotZero(X)  (((((int)get2byte(X))-1)&0xffff)+1)
45 
46 /*
47 ** Values passed as the 5th argument to allocateBtreePage()
48 */
49 #define BTALLOC_ANY   0           /* Allocate any page */
50 #define BTALLOC_EXACT 1           /* Allocate exact page if possible */
51 #define BTALLOC_LE    2           /* Allocate any page <= the parameter */
52 
53 /*
54 ** Macro IfNotOmitAV(x) returns (x) if SQLITE_OMIT_AUTOVACUUM is not
55 ** defined, or 0 if it is. For example:
56 **
57 **   bIncrVacuum = IfNotOmitAV(pBtShared->incrVacuum);
58 */
59 #ifndef SQLITE_OMIT_AUTOVACUUM
60 #define IfNotOmitAV(expr) (expr)
61 #else
62 #define IfNotOmitAV(expr) 0
63 #endif
64 
65 #ifndef SQLITE_OMIT_SHARED_CACHE
66 /*
67 ** A list of BtShared objects that are eligible for participation
68 ** in shared cache.  This variable has file scope during normal builds,
69 ** but the test harness needs to access it so we make it global for
70 ** test builds.
71 **
72 ** Access to this variable is protected by SQLITE_MUTEX_STATIC_MASTER.
73 */
74 #ifdef SQLITE_TEST
75 BtShared *SQLITE_WSD sqlite3SharedCacheList = 0;
76 #else
77 static BtShared *SQLITE_WSD sqlite3SharedCacheList = 0;
78 #endif
79 #endif /* SQLITE_OMIT_SHARED_CACHE */
80 
81 #ifndef SQLITE_OMIT_SHARED_CACHE
82 /*
83 ** Enable or disable the shared pager and schema features.
84 **
85 ** This routine has no effect on existing database connections.
86 ** The shared cache setting effects only future calls to
87 ** sqlite3_open(), sqlite3_open16(), or sqlite3_open_v2().
88 */
89 int sqlite3_enable_shared_cache(int enable){
90   sqlite3GlobalConfig.sharedCacheEnabled = enable;
91   return SQLITE_OK;
92 }
93 #endif
94 
95 
96 
97 #ifdef SQLITE_OMIT_SHARED_CACHE
98   /*
99   ** The functions querySharedCacheTableLock(), setSharedCacheTableLock(),
100   ** and clearAllSharedCacheTableLocks()
101   ** manipulate entries in the BtShared.pLock linked list used to store
102   ** shared-cache table level locks. If the library is compiled with the
103   ** shared-cache feature disabled, then there is only ever one user
104   ** of each BtShared structure and so this locking is not necessary.
105   ** So define the lock related functions as no-ops.
106   */
107   #define querySharedCacheTableLock(a,b,c) SQLITE_OK
108   #define setSharedCacheTableLock(a,b,c) SQLITE_OK
109   #define clearAllSharedCacheTableLocks(a)
110   #define downgradeAllSharedCacheTableLocks(a)
111   #define hasSharedCacheTableLock(a,b,c,d) 1
112   #define hasReadConflicts(a, b) 0
113 #endif
114 
115 #ifndef SQLITE_OMIT_SHARED_CACHE
116 
117 #ifdef SQLITE_DEBUG
118 /*
119 **** This function is only used as part of an assert() statement. ***
120 **
121 ** Check to see if pBtree holds the required locks to read or write to the
122 ** table with root page iRoot.   Return 1 if it does and 0 if not.
123 **
124 ** For example, when writing to a table with root-page iRoot via
125 ** Btree connection pBtree:
126 **
127 **    assert( hasSharedCacheTableLock(pBtree, iRoot, 0, WRITE_LOCK) );
128 **
129 ** When writing to an index that resides in a sharable database, the
130 ** caller should have first obtained a lock specifying the root page of
131 ** the corresponding table. This makes things a bit more complicated,
132 ** as this module treats each table as a separate structure. To determine
133 ** the table corresponding to the index being written, this
134 ** function has to search through the database schema.
135 **
136 ** Instead of a lock on the table/index rooted at page iRoot, the caller may
137 ** hold a write-lock on the schema table (root page 1). This is also
138 ** acceptable.
139 */
140 static int hasSharedCacheTableLock(
141   Btree *pBtree,         /* Handle that must hold lock */
142   Pgno iRoot,            /* Root page of b-tree */
143   int isIndex,           /* True if iRoot is the root of an index b-tree */
144   int eLockType          /* Required lock type (READ_LOCK or WRITE_LOCK) */
145 ){
146   Schema *pSchema = (Schema *)pBtree->pBt->pSchema;
147   Pgno iTab = 0;
148   BtLock *pLock;
149 
150   /* If this database is not shareable, or if the client is reading
151   ** and has the read-uncommitted flag set, then no lock is required.
152   ** Return true immediately.
153   */
154   if( (pBtree->sharable==0)
155    || (eLockType==READ_LOCK && (pBtree->db->flags & SQLITE_ReadUncommitted))
156   ){
157     return 1;
158   }
159 
160   /* If the client is reading  or writing an index and the schema is
161   ** not loaded, then it is too difficult to actually check to see if
162   ** the correct locks are held.  So do not bother - just return true.
163   ** This case does not come up very often anyhow.
164   */
165   if( isIndex && (!pSchema || (pSchema->schemaFlags&DB_SchemaLoaded)==0) ){
166     return 1;
167   }
168 
169   /* Figure out the root-page that the lock should be held on. For table
170   ** b-trees, this is just the root page of the b-tree being read or
171   ** written. For index b-trees, it is the root page of the associated
172   ** table.  */
173   if( isIndex ){
174     HashElem *p;
175     for(p=sqliteHashFirst(&pSchema->idxHash); p; p=sqliteHashNext(p)){
176       Index *pIdx = (Index *)sqliteHashData(p);
177       if( pIdx->tnum==(int)iRoot ){
178         if( iTab ){
179           /* Two or more indexes share the same root page.  There must
180           ** be imposter tables.  So just return true.  The assert is not
181           ** useful in that case. */
182           return 1;
183         }
184         iTab = pIdx->pTable->tnum;
185       }
186     }
187   }else{
188     iTab = iRoot;
189   }
190 
191   /* Search for the required lock. Either a write-lock on root-page iTab, a
192   ** write-lock on the schema table, or (if the client is reading) a
193   ** read-lock on iTab will suffice. Return 1 if any of these are found.  */
194   for(pLock=pBtree->pBt->pLock; pLock; pLock=pLock->pNext){
195     if( pLock->pBtree==pBtree
196      && (pLock->iTable==iTab || (pLock->eLock==WRITE_LOCK && pLock->iTable==1))
197      && pLock->eLock>=eLockType
198     ){
199       return 1;
200     }
201   }
202 
203   /* Failed to find the required lock. */
204   return 0;
205 }
206 #endif /* SQLITE_DEBUG */
207 
208 #ifdef SQLITE_DEBUG
209 /*
210 **** This function may be used as part of assert() statements only. ****
211 **
212 ** Return true if it would be illegal for pBtree to write into the
213 ** table or index rooted at iRoot because other shared connections are
214 ** simultaneously reading that same table or index.
215 **
216 ** It is illegal for pBtree to write if some other Btree object that
217 ** shares the same BtShared object is currently reading or writing
218 ** the iRoot table.  Except, if the other Btree object has the
219 ** read-uncommitted flag set, then it is OK for the other object to
220 ** have a read cursor.
221 **
222 ** For example, before writing to any part of the table or index
223 ** rooted at page iRoot, one should call:
224 **
225 **    assert( !hasReadConflicts(pBtree, iRoot) );
226 */
227 static int hasReadConflicts(Btree *pBtree, Pgno iRoot){
228   BtCursor *p;
229   for(p=pBtree->pBt->pCursor; p; p=p->pNext){
230     if( p->pgnoRoot==iRoot
231      && p->pBtree!=pBtree
232      && 0==(p->pBtree->db->flags & SQLITE_ReadUncommitted)
233     ){
234       return 1;
235     }
236   }
237   return 0;
238 }
239 #endif    /* #ifdef SQLITE_DEBUG */
240 
241 /*
242 ** Query to see if Btree handle p may obtain a lock of type eLock
243 ** (READ_LOCK or WRITE_LOCK) on the table with root-page iTab. Return
244 ** SQLITE_OK if the lock may be obtained (by calling
245 ** setSharedCacheTableLock()), or SQLITE_LOCKED if not.
246 */
247 static int querySharedCacheTableLock(Btree *p, Pgno iTab, u8 eLock){
248   BtShared *pBt = p->pBt;
249   BtLock *pIter;
250 
251   assert( sqlite3BtreeHoldsMutex(p) );
252   assert( eLock==READ_LOCK || eLock==WRITE_LOCK );
253   assert( p->db!=0 );
254   assert( !(p->db->flags&SQLITE_ReadUncommitted)||eLock==WRITE_LOCK||iTab==1 );
255 
256   /* If requesting a write-lock, then the Btree must have an open write
257   ** transaction on this file. And, obviously, for this to be so there
258   ** must be an open write transaction on the file itself.
259   */
260   assert( eLock==READ_LOCK || (p==pBt->pWriter && p->inTrans==TRANS_WRITE) );
261   assert( eLock==READ_LOCK || pBt->inTransaction==TRANS_WRITE );
262 
263   /* This routine is a no-op if the shared-cache is not enabled */
264   if( !p->sharable ){
265     return SQLITE_OK;
266   }
267 
268   /* If some other connection is holding an exclusive lock, the
269   ** requested lock may not be obtained.
270   */
271   if( pBt->pWriter!=p && (pBt->btsFlags & BTS_EXCLUSIVE)!=0 ){
272     sqlite3ConnectionBlocked(p->db, pBt->pWriter->db);
273     return SQLITE_LOCKED_SHAREDCACHE;
274   }
275 
276   for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){
277     /* The condition (pIter->eLock!=eLock) in the following if(...)
278     ** statement is a simplification of:
279     **
280     **   (eLock==WRITE_LOCK || pIter->eLock==WRITE_LOCK)
281     **
282     ** since we know that if eLock==WRITE_LOCK, then no other connection
283     ** may hold a WRITE_LOCK on any table in this file (since there can
284     ** only be a single writer).
285     */
286     assert( pIter->eLock==READ_LOCK || pIter->eLock==WRITE_LOCK );
287     assert( eLock==READ_LOCK || pIter->pBtree==p || pIter->eLock==READ_LOCK);
288     if( pIter->pBtree!=p && pIter->iTable==iTab && pIter->eLock!=eLock ){
289       sqlite3ConnectionBlocked(p->db, pIter->pBtree->db);
290       if( eLock==WRITE_LOCK ){
291         assert( p==pBt->pWriter );
292         pBt->btsFlags |= BTS_PENDING;
293       }
294       return SQLITE_LOCKED_SHAREDCACHE;
295     }
296   }
297   return SQLITE_OK;
298 }
299 #endif /* !SQLITE_OMIT_SHARED_CACHE */
300 
301 #ifndef SQLITE_OMIT_SHARED_CACHE
302 /*
303 ** Add a lock on the table with root-page iTable to the shared-btree used
304 ** by Btree handle p. Parameter eLock must be either READ_LOCK or
305 ** WRITE_LOCK.
306 **
307 ** This function assumes the following:
308 **
309 **   (a) The specified Btree object p is connected to a sharable
310 **       database (one with the BtShared.sharable flag set), and
311 **
312 **   (b) No other Btree objects hold a lock that conflicts
313 **       with the requested lock (i.e. querySharedCacheTableLock() has
314 **       already been called and returned SQLITE_OK).
315 **
316 ** SQLITE_OK is returned if the lock is added successfully. SQLITE_NOMEM
317 ** is returned if a malloc attempt fails.
318 */
319 static int setSharedCacheTableLock(Btree *p, Pgno iTable, u8 eLock){
320   BtShared *pBt = p->pBt;
321   BtLock *pLock = 0;
322   BtLock *pIter;
323 
324   assert( sqlite3BtreeHoldsMutex(p) );
325   assert( eLock==READ_LOCK || eLock==WRITE_LOCK );
326   assert( p->db!=0 );
327 
328   /* A connection with the read-uncommitted flag set will never try to
329   ** obtain a read-lock using this function. The only read-lock obtained
330   ** by a connection in read-uncommitted mode is on the sqlite_master
331   ** table, and that lock is obtained in BtreeBeginTrans().  */
332   assert( 0==(p->db->flags&SQLITE_ReadUncommitted) || eLock==WRITE_LOCK );
333 
334   /* This function should only be called on a sharable b-tree after it
335   ** has been determined that no other b-tree holds a conflicting lock.  */
336   assert( p->sharable );
337   assert( SQLITE_OK==querySharedCacheTableLock(p, iTable, eLock) );
338 
339   /* First search the list for an existing lock on this table. */
340   for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){
341     if( pIter->iTable==iTable && pIter->pBtree==p ){
342       pLock = pIter;
343       break;
344     }
345   }
346 
347   /* If the above search did not find a BtLock struct associating Btree p
348   ** with table iTable, allocate one and link it into the list.
349   */
350   if( !pLock ){
351     pLock = (BtLock *)sqlite3MallocZero(sizeof(BtLock));
352     if( !pLock ){
353       return SQLITE_NOMEM;
354     }
355     pLock->iTable = iTable;
356     pLock->pBtree = p;
357     pLock->pNext = pBt->pLock;
358     pBt->pLock = pLock;
359   }
360 
361   /* Set the BtLock.eLock variable to the maximum of the current lock
362   ** and the requested lock. This means if a write-lock was already held
363   ** and a read-lock requested, we don't incorrectly downgrade the lock.
364   */
365   assert( WRITE_LOCK>READ_LOCK );
366   if( eLock>pLock->eLock ){
367     pLock->eLock = eLock;
368   }
369 
370   return SQLITE_OK;
371 }
372 #endif /* !SQLITE_OMIT_SHARED_CACHE */
373 
374 #ifndef SQLITE_OMIT_SHARED_CACHE
375 /*
376 ** Release all the table locks (locks obtained via calls to
377 ** the setSharedCacheTableLock() procedure) held by Btree object p.
378 **
379 ** This function assumes that Btree p has an open read or write
380 ** transaction. If it does not, then the BTS_PENDING flag
381 ** may be incorrectly cleared.
382 */
383 static void clearAllSharedCacheTableLocks(Btree *p){
384   BtShared *pBt = p->pBt;
385   BtLock **ppIter = &pBt->pLock;
386 
387   assert( sqlite3BtreeHoldsMutex(p) );
388   assert( p->sharable || 0==*ppIter );
389   assert( p->inTrans>0 );
390 
391   while( *ppIter ){
392     BtLock *pLock = *ppIter;
393     assert( (pBt->btsFlags & BTS_EXCLUSIVE)==0 || pBt->pWriter==pLock->pBtree );
394     assert( pLock->pBtree->inTrans>=pLock->eLock );
395     if( pLock->pBtree==p ){
396       *ppIter = pLock->pNext;
397       assert( pLock->iTable!=1 || pLock==&p->lock );
398       if( pLock->iTable!=1 ){
399         sqlite3_free(pLock);
400       }
401     }else{
402       ppIter = &pLock->pNext;
403     }
404   }
405 
406   assert( (pBt->btsFlags & BTS_PENDING)==0 || pBt->pWriter );
407   if( pBt->pWriter==p ){
408     pBt->pWriter = 0;
409     pBt->btsFlags &= ~(BTS_EXCLUSIVE|BTS_PENDING);
410   }else if( pBt->nTransaction==2 ){
411     /* This function is called when Btree p is concluding its
412     ** transaction. If there currently exists a writer, and p is not
413     ** that writer, then the number of locks held by connections other
414     ** than the writer must be about to drop to zero. In this case
415     ** set the BTS_PENDING flag to 0.
416     **
417     ** If there is not currently a writer, then BTS_PENDING must
418     ** be zero already. So this next line is harmless in that case.
419     */
420     pBt->btsFlags &= ~BTS_PENDING;
421   }
422 }
423 
424 /*
425 ** This function changes all write-locks held by Btree p into read-locks.
426 */
427 static void downgradeAllSharedCacheTableLocks(Btree *p){
428   BtShared *pBt = p->pBt;
429   if( pBt->pWriter==p ){
430     BtLock *pLock;
431     pBt->pWriter = 0;
432     pBt->btsFlags &= ~(BTS_EXCLUSIVE|BTS_PENDING);
433     for(pLock=pBt->pLock; pLock; pLock=pLock->pNext){
434       assert( pLock->eLock==READ_LOCK || pLock->pBtree==p );
435       pLock->eLock = READ_LOCK;
436     }
437   }
438 }
439 
440 #endif /* SQLITE_OMIT_SHARED_CACHE */
441 
442 static void releasePage(MemPage *pPage);  /* Forward reference */
443 
444 /*
445 ***** This routine is used inside of assert() only ****
446 **
447 ** Verify that the cursor holds the mutex on its BtShared
448 */
449 #ifdef SQLITE_DEBUG
450 static int cursorHoldsMutex(BtCursor *p){
451   return sqlite3_mutex_held(p->pBt->mutex);
452 }
453 #endif
454 
455 /*
456 ** Invalidate the overflow cache of the cursor passed as the first argument.
457 ** on the shared btree structure pBt.
458 */
459 #define invalidateOverflowCache(pCur) (pCur->curFlags &= ~BTCF_ValidOvfl)
460 
461 /*
462 ** Invalidate the overflow page-list cache for all cursors opened
463 ** on the shared btree structure pBt.
464 */
465 static void invalidateAllOverflowCache(BtShared *pBt){
466   BtCursor *p;
467   assert( sqlite3_mutex_held(pBt->mutex) );
468   for(p=pBt->pCursor; p; p=p->pNext){
469     invalidateOverflowCache(p);
470   }
471 }
472 
473 #ifndef SQLITE_OMIT_INCRBLOB
474 /*
475 ** This function is called before modifying the contents of a table
476 ** to invalidate any incrblob cursors that are open on the
477 ** row or one of the rows being modified.
478 **
479 ** If argument isClearTable is true, then the entire contents of the
480 ** table is about to be deleted. In this case invalidate all incrblob
481 ** cursors open on any row within the table with root-page pgnoRoot.
482 **
483 ** Otherwise, if argument isClearTable is false, then the row with
484 ** rowid iRow is being replaced or deleted. In this case invalidate
485 ** only those incrblob cursors open on that specific row.
486 */
487 static void invalidateIncrblobCursors(
488   Btree *pBtree,          /* The database file to check */
489   i64 iRow,               /* The rowid that might be changing */
490   int isClearTable        /* True if all rows are being deleted */
491 ){
492   BtCursor *p;
493   if( pBtree->hasIncrblobCur==0 ) return;
494   assert( sqlite3BtreeHoldsMutex(pBtree) );
495   pBtree->hasIncrblobCur = 0;
496   for(p=pBtree->pBt->pCursor; p; p=p->pNext){
497     if( (p->curFlags & BTCF_Incrblob)!=0 ){
498       pBtree->hasIncrblobCur = 1;
499       if( isClearTable || p->info.nKey==iRow ){
500         p->eState = CURSOR_INVALID;
501       }
502     }
503   }
504 }
505 
506 #else
507   /* Stub function when INCRBLOB is omitted */
508   #define invalidateIncrblobCursors(x,y,z)
509 #endif /* SQLITE_OMIT_INCRBLOB */
510 
511 /*
512 ** Set bit pgno of the BtShared.pHasContent bitvec. This is called
513 ** when a page that previously contained data becomes a free-list leaf
514 ** page.
515 **
516 ** The BtShared.pHasContent bitvec exists to work around an obscure
517 ** bug caused by the interaction of two useful IO optimizations surrounding
518 ** free-list leaf pages:
519 **
520 **   1) When all data is deleted from a page and the page becomes
521 **      a free-list leaf page, the page is not written to the database
522 **      (as free-list leaf pages contain no meaningful data). Sometimes
523 **      such a page is not even journalled (as it will not be modified,
524 **      why bother journalling it?).
525 **
526 **   2) When a free-list leaf page is reused, its content is not read
527 **      from the database or written to the journal file (why should it
528 **      be, if it is not at all meaningful?).
529 **
530 ** By themselves, these optimizations work fine and provide a handy
531 ** performance boost to bulk delete or insert operations. However, if
532 ** a page is moved to the free-list and then reused within the same
533 ** transaction, a problem comes up. If the page is not journalled when
534 ** it is moved to the free-list and it is also not journalled when it
535 ** is extracted from the free-list and reused, then the original data
536 ** may be lost. In the event of a rollback, it may not be possible
537 ** to restore the database to its original configuration.
538 **
539 ** The solution is the BtShared.pHasContent bitvec. Whenever a page is
540 ** moved to become a free-list leaf page, the corresponding bit is
541 ** set in the bitvec. Whenever a leaf page is extracted from the free-list,
542 ** optimization 2 above is omitted if the corresponding bit is already
543 ** set in BtShared.pHasContent. The contents of the bitvec are cleared
544 ** at the end of every transaction.
545 */
546 static int btreeSetHasContent(BtShared *pBt, Pgno pgno){
547   int rc = SQLITE_OK;
548   if( !pBt->pHasContent ){
549     assert( pgno<=pBt->nPage );
550     pBt->pHasContent = sqlite3BitvecCreate(pBt->nPage);
551     if( !pBt->pHasContent ){
552       rc = SQLITE_NOMEM;
553     }
554   }
555   if( rc==SQLITE_OK && pgno<=sqlite3BitvecSize(pBt->pHasContent) ){
556     rc = sqlite3BitvecSet(pBt->pHasContent, pgno);
557   }
558   return rc;
559 }
560 
561 /*
562 ** Query the BtShared.pHasContent vector.
563 **
564 ** This function is called when a free-list leaf page is removed from the
565 ** free-list for reuse. It returns false if it is safe to retrieve the
566 ** page from the pager layer with the 'no-content' flag set. True otherwise.
567 */
568 static int btreeGetHasContent(BtShared *pBt, Pgno pgno){
569   Bitvec *p = pBt->pHasContent;
570   return (p && (pgno>sqlite3BitvecSize(p) || sqlite3BitvecTest(p, pgno)));
571 }
572 
573 /*
574 ** Clear (destroy) the BtShared.pHasContent bitvec. This should be
575 ** invoked at the conclusion of each write-transaction.
576 */
577 static void btreeClearHasContent(BtShared *pBt){
578   sqlite3BitvecDestroy(pBt->pHasContent);
579   pBt->pHasContent = 0;
580 }
581 
582 /*
583 ** Release all of the apPage[] pages for a cursor.
584 */
585 static void btreeReleaseAllCursorPages(BtCursor *pCur){
586   int i;
587   for(i=0; i<=pCur->iPage; i++){
588     releasePage(pCur->apPage[i]);
589     pCur->apPage[i] = 0;
590   }
591   pCur->iPage = -1;
592 }
593 
594 
595 /*
596 ** Save the current cursor position in the variables BtCursor.nKey
597 ** and BtCursor.pKey. The cursor's state is set to CURSOR_REQUIRESEEK.
598 **
599 ** The caller must ensure that the cursor is valid (has eState==CURSOR_VALID)
600 ** prior to calling this routine.
601 */
602 static int saveCursorPosition(BtCursor *pCur){
603   int rc;
604 
605   assert( CURSOR_VALID==pCur->eState || CURSOR_SKIPNEXT==pCur->eState );
606   assert( 0==pCur->pKey );
607   assert( cursorHoldsMutex(pCur) );
608 
609   if( pCur->eState==CURSOR_SKIPNEXT ){
610     pCur->eState = CURSOR_VALID;
611   }else{
612     pCur->skipNext = 0;
613   }
614   rc = sqlite3BtreeKeySize(pCur, &pCur->nKey);
615   assert( rc==SQLITE_OK );  /* KeySize() cannot fail */
616 
617   /* If this is an intKey table, then the above call to BtreeKeySize()
618   ** stores the integer key in pCur->nKey. In this case this value is
619   ** all that is required. Otherwise, if pCur is not open on an intKey
620   ** table, then malloc space for and store the pCur->nKey bytes of key
621   ** data.
622   */
623   if( 0==pCur->curIntKey ){
624     void *pKey = sqlite3Malloc( pCur->nKey );
625     if( pKey ){
626       rc = sqlite3BtreeKey(pCur, 0, (int)pCur->nKey, pKey);
627       if( rc==SQLITE_OK ){
628         pCur->pKey = pKey;
629       }else{
630         sqlite3_free(pKey);
631       }
632     }else{
633       rc = SQLITE_NOMEM;
634     }
635   }
636   assert( !pCur->curIntKey || !pCur->pKey );
637 
638   if( rc==SQLITE_OK ){
639     btreeReleaseAllCursorPages(pCur);
640     pCur->eState = CURSOR_REQUIRESEEK;
641   }
642 
643   invalidateOverflowCache(pCur);
644   return rc;
645 }
646 
647 /* Forward reference */
648 static int SQLITE_NOINLINE saveCursorsOnList(BtCursor*,Pgno,BtCursor*);
649 
650 /*
651 ** Save the positions of all cursors (except pExcept) that are open on
652 ** the table with root-page iRoot.  "Saving the cursor position" means that
653 ** the location in the btree is remembered in such a way that it can be
654 ** moved back to the same spot after the btree has been modified.  This
655 ** routine is called just before cursor pExcept is used to modify the
656 ** table, for example in BtreeDelete() or BtreeInsert().
657 **
658 ** If there are two or more cursors on the same btree, then all such
659 ** cursors should have their BTCF_Multiple flag set.  The btreeCursor()
660 ** routine enforces that rule.  This routine only needs to be called in
661 ** the uncommon case when pExpect has the BTCF_Multiple flag set.
662 **
663 ** If pExpect!=NULL and if no other cursors are found on the same root-page,
664 ** then the BTCF_Multiple flag on pExpect is cleared, to avoid another
665 ** pointless call to this routine.
666 **
667 ** Implementation note:  This routine merely checks to see if any cursors
668 ** need to be saved.  It calls out to saveCursorsOnList() in the (unusual)
669 ** event that cursors are in need to being saved.
670 */
671 static int saveAllCursors(BtShared *pBt, Pgno iRoot, BtCursor *pExcept){
672   BtCursor *p;
673   assert( sqlite3_mutex_held(pBt->mutex) );
674   assert( pExcept==0 || pExcept->pBt==pBt );
675   for(p=pBt->pCursor; p; p=p->pNext){
676     if( p!=pExcept && (0==iRoot || p->pgnoRoot==iRoot) ) break;
677   }
678   if( p ) return saveCursorsOnList(p, iRoot, pExcept);
679   if( pExcept ) pExcept->curFlags &= ~BTCF_Multiple;
680   return SQLITE_OK;
681 }
682 
683 /* This helper routine to saveAllCursors does the actual work of saving
684 ** the cursors if and when a cursor is found that actually requires saving.
685 ** The common case is that no cursors need to be saved, so this routine is
686 ** broken out from its caller to avoid unnecessary stack pointer movement.
687 */
688 static int SQLITE_NOINLINE saveCursorsOnList(
689   BtCursor *p,         /* The first cursor that needs saving */
690   Pgno iRoot,          /* Only save cursor with this iRoot. Save all if zero */
691   BtCursor *pExcept    /* Do not save this cursor */
692 ){
693   do{
694     if( p!=pExcept && (0==iRoot || p->pgnoRoot==iRoot) ){
695       if( p->eState==CURSOR_VALID || p->eState==CURSOR_SKIPNEXT ){
696         int rc = saveCursorPosition(p);
697         if( SQLITE_OK!=rc ){
698           return rc;
699         }
700       }else{
701         testcase( p->iPage>0 );
702         btreeReleaseAllCursorPages(p);
703       }
704     }
705     p = p->pNext;
706   }while( p );
707   return SQLITE_OK;
708 }
709 
710 /*
711 ** Clear the current cursor position.
712 */
713 void sqlite3BtreeClearCursor(BtCursor *pCur){
714   assert( cursorHoldsMutex(pCur) );
715   sqlite3_free(pCur->pKey);
716   pCur->pKey = 0;
717   pCur->eState = CURSOR_INVALID;
718 }
719 
720 /*
721 ** In this version of BtreeMoveto, pKey is a packed index record
722 ** such as is generated by the OP_MakeRecord opcode.  Unpack the
723 ** record and then call BtreeMovetoUnpacked() to do the work.
724 */
725 static int btreeMoveto(
726   BtCursor *pCur,     /* Cursor open on the btree to be searched */
727   const void *pKey,   /* Packed key if the btree is an index */
728   i64 nKey,           /* Integer key for tables.  Size of pKey for indices */
729   int bias,           /* Bias search to the high end */
730   int *pRes           /* Write search results here */
731 ){
732   int rc;                    /* Status code */
733   UnpackedRecord *pIdxKey;   /* Unpacked index key */
734   char aSpace[200];          /* Temp space for pIdxKey - to avoid a malloc */
735   char *pFree = 0;
736 
737   if( pKey ){
738     assert( nKey==(i64)(int)nKey );
739     pIdxKey = sqlite3VdbeAllocUnpackedRecord(
740         pCur->pKeyInfo, aSpace, sizeof(aSpace), &pFree
741     );
742     if( pIdxKey==0 ) return SQLITE_NOMEM;
743     sqlite3VdbeRecordUnpack(pCur->pKeyInfo, (int)nKey, pKey, pIdxKey);
744     if( pIdxKey->nField==0 ){
745       sqlite3DbFree(pCur->pKeyInfo->db, pFree);
746       return SQLITE_CORRUPT_BKPT;
747     }
748   }else{
749     pIdxKey = 0;
750   }
751   rc = sqlite3BtreeMovetoUnpacked(pCur, pIdxKey, nKey, bias, pRes);
752   if( pFree ){
753     sqlite3DbFree(pCur->pKeyInfo->db, pFree);
754   }
755   return rc;
756 }
757 
758 /*
759 ** Restore the cursor to the position it was in (or as close to as possible)
760 ** when saveCursorPosition() was called. Note that this call deletes the
761 ** saved position info stored by saveCursorPosition(), so there can be
762 ** at most one effective restoreCursorPosition() call after each
763 ** saveCursorPosition().
764 */
765 static int btreeRestoreCursorPosition(BtCursor *pCur){
766   int rc;
767   int skipNext;
768   assert( cursorHoldsMutex(pCur) );
769   assert( pCur->eState>=CURSOR_REQUIRESEEK );
770   if( pCur->eState==CURSOR_FAULT ){
771     return pCur->skipNext;
772   }
773   pCur->eState = CURSOR_INVALID;
774   rc = btreeMoveto(pCur, pCur->pKey, pCur->nKey, 0, &skipNext);
775   if( rc==SQLITE_OK ){
776     sqlite3_free(pCur->pKey);
777     pCur->pKey = 0;
778     assert( pCur->eState==CURSOR_VALID || pCur->eState==CURSOR_INVALID );
779     pCur->skipNext |= skipNext;
780     if( pCur->skipNext && pCur->eState==CURSOR_VALID ){
781       pCur->eState = CURSOR_SKIPNEXT;
782     }
783   }
784   return rc;
785 }
786 
787 #define restoreCursorPosition(p) \
788   (p->eState>=CURSOR_REQUIRESEEK ? \
789          btreeRestoreCursorPosition(p) : \
790          SQLITE_OK)
791 
792 /*
793 ** Determine whether or not a cursor has moved from the position where
794 ** it was last placed, or has been invalidated for any other reason.
795 ** Cursors can move when the row they are pointing at is deleted out
796 ** from under them, for example.  Cursor might also move if a btree
797 ** is rebalanced.
798 **
799 ** Calling this routine with a NULL cursor pointer returns false.
800 **
801 ** Use the separate sqlite3BtreeCursorRestore() routine to restore a cursor
802 ** back to where it ought to be if this routine returns true.
803 */
804 int sqlite3BtreeCursorHasMoved(BtCursor *pCur){
805   return pCur->eState!=CURSOR_VALID;
806 }
807 
808 /*
809 ** This routine restores a cursor back to its original position after it
810 ** has been moved by some outside activity (such as a btree rebalance or
811 ** a row having been deleted out from under the cursor).
812 **
813 ** On success, the *pDifferentRow parameter is false if the cursor is left
814 ** pointing at exactly the same row.  *pDifferntRow is the row the cursor
815 ** was pointing to has been deleted, forcing the cursor to point to some
816 ** nearby row.
817 **
818 ** This routine should only be called for a cursor that just returned
819 ** TRUE from sqlite3BtreeCursorHasMoved().
820 */
821 int sqlite3BtreeCursorRestore(BtCursor *pCur, int *pDifferentRow){
822   int rc;
823 
824   assert( pCur!=0 );
825   assert( pCur->eState!=CURSOR_VALID );
826   rc = restoreCursorPosition(pCur);
827   if( rc ){
828     *pDifferentRow = 1;
829     return rc;
830   }
831   if( pCur->eState!=CURSOR_VALID ){
832     *pDifferentRow = 1;
833   }else{
834     assert( pCur->skipNext==0 );
835     *pDifferentRow = 0;
836   }
837   return SQLITE_OK;
838 }
839 
840 #ifndef SQLITE_OMIT_AUTOVACUUM
841 /*
842 ** Given a page number of a regular database page, return the page
843 ** number for the pointer-map page that contains the entry for the
844 ** input page number.
845 **
846 ** Return 0 (not a valid page) for pgno==1 since there is
847 ** no pointer map associated with page 1.  The integrity_check logic
848 ** requires that ptrmapPageno(*,1)!=1.
849 */
850 static Pgno ptrmapPageno(BtShared *pBt, Pgno pgno){
851   int nPagesPerMapPage;
852   Pgno iPtrMap, ret;
853   assert( sqlite3_mutex_held(pBt->mutex) );
854   if( pgno<2 ) return 0;
855   nPagesPerMapPage = (pBt->usableSize/5)+1;
856   iPtrMap = (pgno-2)/nPagesPerMapPage;
857   ret = (iPtrMap*nPagesPerMapPage) + 2;
858   if( ret==PENDING_BYTE_PAGE(pBt) ){
859     ret++;
860   }
861   return ret;
862 }
863 
864 /*
865 ** Write an entry into the pointer map.
866 **
867 ** This routine updates the pointer map entry for page number 'key'
868 ** so that it maps to type 'eType' and parent page number 'pgno'.
869 **
870 ** If *pRC is initially non-zero (non-SQLITE_OK) then this routine is
871 ** a no-op.  If an error occurs, the appropriate error code is written
872 ** into *pRC.
873 */
874 static void ptrmapPut(BtShared *pBt, Pgno key, u8 eType, Pgno parent, int *pRC){
875   DbPage *pDbPage;  /* The pointer map page */
876   u8 *pPtrmap;      /* The pointer map data */
877   Pgno iPtrmap;     /* The pointer map page number */
878   int offset;       /* Offset in pointer map page */
879   int rc;           /* Return code from subfunctions */
880 
881   if( *pRC ) return;
882 
883   assert( sqlite3_mutex_held(pBt->mutex) );
884   /* The master-journal page number must never be used as a pointer map page */
885   assert( 0==PTRMAP_ISPAGE(pBt, PENDING_BYTE_PAGE(pBt)) );
886 
887   assert( pBt->autoVacuum );
888   if( key==0 ){
889     *pRC = SQLITE_CORRUPT_BKPT;
890     return;
891   }
892   iPtrmap = PTRMAP_PAGENO(pBt, key);
893   rc = sqlite3PagerGet(pBt->pPager, iPtrmap, &pDbPage);
894   if( rc!=SQLITE_OK ){
895     *pRC = rc;
896     return;
897   }
898   offset = PTRMAP_PTROFFSET(iPtrmap, key);
899   if( offset<0 ){
900     *pRC = SQLITE_CORRUPT_BKPT;
901     goto ptrmap_exit;
902   }
903   assert( offset <= (int)pBt->usableSize-5 );
904   pPtrmap = (u8 *)sqlite3PagerGetData(pDbPage);
905 
906   if( eType!=pPtrmap[offset] || get4byte(&pPtrmap[offset+1])!=parent ){
907     TRACE(("PTRMAP_UPDATE: %d->(%d,%d)\n", key, eType, parent));
908     *pRC= rc = sqlite3PagerWrite(pDbPage);
909     if( rc==SQLITE_OK ){
910       pPtrmap[offset] = eType;
911       put4byte(&pPtrmap[offset+1], parent);
912     }
913   }
914 
915 ptrmap_exit:
916   sqlite3PagerUnref(pDbPage);
917 }
918 
919 /*
920 ** Read an entry from the pointer map.
921 **
922 ** This routine retrieves the pointer map entry for page 'key', writing
923 ** the type and parent page number to *pEType and *pPgno respectively.
924 ** An error code is returned if something goes wrong, otherwise SQLITE_OK.
925 */
926 static int ptrmapGet(BtShared *pBt, Pgno key, u8 *pEType, Pgno *pPgno){
927   DbPage *pDbPage;   /* The pointer map page */
928   int iPtrmap;       /* Pointer map page index */
929   u8 *pPtrmap;       /* Pointer map page data */
930   int offset;        /* Offset of entry in pointer map */
931   int rc;
932 
933   assert( sqlite3_mutex_held(pBt->mutex) );
934 
935   iPtrmap = PTRMAP_PAGENO(pBt, key);
936   rc = sqlite3PagerGet(pBt->pPager, iPtrmap, &pDbPage);
937   if( rc!=0 ){
938     return rc;
939   }
940   pPtrmap = (u8 *)sqlite3PagerGetData(pDbPage);
941 
942   offset = PTRMAP_PTROFFSET(iPtrmap, key);
943   if( offset<0 ){
944     sqlite3PagerUnref(pDbPage);
945     return SQLITE_CORRUPT_BKPT;
946   }
947   assert( offset <= (int)pBt->usableSize-5 );
948   assert( pEType!=0 );
949   *pEType = pPtrmap[offset];
950   if( pPgno ) *pPgno = get4byte(&pPtrmap[offset+1]);
951 
952   sqlite3PagerUnref(pDbPage);
953   if( *pEType<1 || *pEType>5 ) return SQLITE_CORRUPT_BKPT;
954   return SQLITE_OK;
955 }
956 
957 #else /* if defined SQLITE_OMIT_AUTOVACUUM */
958   #define ptrmapPut(w,x,y,z,rc)
959   #define ptrmapGet(w,x,y,z) SQLITE_OK
960   #define ptrmapPutOvflPtr(x, y, rc)
961 #endif
962 
963 /*
964 ** Given a btree page and a cell index (0 means the first cell on
965 ** the page, 1 means the second cell, and so forth) return a pointer
966 ** to the cell content.
967 **
968 ** findCellPastPtr() does the same except it skips past the initial
969 ** 4-byte child pointer found on interior pages, if there is one.
970 **
971 ** This routine works only for pages that do not contain overflow cells.
972 */
973 #define findCell(P,I) \
974   ((P)->aData + ((P)->maskPage & get2byteAligned(&(P)->aCellIdx[2*(I)])))
975 #define findCellPastPtr(P,I) \
976   ((P)->aDataOfst + ((P)->maskPage & get2byteAligned(&(P)->aCellIdx[2*(I)])))
977 
978 
979 /*
980 ** This is common tail processing for btreeParseCellPtr() and
981 ** btreeParseCellPtrIndex() for the case when the cell does not fit entirely
982 ** on a single B-tree page.  Make necessary adjustments to the CellInfo
983 ** structure.
984 */
985 static SQLITE_NOINLINE void btreeParseCellAdjustSizeForOverflow(
986   MemPage *pPage,         /* Page containing the cell */
987   u8 *pCell,              /* Pointer to the cell text. */
988   CellInfo *pInfo         /* Fill in this structure */
989 ){
990   /* If the payload will not fit completely on the local page, we have
991   ** to decide how much to store locally and how much to spill onto
992   ** overflow pages.  The strategy is to minimize the amount of unused
993   ** space on overflow pages while keeping the amount of local storage
994   ** in between minLocal and maxLocal.
995   **
996   ** Warning:  changing the way overflow payload is distributed in any
997   ** way will result in an incompatible file format.
998   */
999   int minLocal;  /* Minimum amount of payload held locally */
1000   int maxLocal;  /* Maximum amount of payload held locally */
1001   int surplus;   /* Overflow payload available for local storage */
1002 
1003   minLocal = pPage->minLocal;
1004   maxLocal = pPage->maxLocal;
1005   surplus = minLocal + (pInfo->nPayload - minLocal)%(pPage->pBt->usableSize-4);
1006   testcase( surplus==maxLocal );
1007   testcase( surplus==maxLocal+1 );
1008   if( surplus <= maxLocal ){
1009     pInfo->nLocal = (u16)surplus;
1010   }else{
1011     pInfo->nLocal = (u16)minLocal;
1012   }
1013   pInfo->iOverflow = (u16)(&pInfo->pPayload[pInfo->nLocal] - pCell);
1014   pInfo->nSize = pInfo->iOverflow + 4;
1015 }
1016 
1017 /*
1018 ** The following routines are implementations of the MemPage.xParseCell()
1019 ** method.
1020 **
1021 ** Parse a cell content block and fill in the CellInfo structure.
1022 **
1023 ** btreeParseCellPtr()        =>   table btree leaf nodes
1024 ** btreeParseCellNoPayload()  =>   table btree internal nodes
1025 ** btreeParseCellPtrIndex()   =>   index btree nodes
1026 **
1027 ** There is also a wrapper function btreeParseCell() that works for
1028 ** all MemPage types and that references the cell by index rather than
1029 ** by pointer.
1030 */
1031 static void btreeParseCellPtrNoPayload(
1032   MemPage *pPage,         /* Page containing the cell */
1033   u8 *pCell,              /* Pointer to the cell text. */
1034   CellInfo *pInfo         /* Fill in this structure */
1035 ){
1036   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1037   assert( pPage->leaf==0 );
1038   assert( pPage->noPayload );
1039   assert( pPage->childPtrSize==4 );
1040 #ifndef SQLITE_DEBUG
1041   UNUSED_PARAMETER(pPage);
1042 #endif
1043   pInfo->nSize = 4 + getVarint(&pCell[4], (u64*)&pInfo->nKey);
1044   pInfo->nPayload = 0;
1045   pInfo->nLocal = 0;
1046   pInfo->iOverflow = 0;
1047   pInfo->pPayload = 0;
1048   return;
1049 }
1050 static void btreeParseCellPtr(
1051   MemPage *pPage,         /* Page containing the cell */
1052   u8 *pCell,              /* Pointer to the cell text. */
1053   CellInfo *pInfo         /* Fill in this structure */
1054 ){
1055   u8 *pIter;              /* For scanning through pCell */
1056   u32 nPayload;           /* Number of bytes of cell payload */
1057   u64 iKey;               /* Extracted Key value */
1058 
1059   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1060   assert( pPage->leaf==0 || pPage->leaf==1 );
1061   assert( pPage->intKeyLeaf || pPage->noPayload );
1062   assert( pPage->noPayload==0 );
1063   assert( pPage->intKeyLeaf );
1064   assert( pPage->childPtrSize==0 );
1065   pIter = pCell;
1066 
1067   /* The next block of code is equivalent to:
1068   **
1069   **     pIter += getVarint32(pIter, nPayload);
1070   **
1071   ** The code is inlined to avoid a function call.
1072   */
1073   nPayload = *pIter;
1074   if( nPayload>=0x80 ){
1075     u8 *pEnd = &pIter[8];
1076     nPayload &= 0x7f;
1077     do{
1078       nPayload = (nPayload<<7) | (*++pIter & 0x7f);
1079     }while( (*pIter)>=0x80 && pIter<pEnd );
1080   }
1081   pIter++;
1082 
1083   /* The next block of code is equivalent to:
1084   **
1085   **     pIter += getVarint(pIter, (u64*)&pInfo->nKey);
1086   **
1087   ** The code is inlined to avoid a function call.
1088   */
1089   iKey = *pIter;
1090   if( iKey>=0x80 ){
1091     u8 *pEnd = &pIter[7];
1092     iKey &= 0x7f;
1093     while(1){
1094       iKey = (iKey<<7) | (*++pIter & 0x7f);
1095       if( (*pIter)<0x80 ) break;
1096       if( pIter>=pEnd ){
1097         iKey = (iKey<<8) | *++pIter;
1098         break;
1099       }
1100     }
1101   }
1102   pIter++;
1103 
1104   pInfo->nKey = *(i64*)&iKey;
1105   pInfo->nPayload = nPayload;
1106   pInfo->pPayload = pIter;
1107   testcase( nPayload==pPage->maxLocal );
1108   testcase( nPayload==pPage->maxLocal+1 );
1109   if( nPayload<=pPage->maxLocal ){
1110     /* This is the (easy) common case where the entire payload fits
1111     ** on the local page.  No overflow is required.
1112     */
1113     pInfo->nSize = nPayload + (u16)(pIter - pCell);
1114     if( pInfo->nSize<4 ) pInfo->nSize = 4;
1115     pInfo->nLocal = (u16)nPayload;
1116     pInfo->iOverflow = 0;
1117   }else{
1118     btreeParseCellAdjustSizeForOverflow(pPage, pCell, pInfo);
1119   }
1120 }
1121 static void btreeParseCellPtrIndex(
1122   MemPage *pPage,         /* Page containing the cell */
1123   u8 *pCell,              /* Pointer to the cell text. */
1124   CellInfo *pInfo         /* Fill in this structure */
1125 ){
1126   u8 *pIter;              /* For scanning through pCell */
1127   u32 nPayload;           /* Number of bytes of cell payload */
1128 
1129   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1130   assert( pPage->leaf==0 || pPage->leaf==1 );
1131   assert( pPage->intKeyLeaf==0 );
1132   assert( pPage->noPayload==0 );
1133   pIter = pCell + pPage->childPtrSize;
1134   nPayload = *pIter;
1135   if( nPayload>=0x80 ){
1136     u8 *pEnd = &pIter[8];
1137     nPayload &= 0x7f;
1138     do{
1139       nPayload = (nPayload<<7) | (*++pIter & 0x7f);
1140     }while( *(pIter)>=0x80 && pIter<pEnd );
1141   }
1142   pIter++;
1143   pInfo->nKey = nPayload;
1144   pInfo->nPayload = nPayload;
1145   pInfo->pPayload = pIter;
1146   testcase( nPayload==pPage->maxLocal );
1147   testcase( nPayload==pPage->maxLocal+1 );
1148   if( nPayload<=pPage->maxLocal ){
1149     /* This is the (easy) common case where the entire payload fits
1150     ** on the local page.  No overflow is required.
1151     */
1152     pInfo->nSize = nPayload + (u16)(pIter - pCell);
1153     if( pInfo->nSize<4 ) pInfo->nSize = 4;
1154     pInfo->nLocal = (u16)nPayload;
1155     pInfo->iOverflow = 0;
1156   }else{
1157     btreeParseCellAdjustSizeForOverflow(pPage, pCell, pInfo);
1158   }
1159 }
1160 static void btreeParseCell(
1161   MemPage *pPage,         /* Page containing the cell */
1162   int iCell,              /* The cell index.  First cell is 0 */
1163   CellInfo *pInfo         /* Fill in this structure */
1164 ){
1165   pPage->xParseCell(pPage, findCell(pPage, iCell), pInfo);
1166 }
1167 
1168 /*
1169 ** The following routines are implementations of the MemPage.xCellSize
1170 ** method.
1171 **
1172 ** Compute the total number of bytes that a Cell needs in the cell
1173 ** data area of the btree-page.  The return number includes the cell
1174 ** data header and the local payload, but not any overflow page or
1175 ** the space used by the cell pointer.
1176 **
1177 ** cellSizePtrNoPayload()    =>   table internal nodes
1178 ** cellSizePtr()             =>   all index nodes & table leaf nodes
1179 */
1180 static u16 cellSizePtr(MemPage *pPage, u8 *pCell){
1181   u8 *pIter = pCell + pPage->childPtrSize; /* For looping over bytes of pCell */
1182   u8 *pEnd;                                /* End mark for a varint */
1183   u32 nSize;                               /* Size value to return */
1184 
1185 #ifdef SQLITE_DEBUG
1186   /* The value returned by this function should always be the same as
1187   ** the (CellInfo.nSize) value found by doing a full parse of the
1188   ** cell. If SQLITE_DEBUG is defined, an assert() at the bottom of
1189   ** this function verifies that this invariant is not violated. */
1190   CellInfo debuginfo;
1191   pPage->xParseCell(pPage, pCell, &debuginfo);
1192 #endif
1193 
1194   assert( pPage->noPayload==0 );
1195   nSize = *pIter;
1196   if( nSize>=0x80 ){
1197     pEnd = &pIter[8];
1198     nSize &= 0x7f;
1199     do{
1200       nSize = (nSize<<7) | (*++pIter & 0x7f);
1201     }while( *(pIter)>=0x80 && pIter<pEnd );
1202   }
1203   pIter++;
1204   if( pPage->intKey ){
1205     /* pIter now points at the 64-bit integer key value, a variable length
1206     ** integer. The following block moves pIter to point at the first byte
1207     ** past the end of the key value. */
1208     pEnd = &pIter[9];
1209     while( (*pIter++)&0x80 && pIter<pEnd );
1210   }
1211   testcase( nSize==pPage->maxLocal );
1212   testcase( nSize==pPage->maxLocal+1 );
1213   if( nSize<=pPage->maxLocal ){
1214     nSize += (u32)(pIter - pCell);
1215     if( nSize<4 ) nSize = 4;
1216   }else{
1217     int minLocal = pPage->minLocal;
1218     nSize = minLocal + (nSize - minLocal) % (pPage->pBt->usableSize - 4);
1219     testcase( nSize==pPage->maxLocal );
1220     testcase( nSize==pPage->maxLocal+1 );
1221     if( nSize>pPage->maxLocal ){
1222       nSize = minLocal;
1223     }
1224     nSize += 4 + (u16)(pIter - pCell);
1225   }
1226   assert( nSize==debuginfo.nSize || CORRUPT_DB );
1227   return (u16)nSize;
1228 }
1229 static u16 cellSizePtrNoPayload(MemPage *pPage, u8 *pCell){
1230   u8 *pIter = pCell + 4; /* For looping over bytes of pCell */
1231   u8 *pEnd;              /* End mark for a varint */
1232 
1233 #ifdef SQLITE_DEBUG
1234   /* The value returned by this function should always be the same as
1235   ** the (CellInfo.nSize) value found by doing a full parse of the
1236   ** cell. If SQLITE_DEBUG is defined, an assert() at the bottom of
1237   ** this function verifies that this invariant is not violated. */
1238   CellInfo debuginfo;
1239   pPage->xParseCell(pPage, pCell, &debuginfo);
1240 #else
1241   UNUSED_PARAMETER(pPage);
1242 #endif
1243 
1244   assert( pPage->childPtrSize==4 );
1245   pEnd = pIter + 9;
1246   while( (*pIter++)&0x80 && pIter<pEnd );
1247   assert( debuginfo.nSize==(u16)(pIter - pCell) || CORRUPT_DB );
1248   return (u16)(pIter - pCell);
1249 }
1250 
1251 
1252 #ifdef SQLITE_DEBUG
1253 /* This variation on cellSizePtr() is used inside of assert() statements
1254 ** only. */
1255 static u16 cellSize(MemPage *pPage, int iCell){
1256   return pPage->xCellSize(pPage, findCell(pPage, iCell));
1257 }
1258 #endif
1259 
1260 #ifndef SQLITE_OMIT_AUTOVACUUM
1261 /*
1262 ** If the cell pCell, part of page pPage contains a pointer
1263 ** to an overflow page, insert an entry into the pointer-map
1264 ** for the overflow page.
1265 */
1266 static void ptrmapPutOvflPtr(MemPage *pPage, u8 *pCell, int *pRC){
1267   CellInfo info;
1268   if( *pRC ) return;
1269   assert( pCell!=0 );
1270   pPage->xParseCell(pPage, pCell, &info);
1271   if( info.iOverflow ){
1272     Pgno ovfl = get4byte(&pCell[info.iOverflow]);
1273     ptrmapPut(pPage->pBt, ovfl, PTRMAP_OVERFLOW1, pPage->pgno, pRC);
1274   }
1275 }
1276 #endif
1277 
1278 
1279 /*
1280 ** Defragment the page given.  All Cells are moved to the
1281 ** end of the page and all free space is collected into one
1282 ** big FreeBlk that occurs in between the header and cell
1283 ** pointer array and the cell content area.
1284 **
1285 ** EVIDENCE-OF: R-44582-60138 SQLite may from time to time reorganize a
1286 ** b-tree page so that there are no freeblocks or fragment bytes, all
1287 ** unused bytes are contained in the unallocated space region, and all
1288 ** cells are packed tightly at the end of the page.
1289 */
1290 static int defragmentPage(MemPage *pPage){
1291   int i;                     /* Loop counter */
1292   int pc;                    /* Address of the i-th cell */
1293   int hdr;                   /* Offset to the page header */
1294   int size;                  /* Size of a cell */
1295   int usableSize;            /* Number of usable bytes on a page */
1296   int cellOffset;            /* Offset to the cell pointer array */
1297   int cbrk;                  /* Offset to the cell content area */
1298   int nCell;                 /* Number of cells on the page */
1299   unsigned char *data;       /* The page data */
1300   unsigned char *temp;       /* Temp area for cell content */
1301   unsigned char *src;        /* Source of content */
1302   int iCellFirst;            /* First allowable cell index */
1303   int iCellLast;             /* Last possible cell index */
1304 
1305 
1306   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
1307   assert( pPage->pBt!=0 );
1308   assert( pPage->pBt->usableSize <= SQLITE_MAX_PAGE_SIZE );
1309   assert( pPage->nOverflow==0 );
1310   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1311   temp = 0;
1312   src = data = pPage->aData;
1313   hdr = pPage->hdrOffset;
1314   cellOffset = pPage->cellOffset;
1315   nCell = pPage->nCell;
1316   assert( nCell==get2byte(&data[hdr+3]) );
1317   usableSize = pPage->pBt->usableSize;
1318   cbrk = usableSize;
1319   iCellFirst = cellOffset + 2*nCell;
1320   iCellLast = usableSize - 4;
1321   for(i=0; i<nCell; i++){
1322     u8 *pAddr;     /* The i-th cell pointer */
1323     pAddr = &data[cellOffset + i*2];
1324     pc = get2byte(pAddr);
1325     testcase( pc==iCellFirst );
1326     testcase( pc==iCellLast );
1327     /* These conditions have already been verified in btreeInitPage()
1328     ** if PRAGMA cell_size_check=ON.
1329     */
1330     if( pc<iCellFirst || pc>iCellLast ){
1331       return SQLITE_CORRUPT_BKPT;
1332     }
1333     assert( pc>=iCellFirst && pc<=iCellLast );
1334     size = pPage->xCellSize(pPage, &src[pc]);
1335     cbrk -= size;
1336     if( cbrk<iCellFirst || pc+size>usableSize ){
1337       return SQLITE_CORRUPT_BKPT;
1338     }
1339     assert( cbrk+size<=usableSize && cbrk>=iCellFirst );
1340     testcase( cbrk+size==usableSize );
1341     testcase( pc+size==usableSize );
1342     put2byte(pAddr, cbrk);
1343     if( temp==0 ){
1344       int x;
1345       if( cbrk==pc ) continue;
1346       temp = sqlite3PagerTempSpace(pPage->pBt->pPager);
1347       x = get2byte(&data[hdr+5]);
1348       memcpy(&temp[x], &data[x], (cbrk+size) - x);
1349       src = temp;
1350     }
1351     memcpy(&data[cbrk], &src[pc], size);
1352   }
1353   assert( cbrk>=iCellFirst );
1354   put2byte(&data[hdr+5], cbrk);
1355   data[hdr+1] = 0;
1356   data[hdr+2] = 0;
1357   data[hdr+7] = 0;
1358   memset(&data[iCellFirst], 0, cbrk-iCellFirst);
1359   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
1360   if( cbrk-iCellFirst!=pPage->nFree ){
1361     return SQLITE_CORRUPT_BKPT;
1362   }
1363   return SQLITE_OK;
1364 }
1365 
1366 /*
1367 ** Search the free-list on page pPg for space to store a cell nByte bytes in
1368 ** size. If one can be found, return a pointer to the space and remove it
1369 ** from the free-list.
1370 **
1371 ** If no suitable space can be found on the free-list, return NULL.
1372 **
1373 ** This function may detect corruption within pPg.  If corruption is
1374 ** detected then *pRc is set to SQLITE_CORRUPT and NULL is returned.
1375 **
1376 ** Slots on the free list that are between 1 and 3 bytes larger than nByte
1377 ** will be ignored if adding the extra space to the fragmentation count
1378 ** causes the fragmentation count to exceed 60.
1379 */
1380 static u8 *pageFindSlot(MemPage *pPg, int nByte, int *pRc){
1381   const int hdr = pPg->hdrOffset;
1382   u8 * const aData = pPg->aData;
1383   int iAddr = hdr + 1;
1384   int pc = get2byte(&aData[iAddr]);
1385   int x;
1386   int usableSize = pPg->pBt->usableSize;
1387 
1388   assert( pc>0 );
1389   do{
1390     int size;            /* Size of the free slot */
1391     /* EVIDENCE-OF: R-06866-39125 Freeblocks are always connected in order of
1392     ** increasing offset. */
1393     if( pc>usableSize-4 || pc<iAddr+4 ){
1394       *pRc = SQLITE_CORRUPT_BKPT;
1395       return 0;
1396     }
1397     /* EVIDENCE-OF: R-22710-53328 The third and fourth bytes of each
1398     ** freeblock form a big-endian integer which is the size of the freeblock
1399     ** in bytes, including the 4-byte header. */
1400     size = get2byte(&aData[pc+2]);
1401     if( (x = size - nByte)>=0 ){
1402       testcase( x==4 );
1403       testcase( x==3 );
1404       if( pc < pPg->cellOffset+2*pPg->nCell || size+pc > usableSize ){
1405         *pRc = SQLITE_CORRUPT_BKPT;
1406         return 0;
1407       }else if( x<4 ){
1408         /* EVIDENCE-OF: R-11498-58022 In a well-formed b-tree page, the total
1409         ** number of bytes in fragments may not exceed 60. */
1410         if( aData[hdr+7]>57 ) return 0;
1411 
1412         /* Remove the slot from the free-list. Update the number of
1413         ** fragmented bytes within the page. */
1414         memcpy(&aData[iAddr], &aData[pc], 2);
1415         aData[hdr+7] += (u8)x;
1416       }else{
1417         /* The slot remains on the free-list. Reduce its size to account
1418          ** for the portion used by the new allocation. */
1419         put2byte(&aData[pc+2], x);
1420       }
1421       return &aData[pc + x];
1422     }
1423     iAddr = pc;
1424     pc = get2byte(&aData[pc]);
1425   }while( pc );
1426 
1427   return 0;
1428 }
1429 
1430 /*
1431 ** Allocate nByte bytes of space from within the B-Tree page passed
1432 ** as the first argument. Write into *pIdx the index into pPage->aData[]
1433 ** of the first byte of allocated space. Return either SQLITE_OK or
1434 ** an error code (usually SQLITE_CORRUPT).
1435 **
1436 ** The caller guarantees that there is sufficient space to make the
1437 ** allocation.  This routine might need to defragment in order to bring
1438 ** all the space together, however.  This routine will avoid using
1439 ** the first two bytes past the cell pointer area since presumably this
1440 ** allocation is being made in order to insert a new cell, so we will
1441 ** also end up needing a new cell pointer.
1442 */
1443 static int allocateSpace(MemPage *pPage, int nByte, int *pIdx){
1444   const int hdr = pPage->hdrOffset;    /* Local cache of pPage->hdrOffset */
1445   u8 * const data = pPage->aData;      /* Local cache of pPage->aData */
1446   int top;                             /* First byte of cell content area */
1447   int rc = SQLITE_OK;                  /* Integer return code */
1448   int gap;        /* First byte of gap between cell pointers and cell content */
1449 
1450   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
1451   assert( pPage->pBt );
1452   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1453   assert( nByte>=0 );  /* Minimum cell size is 4 */
1454   assert( pPage->nFree>=nByte );
1455   assert( pPage->nOverflow==0 );
1456   assert( nByte < (int)(pPage->pBt->usableSize-8) );
1457 
1458   assert( pPage->cellOffset == hdr + 12 - 4*pPage->leaf );
1459   gap = pPage->cellOffset + 2*pPage->nCell;
1460   assert( gap<=65536 );
1461   /* EVIDENCE-OF: R-29356-02391 If the database uses a 65536-byte page size
1462   ** and the reserved space is zero (the usual value for reserved space)
1463   ** then the cell content offset of an empty page wants to be 65536.
1464   ** However, that integer is too large to be stored in a 2-byte unsigned
1465   ** integer, so a value of 0 is used in its place. */
1466   top = get2byte(&data[hdr+5]);
1467   assert( top<=(int)pPage->pBt->usableSize ); /* Prevent by getAndInitPage() */
1468   if( gap>top ){
1469     if( top==0 && pPage->pBt->usableSize==65536 ){
1470       top = 65536;
1471     }else{
1472       return SQLITE_CORRUPT_BKPT;
1473     }
1474   }
1475 
1476   /* If there is enough space between gap and top for one more cell pointer
1477   ** array entry offset, and if the freelist is not empty, then search the
1478   ** freelist looking for a free slot big enough to satisfy the request.
1479   */
1480   testcase( gap+2==top );
1481   testcase( gap+1==top );
1482   testcase( gap==top );
1483   if( (data[hdr+2] || data[hdr+1]) && gap+2<=top ){
1484     u8 *pSpace = pageFindSlot(pPage, nByte, &rc);
1485     if( pSpace ){
1486       assert( pSpace>=data && (pSpace - data)<65536 );
1487       *pIdx = (int)(pSpace - data);
1488       return SQLITE_OK;
1489     }else if( rc ){
1490       return rc;
1491     }
1492   }
1493 
1494   /* The request could not be fulfilled using a freelist slot.  Check
1495   ** to see if defragmentation is necessary.
1496   */
1497   testcase( gap+2+nByte==top );
1498   if( gap+2+nByte>top ){
1499     assert( pPage->nCell>0 || CORRUPT_DB );
1500     rc = defragmentPage(pPage);
1501     if( rc ) return rc;
1502     top = get2byteNotZero(&data[hdr+5]);
1503     assert( gap+nByte<=top );
1504   }
1505 
1506 
1507   /* Allocate memory from the gap in between the cell pointer array
1508   ** and the cell content area.  The btreeInitPage() call has already
1509   ** validated the freelist.  Given that the freelist is valid, there
1510   ** is no way that the allocation can extend off the end of the page.
1511   ** The assert() below verifies the previous sentence.
1512   */
1513   top -= nByte;
1514   put2byte(&data[hdr+5], top);
1515   assert( top+nByte <= (int)pPage->pBt->usableSize );
1516   *pIdx = top;
1517   return SQLITE_OK;
1518 }
1519 
1520 /*
1521 ** Return a section of the pPage->aData to the freelist.
1522 ** The first byte of the new free block is pPage->aData[iStart]
1523 ** and the size of the block is iSize bytes.
1524 **
1525 ** Adjacent freeblocks are coalesced.
1526 **
1527 ** Note that even though the freeblock list was checked by btreeInitPage(),
1528 ** that routine will not detect overlap between cells or freeblocks.  Nor
1529 ** does it detect cells or freeblocks that encrouch into the reserved bytes
1530 ** at the end of the page.  So do additional corruption checks inside this
1531 ** routine and return SQLITE_CORRUPT if any problems are found.
1532 */
1533 static int freeSpace(MemPage *pPage, u16 iStart, u16 iSize){
1534   u16 iPtr;                             /* Address of ptr to next freeblock */
1535   u16 iFreeBlk;                         /* Address of the next freeblock */
1536   u8 hdr;                               /* Page header size.  0 or 100 */
1537   u8 nFrag = 0;                         /* Reduction in fragmentation */
1538   u16 iOrigSize = iSize;                /* Original value of iSize */
1539   u32 iLast = pPage->pBt->usableSize-4; /* Largest possible freeblock offset */
1540   u32 iEnd = iStart + iSize;            /* First byte past the iStart buffer */
1541   unsigned char *data = pPage->aData;   /* Page content */
1542 
1543   assert( pPage->pBt!=0 );
1544   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
1545   assert( CORRUPT_DB || iStart>=pPage->hdrOffset+6+pPage->childPtrSize );
1546   assert( CORRUPT_DB || iEnd <= pPage->pBt->usableSize );
1547   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1548   assert( iSize>=4 );   /* Minimum cell size is 4 */
1549   assert( iStart<=iLast );
1550 
1551   /* Overwrite deleted information with zeros when the secure_delete
1552   ** option is enabled */
1553   if( pPage->pBt->btsFlags & BTS_SECURE_DELETE ){
1554     memset(&data[iStart], 0, iSize);
1555   }
1556 
1557   /* The list of freeblocks must be in ascending order.  Find the
1558   ** spot on the list where iStart should be inserted.
1559   */
1560   hdr = pPage->hdrOffset;
1561   iPtr = hdr + 1;
1562   if( data[iPtr+1]==0 && data[iPtr]==0 ){
1563     iFreeBlk = 0;  /* Shortcut for the case when the freelist is empty */
1564   }else{
1565     while( (iFreeBlk = get2byte(&data[iPtr]))>0 && iFreeBlk<iStart ){
1566       if( iFreeBlk<iPtr+4 ) return SQLITE_CORRUPT_BKPT;
1567       iPtr = iFreeBlk;
1568     }
1569     if( iFreeBlk>iLast ) return SQLITE_CORRUPT_BKPT;
1570     assert( iFreeBlk>iPtr || iFreeBlk==0 );
1571 
1572     /* At this point:
1573     **    iFreeBlk:   First freeblock after iStart, or zero if none
1574     **    iPtr:       The address of a pointer to iFreeBlk
1575     **
1576     ** Check to see if iFreeBlk should be coalesced onto the end of iStart.
1577     */
1578     if( iFreeBlk && iEnd+3>=iFreeBlk ){
1579       nFrag = iFreeBlk - iEnd;
1580       if( iEnd>iFreeBlk ) return SQLITE_CORRUPT_BKPT;
1581       iEnd = iFreeBlk + get2byte(&data[iFreeBlk+2]);
1582       if( iEnd > pPage->pBt->usableSize ) return SQLITE_CORRUPT_BKPT;
1583       iSize = iEnd - iStart;
1584       iFreeBlk = get2byte(&data[iFreeBlk]);
1585     }
1586 
1587     /* If iPtr is another freeblock (that is, if iPtr is not the freelist
1588     ** pointer in the page header) then check to see if iStart should be
1589     ** coalesced onto the end of iPtr.
1590     */
1591     if( iPtr>hdr+1 ){
1592       int iPtrEnd = iPtr + get2byte(&data[iPtr+2]);
1593       if( iPtrEnd+3>=iStart ){
1594         if( iPtrEnd>iStart ) return SQLITE_CORRUPT_BKPT;
1595         nFrag += iStart - iPtrEnd;
1596         iSize = iEnd - iPtr;
1597         iStart = iPtr;
1598       }
1599     }
1600     if( nFrag>data[hdr+7] ) return SQLITE_CORRUPT_BKPT;
1601     data[hdr+7] -= nFrag;
1602   }
1603   if( iStart==get2byte(&data[hdr+5]) ){
1604     /* The new freeblock is at the beginning of the cell content area,
1605     ** so just extend the cell content area rather than create another
1606     ** freelist entry */
1607     if( iPtr!=hdr+1 ) return SQLITE_CORRUPT_BKPT;
1608     put2byte(&data[hdr+1], iFreeBlk);
1609     put2byte(&data[hdr+5], iEnd);
1610   }else{
1611     /* Insert the new freeblock into the freelist */
1612     put2byte(&data[iPtr], iStart);
1613     put2byte(&data[iStart], iFreeBlk);
1614     put2byte(&data[iStart+2], iSize);
1615   }
1616   pPage->nFree += iOrigSize;
1617   return SQLITE_OK;
1618 }
1619 
1620 /*
1621 ** Decode the flags byte (the first byte of the header) for a page
1622 ** and initialize fields of the MemPage structure accordingly.
1623 **
1624 ** Only the following combinations are supported.  Anything different
1625 ** indicates a corrupt database files:
1626 **
1627 **         PTF_ZERODATA
1628 **         PTF_ZERODATA | PTF_LEAF
1629 **         PTF_LEAFDATA | PTF_INTKEY
1630 **         PTF_LEAFDATA | PTF_INTKEY | PTF_LEAF
1631 */
1632 static int decodeFlags(MemPage *pPage, int flagByte){
1633   BtShared *pBt;     /* A copy of pPage->pBt */
1634 
1635   assert( pPage->hdrOffset==(pPage->pgno==1 ? 100 : 0) );
1636   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1637   pPage->leaf = (u8)(flagByte>>3);  assert( PTF_LEAF == 1<<3 );
1638   flagByte &= ~PTF_LEAF;
1639   pPage->childPtrSize = 4-4*pPage->leaf;
1640   pPage->xCellSize = cellSizePtr;
1641   pBt = pPage->pBt;
1642   if( flagByte==(PTF_LEAFDATA | PTF_INTKEY) ){
1643     /* EVIDENCE-OF: R-03640-13415 A value of 5 means the page is an interior
1644     ** table b-tree page. */
1645     assert( (PTF_LEAFDATA|PTF_INTKEY)==5 );
1646     /* EVIDENCE-OF: R-20501-61796 A value of 13 means the page is a leaf
1647     ** table b-tree page. */
1648     assert( (PTF_LEAFDATA|PTF_INTKEY|PTF_LEAF)==13 );
1649     pPage->intKey = 1;
1650     if( pPage->leaf ){
1651       pPage->intKeyLeaf = 1;
1652       pPage->noPayload = 0;
1653       pPage->xParseCell = btreeParseCellPtr;
1654     }else{
1655       pPage->intKeyLeaf = 0;
1656       pPage->noPayload = 1;
1657       pPage->xCellSize = cellSizePtrNoPayload;
1658       pPage->xParseCell = btreeParseCellPtrNoPayload;
1659     }
1660     pPage->maxLocal = pBt->maxLeaf;
1661     pPage->minLocal = pBt->minLeaf;
1662   }else if( flagByte==PTF_ZERODATA ){
1663     /* EVIDENCE-OF: R-27225-53936 A value of 2 means the page is an interior
1664     ** index b-tree page. */
1665     assert( (PTF_ZERODATA)==2 );
1666     /* EVIDENCE-OF: R-16571-11615 A value of 10 means the page is a leaf
1667     ** index b-tree page. */
1668     assert( (PTF_ZERODATA|PTF_LEAF)==10 );
1669     pPage->intKey = 0;
1670     pPage->intKeyLeaf = 0;
1671     pPage->noPayload = 0;
1672     pPage->xParseCell = btreeParseCellPtrIndex;
1673     pPage->maxLocal = pBt->maxLocal;
1674     pPage->minLocal = pBt->minLocal;
1675   }else{
1676     /* EVIDENCE-OF: R-47608-56469 Any other value for the b-tree page type is
1677     ** an error. */
1678     return SQLITE_CORRUPT_BKPT;
1679   }
1680   pPage->max1bytePayload = pBt->max1bytePayload;
1681   return SQLITE_OK;
1682 }
1683 
1684 /*
1685 ** Initialize the auxiliary information for a disk block.
1686 **
1687 ** Return SQLITE_OK on success.  If we see that the page does
1688 ** not contain a well-formed database page, then return
1689 ** SQLITE_CORRUPT.  Note that a return of SQLITE_OK does not
1690 ** guarantee that the page is well-formed.  It only shows that
1691 ** we failed to detect any corruption.
1692 */
1693 static int btreeInitPage(MemPage *pPage){
1694 
1695   assert( pPage->pBt!=0 );
1696   assert( pPage->pBt->db!=0 );
1697   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1698   assert( pPage->pgno==sqlite3PagerPagenumber(pPage->pDbPage) );
1699   assert( pPage == sqlite3PagerGetExtra(pPage->pDbPage) );
1700   assert( pPage->aData == sqlite3PagerGetData(pPage->pDbPage) );
1701 
1702   if( !pPage->isInit ){
1703     u16 pc;            /* Address of a freeblock within pPage->aData[] */
1704     u8 hdr;            /* Offset to beginning of page header */
1705     u8 *data;          /* Equal to pPage->aData */
1706     BtShared *pBt;        /* The main btree structure */
1707     int usableSize;    /* Amount of usable space on each page */
1708     u16 cellOffset;    /* Offset from start of page to first cell pointer */
1709     int nFree;         /* Number of unused bytes on the page */
1710     int top;           /* First byte of the cell content area */
1711     int iCellFirst;    /* First allowable cell or freeblock offset */
1712     int iCellLast;     /* Last possible cell or freeblock offset */
1713 
1714     pBt = pPage->pBt;
1715 
1716     hdr = pPage->hdrOffset;
1717     data = pPage->aData;
1718     /* EVIDENCE-OF: R-28594-02890 The one-byte flag at offset 0 indicating
1719     ** the b-tree page type. */
1720     if( decodeFlags(pPage, data[hdr]) ) return SQLITE_CORRUPT_BKPT;
1721     assert( pBt->pageSize>=512 && pBt->pageSize<=65536 );
1722     pPage->maskPage = (u16)(pBt->pageSize - 1);
1723     pPage->nOverflow = 0;
1724     usableSize = pBt->usableSize;
1725     pPage->cellOffset = cellOffset = hdr + 8 + pPage->childPtrSize;
1726     pPage->aDataEnd = &data[usableSize];
1727     pPage->aCellIdx = &data[cellOffset];
1728     pPage->aDataOfst = &data[pPage->childPtrSize];
1729     /* EVIDENCE-OF: R-58015-48175 The two-byte integer at offset 5 designates
1730     ** the start of the cell content area. A zero value for this integer is
1731     ** interpreted as 65536. */
1732     top = get2byteNotZero(&data[hdr+5]);
1733     /* EVIDENCE-OF: R-37002-32774 The two-byte integer at offset 3 gives the
1734     ** number of cells on the page. */
1735     pPage->nCell = get2byte(&data[hdr+3]);
1736     if( pPage->nCell>MX_CELL(pBt) ){
1737       /* To many cells for a single page.  The page must be corrupt */
1738       return SQLITE_CORRUPT_BKPT;
1739     }
1740     testcase( pPage->nCell==MX_CELL(pBt) );
1741     /* EVIDENCE-OF: R-24089-57979 If a page contains no cells (which is only
1742     ** possible for a root page of a table that contains no rows) then the
1743     ** offset to the cell content area will equal the page size minus the
1744     ** bytes of reserved space. */
1745     assert( pPage->nCell>0 || top==usableSize || CORRUPT_DB );
1746 
1747     /* A malformed database page might cause us to read past the end
1748     ** of page when parsing a cell.
1749     **
1750     ** The following block of code checks early to see if a cell extends
1751     ** past the end of a page boundary and causes SQLITE_CORRUPT to be
1752     ** returned if it does.
1753     */
1754     iCellFirst = cellOffset + 2*pPage->nCell;
1755     iCellLast = usableSize - 4;
1756     if( pBt->db->flags & SQLITE_CellSizeCk ){
1757       int i;            /* Index into the cell pointer array */
1758       int sz;           /* Size of a cell */
1759 
1760       if( !pPage->leaf ) iCellLast--;
1761       for(i=0; i<pPage->nCell; i++){
1762         pc = get2byteAligned(&data[cellOffset+i*2]);
1763         testcase( pc==iCellFirst );
1764         testcase( pc==iCellLast );
1765         if( pc<iCellFirst || pc>iCellLast ){
1766           return SQLITE_CORRUPT_BKPT;
1767         }
1768         sz = pPage->xCellSize(pPage, &data[pc]);
1769         testcase( pc+sz==usableSize );
1770         if( pc+sz>usableSize ){
1771           return SQLITE_CORRUPT_BKPT;
1772         }
1773       }
1774       if( !pPage->leaf ) iCellLast++;
1775     }
1776 
1777     /* Compute the total free space on the page
1778     ** EVIDENCE-OF: R-23588-34450 The two-byte integer at offset 1 gives the
1779     ** start of the first freeblock on the page, or is zero if there are no
1780     ** freeblocks. */
1781     pc = get2byte(&data[hdr+1]);
1782     nFree = data[hdr+7] + top;  /* Init nFree to non-freeblock free space */
1783     while( pc>0 ){
1784       u16 next, size;
1785       if( pc<iCellFirst || pc>iCellLast ){
1786         /* EVIDENCE-OF: R-55530-52930 In a well-formed b-tree page, there will
1787         ** always be at least one cell before the first freeblock.
1788         **
1789         ** Or, the freeblock is off the end of the page
1790         */
1791         return SQLITE_CORRUPT_BKPT;
1792       }
1793       next = get2byte(&data[pc]);
1794       size = get2byte(&data[pc+2]);
1795       if( (next>0 && next<=pc+size+3) || pc+size>usableSize ){
1796         /* Free blocks must be in ascending order. And the last byte of
1797         ** the free-block must lie on the database page.  */
1798         return SQLITE_CORRUPT_BKPT;
1799       }
1800       nFree = nFree + size;
1801       pc = next;
1802     }
1803 
1804     /* At this point, nFree contains the sum of the offset to the start
1805     ** of the cell-content area plus the number of free bytes within
1806     ** the cell-content area. If this is greater than the usable-size
1807     ** of the page, then the page must be corrupted. This check also
1808     ** serves to verify that the offset to the start of the cell-content
1809     ** area, according to the page header, lies within the page.
1810     */
1811     if( nFree>usableSize ){
1812       return SQLITE_CORRUPT_BKPT;
1813     }
1814     pPage->nFree = (u16)(nFree - iCellFirst);
1815     pPage->isInit = 1;
1816   }
1817   return SQLITE_OK;
1818 }
1819 
1820 /*
1821 ** Set up a raw page so that it looks like a database page holding
1822 ** no entries.
1823 */
1824 static void zeroPage(MemPage *pPage, int flags){
1825   unsigned char *data = pPage->aData;
1826   BtShared *pBt = pPage->pBt;
1827   u8 hdr = pPage->hdrOffset;
1828   u16 first;
1829 
1830   assert( sqlite3PagerPagenumber(pPage->pDbPage)==pPage->pgno );
1831   assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage );
1832   assert( sqlite3PagerGetData(pPage->pDbPage) == data );
1833   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
1834   assert( sqlite3_mutex_held(pBt->mutex) );
1835   if( pBt->btsFlags & BTS_SECURE_DELETE ){
1836     memset(&data[hdr], 0, pBt->usableSize - hdr);
1837   }
1838   data[hdr] = (char)flags;
1839   first = hdr + ((flags&PTF_LEAF)==0 ? 12 : 8);
1840   memset(&data[hdr+1], 0, 4);
1841   data[hdr+7] = 0;
1842   put2byte(&data[hdr+5], pBt->usableSize);
1843   pPage->nFree = (u16)(pBt->usableSize - first);
1844   decodeFlags(pPage, flags);
1845   pPage->cellOffset = first;
1846   pPage->aDataEnd = &data[pBt->usableSize];
1847   pPage->aCellIdx = &data[first];
1848   pPage->aDataOfst = &data[pPage->childPtrSize];
1849   pPage->nOverflow = 0;
1850   assert( pBt->pageSize>=512 && pBt->pageSize<=65536 );
1851   pPage->maskPage = (u16)(pBt->pageSize - 1);
1852   pPage->nCell = 0;
1853   pPage->isInit = 1;
1854 }
1855 
1856 
1857 /*
1858 ** Convert a DbPage obtained from the pager into a MemPage used by
1859 ** the btree layer.
1860 */
1861 static MemPage *btreePageFromDbPage(DbPage *pDbPage, Pgno pgno, BtShared *pBt){
1862   MemPage *pPage = (MemPage*)sqlite3PagerGetExtra(pDbPage);
1863   pPage->aData = sqlite3PagerGetData(pDbPage);
1864   pPage->pDbPage = pDbPage;
1865   pPage->pBt = pBt;
1866   pPage->pgno = pgno;
1867   pPage->hdrOffset = pgno==1 ? 100 : 0;
1868   return pPage;
1869 }
1870 
1871 /*
1872 ** Get a page from the pager.  Initialize the MemPage.pBt and
1873 ** MemPage.aData elements if needed.  See also: btreeGetUnusedPage().
1874 **
1875 ** If the PAGER_GET_NOCONTENT flag is set, it means that we do not care
1876 ** about the content of the page at this time.  So do not go to the disk
1877 ** to fetch the content.  Just fill in the content with zeros for now.
1878 ** If in the future we call sqlite3PagerWrite() on this page, that
1879 ** means we have started to be concerned about content and the disk
1880 ** read should occur at that point.
1881 */
1882 static int btreeGetPage(
1883   BtShared *pBt,       /* The btree */
1884   Pgno pgno,           /* Number of the page to fetch */
1885   MemPage **ppPage,    /* Return the page in this parameter */
1886   int flags            /* PAGER_GET_NOCONTENT or PAGER_GET_READONLY */
1887 ){
1888   int rc;
1889   DbPage *pDbPage;
1890 
1891   assert( flags==0 || flags==PAGER_GET_NOCONTENT || flags==PAGER_GET_READONLY );
1892   assert( sqlite3_mutex_held(pBt->mutex) );
1893   rc = sqlite3PagerAcquire(pBt->pPager, pgno, (DbPage**)&pDbPage, flags);
1894   if( rc ) return rc;
1895   *ppPage = btreePageFromDbPage(pDbPage, pgno, pBt);
1896   return SQLITE_OK;
1897 }
1898 
1899 /*
1900 ** Retrieve a page from the pager cache. If the requested page is not
1901 ** already in the pager cache return NULL. Initialize the MemPage.pBt and
1902 ** MemPage.aData elements if needed.
1903 */
1904 static MemPage *btreePageLookup(BtShared *pBt, Pgno pgno){
1905   DbPage *pDbPage;
1906   assert( sqlite3_mutex_held(pBt->mutex) );
1907   pDbPage = sqlite3PagerLookup(pBt->pPager, pgno);
1908   if( pDbPage ){
1909     return btreePageFromDbPage(pDbPage, pgno, pBt);
1910   }
1911   return 0;
1912 }
1913 
1914 /*
1915 ** Return the size of the database file in pages. If there is any kind of
1916 ** error, return ((unsigned int)-1).
1917 */
1918 static Pgno btreePagecount(BtShared *pBt){
1919   return pBt->nPage;
1920 }
1921 u32 sqlite3BtreeLastPage(Btree *p){
1922   assert( sqlite3BtreeHoldsMutex(p) );
1923   assert( ((p->pBt->nPage)&0x8000000)==0 );
1924   return btreePagecount(p->pBt);
1925 }
1926 
1927 /*
1928 ** Get a page from the pager and initialize it.
1929 **
1930 ** If pCur!=0 then the page is being fetched as part of a moveToChild()
1931 ** call.  Do additional sanity checking on the page in this case.
1932 ** And if the fetch fails, this routine must decrement pCur->iPage.
1933 **
1934 ** The page is fetched as read-write unless pCur is not NULL and is
1935 ** a read-only cursor.
1936 **
1937 ** If an error occurs, then *ppPage is undefined. It
1938 ** may remain unchanged, or it may be set to an invalid value.
1939 */
1940 static int getAndInitPage(
1941   BtShared *pBt,                  /* The database file */
1942   Pgno pgno,                      /* Number of the page to get */
1943   MemPage **ppPage,               /* Write the page pointer here */
1944   BtCursor *pCur,                 /* Cursor to receive the page, or NULL */
1945   int bReadOnly                   /* True for a read-only page */
1946 ){
1947   int rc;
1948   DbPage *pDbPage;
1949   assert( sqlite3_mutex_held(pBt->mutex) );
1950   assert( pCur==0 || ppPage==&pCur->apPage[pCur->iPage] );
1951   assert( pCur==0 || bReadOnly==pCur->curPagerFlags );
1952   assert( pCur==0 || pCur->iPage>0 );
1953 
1954   if( pgno>btreePagecount(pBt) ){
1955     rc = SQLITE_CORRUPT_BKPT;
1956     goto getAndInitPage_error;
1957   }
1958   rc = sqlite3PagerAcquire(pBt->pPager, pgno, (DbPage**)&pDbPage, bReadOnly);
1959   if( rc ){
1960     goto getAndInitPage_error;
1961   }
1962   *ppPage = btreePageFromDbPage(pDbPage, pgno, pBt);
1963   if( (*ppPage)->isInit==0 ){
1964     rc = btreeInitPage(*ppPage);
1965     if( rc!=SQLITE_OK ){
1966       releasePage(*ppPage);
1967       goto getAndInitPage_error;
1968     }
1969   }
1970 
1971   /* If obtaining a child page for a cursor, we must verify that the page is
1972   ** compatible with the root page. */
1973   if( pCur
1974    && ((*ppPage)->nCell<1 || (*ppPage)->intKey!=pCur->curIntKey)
1975   ){
1976     rc = SQLITE_CORRUPT_BKPT;
1977     releasePage(*ppPage);
1978     goto getAndInitPage_error;
1979   }
1980   return SQLITE_OK;
1981 
1982 getAndInitPage_error:
1983   if( pCur ) pCur->iPage--;
1984   testcase( pgno==0 );
1985   assert( pgno!=0 || rc==SQLITE_CORRUPT );
1986   return rc;
1987 }
1988 
1989 /*
1990 ** Release a MemPage.  This should be called once for each prior
1991 ** call to btreeGetPage.
1992 */
1993 static void releasePageNotNull(MemPage *pPage){
1994   assert( pPage->aData );
1995   assert( pPage->pBt );
1996   assert( pPage->pDbPage!=0 );
1997   assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage );
1998   assert( sqlite3PagerGetData(pPage->pDbPage)==pPage->aData );
1999   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
2000   sqlite3PagerUnrefNotNull(pPage->pDbPage);
2001 }
2002 static void releasePage(MemPage *pPage){
2003   if( pPage ) releasePageNotNull(pPage);
2004 }
2005 
2006 /*
2007 ** Get an unused page.
2008 **
2009 ** This works just like btreeGetPage() with the addition:
2010 **
2011 **   *  If the page is already in use for some other purpose, immediately
2012 **      release it and return an SQLITE_CURRUPT error.
2013 **   *  Make sure the isInit flag is clear
2014 */
2015 static int btreeGetUnusedPage(
2016   BtShared *pBt,       /* The btree */
2017   Pgno pgno,           /* Number of the page to fetch */
2018   MemPage **ppPage,    /* Return the page in this parameter */
2019   int flags            /* PAGER_GET_NOCONTENT or PAGER_GET_READONLY */
2020 ){
2021   int rc = btreeGetPage(pBt, pgno, ppPage, flags);
2022   if( rc==SQLITE_OK ){
2023     if( sqlite3PagerPageRefcount((*ppPage)->pDbPage)>1 ){
2024       releasePage(*ppPage);
2025       *ppPage = 0;
2026       return SQLITE_CORRUPT_BKPT;
2027     }
2028     (*ppPage)->isInit = 0;
2029   }else{
2030     *ppPage = 0;
2031   }
2032   return rc;
2033 }
2034 
2035 
2036 /*
2037 ** During a rollback, when the pager reloads information into the cache
2038 ** so that the cache is restored to its original state at the start of
2039 ** the transaction, for each page restored this routine is called.
2040 **
2041 ** This routine needs to reset the extra data section at the end of the
2042 ** page to agree with the restored data.
2043 */
2044 static void pageReinit(DbPage *pData){
2045   MemPage *pPage;
2046   pPage = (MemPage *)sqlite3PagerGetExtra(pData);
2047   assert( sqlite3PagerPageRefcount(pData)>0 );
2048   if( pPage->isInit ){
2049     assert( sqlite3_mutex_held(pPage->pBt->mutex) );
2050     pPage->isInit = 0;
2051     if( sqlite3PagerPageRefcount(pData)>1 ){
2052       /* pPage might not be a btree page;  it might be an overflow page
2053       ** or ptrmap page or a free page.  In those cases, the following
2054       ** call to btreeInitPage() will likely return SQLITE_CORRUPT.
2055       ** But no harm is done by this.  And it is very important that
2056       ** btreeInitPage() be called on every btree page so we make
2057       ** the call for every page that comes in for re-initing. */
2058       btreeInitPage(pPage);
2059     }
2060   }
2061 }
2062 
2063 /*
2064 ** Invoke the busy handler for a btree.
2065 */
2066 static int btreeInvokeBusyHandler(void *pArg){
2067   BtShared *pBt = (BtShared*)pArg;
2068   assert( pBt->db );
2069   assert( sqlite3_mutex_held(pBt->db->mutex) );
2070   return sqlite3InvokeBusyHandler(&pBt->db->busyHandler);
2071 }
2072 
2073 /*
2074 ** Open a database file.
2075 **
2076 ** zFilename is the name of the database file.  If zFilename is NULL
2077 ** then an ephemeral database is created.  The ephemeral database might
2078 ** be exclusively in memory, or it might use a disk-based memory cache.
2079 ** Either way, the ephemeral database will be automatically deleted
2080 ** when sqlite3BtreeClose() is called.
2081 **
2082 ** If zFilename is ":memory:" then an in-memory database is created
2083 ** that is automatically destroyed when it is closed.
2084 **
2085 ** The "flags" parameter is a bitmask that might contain bits like
2086 ** BTREE_OMIT_JOURNAL and/or BTREE_MEMORY.
2087 **
2088 ** If the database is already opened in the same database connection
2089 ** and we are in shared cache mode, then the open will fail with an
2090 ** SQLITE_CONSTRAINT error.  We cannot allow two or more BtShared
2091 ** objects in the same database connection since doing so will lead
2092 ** to problems with locking.
2093 */
2094 int sqlite3BtreeOpen(
2095   sqlite3_vfs *pVfs,      /* VFS to use for this b-tree */
2096   const char *zFilename,  /* Name of the file containing the BTree database */
2097   sqlite3 *db,            /* Associated database handle */
2098   Btree **ppBtree,        /* Pointer to new Btree object written here */
2099   int flags,              /* Options */
2100   int vfsFlags            /* Flags passed through to sqlite3_vfs.xOpen() */
2101 ){
2102   BtShared *pBt = 0;             /* Shared part of btree structure */
2103   Btree *p;                      /* Handle to return */
2104   sqlite3_mutex *mutexOpen = 0;  /* Prevents a race condition. Ticket #3537 */
2105   int rc = SQLITE_OK;            /* Result code from this function */
2106   u8 nReserve;                   /* Byte of unused space on each page */
2107   unsigned char zDbHeader[100];  /* Database header content */
2108 
2109   /* True if opening an ephemeral, temporary database */
2110   const int isTempDb = zFilename==0 || zFilename[0]==0;
2111 
2112   /* Set the variable isMemdb to true for an in-memory database, or
2113   ** false for a file-based database.
2114   */
2115 #ifdef SQLITE_OMIT_MEMORYDB
2116   const int isMemdb = 0;
2117 #else
2118   const int isMemdb = (zFilename && strcmp(zFilename, ":memory:")==0)
2119                        || (isTempDb && sqlite3TempInMemory(db))
2120                        || (vfsFlags & SQLITE_OPEN_MEMORY)!=0;
2121 #endif
2122 
2123   assert( db!=0 );
2124   assert( pVfs!=0 );
2125   assert( sqlite3_mutex_held(db->mutex) );
2126   assert( (flags&0xff)==flags );   /* flags fit in 8 bits */
2127 
2128   /* Only a BTREE_SINGLE database can be BTREE_UNORDERED */
2129   assert( (flags & BTREE_UNORDERED)==0 || (flags & BTREE_SINGLE)!=0 );
2130 
2131   /* A BTREE_SINGLE database is always a temporary and/or ephemeral */
2132   assert( (flags & BTREE_SINGLE)==0 || isTempDb );
2133 
2134   if( isMemdb ){
2135     flags |= BTREE_MEMORY;
2136   }
2137   if( (vfsFlags & SQLITE_OPEN_MAIN_DB)!=0 && (isMemdb || isTempDb) ){
2138     vfsFlags = (vfsFlags & ~SQLITE_OPEN_MAIN_DB) | SQLITE_OPEN_TEMP_DB;
2139   }
2140   p = sqlite3MallocZero(sizeof(Btree));
2141   if( !p ){
2142     return SQLITE_NOMEM;
2143   }
2144   p->inTrans = TRANS_NONE;
2145   p->db = db;
2146 #ifndef SQLITE_OMIT_SHARED_CACHE
2147   p->lock.pBtree = p;
2148   p->lock.iTable = 1;
2149 #endif
2150 
2151 #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)
2152   /*
2153   ** If this Btree is a candidate for shared cache, try to find an
2154   ** existing BtShared object that we can share with
2155   */
2156   if( isTempDb==0 && (isMemdb==0 || (vfsFlags&SQLITE_OPEN_URI)!=0) ){
2157     if( vfsFlags & SQLITE_OPEN_SHAREDCACHE ){
2158       int nFilename = sqlite3Strlen30(zFilename)+1;
2159       int nFullPathname = pVfs->mxPathname+1;
2160       char *zFullPathname = sqlite3Malloc(MAX(nFullPathname,nFilename));
2161       MUTEX_LOGIC( sqlite3_mutex *mutexShared; )
2162 
2163       p->sharable = 1;
2164       if( !zFullPathname ){
2165         sqlite3_free(p);
2166         return SQLITE_NOMEM;
2167       }
2168       if( isMemdb ){
2169         memcpy(zFullPathname, zFilename, nFilename);
2170       }else{
2171         rc = sqlite3OsFullPathname(pVfs, zFilename,
2172                                    nFullPathname, zFullPathname);
2173         if( rc ){
2174           sqlite3_free(zFullPathname);
2175           sqlite3_free(p);
2176           return rc;
2177         }
2178       }
2179 #if SQLITE_THREADSAFE
2180       mutexOpen = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_OPEN);
2181       sqlite3_mutex_enter(mutexOpen);
2182       mutexShared = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER);
2183       sqlite3_mutex_enter(mutexShared);
2184 #endif
2185       for(pBt=GLOBAL(BtShared*,sqlite3SharedCacheList); pBt; pBt=pBt->pNext){
2186         assert( pBt->nRef>0 );
2187         if( 0==strcmp(zFullPathname, sqlite3PagerFilename(pBt->pPager, 0))
2188                  && sqlite3PagerVfs(pBt->pPager)==pVfs ){
2189           int iDb;
2190           for(iDb=db->nDb-1; iDb>=0; iDb--){
2191             Btree *pExisting = db->aDb[iDb].pBt;
2192             if( pExisting && pExisting->pBt==pBt ){
2193               sqlite3_mutex_leave(mutexShared);
2194               sqlite3_mutex_leave(mutexOpen);
2195               sqlite3_free(zFullPathname);
2196               sqlite3_free(p);
2197               return SQLITE_CONSTRAINT;
2198             }
2199           }
2200           p->pBt = pBt;
2201           pBt->nRef++;
2202           break;
2203         }
2204       }
2205       sqlite3_mutex_leave(mutexShared);
2206       sqlite3_free(zFullPathname);
2207     }
2208 #ifdef SQLITE_DEBUG
2209     else{
2210       /* In debug mode, we mark all persistent databases as sharable
2211       ** even when they are not.  This exercises the locking code and
2212       ** gives more opportunity for asserts(sqlite3_mutex_held())
2213       ** statements to find locking problems.
2214       */
2215       p->sharable = 1;
2216     }
2217 #endif
2218   }
2219 #endif
2220   if( pBt==0 ){
2221     /*
2222     ** The following asserts make sure that structures used by the btree are
2223     ** the right size.  This is to guard against size changes that result
2224     ** when compiling on a different architecture.
2225     */
2226     assert( sizeof(i64)==8 );
2227     assert( sizeof(u64)==8 );
2228     assert( sizeof(u32)==4 );
2229     assert( sizeof(u16)==2 );
2230     assert( sizeof(Pgno)==4 );
2231 
2232     pBt = sqlite3MallocZero( sizeof(*pBt) );
2233     if( pBt==0 ){
2234       rc = SQLITE_NOMEM;
2235       goto btree_open_out;
2236     }
2237     rc = sqlite3PagerOpen(pVfs, &pBt->pPager, zFilename,
2238                           EXTRA_SIZE, flags, vfsFlags, pageReinit);
2239     if( rc==SQLITE_OK ){
2240       sqlite3PagerSetMmapLimit(pBt->pPager, db->szMmap);
2241       rc = sqlite3PagerReadFileheader(pBt->pPager,sizeof(zDbHeader),zDbHeader);
2242     }
2243     if( rc!=SQLITE_OK ){
2244       goto btree_open_out;
2245     }
2246     pBt->openFlags = (u8)flags;
2247     pBt->db = db;
2248     sqlite3PagerSetBusyhandler(pBt->pPager, btreeInvokeBusyHandler, pBt);
2249     p->pBt = pBt;
2250 
2251     pBt->pCursor = 0;
2252     pBt->pPage1 = 0;
2253     if( sqlite3PagerIsreadonly(pBt->pPager) ) pBt->btsFlags |= BTS_READ_ONLY;
2254 #ifdef SQLITE_SECURE_DELETE
2255     pBt->btsFlags |= BTS_SECURE_DELETE;
2256 #endif
2257     /* EVIDENCE-OF: R-51873-39618 The page size for a database file is
2258     ** determined by the 2-byte integer located at an offset of 16 bytes from
2259     ** the beginning of the database file. */
2260     pBt->pageSize = (zDbHeader[16]<<8) | (zDbHeader[17]<<16);
2261     if( pBt->pageSize<512 || pBt->pageSize>SQLITE_MAX_PAGE_SIZE
2262          || ((pBt->pageSize-1)&pBt->pageSize)!=0 ){
2263       pBt->pageSize = 0;
2264 #ifndef SQLITE_OMIT_AUTOVACUUM
2265       /* If the magic name ":memory:" will create an in-memory database, then
2266       ** leave the autoVacuum mode at 0 (do not auto-vacuum), even if
2267       ** SQLITE_DEFAULT_AUTOVACUUM is true. On the other hand, if
2268       ** SQLITE_OMIT_MEMORYDB has been defined, then ":memory:" is just a
2269       ** regular file-name. In this case the auto-vacuum applies as per normal.
2270       */
2271       if( zFilename && !isMemdb ){
2272         pBt->autoVacuum = (SQLITE_DEFAULT_AUTOVACUUM ? 1 : 0);
2273         pBt->incrVacuum = (SQLITE_DEFAULT_AUTOVACUUM==2 ? 1 : 0);
2274       }
2275 #endif
2276       nReserve = 0;
2277     }else{
2278       /* EVIDENCE-OF: R-37497-42412 The size of the reserved region is
2279       ** determined by the one-byte unsigned integer found at an offset of 20
2280       ** into the database file header. */
2281       nReserve = zDbHeader[20];
2282       pBt->btsFlags |= BTS_PAGESIZE_FIXED;
2283 #ifndef SQLITE_OMIT_AUTOVACUUM
2284       pBt->autoVacuum = (get4byte(&zDbHeader[36 + 4*4])?1:0);
2285       pBt->incrVacuum = (get4byte(&zDbHeader[36 + 7*4])?1:0);
2286 #endif
2287     }
2288     rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize, nReserve);
2289     if( rc ) goto btree_open_out;
2290     pBt->usableSize = pBt->pageSize - nReserve;
2291     assert( (pBt->pageSize & 7)==0 );  /* 8-byte alignment of pageSize */
2292 
2293 #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)
2294     /* Add the new BtShared object to the linked list sharable BtShareds.
2295     */
2296     if( p->sharable ){
2297       MUTEX_LOGIC( sqlite3_mutex *mutexShared; )
2298       pBt->nRef = 1;
2299       MUTEX_LOGIC( mutexShared = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER);)
2300       if( SQLITE_THREADSAFE && sqlite3GlobalConfig.bCoreMutex ){
2301         pBt->mutex = sqlite3MutexAlloc(SQLITE_MUTEX_FAST);
2302         if( pBt->mutex==0 ){
2303           rc = SQLITE_NOMEM;
2304           db->mallocFailed = 0;
2305           goto btree_open_out;
2306         }
2307       }
2308       sqlite3_mutex_enter(mutexShared);
2309       pBt->pNext = GLOBAL(BtShared*,sqlite3SharedCacheList);
2310       GLOBAL(BtShared*,sqlite3SharedCacheList) = pBt;
2311       sqlite3_mutex_leave(mutexShared);
2312     }
2313 #endif
2314   }
2315 
2316 #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)
2317   /* If the new Btree uses a sharable pBtShared, then link the new
2318   ** Btree into the list of all sharable Btrees for the same connection.
2319   ** The list is kept in ascending order by pBt address.
2320   */
2321   if( p->sharable ){
2322     int i;
2323     Btree *pSib;
2324     for(i=0; i<db->nDb; i++){
2325       if( (pSib = db->aDb[i].pBt)!=0 && pSib->sharable ){
2326         while( pSib->pPrev ){ pSib = pSib->pPrev; }
2327         if( p->pBt<pSib->pBt ){
2328           p->pNext = pSib;
2329           p->pPrev = 0;
2330           pSib->pPrev = p;
2331         }else{
2332           while( pSib->pNext && pSib->pNext->pBt<p->pBt ){
2333             pSib = pSib->pNext;
2334           }
2335           p->pNext = pSib->pNext;
2336           p->pPrev = pSib;
2337           if( p->pNext ){
2338             p->pNext->pPrev = p;
2339           }
2340           pSib->pNext = p;
2341         }
2342         break;
2343       }
2344     }
2345   }
2346 #endif
2347   *ppBtree = p;
2348 
2349 btree_open_out:
2350   if( rc!=SQLITE_OK ){
2351     if( pBt && pBt->pPager ){
2352       sqlite3PagerClose(pBt->pPager);
2353     }
2354     sqlite3_free(pBt);
2355     sqlite3_free(p);
2356     *ppBtree = 0;
2357   }else{
2358     /* If the B-Tree was successfully opened, set the pager-cache size to the
2359     ** default value. Except, when opening on an existing shared pager-cache,
2360     ** do not change the pager-cache size.
2361     */
2362     if( sqlite3BtreeSchema(p, 0, 0)==0 ){
2363       sqlite3PagerSetCachesize(p->pBt->pPager, SQLITE_DEFAULT_CACHE_SIZE);
2364     }
2365   }
2366   if( mutexOpen ){
2367     assert( sqlite3_mutex_held(mutexOpen) );
2368     sqlite3_mutex_leave(mutexOpen);
2369   }
2370   return rc;
2371 }
2372 
2373 /*
2374 ** Decrement the BtShared.nRef counter.  When it reaches zero,
2375 ** remove the BtShared structure from the sharing list.  Return
2376 ** true if the BtShared.nRef counter reaches zero and return
2377 ** false if it is still positive.
2378 */
2379 static int removeFromSharingList(BtShared *pBt){
2380 #ifndef SQLITE_OMIT_SHARED_CACHE
2381   MUTEX_LOGIC( sqlite3_mutex *pMaster; )
2382   BtShared *pList;
2383   int removed = 0;
2384 
2385   assert( sqlite3_mutex_notheld(pBt->mutex) );
2386   MUTEX_LOGIC( pMaster = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER); )
2387   sqlite3_mutex_enter(pMaster);
2388   pBt->nRef--;
2389   if( pBt->nRef<=0 ){
2390     if( GLOBAL(BtShared*,sqlite3SharedCacheList)==pBt ){
2391       GLOBAL(BtShared*,sqlite3SharedCacheList) = pBt->pNext;
2392     }else{
2393       pList = GLOBAL(BtShared*,sqlite3SharedCacheList);
2394       while( ALWAYS(pList) && pList->pNext!=pBt ){
2395         pList=pList->pNext;
2396       }
2397       if( ALWAYS(pList) ){
2398         pList->pNext = pBt->pNext;
2399       }
2400     }
2401     if( SQLITE_THREADSAFE ){
2402       sqlite3_mutex_free(pBt->mutex);
2403     }
2404     removed = 1;
2405   }
2406   sqlite3_mutex_leave(pMaster);
2407   return removed;
2408 #else
2409   return 1;
2410 #endif
2411 }
2412 
2413 /*
2414 ** Make sure pBt->pTmpSpace points to an allocation of
2415 ** MX_CELL_SIZE(pBt) bytes with a 4-byte prefix for a left-child
2416 ** pointer.
2417 */
2418 static void allocateTempSpace(BtShared *pBt){
2419   if( !pBt->pTmpSpace ){
2420     pBt->pTmpSpace = sqlite3PageMalloc( pBt->pageSize );
2421 
2422     /* One of the uses of pBt->pTmpSpace is to format cells before
2423     ** inserting them into a leaf page (function fillInCell()). If
2424     ** a cell is less than 4 bytes in size, it is rounded up to 4 bytes
2425     ** by the various routines that manipulate binary cells. Which
2426     ** can mean that fillInCell() only initializes the first 2 or 3
2427     ** bytes of pTmpSpace, but that the first 4 bytes are copied from
2428     ** it into a database page. This is not actually a problem, but it
2429     ** does cause a valgrind error when the 1 or 2 bytes of unitialized
2430     ** data is passed to system call write(). So to avoid this error,
2431     ** zero the first 4 bytes of temp space here.
2432     **
2433     ** Also:  Provide four bytes of initialized space before the
2434     ** beginning of pTmpSpace as an area available to prepend the
2435     ** left-child pointer to the beginning of a cell.
2436     */
2437     if( pBt->pTmpSpace ){
2438       memset(pBt->pTmpSpace, 0, 8);
2439       pBt->pTmpSpace += 4;
2440     }
2441   }
2442 }
2443 
2444 /*
2445 ** Free the pBt->pTmpSpace allocation
2446 */
2447 static void freeTempSpace(BtShared *pBt){
2448   if( pBt->pTmpSpace ){
2449     pBt->pTmpSpace -= 4;
2450     sqlite3PageFree(pBt->pTmpSpace);
2451     pBt->pTmpSpace = 0;
2452   }
2453 }
2454 
2455 /*
2456 ** Close an open database and invalidate all cursors.
2457 */
2458 int sqlite3BtreeClose(Btree *p){
2459   BtShared *pBt = p->pBt;
2460   BtCursor *pCur;
2461 
2462   /* Close all cursors opened via this handle.  */
2463   assert( sqlite3_mutex_held(p->db->mutex) );
2464   sqlite3BtreeEnter(p);
2465   pCur = pBt->pCursor;
2466   while( pCur ){
2467     BtCursor *pTmp = pCur;
2468     pCur = pCur->pNext;
2469     if( pTmp->pBtree==p ){
2470       sqlite3BtreeCloseCursor(pTmp);
2471     }
2472   }
2473 
2474   /* Rollback any active transaction and free the handle structure.
2475   ** The call to sqlite3BtreeRollback() drops any table-locks held by
2476   ** this handle.
2477   */
2478   sqlite3BtreeRollback(p, SQLITE_OK, 0);
2479   sqlite3BtreeLeave(p);
2480 
2481   /* If there are still other outstanding references to the shared-btree
2482   ** structure, return now. The remainder of this procedure cleans
2483   ** up the shared-btree.
2484   */
2485   assert( p->wantToLock==0 && p->locked==0 );
2486   if( !p->sharable || removeFromSharingList(pBt) ){
2487     /* The pBt is no longer on the sharing list, so we can access
2488     ** it without having to hold the mutex.
2489     **
2490     ** Clean out and delete the BtShared object.
2491     */
2492     assert( !pBt->pCursor );
2493     sqlite3PagerClose(pBt->pPager);
2494     if( pBt->xFreeSchema && pBt->pSchema ){
2495       pBt->xFreeSchema(pBt->pSchema);
2496     }
2497     sqlite3DbFree(0, pBt->pSchema);
2498     freeTempSpace(pBt);
2499     sqlite3_free(pBt);
2500   }
2501 
2502 #ifndef SQLITE_OMIT_SHARED_CACHE
2503   assert( p->wantToLock==0 );
2504   assert( p->locked==0 );
2505   if( p->pPrev ) p->pPrev->pNext = p->pNext;
2506   if( p->pNext ) p->pNext->pPrev = p->pPrev;
2507 #endif
2508 
2509   sqlite3_free(p);
2510   return SQLITE_OK;
2511 }
2512 
2513 /*
2514 ** Change the limit on the number of pages allowed in the cache.
2515 **
2516 ** The maximum number of cache pages is set to the absolute
2517 ** value of mxPage.  If mxPage is negative, the pager will
2518 ** operate asynchronously - it will not stop to do fsync()s
2519 ** to insure data is written to the disk surface before
2520 ** continuing.  Transactions still work if synchronous is off,
2521 ** and the database cannot be corrupted if this program
2522 ** crashes.  But if the operating system crashes or there is
2523 ** an abrupt power failure when synchronous is off, the database
2524 ** could be left in an inconsistent and unrecoverable state.
2525 ** Synchronous is on by default so database corruption is not
2526 ** normally a worry.
2527 */
2528 int sqlite3BtreeSetCacheSize(Btree *p, int mxPage){
2529   BtShared *pBt = p->pBt;
2530   assert( sqlite3_mutex_held(p->db->mutex) );
2531   sqlite3BtreeEnter(p);
2532   sqlite3PagerSetCachesize(pBt->pPager, mxPage);
2533   sqlite3BtreeLeave(p);
2534   return SQLITE_OK;
2535 }
2536 
2537 #if SQLITE_MAX_MMAP_SIZE>0
2538 /*
2539 ** Change the limit on the amount of the database file that may be
2540 ** memory mapped.
2541 */
2542 int sqlite3BtreeSetMmapLimit(Btree *p, sqlite3_int64 szMmap){
2543   BtShared *pBt = p->pBt;
2544   assert( sqlite3_mutex_held(p->db->mutex) );
2545   sqlite3BtreeEnter(p);
2546   sqlite3PagerSetMmapLimit(pBt->pPager, szMmap);
2547   sqlite3BtreeLeave(p);
2548   return SQLITE_OK;
2549 }
2550 #endif /* SQLITE_MAX_MMAP_SIZE>0 */
2551 
2552 /*
2553 ** Change the way data is synced to disk in order to increase or decrease
2554 ** how well the database resists damage due to OS crashes and power
2555 ** failures.  Level 1 is the same as asynchronous (no syncs() occur and
2556 ** there is a high probability of damage)  Level 2 is the default.  There
2557 ** is a very low but non-zero probability of damage.  Level 3 reduces the
2558 ** probability of damage to near zero but with a write performance reduction.
2559 */
2560 #ifndef SQLITE_OMIT_PAGER_PRAGMAS
2561 int sqlite3BtreeSetPagerFlags(
2562   Btree *p,              /* The btree to set the safety level on */
2563   unsigned pgFlags       /* Various PAGER_* flags */
2564 ){
2565   BtShared *pBt = p->pBt;
2566   assert( sqlite3_mutex_held(p->db->mutex) );
2567   sqlite3BtreeEnter(p);
2568   sqlite3PagerSetFlags(pBt->pPager, pgFlags);
2569   sqlite3BtreeLeave(p);
2570   return SQLITE_OK;
2571 }
2572 #endif
2573 
2574 /*
2575 ** Return TRUE if the given btree is set to safety level 1.  In other
2576 ** words, return TRUE if no sync() occurs on the disk files.
2577 */
2578 int sqlite3BtreeSyncDisabled(Btree *p){
2579   BtShared *pBt = p->pBt;
2580   int rc;
2581   assert( sqlite3_mutex_held(p->db->mutex) );
2582   sqlite3BtreeEnter(p);
2583   assert( pBt && pBt->pPager );
2584   rc = sqlite3PagerNosync(pBt->pPager);
2585   sqlite3BtreeLeave(p);
2586   return rc;
2587 }
2588 
2589 /*
2590 ** Change the default pages size and the number of reserved bytes per page.
2591 ** Or, if the page size has already been fixed, return SQLITE_READONLY
2592 ** without changing anything.
2593 **
2594 ** The page size must be a power of 2 between 512 and 65536.  If the page
2595 ** size supplied does not meet this constraint then the page size is not
2596 ** changed.
2597 **
2598 ** Page sizes are constrained to be a power of two so that the region
2599 ** of the database file used for locking (beginning at PENDING_BYTE,
2600 ** the first byte past the 1GB boundary, 0x40000000) needs to occur
2601 ** at the beginning of a page.
2602 **
2603 ** If parameter nReserve is less than zero, then the number of reserved
2604 ** bytes per page is left unchanged.
2605 **
2606 ** If the iFix!=0 then the BTS_PAGESIZE_FIXED flag is set so that the page size
2607 ** and autovacuum mode can no longer be changed.
2608 */
2609 int sqlite3BtreeSetPageSize(Btree *p, int pageSize, int nReserve, int iFix){
2610   int rc = SQLITE_OK;
2611   BtShared *pBt = p->pBt;
2612   assert( nReserve>=-1 && nReserve<=255 );
2613   sqlite3BtreeEnter(p);
2614 #if SQLITE_HAS_CODEC
2615   if( nReserve>pBt->optimalReserve ) pBt->optimalReserve = (u8)nReserve;
2616 #endif
2617   if( pBt->btsFlags & BTS_PAGESIZE_FIXED ){
2618     sqlite3BtreeLeave(p);
2619     return SQLITE_READONLY;
2620   }
2621   if( nReserve<0 ){
2622     nReserve = pBt->pageSize - pBt->usableSize;
2623   }
2624   assert( nReserve>=0 && nReserve<=255 );
2625   if( pageSize>=512 && pageSize<=SQLITE_MAX_PAGE_SIZE &&
2626         ((pageSize-1)&pageSize)==0 ){
2627     assert( (pageSize & 7)==0 );
2628     assert( !pBt->pCursor );
2629     pBt->pageSize = (u32)pageSize;
2630     freeTempSpace(pBt);
2631   }
2632   rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize, nReserve);
2633   pBt->usableSize = pBt->pageSize - (u16)nReserve;
2634   if( iFix ) pBt->btsFlags |= BTS_PAGESIZE_FIXED;
2635   sqlite3BtreeLeave(p);
2636   return rc;
2637 }
2638 
2639 /*
2640 ** Return the currently defined page size
2641 */
2642 int sqlite3BtreeGetPageSize(Btree *p){
2643   return p->pBt->pageSize;
2644 }
2645 
2646 /*
2647 ** This function is similar to sqlite3BtreeGetReserve(), except that it
2648 ** may only be called if it is guaranteed that the b-tree mutex is already
2649 ** held.
2650 **
2651 ** This is useful in one special case in the backup API code where it is
2652 ** known that the shared b-tree mutex is held, but the mutex on the
2653 ** database handle that owns *p is not. In this case if sqlite3BtreeEnter()
2654 ** were to be called, it might collide with some other operation on the
2655 ** database handle that owns *p, causing undefined behavior.
2656 */
2657 int sqlite3BtreeGetReserveNoMutex(Btree *p){
2658   int n;
2659   assert( sqlite3_mutex_held(p->pBt->mutex) );
2660   n = p->pBt->pageSize - p->pBt->usableSize;
2661   return n;
2662 }
2663 
2664 /*
2665 ** Return the number of bytes of space at the end of every page that
2666 ** are intentually left unused.  This is the "reserved" space that is
2667 ** sometimes used by extensions.
2668 **
2669 ** If SQLITE_HAS_MUTEX is defined then the number returned is the
2670 ** greater of the current reserved space and the maximum requested
2671 ** reserve space.
2672 */
2673 int sqlite3BtreeGetOptimalReserve(Btree *p){
2674   int n;
2675   sqlite3BtreeEnter(p);
2676   n = sqlite3BtreeGetReserveNoMutex(p);
2677 #ifdef SQLITE_HAS_CODEC
2678   if( n<p->pBt->optimalReserve ) n = p->pBt->optimalReserve;
2679 #endif
2680   sqlite3BtreeLeave(p);
2681   return n;
2682 }
2683 
2684 
2685 /*
2686 ** Set the maximum page count for a database if mxPage is positive.
2687 ** No changes are made if mxPage is 0 or negative.
2688 ** Regardless of the value of mxPage, return the maximum page count.
2689 */
2690 int sqlite3BtreeMaxPageCount(Btree *p, int mxPage){
2691   int n;
2692   sqlite3BtreeEnter(p);
2693   n = sqlite3PagerMaxPageCount(p->pBt->pPager, mxPage);
2694   sqlite3BtreeLeave(p);
2695   return n;
2696 }
2697 
2698 /*
2699 ** Set the BTS_SECURE_DELETE flag if newFlag is 0 or 1.  If newFlag is -1,
2700 ** then make no changes.  Always return the value of the BTS_SECURE_DELETE
2701 ** setting after the change.
2702 */
2703 int sqlite3BtreeSecureDelete(Btree *p, int newFlag){
2704   int b;
2705   if( p==0 ) return 0;
2706   sqlite3BtreeEnter(p);
2707   if( newFlag>=0 ){
2708     p->pBt->btsFlags &= ~BTS_SECURE_DELETE;
2709     if( newFlag ) p->pBt->btsFlags |= BTS_SECURE_DELETE;
2710   }
2711   b = (p->pBt->btsFlags & BTS_SECURE_DELETE)!=0;
2712   sqlite3BtreeLeave(p);
2713   return b;
2714 }
2715 
2716 /*
2717 ** Change the 'auto-vacuum' property of the database. If the 'autoVacuum'
2718 ** parameter is non-zero, then auto-vacuum mode is enabled. If zero, it
2719 ** is disabled. The default value for the auto-vacuum property is
2720 ** determined by the SQLITE_DEFAULT_AUTOVACUUM macro.
2721 */
2722 int sqlite3BtreeSetAutoVacuum(Btree *p, int autoVacuum){
2723 #ifdef SQLITE_OMIT_AUTOVACUUM
2724   return SQLITE_READONLY;
2725 #else
2726   BtShared *pBt = p->pBt;
2727   int rc = SQLITE_OK;
2728   u8 av = (u8)autoVacuum;
2729 
2730   sqlite3BtreeEnter(p);
2731   if( (pBt->btsFlags & BTS_PAGESIZE_FIXED)!=0 && (av ?1:0)!=pBt->autoVacuum ){
2732     rc = SQLITE_READONLY;
2733   }else{
2734     pBt->autoVacuum = av ?1:0;
2735     pBt->incrVacuum = av==2 ?1:0;
2736   }
2737   sqlite3BtreeLeave(p);
2738   return rc;
2739 #endif
2740 }
2741 
2742 /*
2743 ** Return the value of the 'auto-vacuum' property. If auto-vacuum is
2744 ** enabled 1 is returned. Otherwise 0.
2745 */
2746 int sqlite3BtreeGetAutoVacuum(Btree *p){
2747 #ifdef SQLITE_OMIT_AUTOVACUUM
2748   return BTREE_AUTOVACUUM_NONE;
2749 #else
2750   int rc;
2751   sqlite3BtreeEnter(p);
2752   rc = (
2753     (!p->pBt->autoVacuum)?BTREE_AUTOVACUUM_NONE:
2754     (!p->pBt->incrVacuum)?BTREE_AUTOVACUUM_FULL:
2755     BTREE_AUTOVACUUM_INCR
2756   );
2757   sqlite3BtreeLeave(p);
2758   return rc;
2759 #endif
2760 }
2761 
2762 
2763 /*
2764 ** Get a reference to pPage1 of the database file.  This will
2765 ** also acquire a readlock on that file.
2766 **
2767 ** SQLITE_OK is returned on success.  If the file is not a
2768 ** well-formed database file, then SQLITE_CORRUPT is returned.
2769 ** SQLITE_BUSY is returned if the database is locked.  SQLITE_NOMEM
2770 ** is returned if we run out of memory.
2771 */
2772 static int lockBtree(BtShared *pBt){
2773   int rc;              /* Result code from subfunctions */
2774   MemPage *pPage1;     /* Page 1 of the database file */
2775   int nPage;           /* Number of pages in the database */
2776   int nPageFile = 0;   /* Number of pages in the database file */
2777   int nPageHeader;     /* Number of pages in the database according to hdr */
2778 
2779   assert( sqlite3_mutex_held(pBt->mutex) );
2780   assert( pBt->pPage1==0 );
2781   rc = sqlite3PagerSharedLock(pBt->pPager);
2782   if( rc!=SQLITE_OK ) return rc;
2783   rc = btreeGetPage(pBt, 1, &pPage1, 0);
2784   if( rc!=SQLITE_OK ) return rc;
2785 
2786   /* Do some checking to help insure the file we opened really is
2787   ** a valid database file.
2788   */
2789   nPage = nPageHeader = get4byte(28+(u8*)pPage1->aData);
2790   sqlite3PagerPagecount(pBt->pPager, &nPageFile);
2791   if( nPage==0 || memcmp(24+(u8*)pPage1->aData, 92+(u8*)pPage1->aData,4)!=0 ){
2792     nPage = nPageFile;
2793   }
2794   if( nPage>0 ){
2795     u32 pageSize;
2796     u32 usableSize;
2797     u8 *page1 = pPage1->aData;
2798     rc = SQLITE_NOTADB;
2799     /* EVIDENCE-OF: R-43737-39999 Every valid SQLite database file begins
2800     ** with the following 16 bytes (in hex): 53 51 4c 69 74 65 20 66 6f 72 6d
2801     ** 61 74 20 33 00. */
2802     if( memcmp(page1, zMagicHeader, 16)!=0 ){
2803       goto page1_init_failed;
2804     }
2805 
2806 #ifdef SQLITE_OMIT_WAL
2807     if( page1[18]>1 ){
2808       pBt->btsFlags |= BTS_READ_ONLY;
2809     }
2810     if( page1[19]>1 ){
2811       goto page1_init_failed;
2812     }
2813 #else
2814     if( page1[18]>2 ){
2815       pBt->btsFlags |= BTS_READ_ONLY;
2816     }
2817     if( page1[19]>2 ){
2818       goto page1_init_failed;
2819     }
2820 
2821     /* If the write version is set to 2, this database should be accessed
2822     ** in WAL mode. If the log is not already open, open it now. Then
2823     ** return SQLITE_OK and return without populating BtShared.pPage1.
2824     ** The caller detects this and calls this function again. This is
2825     ** required as the version of page 1 currently in the page1 buffer
2826     ** may not be the latest version - there may be a newer one in the log
2827     ** file.
2828     */
2829     if( page1[19]==2 && (pBt->btsFlags & BTS_NO_WAL)==0 ){
2830       int isOpen = 0;
2831       rc = sqlite3PagerOpenWal(pBt->pPager, &isOpen);
2832       if( rc!=SQLITE_OK ){
2833         goto page1_init_failed;
2834       }else if( isOpen==0 ){
2835         releasePage(pPage1);
2836         return SQLITE_OK;
2837       }
2838       rc = SQLITE_NOTADB;
2839     }
2840 #endif
2841 
2842     /* EVIDENCE-OF: R-15465-20813 The maximum and minimum embedded payload
2843     ** fractions and the leaf payload fraction values must be 64, 32, and 32.
2844     **
2845     ** The original design allowed these amounts to vary, but as of
2846     ** version 3.6.0, we require them to be fixed.
2847     */
2848     if( memcmp(&page1[21], "\100\040\040",3)!=0 ){
2849       goto page1_init_failed;
2850     }
2851     /* EVIDENCE-OF: R-51873-39618 The page size for a database file is
2852     ** determined by the 2-byte integer located at an offset of 16 bytes from
2853     ** the beginning of the database file. */
2854     pageSize = (page1[16]<<8) | (page1[17]<<16);
2855     /* EVIDENCE-OF: R-25008-21688 The size of a page is a power of two
2856     ** between 512 and 65536 inclusive. */
2857     if( ((pageSize-1)&pageSize)!=0
2858      || pageSize>SQLITE_MAX_PAGE_SIZE
2859      || pageSize<=256
2860     ){
2861       goto page1_init_failed;
2862     }
2863     assert( (pageSize & 7)==0 );
2864     /* EVIDENCE-OF: R-59310-51205 The "reserved space" size in the 1-byte
2865     ** integer at offset 20 is the number of bytes of space at the end of
2866     ** each page to reserve for extensions.
2867     **
2868     ** EVIDENCE-OF: R-37497-42412 The size of the reserved region is
2869     ** determined by the one-byte unsigned integer found at an offset of 20
2870     ** into the database file header. */
2871     usableSize = pageSize - page1[20];
2872     if( (u32)pageSize!=pBt->pageSize ){
2873       /* After reading the first page of the database assuming a page size
2874       ** of BtShared.pageSize, we have discovered that the page-size is
2875       ** actually pageSize. Unlock the database, leave pBt->pPage1 at
2876       ** zero and return SQLITE_OK. The caller will call this function
2877       ** again with the correct page-size.
2878       */
2879       releasePage(pPage1);
2880       pBt->usableSize = usableSize;
2881       pBt->pageSize = pageSize;
2882       freeTempSpace(pBt);
2883       rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize,
2884                                    pageSize-usableSize);
2885       return rc;
2886     }
2887     if( (pBt->db->flags & SQLITE_RecoveryMode)==0 && nPage>nPageFile ){
2888       rc = SQLITE_CORRUPT_BKPT;
2889       goto page1_init_failed;
2890     }
2891     /* EVIDENCE-OF: R-28312-64704 However, the usable size is not allowed to
2892     ** be less than 480. In other words, if the page size is 512, then the
2893     ** reserved space size cannot exceed 32. */
2894     if( usableSize<480 ){
2895       goto page1_init_failed;
2896     }
2897     pBt->pageSize = pageSize;
2898     pBt->usableSize = usableSize;
2899 #ifndef SQLITE_OMIT_AUTOVACUUM
2900     pBt->autoVacuum = (get4byte(&page1[36 + 4*4])?1:0);
2901     pBt->incrVacuum = (get4byte(&page1[36 + 7*4])?1:0);
2902 #endif
2903   }
2904 
2905   /* maxLocal is the maximum amount of payload to store locally for
2906   ** a cell.  Make sure it is small enough so that at least minFanout
2907   ** cells can will fit on one page.  We assume a 10-byte page header.
2908   ** Besides the payload, the cell must store:
2909   **     2-byte pointer to the cell
2910   **     4-byte child pointer
2911   **     9-byte nKey value
2912   **     4-byte nData value
2913   **     4-byte overflow page pointer
2914   ** So a cell consists of a 2-byte pointer, a header which is as much as
2915   ** 17 bytes long, 0 to N bytes of payload, and an optional 4 byte overflow
2916   ** page pointer.
2917   */
2918   pBt->maxLocal = (u16)((pBt->usableSize-12)*64/255 - 23);
2919   pBt->minLocal = (u16)((pBt->usableSize-12)*32/255 - 23);
2920   pBt->maxLeaf = (u16)(pBt->usableSize - 35);
2921   pBt->minLeaf = (u16)((pBt->usableSize-12)*32/255 - 23);
2922   if( pBt->maxLocal>127 ){
2923     pBt->max1bytePayload = 127;
2924   }else{
2925     pBt->max1bytePayload = (u8)pBt->maxLocal;
2926   }
2927   assert( pBt->maxLeaf + 23 <= MX_CELL_SIZE(pBt) );
2928   pBt->pPage1 = pPage1;
2929   pBt->nPage = nPage;
2930   return SQLITE_OK;
2931 
2932 page1_init_failed:
2933   releasePage(pPage1);
2934   pBt->pPage1 = 0;
2935   return rc;
2936 }
2937 
2938 #ifndef NDEBUG
2939 /*
2940 ** Return the number of cursors open on pBt. This is for use
2941 ** in assert() expressions, so it is only compiled if NDEBUG is not
2942 ** defined.
2943 **
2944 ** Only write cursors are counted if wrOnly is true.  If wrOnly is
2945 ** false then all cursors are counted.
2946 **
2947 ** For the purposes of this routine, a cursor is any cursor that
2948 ** is capable of reading or writing to the database.  Cursors that
2949 ** have been tripped into the CURSOR_FAULT state are not counted.
2950 */
2951 static int countValidCursors(BtShared *pBt, int wrOnly){
2952   BtCursor *pCur;
2953   int r = 0;
2954   for(pCur=pBt->pCursor; pCur; pCur=pCur->pNext){
2955     if( (wrOnly==0 || (pCur->curFlags & BTCF_WriteFlag)!=0)
2956      && pCur->eState!=CURSOR_FAULT ) r++;
2957   }
2958   return r;
2959 }
2960 #endif
2961 
2962 /*
2963 ** If there are no outstanding cursors and we are not in the middle
2964 ** of a transaction but there is a read lock on the database, then
2965 ** this routine unrefs the first page of the database file which
2966 ** has the effect of releasing the read lock.
2967 **
2968 ** If there is a transaction in progress, this routine is a no-op.
2969 */
2970 static void unlockBtreeIfUnused(BtShared *pBt){
2971   assert( sqlite3_mutex_held(pBt->mutex) );
2972   assert( countValidCursors(pBt,0)==0 || pBt->inTransaction>TRANS_NONE );
2973   if( pBt->inTransaction==TRANS_NONE && pBt->pPage1!=0 ){
2974     MemPage *pPage1 = pBt->pPage1;
2975     assert( pPage1->aData );
2976     assert( sqlite3PagerRefcount(pBt->pPager)==1 );
2977     pBt->pPage1 = 0;
2978     releasePageNotNull(pPage1);
2979   }
2980 }
2981 
2982 /*
2983 ** If pBt points to an empty file then convert that empty file
2984 ** into a new empty database by initializing the first page of
2985 ** the database.
2986 */
2987 static int newDatabase(BtShared *pBt){
2988   MemPage *pP1;
2989   unsigned char *data;
2990   int rc;
2991 
2992   assert( sqlite3_mutex_held(pBt->mutex) );
2993   if( pBt->nPage>0 ){
2994     return SQLITE_OK;
2995   }
2996   pP1 = pBt->pPage1;
2997   assert( pP1!=0 );
2998   data = pP1->aData;
2999   rc = sqlite3PagerWrite(pP1->pDbPage);
3000   if( rc ) return rc;
3001   memcpy(data, zMagicHeader, sizeof(zMagicHeader));
3002   assert( sizeof(zMagicHeader)==16 );
3003   data[16] = (u8)((pBt->pageSize>>8)&0xff);
3004   data[17] = (u8)((pBt->pageSize>>16)&0xff);
3005   data[18] = 1;
3006   data[19] = 1;
3007   assert( pBt->usableSize<=pBt->pageSize && pBt->usableSize+255>=pBt->pageSize);
3008   data[20] = (u8)(pBt->pageSize - pBt->usableSize);
3009   data[21] = 64;
3010   data[22] = 32;
3011   data[23] = 32;
3012   memset(&data[24], 0, 100-24);
3013   zeroPage(pP1, PTF_INTKEY|PTF_LEAF|PTF_LEAFDATA );
3014   pBt->btsFlags |= BTS_PAGESIZE_FIXED;
3015 #ifndef SQLITE_OMIT_AUTOVACUUM
3016   assert( pBt->autoVacuum==1 || pBt->autoVacuum==0 );
3017   assert( pBt->incrVacuum==1 || pBt->incrVacuum==0 );
3018   put4byte(&data[36 + 4*4], pBt->autoVacuum);
3019   put4byte(&data[36 + 7*4], pBt->incrVacuum);
3020 #endif
3021   pBt->nPage = 1;
3022   data[31] = 1;
3023   return SQLITE_OK;
3024 }
3025 
3026 /*
3027 ** Initialize the first page of the database file (creating a database
3028 ** consisting of a single page and no schema objects). Return SQLITE_OK
3029 ** if successful, or an SQLite error code otherwise.
3030 */
3031 int sqlite3BtreeNewDb(Btree *p){
3032   int rc;
3033   sqlite3BtreeEnter(p);
3034   p->pBt->nPage = 0;
3035   rc = newDatabase(p->pBt);
3036   sqlite3BtreeLeave(p);
3037   return rc;
3038 }
3039 
3040 /*
3041 ** Attempt to start a new transaction. A write-transaction
3042 ** is started if the second argument is nonzero, otherwise a read-
3043 ** transaction.  If the second argument is 2 or more and exclusive
3044 ** transaction is started, meaning that no other process is allowed
3045 ** to access the database.  A preexisting transaction may not be
3046 ** upgraded to exclusive by calling this routine a second time - the
3047 ** exclusivity flag only works for a new transaction.
3048 **
3049 ** A write-transaction must be started before attempting any
3050 ** changes to the database.  None of the following routines
3051 ** will work unless a transaction is started first:
3052 **
3053 **      sqlite3BtreeCreateTable()
3054 **      sqlite3BtreeCreateIndex()
3055 **      sqlite3BtreeClearTable()
3056 **      sqlite3BtreeDropTable()
3057 **      sqlite3BtreeInsert()
3058 **      sqlite3BtreeDelete()
3059 **      sqlite3BtreeUpdateMeta()
3060 **
3061 ** If an initial attempt to acquire the lock fails because of lock contention
3062 ** and the database was previously unlocked, then invoke the busy handler
3063 ** if there is one.  But if there was previously a read-lock, do not
3064 ** invoke the busy handler - just return SQLITE_BUSY.  SQLITE_BUSY is
3065 ** returned when there is already a read-lock in order to avoid a deadlock.
3066 **
3067 ** Suppose there are two processes A and B.  A has a read lock and B has
3068 ** a reserved lock.  B tries to promote to exclusive but is blocked because
3069 ** of A's read lock.  A tries to promote to reserved but is blocked by B.
3070 ** One or the other of the two processes must give way or there can be
3071 ** no progress.  By returning SQLITE_BUSY and not invoking the busy callback
3072 ** when A already has a read lock, we encourage A to give up and let B
3073 ** proceed.
3074 */
3075 int sqlite3BtreeBeginTrans(Btree *p, int wrflag){
3076   sqlite3 *pBlock = 0;
3077   BtShared *pBt = p->pBt;
3078   int rc = SQLITE_OK;
3079 
3080   sqlite3BtreeEnter(p);
3081   btreeIntegrity(p);
3082 
3083   /* If the btree is already in a write-transaction, or it
3084   ** is already in a read-transaction and a read-transaction
3085   ** is requested, this is a no-op.
3086   */
3087   if( p->inTrans==TRANS_WRITE || (p->inTrans==TRANS_READ && !wrflag) ){
3088     goto trans_begun;
3089   }
3090   assert( pBt->inTransaction==TRANS_WRITE || IfNotOmitAV(pBt->bDoTruncate)==0 );
3091 
3092   /* Write transactions are not possible on a read-only database */
3093   if( (pBt->btsFlags & BTS_READ_ONLY)!=0 && wrflag ){
3094     rc = SQLITE_READONLY;
3095     goto trans_begun;
3096   }
3097 
3098 #ifndef SQLITE_OMIT_SHARED_CACHE
3099   /* If another database handle has already opened a write transaction
3100   ** on this shared-btree structure and a second write transaction is
3101   ** requested, return SQLITE_LOCKED.
3102   */
3103   if( (wrflag && pBt->inTransaction==TRANS_WRITE)
3104    || (pBt->btsFlags & BTS_PENDING)!=0
3105   ){
3106     pBlock = pBt->pWriter->db;
3107   }else if( wrflag>1 ){
3108     BtLock *pIter;
3109     for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){
3110       if( pIter->pBtree!=p ){
3111         pBlock = pIter->pBtree->db;
3112         break;
3113       }
3114     }
3115   }
3116   if( pBlock ){
3117     sqlite3ConnectionBlocked(p->db, pBlock);
3118     rc = SQLITE_LOCKED_SHAREDCACHE;
3119     goto trans_begun;
3120   }
3121 #endif
3122 
3123   /* Any read-only or read-write transaction implies a read-lock on
3124   ** page 1. So if some other shared-cache client already has a write-lock
3125   ** on page 1, the transaction cannot be opened. */
3126   rc = querySharedCacheTableLock(p, MASTER_ROOT, READ_LOCK);
3127   if( SQLITE_OK!=rc ) goto trans_begun;
3128 
3129   pBt->btsFlags &= ~BTS_INITIALLY_EMPTY;
3130   if( pBt->nPage==0 ) pBt->btsFlags |= BTS_INITIALLY_EMPTY;
3131   do {
3132     /* Call lockBtree() until either pBt->pPage1 is populated or
3133     ** lockBtree() returns something other than SQLITE_OK. lockBtree()
3134     ** may return SQLITE_OK but leave pBt->pPage1 set to 0 if after
3135     ** reading page 1 it discovers that the page-size of the database
3136     ** file is not pBt->pageSize. In this case lockBtree() will update
3137     ** pBt->pageSize to the page-size of the file on disk.
3138     */
3139     while( pBt->pPage1==0 && SQLITE_OK==(rc = lockBtree(pBt)) );
3140 
3141     if( rc==SQLITE_OK && wrflag ){
3142       if( (pBt->btsFlags & BTS_READ_ONLY)!=0 ){
3143         rc = SQLITE_READONLY;
3144       }else{
3145         rc = sqlite3PagerBegin(pBt->pPager,wrflag>1,sqlite3TempInMemory(p->db));
3146         if( rc==SQLITE_OK ){
3147           rc = newDatabase(pBt);
3148         }
3149       }
3150     }
3151 
3152     if( rc!=SQLITE_OK ){
3153       unlockBtreeIfUnused(pBt);
3154     }
3155   }while( (rc&0xFF)==SQLITE_BUSY && pBt->inTransaction==TRANS_NONE &&
3156           btreeInvokeBusyHandler(pBt) );
3157 
3158   if( rc==SQLITE_OK ){
3159     if( p->inTrans==TRANS_NONE ){
3160       pBt->nTransaction++;
3161 #ifndef SQLITE_OMIT_SHARED_CACHE
3162       if( p->sharable ){
3163         assert( p->lock.pBtree==p && p->lock.iTable==1 );
3164         p->lock.eLock = READ_LOCK;
3165         p->lock.pNext = pBt->pLock;
3166         pBt->pLock = &p->lock;
3167       }
3168 #endif
3169     }
3170     p->inTrans = (wrflag?TRANS_WRITE:TRANS_READ);
3171     if( p->inTrans>pBt->inTransaction ){
3172       pBt->inTransaction = p->inTrans;
3173     }
3174     if( wrflag ){
3175       MemPage *pPage1 = pBt->pPage1;
3176 #ifndef SQLITE_OMIT_SHARED_CACHE
3177       assert( !pBt->pWriter );
3178       pBt->pWriter = p;
3179       pBt->btsFlags &= ~BTS_EXCLUSIVE;
3180       if( wrflag>1 ) pBt->btsFlags |= BTS_EXCLUSIVE;
3181 #endif
3182 
3183       /* If the db-size header field is incorrect (as it may be if an old
3184       ** client has been writing the database file), update it now. Doing
3185       ** this sooner rather than later means the database size can safely
3186       ** re-read the database size from page 1 if a savepoint or transaction
3187       ** rollback occurs within the transaction.
3188       */
3189       if( pBt->nPage!=get4byte(&pPage1->aData[28]) ){
3190         rc = sqlite3PagerWrite(pPage1->pDbPage);
3191         if( rc==SQLITE_OK ){
3192           put4byte(&pPage1->aData[28], pBt->nPage);
3193         }
3194       }
3195     }
3196   }
3197 
3198 
3199 trans_begun:
3200   if( rc==SQLITE_OK && wrflag ){
3201     /* This call makes sure that the pager has the correct number of
3202     ** open savepoints. If the second parameter is greater than 0 and
3203     ** the sub-journal is not already open, then it will be opened here.
3204     */
3205     rc = sqlite3PagerOpenSavepoint(pBt->pPager, p->db->nSavepoint);
3206   }
3207 
3208   btreeIntegrity(p);
3209   sqlite3BtreeLeave(p);
3210   return rc;
3211 }
3212 
3213 #ifndef SQLITE_OMIT_AUTOVACUUM
3214 
3215 /*
3216 ** Set the pointer-map entries for all children of page pPage. Also, if
3217 ** pPage contains cells that point to overflow pages, set the pointer
3218 ** map entries for the overflow pages as well.
3219 */
3220 static int setChildPtrmaps(MemPage *pPage){
3221   int i;                             /* Counter variable */
3222   int nCell;                         /* Number of cells in page pPage */
3223   int rc;                            /* Return code */
3224   BtShared *pBt = pPage->pBt;
3225   u8 isInitOrig = pPage->isInit;
3226   Pgno pgno = pPage->pgno;
3227 
3228   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
3229   rc = btreeInitPage(pPage);
3230   if( rc!=SQLITE_OK ){
3231     goto set_child_ptrmaps_out;
3232   }
3233   nCell = pPage->nCell;
3234 
3235   for(i=0; i<nCell; i++){
3236     u8 *pCell = findCell(pPage, i);
3237 
3238     ptrmapPutOvflPtr(pPage, pCell, &rc);
3239 
3240     if( !pPage->leaf ){
3241       Pgno childPgno = get4byte(pCell);
3242       ptrmapPut(pBt, childPgno, PTRMAP_BTREE, pgno, &rc);
3243     }
3244   }
3245 
3246   if( !pPage->leaf ){
3247     Pgno childPgno = get4byte(&pPage->aData[pPage->hdrOffset+8]);
3248     ptrmapPut(pBt, childPgno, PTRMAP_BTREE, pgno, &rc);
3249   }
3250 
3251 set_child_ptrmaps_out:
3252   pPage->isInit = isInitOrig;
3253   return rc;
3254 }
3255 
3256 /*
3257 ** Somewhere on pPage is a pointer to page iFrom.  Modify this pointer so
3258 ** that it points to iTo. Parameter eType describes the type of pointer to
3259 ** be modified, as  follows:
3260 **
3261 ** PTRMAP_BTREE:     pPage is a btree-page. The pointer points at a child
3262 **                   page of pPage.
3263 **
3264 ** PTRMAP_OVERFLOW1: pPage is a btree-page. The pointer points at an overflow
3265 **                   page pointed to by one of the cells on pPage.
3266 **
3267 ** PTRMAP_OVERFLOW2: pPage is an overflow-page. The pointer points at the next
3268 **                   overflow page in the list.
3269 */
3270 static int modifyPagePointer(MemPage *pPage, Pgno iFrom, Pgno iTo, u8 eType){
3271   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
3272   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
3273   if( eType==PTRMAP_OVERFLOW2 ){
3274     /* The pointer is always the first 4 bytes of the page in this case.  */
3275     if( get4byte(pPage->aData)!=iFrom ){
3276       return SQLITE_CORRUPT_BKPT;
3277     }
3278     put4byte(pPage->aData, iTo);
3279   }else{
3280     u8 isInitOrig = pPage->isInit;
3281     int i;
3282     int nCell;
3283     int rc;
3284 
3285     rc = btreeInitPage(pPage);
3286     if( rc ) return rc;
3287     nCell = pPage->nCell;
3288 
3289     for(i=0; i<nCell; i++){
3290       u8 *pCell = findCell(pPage, i);
3291       if( eType==PTRMAP_OVERFLOW1 ){
3292         CellInfo info;
3293         pPage->xParseCell(pPage, pCell, &info);
3294         if( info.iOverflow
3295          && pCell+info.iOverflow+3<=pPage->aData+pPage->maskPage
3296          && iFrom==get4byte(&pCell[info.iOverflow])
3297         ){
3298           put4byte(&pCell[info.iOverflow], iTo);
3299           break;
3300         }
3301       }else{
3302         if( get4byte(pCell)==iFrom ){
3303           put4byte(pCell, iTo);
3304           break;
3305         }
3306       }
3307     }
3308 
3309     if( i==nCell ){
3310       if( eType!=PTRMAP_BTREE ||
3311           get4byte(&pPage->aData[pPage->hdrOffset+8])!=iFrom ){
3312         return SQLITE_CORRUPT_BKPT;
3313       }
3314       put4byte(&pPage->aData[pPage->hdrOffset+8], iTo);
3315     }
3316 
3317     pPage->isInit = isInitOrig;
3318   }
3319   return SQLITE_OK;
3320 }
3321 
3322 
3323 /*
3324 ** Move the open database page pDbPage to location iFreePage in the
3325 ** database. The pDbPage reference remains valid.
3326 **
3327 ** The isCommit flag indicates that there is no need to remember that
3328 ** the journal needs to be sync()ed before database page pDbPage->pgno
3329 ** can be written to. The caller has already promised not to write to that
3330 ** page.
3331 */
3332 static int relocatePage(
3333   BtShared *pBt,           /* Btree */
3334   MemPage *pDbPage,        /* Open page to move */
3335   u8 eType,                /* Pointer map 'type' entry for pDbPage */
3336   Pgno iPtrPage,           /* Pointer map 'page-no' entry for pDbPage */
3337   Pgno iFreePage,          /* The location to move pDbPage to */
3338   int isCommit             /* isCommit flag passed to sqlite3PagerMovepage */
3339 ){
3340   MemPage *pPtrPage;   /* The page that contains a pointer to pDbPage */
3341   Pgno iDbPage = pDbPage->pgno;
3342   Pager *pPager = pBt->pPager;
3343   int rc;
3344 
3345   assert( eType==PTRMAP_OVERFLOW2 || eType==PTRMAP_OVERFLOW1 ||
3346       eType==PTRMAP_BTREE || eType==PTRMAP_ROOTPAGE );
3347   assert( sqlite3_mutex_held(pBt->mutex) );
3348   assert( pDbPage->pBt==pBt );
3349 
3350   /* Move page iDbPage from its current location to page number iFreePage */
3351   TRACE(("AUTOVACUUM: Moving %d to free page %d (ptr page %d type %d)\n",
3352       iDbPage, iFreePage, iPtrPage, eType));
3353   rc = sqlite3PagerMovepage(pPager, pDbPage->pDbPage, iFreePage, isCommit);
3354   if( rc!=SQLITE_OK ){
3355     return rc;
3356   }
3357   pDbPage->pgno = iFreePage;
3358 
3359   /* If pDbPage was a btree-page, then it may have child pages and/or cells
3360   ** that point to overflow pages. The pointer map entries for all these
3361   ** pages need to be changed.
3362   **
3363   ** If pDbPage is an overflow page, then the first 4 bytes may store a
3364   ** pointer to a subsequent overflow page. If this is the case, then
3365   ** the pointer map needs to be updated for the subsequent overflow page.
3366   */
3367   if( eType==PTRMAP_BTREE || eType==PTRMAP_ROOTPAGE ){
3368     rc = setChildPtrmaps(pDbPage);
3369     if( rc!=SQLITE_OK ){
3370       return rc;
3371     }
3372   }else{
3373     Pgno nextOvfl = get4byte(pDbPage->aData);
3374     if( nextOvfl!=0 ){
3375       ptrmapPut(pBt, nextOvfl, PTRMAP_OVERFLOW2, iFreePage, &rc);
3376       if( rc!=SQLITE_OK ){
3377         return rc;
3378       }
3379     }
3380   }
3381 
3382   /* Fix the database pointer on page iPtrPage that pointed at iDbPage so
3383   ** that it points at iFreePage. Also fix the pointer map entry for
3384   ** iPtrPage.
3385   */
3386   if( eType!=PTRMAP_ROOTPAGE ){
3387     rc = btreeGetPage(pBt, iPtrPage, &pPtrPage, 0);
3388     if( rc!=SQLITE_OK ){
3389       return rc;
3390     }
3391     rc = sqlite3PagerWrite(pPtrPage->pDbPage);
3392     if( rc!=SQLITE_OK ){
3393       releasePage(pPtrPage);
3394       return rc;
3395     }
3396     rc = modifyPagePointer(pPtrPage, iDbPage, iFreePage, eType);
3397     releasePage(pPtrPage);
3398     if( rc==SQLITE_OK ){
3399       ptrmapPut(pBt, iFreePage, eType, iPtrPage, &rc);
3400     }
3401   }
3402   return rc;
3403 }
3404 
3405 /* Forward declaration required by incrVacuumStep(). */
3406 static int allocateBtreePage(BtShared *, MemPage **, Pgno *, Pgno, u8);
3407 
3408 /*
3409 ** Perform a single step of an incremental-vacuum. If successful, return
3410 ** SQLITE_OK. If there is no work to do (and therefore no point in
3411 ** calling this function again), return SQLITE_DONE. Or, if an error
3412 ** occurs, return some other error code.
3413 **
3414 ** More specifically, this function attempts to re-organize the database so
3415 ** that the last page of the file currently in use is no longer in use.
3416 **
3417 ** Parameter nFin is the number of pages that this database would contain
3418 ** were this function called until it returns SQLITE_DONE.
3419 **
3420 ** If the bCommit parameter is non-zero, this function assumes that the
3421 ** caller will keep calling incrVacuumStep() until it returns SQLITE_DONE
3422 ** or an error. bCommit is passed true for an auto-vacuum-on-commit
3423 ** operation, or false for an incremental vacuum.
3424 */
3425 static int incrVacuumStep(BtShared *pBt, Pgno nFin, Pgno iLastPg, int bCommit){
3426   Pgno nFreeList;           /* Number of pages still on the free-list */
3427   int rc;
3428 
3429   assert( sqlite3_mutex_held(pBt->mutex) );
3430   assert( iLastPg>nFin );
3431 
3432   if( !PTRMAP_ISPAGE(pBt, iLastPg) && iLastPg!=PENDING_BYTE_PAGE(pBt) ){
3433     u8 eType;
3434     Pgno iPtrPage;
3435 
3436     nFreeList = get4byte(&pBt->pPage1->aData[36]);
3437     if( nFreeList==0 ){
3438       return SQLITE_DONE;
3439     }
3440 
3441     rc = ptrmapGet(pBt, iLastPg, &eType, &iPtrPage);
3442     if( rc!=SQLITE_OK ){
3443       return rc;
3444     }
3445     if( eType==PTRMAP_ROOTPAGE ){
3446       return SQLITE_CORRUPT_BKPT;
3447     }
3448 
3449     if( eType==PTRMAP_FREEPAGE ){
3450       if( bCommit==0 ){
3451         /* Remove the page from the files free-list. This is not required
3452         ** if bCommit is non-zero. In that case, the free-list will be
3453         ** truncated to zero after this function returns, so it doesn't
3454         ** matter if it still contains some garbage entries.
3455         */
3456         Pgno iFreePg;
3457         MemPage *pFreePg;
3458         rc = allocateBtreePage(pBt, &pFreePg, &iFreePg, iLastPg, BTALLOC_EXACT);
3459         if( rc!=SQLITE_OK ){
3460           return rc;
3461         }
3462         assert( iFreePg==iLastPg );
3463         releasePage(pFreePg);
3464       }
3465     } else {
3466       Pgno iFreePg;             /* Index of free page to move pLastPg to */
3467       MemPage *pLastPg;
3468       u8 eMode = BTALLOC_ANY;   /* Mode parameter for allocateBtreePage() */
3469       Pgno iNear = 0;           /* nearby parameter for allocateBtreePage() */
3470 
3471       rc = btreeGetPage(pBt, iLastPg, &pLastPg, 0);
3472       if( rc!=SQLITE_OK ){
3473         return rc;
3474       }
3475 
3476       /* If bCommit is zero, this loop runs exactly once and page pLastPg
3477       ** is swapped with the first free page pulled off the free list.
3478       **
3479       ** On the other hand, if bCommit is greater than zero, then keep
3480       ** looping until a free-page located within the first nFin pages
3481       ** of the file is found.
3482       */
3483       if( bCommit==0 ){
3484         eMode = BTALLOC_LE;
3485         iNear = nFin;
3486       }
3487       do {
3488         MemPage *pFreePg;
3489         rc = allocateBtreePage(pBt, &pFreePg, &iFreePg, iNear, eMode);
3490         if( rc!=SQLITE_OK ){
3491           releasePage(pLastPg);
3492           return rc;
3493         }
3494         releasePage(pFreePg);
3495       }while( bCommit && iFreePg>nFin );
3496       assert( iFreePg<iLastPg );
3497 
3498       rc = relocatePage(pBt, pLastPg, eType, iPtrPage, iFreePg, bCommit);
3499       releasePage(pLastPg);
3500       if( rc!=SQLITE_OK ){
3501         return rc;
3502       }
3503     }
3504   }
3505 
3506   if( bCommit==0 ){
3507     do {
3508       iLastPg--;
3509     }while( iLastPg==PENDING_BYTE_PAGE(pBt) || PTRMAP_ISPAGE(pBt, iLastPg) );
3510     pBt->bDoTruncate = 1;
3511     pBt->nPage = iLastPg;
3512   }
3513   return SQLITE_OK;
3514 }
3515 
3516 /*
3517 ** The database opened by the first argument is an auto-vacuum database
3518 ** nOrig pages in size containing nFree free pages. Return the expected
3519 ** size of the database in pages following an auto-vacuum operation.
3520 */
3521 static Pgno finalDbSize(BtShared *pBt, Pgno nOrig, Pgno nFree){
3522   int nEntry;                     /* Number of entries on one ptrmap page */
3523   Pgno nPtrmap;                   /* Number of PtrMap pages to be freed */
3524   Pgno nFin;                      /* Return value */
3525 
3526   nEntry = pBt->usableSize/5;
3527   nPtrmap = (nFree-nOrig+PTRMAP_PAGENO(pBt, nOrig)+nEntry)/nEntry;
3528   nFin = nOrig - nFree - nPtrmap;
3529   if( nOrig>PENDING_BYTE_PAGE(pBt) && nFin<PENDING_BYTE_PAGE(pBt) ){
3530     nFin--;
3531   }
3532   while( PTRMAP_ISPAGE(pBt, nFin) || nFin==PENDING_BYTE_PAGE(pBt) ){
3533     nFin--;
3534   }
3535 
3536   return nFin;
3537 }
3538 
3539 /*
3540 ** A write-transaction must be opened before calling this function.
3541 ** It performs a single unit of work towards an incremental vacuum.
3542 **
3543 ** If the incremental vacuum is finished after this function has run,
3544 ** SQLITE_DONE is returned. If it is not finished, but no error occurred,
3545 ** SQLITE_OK is returned. Otherwise an SQLite error code.
3546 */
3547 int sqlite3BtreeIncrVacuum(Btree *p){
3548   int rc;
3549   BtShared *pBt = p->pBt;
3550 
3551   sqlite3BtreeEnter(p);
3552   assert( pBt->inTransaction==TRANS_WRITE && p->inTrans==TRANS_WRITE );
3553   if( !pBt->autoVacuum ){
3554     rc = SQLITE_DONE;
3555   }else{
3556     Pgno nOrig = btreePagecount(pBt);
3557     Pgno nFree = get4byte(&pBt->pPage1->aData[36]);
3558     Pgno nFin = finalDbSize(pBt, nOrig, nFree);
3559 
3560     if( nOrig<nFin ){
3561       rc = SQLITE_CORRUPT_BKPT;
3562     }else if( nFree>0 ){
3563       rc = saveAllCursors(pBt, 0, 0);
3564       if( rc==SQLITE_OK ){
3565         invalidateAllOverflowCache(pBt);
3566         rc = incrVacuumStep(pBt, nFin, nOrig, 0);
3567       }
3568       if( rc==SQLITE_OK ){
3569         rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
3570         put4byte(&pBt->pPage1->aData[28], pBt->nPage);
3571       }
3572     }else{
3573       rc = SQLITE_DONE;
3574     }
3575   }
3576   sqlite3BtreeLeave(p);
3577   return rc;
3578 }
3579 
3580 /*
3581 ** This routine is called prior to sqlite3PagerCommit when a transaction
3582 ** is committed for an auto-vacuum database.
3583 **
3584 ** If SQLITE_OK is returned, then *pnTrunc is set to the number of pages
3585 ** the database file should be truncated to during the commit process.
3586 ** i.e. the database has been reorganized so that only the first *pnTrunc
3587 ** pages are in use.
3588 */
3589 static int autoVacuumCommit(BtShared *pBt){
3590   int rc = SQLITE_OK;
3591   Pager *pPager = pBt->pPager;
3592   VVA_ONLY( int nRef = sqlite3PagerRefcount(pPager); )
3593 
3594   assert( sqlite3_mutex_held(pBt->mutex) );
3595   invalidateAllOverflowCache(pBt);
3596   assert(pBt->autoVacuum);
3597   if( !pBt->incrVacuum ){
3598     Pgno nFin;         /* Number of pages in database after autovacuuming */
3599     Pgno nFree;        /* Number of pages on the freelist initially */
3600     Pgno iFree;        /* The next page to be freed */
3601     Pgno nOrig;        /* Database size before freeing */
3602 
3603     nOrig = btreePagecount(pBt);
3604     if( PTRMAP_ISPAGE(pBt, nOrig) || nOrig==PENDING_BYTE_PAGE(pBt) ){
3605       /* It is not possible to create a database for which the final page
3606       ** is either a pointer-map page or the pending-byte page. If one
3607       ** is encountered, this indicates corruption.
3608       */
3609       return SQLITE_CORRUPT_BKPT;
3610     }
3611 
3612     nFree = get4byte(&pBt->pPage1->aData[36]);
3613     nFin = finalDbSize(pBt, nOrig, nFree);
3614     if( nFin>nOrig ) return SQLITE_CORRUPT_BKPT;
3615     if( nFin<nOrig ){
3616       rc = saveAllCursors(pBt, 0, 0);
3617     }
3618     for(iFree=nOrig; iFree>nFin && rc==SQLITE_OK; iFree--){
3619       rc = incrVacuumStep(pBt, nFin, iFree, 1);
3620     }
3621     if( (rc==SQLITE_DONE || rc==SQLITE_OK) && nFree>0 ){
3622       rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
3623       put4byte(&pBt->pPage1->aData[32], 0);
3624       put4byte(&pBt->pPage1->aData[36], 0);
3625       put4byte(&pBt->pPage1->aData[28], nFin);
3626       pBt->bDoTruncate = 1;
3627       pBt->nPage = nFin;
3628     }
3629     if( rc!=SQLITE_OK ){
3630       sqlite3PagerRollback(pPager);
3631     }
3632   }
3633 
3634   assert( nRef>=sqlite3PagerRefcount(pPager) );
3635   return rc;
3636 }
3637 
3638 #else /* ifndef SQLITE_OMIT_AUTOVACUUM */
3639 # define setChildPtrmaps(x) SQLITE_OK
3640 #endif
3641 
3642 /*
3643 ** This routine does the first phase of a two-phase commit.  This routine
3644 ** causes a rollback journal to be created (if it does not already exist)
3645 ** and populated with enough information so that if a power loss occurs
3646 ** the database can be restored to its original state by playing back
3647 ** the journal.  Then the contents of the journal are flushed out to
3648 ** the disk.  After the journal is safely on oxide, the changes to the
3649 ** database are written into the database file and flushed to oxide.
3650 ** At the end of this call, the rollback journal still exists on the
3651 ** disk and we are still holding all locks, so the transaction has not
3652 ** committed.  See sqlite3BtreeCommitPhaseTwo() for the second phase of the
3653 ** commit process.
3654 **
3655 ** This call is a no-op if no write-transaction is currently active on pBt.
3656 **
3657 ** Otherwise, sync the database file for the btree pBt. zMaster points to
3658 ** the name of a master journal file that should be written into the
3659 ** individual journal file, or is NULL, indicating no master journal file
3660 ** (single database transaction).
3661 **
3662 ** When this is called, the master journal should already have been
3663 ** created, populated with this journal pointer and synced to disk.
3664 **
3665 ** Once this is routine has returned, the only thing required to commit
3666 ** the write-transaction for this database file is to delete the journal.
3667 */
3668 int sqlite3BtreeCommitPhaseOne(Btree *p, const char *zMaster){
3669   int rc = SQLITE_OK;
3670   if( p->inTrans==TRANS_WRITE ){
3671     BtShared *pBt = p->pBt;
3672     sqlite3BtreeEnter(p);
3673 #ifndef SQLITE_OMIT_AUTOVACUUM
3674     if( pBt->autoVacuum ){
3675       rc = autoVacuumCommit(pBt);
3676       if( rc!=SQLITE_OK ){
3677         sqlite3BtreeLeave(p);
3678         return rc;
3679       }
3680     }
3681     if( pBt->bDoTruncate ){
3682       sqlite3PagerTruncateImage(pBt->pPager, pBt->nPage);
3683     }
3684 #endif
3685     rc = sqlite3PagerCommitPhaseOne(pBt->pPager, zMaster, 0);
3686     sqlite3BtreeLeave(p);
3687   }
3688   return rc;
3689 }
3690 
3691 /*
3692 ** This function is called from both BtreeCommitPhaseTwo() and BtreeRollback()
3693 ** at the conclusion of a transaction.
3694 */
3695 static void btreeEndTransaction(Btree *p){
3696   BtShared *pBt = p->pBt;
3697   sqlite3 *db = p->db;
3698   assert( sqlite3BtreeHoldsMutex(p) );
3699 
3700 #ifndef SQLITE_OMIT_AUTOVACUUM
3701   pBt->bDoTruncate = 0;
3702 #endif
3703   if( p->inTrans>TRANS_NONE && db->nVdbeRead>1 ){
3704     /* If there are other active statements that belong to this database
3705     ** handle, downgrade to a read-only transaction. The other statements
3706     ** may still be reading from the database.  */
3707     downgradeAllSharedCacheTableLocks(p);
3708     p->inTrans = TRANS_READ;
3709   }else{
3710     /* If the handle had any kind of transaction open, decrement the
3711     ** transaction count of the shared btree. If the transaction count
3712     ** reaches 0, set the shared state to TRANS_NONE. The unlockBtreeIfUnused()
3713     ** call below will unlock the pager.  */
3714     if( p->inTrans!=TRANS_NONE ){
3715       clearAllSharedCacheTableLocks(p);
3716       pBt->nTransaction--;
3717       if( 0==pBt->nTransaction ){
3718         pBt->inTransaction = TRANS_NONE;
3719       }
3720     }
3721 
3722     /* Set the current transaction state to TRANS_NONE and unlock the
3723     ** pager if this call closed the only read or write transaction.  */
3724     p->inTrans = TRANS_NONE;
3725     unlockBtreeIfUnused(pBt);
3726   }
3727 
3728   btreeIntegrity(p);
3729 }
3730 
3731 /*
3732 ** Commit the transaction currently in progress.
3733 **
3734 ** This routine implements the second phase of a 2-phase commit.  The
3735 ** sqlite3BtreeCommitPhaseOne() routine does the first phase and should
3736 ** be invoked prior to calling this routine.  The sqlite3BtreeCommitPhaseOne()
3737 ** routine did all the work of writing information out to disk and flushing the
3738 ** contents so that they are written onto the disk platter.  All this
3739 ** routine has to do is delete or truncate or zero the header in the
3740 ** the rollback journal (which causes the transaction to commit) and
3741 ** drop locks.
3742 **
3743 ** Normally, if an error occurs while the pager layer is attempting to
3744 ** finalize the underlying journal file, this function returns an error and
3745 ** the upper layer will attempt a rollback. However, if the second argument
3746 ** is non-zero then this b-tree transaction is part of a multi-file
3747 ** transaction. In this case, the transaction has already been committed
3748 ** (by deleting a master journal file) and the caller will ignore this
3749 ** functions return code. So, even if an error occurs in the pager layer,
3750 ** reset the b-tree objects internal state to indicate that the write
3751 ** transaction has been closed. This is quite safe, as the pager will have
3752 ** transitioned to the error state.
3753 **
3754 ** This will release the write lock on the database file.  If there
3755 ** are no active cursors, it also releases the read lock.
3756 */
3757 int sqlite3BtreeCommitPhaseTwo(Btree *p, int bCleanup){
3758 
3759   if( p->inTrans==TRANS_NONE ) return SQLITE_OK;
3760   sqlite3BtreeEnter(p);
3761   btreeIntegrity(p);
3762 
3763   /* If the handle has a write-transaction open, commit the shared-btrees
3764   ** transaction and set the shared state to TRANS_READ.
3765   */
3766   if( p->inTrans==TRANS_WRITE ){
3767     int rc;
3768     BtShared *pBt = p->pBt;
3769     assert( pBt->inTransaction==TRANS_WRITE );
3770     assert( pBt->nTransaction>0 );
3771     rc = sqlite3PagerCommitPhaseTwo(pBt->pPager);
3772     if( rc!=SQLITE_OK && bCleanup==0 ){
3773       sqlite3BtreeLeave(p);
3774       return rc;
3775     }
3776     p->iDataVersion--;  /* Compensate for pPager->iDataVersion++; */
3777     pBt->inTransaction = TRANS_READ;
3778     btreeClearHasContent(pBt);
3779   }
3780 
3781   btreeEndTransaction(p);
3782   sqlite3BtreeLeave(p);
3783   return SQLITE_OK;
3784 }
3785 
3786 /*
3787 ** Do both phases of a commit.
3788 */
3789 int sqlite3BtreeCommit(Btree *p){
3790   int rc;
3791   sqlite3BtreeEnter(p);
3792   rc = sqlite3BtreeCommitPhaseOne(p, 0);
3793   if( rc==SQLITE_OK ){
3794     rc = sqlite3BtreeCommitPhaseTwo(p, 0);
3795   }
3796   sqlite3BtreeLeave(p);
3797   return rc;
3798 }
3799 
3800 /*
3801 ** This routine sets the state to CURSOR_FAULT and the error
3802 ** code to errCode for every cursor on any BtShared that pBtree
3803 ** references.  Or if the writeOnly flag is set to 1, then only
3804 ** trip write cursors and leave read cursors unchanged.
3805 **
3806 ** Every cursor is a candidate to be tripped, including cursors
3807 ** that belong to other database connections that happen to be
3808 ** sharing the cache with pBtree.
3809 **
3810 ** This routine gets called when a rollback occurs. If the writeOnly
3811 ** flag is true, then only write-cursors need be tripped - read-only
3812 ** cursors save their current positions so that they may continue
3813 ** following the rollback. Or, if writeOnly is false, all cursors are
3814 ** tripped. In general, writeOnly is false if the transaction being
3815 ** rolled back modified the database schema. In this case b-tree root
3816 ** pages may be moved or deleted from the database altogether, making
3817 ** it unsafe for read cursors to continue.
3818 **
3819 ** If the writeOnly flag is true and an error is encountered while
3820 ** saving the current position of a read-only cursor, all cursors,
3821 ** including all read-cursors are tripped.
3822 **
3823 ** SQLITE_OK is returned if successful, or if an error occurs while
3824 ** saving a cursor position, an SQLite error code.
3825 */
3826 int sqlite3BtreeTripAllCursors(Btree *pBtree, int errCode, int writeOnly){
3827   BtCursor *p;
3828   int rc = SQLITE_OK;
3829 
3830   assert( (writeOnly==0 || writeOnly==1) && BTCF_WriteFlag==1 );
3831   if( pBtree ){
3832     sqlite3BtreeEnter(pBtree);
3833     for(p=pBtree->pBt->pCursor; p; p=p->pNext){
3834       int i;
3835       if( writeOnly && (p->curFlags & BTCF_WriteFlag)==0 ){
3836         if( p->eState==CURSOR_VALID || p->eState==CURSOR_SKIPNEXT ){
3837           rc = saveCursorPosition(p);
3838           if( rc!=SQLITE_OK ){
3839             (void)sqlite3BtreeTripAllCursors(pBtree, rc, 0);
3840             break;
3841           }
3842         }
3843       }else{
3844         sqlite3BtreeClearCursor(p);
3845         p->eState = CURSOR_FAULT;
3846         p->skipNext = errCode;
3847       }
3848       for(i=0; i<=p->iPage; i++){
3849         releasePage(p->apPage[i]);
3850         p->apPage[i] = 0;
3851       }
3852     }
3853     sqlite3BtreeLeave(pBtree);
3854   }
3855   return rc;
3856 }
3857 
3858 /*
3859 ** Rollback the transaction in progress.
3860 **
3861 ** If tripCode is not SQLITE_OK then cursors will be invalidated (tripped).
3862 ** Only write cursors are tripped if writeOnly is true but all cursors are
3863 ** tripped if writeOnly is false.  Any attempt to use
3864 ** a tripped cursor will result in an error.
3865 **
3866 ** This will release the write lock on the database file.  If there
3867 ** are no active cursors, it also releases the read lock.
3868 */
3869 int sqlite3BtreeRollback(Btree *p, int tripCode, int writeOnly){
3870   int rc;
3871   BtShared *pBt = p->pBt;
3872   MemPage *pPage1;
3873 
3874   assert( writeOnly==1 || writeOnly==0 );
3875   assert( tripCode==SQLITE_ABORT_ROLLBACK || tripCode==SQLITE_OK );
3876   sqlite3BtreeEnter(p);
3877   if( tripCode==SQLITE_OK ){
3878     rc = tripCode = saveAllCursors(pBt, 0, 0);
3879     if( rc ) writeOnly = 0;
3880   }else{
3881     rc = SQLITE_OK;
3882   }
3883   if( tripCode ){
3884     int rc2 = sqlite3BtreeTripAllCursors(p, tripCode, writeOnly);
3885     assert( rc==SQLITE_OK || (writeOnly==0 && rc2==SQLITE_OK) );
3886     if( rc2!=SQLITE_OK ) rc = rc2;
3887   }
3888   btreeIntegrity(p);
3889 
3890   if( p->inTrans==TRANS_WRITE ){
3891     int rc2;
3892 
3893     assert( TRANS_WRITE==pBt->inTransaction );
3894     rc2 = sqlite3PagerRollback(pBt->pPager);
3895     if( rc2!=SQLITE_OK ){
3896       rc = rc2;
3897     }
3898 
3899     /* The rollback may have destroyed the pPage1->aData value.  So
3900     ** call btreeGetPage() on page 1 again to make
3901     ** sure pPage1->aData is set correctly. */
3902     if( btreeGetPage(pBt, 1, &pPage1, 0)==SQLITE_OK ){
3903       int nPage = get4byte(28+(u8*)pPage1->aData);
3904       testcase( nPage==0 );
3905       if( nPage==0 ) sqlite3PagerPagecount(pBt->pPager, &nPage);
3906       testcase( pBt->nPage!=nPage );
3907       pBt->nPage = nPage;
3908       releasePage(pPage1);
3909     }
3910     assert( countValidCursors(pBt, 1)==0 );
3911     pBt->inTransaction = TRANS_READ;
3912     btreeClearHasContent(pBt);
3913   }
3914 
3915   btreeEndTransaction(p);
3916   sqlite3BtreeLeave(p);
3917   return rc;
3918 }
3919 
3920 /*
3921 ** Start a statement subtransaction. The subtransaction can be rolled
3922 ** back independently of the main transaction. You must start a transaction
3923 ** before starting a subtransaction. The subtransaction is ended automatically
3924 ** if the main transaction commits or rolls back.
3925 **
3926 ** Statement subtransactions are used around individual SQL statements
3927 ** that are contained within a BEGIN...COMMIT block.  If a constraint
3928 ** error occurs within the statement, the effect of that one statement
3929 ** can be rolled back without having to rollback the entire transaction.
3930 **
3931 ** A statement sub-transaction is implemented as an anonymous savepoint. The
3932 ** value passed as the second parameter is the total number of savepoints,
3933 ** including the new anonymous savepoint, open on the B-Tree. i.e. if there
3934 ** are no active savepoints and no other statement-transactions open,
3935 ** iStatement is 1. This anonymous savepoint can be released or rolled back
3936 ** using the sqlite3BtreeSavepoint() function.
3937 */
3938 int sqlite3BtreeBeginStmt(Btree *p, int iStatement){
3939   int rc;
3940   BtShared *pBt = p->pBt;
3941   sqlite3BtreeEnter(p);
3942   assert( p->inTrans==TRANS_WRITE );
3943   assert( (pBt->btsFlags & BTS_READ_ONLY)==0 );
3944   assert( iStatement>0 );
3945   assert( iStatement>p->db->nSavepoint );
3946   assert( pBt->inTransaction==TRANS_WRITE );
3947   /* At the pager level, a statement transaction is a savepoint with
3948   ** an index greater than all savepoints created explicitly using
3949   ** SQL statements. It is illegal to open, release or rollback any
3950   ** such savepoints while the statement transaction savepoint is active.
3951   */
3952   rc = sqlite3PagerOpenSavepoint(pBt->pPager, iStatement);
3953   sqlite3BtreeLeave(p);
3954   return rc;
3955 }
3956 
3957 /*
3958 ** The second argument to this function, op, is always SAVEPOINT_ROLLBACK
3959 ** or SAVEPOINT_RELEASE. This function either releases or rolls back the
3960 ** savepoint identified by parameter iSavepoint, depending on the value
3961 ** of op.
3962 **
3963 ** Normally, iSavepoint is greater than or equal to zero. However, if op is
3964 ** SAVEPOINT_ROLLBACK, then iSavepoint may also be -1. In this case the
3965 ** contents of the entire transaction are rolled back. This is different
3966 ** from a normal transaction rollback, as no locks are released and the
3967 ** transaction remains open.
3968 */
3969 int sqlite3BtreeSavepoint(Btree *p, int op, int iSavepoint){
3970   int rc = SQLITE_OK;
3971   if( p && p->inTrans==TRANS_WRITE ){
3972     BtShared *pBt = p->pBt;
3973     assert( op==SAVEPOINT_RELEASE || op==SAVEPOINT_ROLLBACK );
3974     assert( iSavepoint>=0 || (iSavepoint==-1 && op==SAVEPOINT_ROLLBACK) );
3975     sqlite3BtreeEnter(p);
3976     rc = sqlite3PagerSavepoint(pBt->pPager, op, iSavepoint);
3977     if( rc==SQLITE_OK ){
3978       if( iSavepoint<0 && (pBt->btsFlags & BTS_INITIALLY_EMPTY)!=0 ){
3979         pBt->nPage = 0;
3980       }
3981       rc = newDatabase(pBt);
3982       pBt->nPage = get4byte(28 + pBt->pPage1->aData);
3983 
3984       /* The database size was written into the offset 28 of the header
3985       ** when the transaction started, so we know that the value at offset
3986       ** 28 is nonzero. */
3987       assert( pBt->nPage>0 );
3988     }
3989     sqlite3BtreeLeave(p);
3990   }
3991   return rc;
3992 }
3993 
3994 /*
3995 ** Create a new cursor for the BTree whose root is on the page
3996 ** iTable. If a read-only cursor is requested, it is assumed that
3997 ** the caller already has at least a read-only transaction open
3998 ** on the database already. If a write-cursor is requested, then
3999 ** the caller is assumed to have an open write transaction.
4000 **
4001 ** If wrFlag==0, then the cursor can only be used for reading.
4002 ** If wrFlag==1, then the cursor can be used for reading or for
4003 ** writing if other conditions for writing are also met.  These
4004 ** are the conditions that must be met in order for writing to
4005 ** be allowed:
4006 **
4007 ** 1:  The cursor must have been opened with wrFlag==1
4008 **
4009 ** 2:  Other database connections that share the same pager cache
4010 **     but which are not in the READ_UNCOMMITTED state may not have
4011 **     cursors open with wrFlag==0 on the same table.  Otherwise
4012 **     the changes made by this write cursor would be visible to
4013 **     the read cursors in the other database connection.
4014 **
4015 ** 3:  The database must be writable (not on read-only media)
4016 **
4017 ** 4:  There must be an active transaction.
4018 **
4019 ** No checking is done to make sure that page iTable really is the
4020 ** root page of a b-tree.  If it is not, then the cursor acquired
4021 ** will not work correctly.
4022 **
4023 ** It is assumed that the sqlite3BtreeCursorZero() has been called
4024 ** on pCur to initialize the memory space prior to invoking this routine.
4025 */
4026 static int btreeCursor(
4027   Btree *p,                              /* The btree */
4028   int iTable,                            /* Root page of table to open */
4029   int wrFlag,                            /* 1 to write. 0 read-only */
4030   struct KeyInfo *pKeyInfo,              /* First arg to comparison function */
4031   BtCursor *pCur                         /* Space for new cursor */
4032 ){
4033   BtShared *pBt = p->pBt;                /* Shared b-tree handle */
4034   BtCursor *pX;                          /* Looping over other all cursors */
4035 
4036   assert( sqlite3BtreeHoldsMutex(p) );
4037   assert( wrFlag==0 || wrFlag==1 );
4038 
4039   /* The following assert statements verify that if this is a sharable
4040   ** b-tree database, the connection is holding the required table locks,
4041   ** and that no other connection has any open cursor that conflicts with
4042   ** this lock.  */
4043   assert( hasSharedCacheTableLock(p, iTable, pKeyInfo!=0, wrFlag+1) );
4044   assert( wrFlag==0 || !hasReadConflicts(p, iTable) );
4045 
4046   /* Assert that the caller has opened the required transaction. */
4047   assert( p->inTrans>TRANS_NONE );
4048   assert( wrFlag==0 || p->inTrans==TRANS_WRITE );
4049   assert( pBt->pPage1 && pBt->pPage1->aData );
4050   assert( wrFlag==0 || (pBt->btsFlags & BTS_READ_ONLY)==0 );
4051 
4052   if( wrFlag ){
4053     allocateTempSpace(pBt);
4054     if( pBt->pTmpSpace==0 ) return SQLITE_NOMEM;
4055   }
4056   if( iTable==1 && btreePagecount(pBt)==0 ){
4057     assert( wrFlag==0 );
4058     iTable = 0;
4059   }
4060 
4061   /* Now that no other errors can occur, finish filling in the BtCursor
4062   ** variables and link the cursor into the BtShared list.  */
4063   pCur->pgnoRoot = (Pgno)iTable;
4064   pCur->iPage = -1;
4065   pCur->pKeyInfo = pKeyInfo;
4066   pCur->pBtree = p;
4067   pCur->pBt = pBt;
4068   assert( wrFlag==0 || wrFlag==BTCF_WriteFlag );
4069   pCur->curFlags = wrFlag;
4070   pCur->curPagerFlags = wrFlag ? 0 : PAGER_GET_READONLY;
4071   /* If there are two or more cursors on the same btree, then all such
4072   ** cursors *must* have the BTCF_Multiple flag set. */
4073   for(pX=pBt->pCursor; pX; pX=pX->pNext){
4074     if( pX->pgnoRoot==(Pgno)iTable ){
4075       pX->curFlags |= BTCF_Multiple;
4076       pCur->curFlags |= BTCF_Multiple;
4077     }
4078   }
4079   pCur->pNext = pBt->pCursor;
4080   pBt->pCursor = pCur;
4081   pCur->eState = CURSOR_INVALID;
4082   return SQLITE_OK;
4083 }
4084 int sqlite3BtreeCursor(
4085   Btree *p,                                   /* The btree */
4086   int iTable,                                 /* Root page of table to open */
4087   int wrFlag,                                 /* 1 to write. 0 read-only */
4088   struct KeyInfo *pKeyInfo,                   /* First arg to xCompare() */
4089   BtCursor *pCur                              /* Write new cursor here */
4090 ){
4091   int rc;
4092   if( iTable<1 ){
4093     rc = SQLITE_CORRUPT_BKPT;
4094   }else{
4095     sqlite3BtreeEnter(p);
4096     rc = btreeCursor(p, iTable, wrFlag, pKeyInfo, pCur);
4097     sqlite3BtreeLeave(p);
4098   }
4099   return rc;
4100 }
4101 
4102 /*
4103 ** Return the size of a BtCursor object in bytes.
4104 **
4105 ** This interfaces is needed so that users of cursors can preallocate
4106 ** sufficient storage to hold a cursor.  The BtCursor object is opaque
4107 ** to users so they cannot do the sizeof() themselves - they must call
4108 ** this routine.
4109 */
4110 int sqlite3BtreeCursorSize(void){
4111   return ROUND8(sizeof(BtCursor));
4112 }
4113 
4114 /*
4115 ** Initialize memory that will be converted into a BtCursor object.
4116 **
4117 ** The simple approach here would be to memset() the entire object
4118 ** to zero.  But it turns out that the apPage[] and aiIdx[] arrays
4119 ** do not need to be zeroed and they are large, so we can save a lot
4120 ** of run-time by skipping the initialization of those elements.
4121 */
4122 void sqlite3BtreeCursorZero(BtCursor *p){
4123   memset(p, 0, offsetof(BtCursor, iPage));
4124 }
4125 
4126 /*
4127 ** Close a cursor.  The read lock on the database file is released
4128 ** when the last cursor is closed.
4129 */
4130 int sqlite3BtreeCloseCursor(BtCursor *pCur){
4131   Btree *pBtree = pCur->pBtree;
4132   if( pBtree ){
4133     int i;
4134     BtShared *pBt = pCur->pBt;
4135     sqlite3BtreeEnter(pBtree);
4136     sqlite3BtreeClearCursor(pCur);
4137     assert( pBt->pCursor!=0 );
4138     if( pBt->pCursor==pCur ){
4139       pBt->pCursor = pCur->pNext;
4140     }else{
4141       BtCursor *pPrev = pBt->pCursor;
4142       do{
4143         if( pPrev->pNext==pCur ){
4144           pPrev->pNext = pCur->pNext;
4145           break;
4146         }
4147         pPrev = pPrev->pNext;
4148       }while( ALWAYS(pPrev) );
4149     }
4150     for(i=0; i<=pCur->iPage; i++){
4151       releasePage(pCur->apPage[i]);
4152     }
4153     unlockBtreeIfUnused(pBt);
4154     sqlite3_free(pCur->aOverflow);
4155     /* sqlite3_free(pCur); */
4156     sqlite3BtreeLeave(pBtree);
4157   }
4158   return SQLITE_OK;
4159 }
4160 
4161 /*
4162 ** Make sure the BtCursor* given in the argument has a valid
4163 ** BtCursor.info structure.  If it is not already valid, call
4164 ** btreeParseCell() to fill it in.
4165 **
4166 ** BtCursor.info is a cache of the information in the current cell.
4167 ** Using this cache reduces the number of calls to btreeParseCell().
4168 */
4169 #ifndef NDEBUG
4170   static void assertCellInfo(BtCursor *pCur){
4171     CellInfo info;
4172     int iPage = pCur->iPage;
4173     memset(&info, 0, sizeof(info));
4174     btreeParseCell(pCur->apPage[iPage], pCur->aiIdx[iPage], &info);
4175     assert( CORRUPT_DB || memcmp(&info, &pCur->info, sizeof(info))==0 );
4176   }
4177 #else
4178   #define assertCellInfo(x)
4179 #endif
4180 static SQLITE_NOINLINE void getCellInfo(BtCursor *pCur){
4181   if( pCur->info.nSize==0 ){
4182     int iPage = pCur->iPage;
4183     pCur->curFlags |= BTCF_ValidNKey;
4184     btreeParseCell(pCur->apPage[iPage],pCur->aiIdx[iPage],&pCur->info);
4185   }else{
4186     assertCellInfo(pCur);
4187   }
4188 }
4189 
4190 #ifndef NDEBUG  /* The next routine used only within assert() statements */
4191 /*
4192 ** Return true if the given BtCursor is valid.  A valid cursor is one
4193 ** that is currently pointing to a row in a (non-empty) table.
4194 ** This is a verification routine is used only within assert() statements.
4195 */
4196 int sqlite3BtreeCursorIsValid(BtCursor *pCur){
4197   return pCur && pCur->eState==CURSOR_VALID;
4198 }
4199 #endif /* NDEBUG */
4200 
4201 /*
4202 ** Set *pSize to the size of the buffer needed to hold the value of
4203 ** the key for the current entry.  If the cursor is not pointing
4204 ** to a valid entry, *pSize is set to 0.
4205 **
4206 ** For a table with the INTKEY flag set, this routine returns the key
4207 ** itself, not the number of bytes in the key.
4208 **
4209 ** The caller must position the cursor prior to invoking this routine.
4210 **
4211 ** This routine cannot fail.  It always returns SQLITE_OK.
4212 */
4213 int sqlite3BtreeKeySize(BtCursor *pCur, i64 *pSize){
4214   assert( cursorHoldsMutex(pCur) );
4215   assert( pCur->eState==CURSOR_VALID );
4216   getCellInfo(pCur);
4217   *pSize = pCur->info.nKey;
4218   return SQLITE_OK;
4219 }
4220 
4221 /*
4222 ** Set *pSize to the number of bytes of data in the entry the
4223 ** cursor currently points to.
4224 **
4225 ** The caller must guarantee that the cursor is pointing to a non-NULL
4226 ** valid entry.  In other words, the calling procedure must guarantee
4227 ** that the cursor has Cursor.eState==CURSOR_VALID.
4228 **
4229 ** Failure is not possible.  This function always returns SQLITE_OK.
4230 ** It might just as well be a procedure (returning void) but we continue
4231 ** to return an integer result code for historical reasons.
4232 */
4233 int sqlite3BtreeDataSize(BtCursor *pCur, u32 *pSize){
4234   assert( cursorHoldsMutex(pCur) );
4235   assert( pCur->eState==CURSOR_VALID );
4236   assert( pCur->iPage>=0 );
4237   assert( pCur->iPage<BTCURSOR_MAX_DEPTH );
4238   assert( pCur->apPage[pCur->iPage]->intKeyLeaf==1 );
4239   getCellInfo(pCur);
4240   *pSize = pCur->info.nPayload;
4241   return SQLITE_OK;
4242 }
4243 
4244 /*
4245 ** Given the page number of an overflow page in the database (parameter
4246 ** ovfl), this function finds the page number of the next page in the
4247 ** linked list of overflow pages. If possible, it uses the auto-vacuum
4248 ** pointer-map data instead of reading the content of page ovfl to do so.
4249 **
4250 ** If an error occurs an SQLite error code is returned. Otherwise:
4251 **
4252 ** The page number of the next overflow page in the linked list is
4253 ** written to *pPgnoNext. If page ovfl is the last page in its linked
4254 ** list, *pPgnoNext is set to zero.
4255 **
4256 ** If ppPage is not NULL, and a reference to the MemPage object corresponding
4257 ** to page number pOvfl was obtained, then *ppPage is set to point to that
4258 ** reference. It is the responsibility of the caller to call releasePage()
4259 ** on *ppPage to free the reference. In no reference was obtained (because
4260 ** the pointer-map was used to obtain the value for *pPgnoNext), then
4261 ** *ppPage is set to zero.
4262 */
4263 static int getOverflowPage(
4264   BtShared *pBt,               /* The database file */
4265   Pgno ovfl,                   /* Current overflow page number */
4266   MemPage **ppPage,            /* OUT: MemPage handle (may be NULL) */
4267   Pgno *pPgnoNext              /* OUT: Next overflow page number */
4268 ){
4269   Pgno next = 0;
4270   MemPage *pPage = 0;
4271   int rc = SQLITE_OK;
4272 
4273   assert( sqlite3_mutex_held(pBt->mutex) );
4274   assert(pPgnoNext);
4275 
4276 #ifndef SQLITE_OMIT_AUTOVACUUM
4277   /* Try to find the next page in the overflow list using the
4278   ** autovacuum pointer-map pages. Guess that the next page in
4279   ** the overflow list is page number (ovfl+1). If that guess turns
4280   ** out to be wrong, fall back to loading the data of page
4281   ** number ovfl to determine the next page number.
4282   */
4283   if( pBt->autoVacuum ){
4284     Pgno pgno;
4285     Pgno iGuess = ovfl+1;
4286     u8 eType;
4287 
4288     while( PTRMAP_ISPAGE(pBt, iGuess) || iGuess==PENDING_BYTE_PAGE(pBt) ){
4289       iGuess++;
4290     }
4291 
4292     if( iGuess<=btreePagecount(pBt) ){
4293       rc = ptrmapGet(pBt, iGuess, &eType, &pgno);
4294       if( rc==SQLITE_OK && eType==PTRMAP_OVERFLOW2 && pgno==ovfl ){
4295         next = iGuess;
4296         rc = SQLITE_DONE;
4297       }
4298     }
4299   }
4300 #endif
4301 
4302   assert( next==0 || rc==SQLITE_DONE );
4303   if( rc==SQLITE_OK ){
4304     rc = btreeGetPage(pBt, ovfl, &pPage, (ppPage==0) ? PAGER_GET_READONLY : 0);
4305     assert( rc==SQLITE_OK || pPage==0 );
4306     if( rc==SQLITE_OK ){
4307       next = get4byte(pPage->aData);
4308     }
4309   }
4310 
4311   *pPgnoNext = next;
4312   if( ppPage ){
4313     *ppPage = pPage;
4314   }else{
4315     releasePage(pPage);
4316   }
4317   return (rc==SQLITE_DONE ? SQLITE_OK : rc);
4318 }
4319 
4320 /*
4321 ** Copy data from a buffer to a page, or from a page to a buffer.
4322 **
4323 ** pPayload is a pointer to data stored on database page pDbPage.
4324 ** If argument eOp is false, then nByte bytes of data are copied
4325 ** from pPayload to the buffer pointed at by pBuf. If eOp is true,
4326 ** then sqlite3PagerWrite() is called on pDbPage and nByte bytes
4327 ** of data are copied from the buffer pBuf to pPayload.
4328 **
4329 ** SQLITE_OK is returned on success, otherwise an error code.
4330 */
4331 static int copyPayload(
4332   void *pPayload,           /* Pointer to page data */
4333   void *pBuf,               /* Pointer to buffer */
4334   int nByte,                /* Number of bytes to copy */
4335   int eOp,                  /* 0 -> copy from page, 1 -> copy to page */
4336   DbPage *pDbPage           /* Page containing pPayload */
4337 ){
4338   if( eOp ){
4339     /* Copy data from buffer to page (a write operation) */
4340     int rc = sqlite3PagerWrite(pDbPage);
4341     if( rc!=SQLITE_OK ){
4342       return rc;
4343     }
4344     memcpy(pPayload, pBuf, nByte);
4345   }else{
4346     /* Copy data from page to buffer (a read operation) */
4347     memcpy(pBuf, pPayload, nByte);
4348   }
4349   return SQLITE_OK;
4350 }
4351 
4352 /*
4353 ** This function is used to read or overwrite payload information
4354 ** for the entry that the pCur cursor is pointing to. The eOp
4355 ** argument is interpreted as follows:
4356 **
4357 **   0: The operation is a read. Populate the overflow cache.
4358 **   1: The operation is a write. Populate the overflow cache.
4359 **   2: The operation is a read. Do not populate the overflow cache.
4360 **
4361 ** A total of "amt" bytes are read or written beginning at "offset".
4362 ** Data is read to or from the buffer pBuf.
4363 **
4364 ** The content being read or written might appear on the main page
4365 ** or be scattered out on multiple overflow pages.
4366 **
4367 ** If the current cursor entry uses one or more overflow pages and the
4368 ** eOp argument is not 2, this function may allocate space for and lazily
4369 ** populates the overflow page-list cache array (BtCursor.aOverflow).
4370 ** Subsequent calls use this cache to make seeking to the supplied offset
4371 ** more efficient.
4372 **
4373 ** Once an overflow page-list cache has been allocated, it may be
4374 ** invalidated if some other cursor writes to the same table, or if
4375 ** the cursor is moved to a different row. Additionally, in auto-vacuum
4376 ** mode, the following events may invalidate an overflow page-list cache.
4377 **
4378 **   * An incremental vacuum,
4379 **   * A commit in auto_vacuum="full" mode,
4380 **   * Creating a table (may require moving an overflow page).
4381 */
4382 static int accessPayload(
4383   BtCursor *pCur,      /* Cursor pointing to entry to read from */
4384   u32 offset,          /* Begin reading this far into payload */
4385   u32 amt,             /* Read this many bytes */
4386   unsigned char *pBuf, /* Write the bytes into this buffer */
4387   int eOp              /* zero to read. non-zero to write. */
4388 ){
4389   unsigned char *aPayload;
4390   int rc = SQLITE_OK;
4391   int iIdx = 0;
4392   MemPage *pPage = pCur->apPage[pCur->iPage]; /* Btree page of current entry */
4393   BtShared *pBt = pCur->pBt;                  /* Btree this cursor belongs to */
4394 #ifdef SQLITE_DIRECT_OVERFLOW_READ
4395   unsigned char * const pBufStart = pBuf;
4396   int bEnd;                                 /* True if reading to end of data */
4397 #endif
4398 
4399   assert( pPage );
4400   assert( pCur->eState==CURSOR_VALID );
4401   assert( pCur->aiIdx[pCur->iPage]<pPage->nCell );
4402   assert( cursorHoldsMutex(pCur) );
4403   assert( eOp!=2 || offset==0 );    /* Always start from beginning for eOp==2 */
4404 
4405   getCellInfo(pCur);
4406   aPayload = pCur->info.pPayload;
4407 #ifdef SQLITE_DIRECT_OVERFLOW_READ
4408   bEnd = offset+amt==pCur->info.nPayload;
4409 #endif
4410   assert( offset+amt <= pCur->info.nPayload );
4411 
4412   if( &aPayload[pCur->info.nLocal] > &pPage->aData[pBt->usableSize] ){
4413     /* Trying to read or write past the end of the data is an error */
4414     return SQLITE_CORRUPT_BKPT;
4415   }
4416 
4417   /* Check if data must be read/written to/from the btree page itself. */
4418   if( offset<pCur->info.nLocal ){
4419     int a = amt;
4420     if( a+offset>pCur->info.nLocal ){
4421       a = pCur->info.nLocal - offset;
4422     }
4423     rc = copyPayload(&aPayload[offset], pBuf, a, (eOp & 0x01), pPage->pDbPage);
4424     offset = 0;
4425     pBuf += a;
4426     amt -= a;
4427   }else{
4428     offset -= pCur->info.nLocal;
4429   }
4430 
4431 
4432   if( rc==SQLITE_OK && amt>0 ){
4433     const u32 ovflSize = pBt->usableSize - 4;  /* Bytes content per ovfl page */
4434     Pgno nextPage;
4435 
4436     nextPage = get4byte(&aPayload[pCur->info.nLocal]);
4437 
4438     /* If the BtCursor.aOverflow[] has not been allocated, allocate it now.
4439     ** Except, do not allocate aOverflow[] for eOp==2.
4440     **
4441     ** The aOverflow[] array is sized at one entry for each overflow page
4442     ** in the overflow chain. The page number of the first overflow page is
4443     ** stored in aOverflow[0], etc. A value of 0 in the aOverflow[] array
4444     ** means "not yet known" (the cache is lazily populated).
4445     */
4446     if( eOp!=2 && (pCur->curFlags & BTCF_ValidOvfl)==0 ){
4447       int nOvfl = (pCur->info.nPayload-pCur->info.nLocal+ovflSize-1)/ovflSize;
4448       if( nOvfl>pCur->nOvflAlloc ){
4449         Pgno *aNew = (Pgno*)sqlite3Realloc(
4450             pCur->aOverflow, nOvfl*2*sizeof(Pgno)
4451         );
4452         if( aNew==0 ){
4453           rc = SQLITE_NOMEM;
4454         }else{
4455           pCur->nOvflAlloc = nOvfl*2;
4456           pCur->aOverflow = aNew;
4457         }
4458       }
4459       if( rc==SQLITE_OK ){
4460         memset(pCur->aOverflow, 0, nOvfl*sizeof(Pgno));
4461         pCur->curFlags |= BTCF_ValidOvfl;
4462       }
4463     }
4464 
4465     /* If the overflow page-list cache has been allocated and the
4466     ** entry for the first required overflow page is valid, skip
4467     ** directly to it.
4468     */
4469     if( (pCur->curFlags & BTCF_ValidOvfl)!=0
4470      && pCur->aOverflow[offset/ovflSize]
4471     ){
4472       iIdx = (offset/ovflSize);
4473       nextPage = pCur->aOverflow[iIdx];
4474       offset = (offset%ovflSize);
4475     }
4476 
4477     for( ; rc==SQLITE_OK && amt>0 && nextPage; iIdx++){
4478 
4479       /* If required, populate the overflow page-list cache. */
4480       if( (pCur->curFlags & BTCF_ValidOvfl)!=0 ){
4481         assert(!pCur->aOverflow[iIdx] || pCur->aOverflow[iIdx]==nextPage);
4482         pCur->aOverflow[iIdx] = nextPage;
4483       }
4484 
4485       if( offset>=ovflSize ){
4486         /* The only reason to read this page is to obtain the page
4487         ** number for the next page in the overflow chain. The page
4488         ** data is not required. So first try to lookup the overflow
4489         ** page-list cache, if any, then fall back to the getOverflowPage()
4490         ** function.
4491         **
4492         ** Note that the aOverflow[] array must be allocated because eOp!=2
4493         ** here.  If eOp==2, then offset==0 and this branch is never taken.
4494         */
4495         assert( eOp!=2 );
4496         assert( pCur->curFlags & BTCF_ValidOvfl );
4497         assert( pCur->pBtree->db==pBt->db );
4498         if( pCur->aOverflow[iIdx+1] ){
4499           nextPage = pCur->aOverflow[iIdx+1];
4500         }else{
4501           rc = getOverflowPage(pBt, nextPage, 0, &nextPage);
4502         }
4503         offset -= ovflSize;
4504       }else{
4505         /* Need to read this page properly. It contains some of the
4506         ** range of data that is being read (eOp==0) or written (eOp!=0).
4507         */
4508 #ifdef SQLITE_DIRECT_OVERFLOW_READ
4509         sqlite3_file *fd;
4510 #endif
4511         int a = amt;
4512         if( a + offset > ovflSize ){
4513           a = ovflSize - offset;
4514         }
4515 
4516 #ifdef SQLITE_DIRECT_OVERFLOW_READ
4517         /* If all the following are true:
4518         **
4519         **   1) this is a read operation, and
4520         **   2) data is required from the start of this overflow page, and
4521         **   3) the database is file-backed, and
4522         **   4) there is no open write-transaction, and
4523         **   5) the database is not a WAL database,
4524         **   6) all data from the page is being read.
4525         **   7) at least 4 bytes have already been read into the output buffer
4526         **
4527         ** then data can be read directly from the database file into the
4528         ** output buffer, bypassing the page-cache altogether. This speeds
4529         ** up loading large records that span many overflow pages.
4530         */
4531         if( (eOp&0x01)==0                                      /* (1) */
4532          && offset==0                                          /* (2) */
4533          && (bEnd || a==ovflSize)                              /* (6) */
4534          && pBt->inTransaction==TRANS_READ                     /* (4) */
4535          && (fd = sqlite3PagerFile(pBt->pPager))->pMethods     /* (3) */
4536          && pBt->pPage1->aData[19]==0x01                       /* (5) */
4537          && &pBuf[-4]>=pBufStart                               /* (7) */
4538         ){
4539           u8 aSave[4];
4540           u8 *aWrite = &pBuf[-4];
4541           assert( aWrite>=pBufStart );                         /* hence (7) */
4542           memcpy(aSave, aWrite, 4);
4543           rc = sqlite3OsRead(fd, aWrite, a+4, (i64)pBt->pageSize*(nextPage-1));
4544           nextPage = get4byte(aWrite);
4545           memcpy(aWrite, aSave, 4);
4546         }else
4547 #endif
4548 
4549         {
4550           DbPage *pDbPage;
4551           rc = sqlite3PagerAcquire(pBt->pPager, nextPage, &pDbPage,
4552               ((eOp&0x01)==0 ? PAGER_GET_READONLY : 0)
4553           );
4554           if( rc==SQLITE_OK ){
4555             aPayload = sqlite3PagerGetData(pDbPage);
4556             nextPage = get4byte(aPayload);
4557             rc = copyPayload(&aPayload[offset+4], pBuf, a, (eOp&0x01), pDbPage);
4558             sqlite3PagerUnref(pDbPage);
4559             offset = 0;
4560           }
4561         }
4562         amt -= a;
4563         pBuf += a;
4564       }
4565     }
4566   }
4567 
4568   if( rc==SQLITE_OK && amt>0 ){
4569     return SQLITE_CORRUPT_BKPT;
4570   }
4571   return rc;
4572 }
4573 
4574 /*
4575 ** Read part of the key associated with cursor pCur.  Exactly
4576 ** "amt" bytes will be transferred into pBuf[].  The transfer
4577 ** begins at "offset".
4578 **
4579 ** The caller must ensure that pCur is pointing to a valid row
4580 ** in the table.
4581 **
4582 ** Return SQLITE_OK on success or an error code if anything goes
4583 ** wrong.  An error is returned if "offset+amt" is larger than
4584 ** the available payload.
4585 */
4586 int sqlite3BtreeKey(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){
4587   assert( cursorHoldsMutex(pCur) );
4588   assert( pCur->eState==CURSOR_VALID );
4589   assert( pCur->iPage>=0 && pCur->apPage[pCur->iPage] );
4590   assert( pCur->aiIdx[pCur->iPage]<pCur->apPage[pCur->iPage]->nCell );
4591   return accessPayload(pCur, offset, amt, (unsigned char*)pBuf, 0);
4592 }
4593 
4594 /*
4595 ** Read part of the data associated with cursor pCur.  Exactly
4596 ** "amt" bytes will be transfered into pBuf[].  The transfer
4597 ** begins at "offset".
4598 **
4599 ** Return SQLITE_OK on success or an error code if anything goes
4600 ** wrong.  An error is returned if "offset+amt" is larger than
4601 ** the available payload.
4602 */
4603 int sqlite3BtreeData(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){
4604   int rc;
4605 
4606 #ifndef SQLITE_OMIT_INCRBLOB
4607   if ( pCur->eState==CURSOR_INVALID ){
4608     return SQLITE_ABORT;
4609   }
4610 #endif
4611 
4612   assert( cursorHoldsMutex(pCur) );
4613   rc = restoreCursorPosition(pCur);
4614   if( rc==SQLITE_OK ){
4615     assert( pCur->eState==CURSOR_VALID );
4616     assert( pCur->iPage>=0 && pCur->apPage[pCur->iPage] );
4617     assert( pCur->aiIdx[pCur->iPage]<pCur->apPage[pCur->iPage]->nCell );
4618     rc = accessPayload(pCur, offset, amt, pBuf, 0);
4619   }
4620   return rc;
4621 }
4622 
4623 /*
4624 ** Return a pointer to payload information from the entry that the
4625 ** pCur cursor is pointing to.  The pointer is to the beginning of
4626 ** the key if index btrees (pPage->intKey==0) and is the data for
4627 ** table btrees (pPage->intKey==1). The number of bytes of available
4628 ** key/data is written into *pAmt.  If *pAmt==0, then the value
4629 ** returned will not be a valid pointer.
4630 **
4631 ** This routine is an optimization.  It is common for the entire key
4632 ** and data to fit on the local page and for there to be no overflow
4633 ** pages.  When that is so, this routine can be used to access the
4634 ** key and data without making a copy.  If the key and/or data spills
4635 ** onto overflow pages, then accessPayload() must be used to reassemble
4636 ** the key/data and copy it into a preallocated buffer.
4637 **
4638 ** The pointer returned by this routine looks directly into the cached
4639 ** page of the database.  The data might change or move the next time
4640 ** any btree routine is called.
4641 */
4642 static const void *fetchPayload(
4643   BtCursor *pCur,      /* Cursor pointing to entry to read from */
4644   u32 *pAmt            /* Write the number of available bytes here */
4645 ){
4646   u32 amt;
4647   assert( pCur!=0 && pCur->iPage>=0 && pCur->apPage[pCur->iPage]);
4648   assert( pCur->eState==CURSOR_VALID );
4649   assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
4650   assert( cursorHoldsMutex(pCur) );
4651   assert( pCur->aiIdx[pCur->iPage]<pCur->apPage[pCur->iPage]->nCell );
4652   assert( pCur->info.nSize>0 );
4653   assert( pCur->info.pPayload>pCur->apPage[pCur->iPage]->aData || CORRUPT_DB );
4654   assert( pCur->info.pPayload<pCur->apPage[pCur->iPage]->aDataEnd ||CORRUPT_DB);
4655   amt = (int)(pCur->apPage[pCur->iPage]->aDataEnd - pCur->info.pPayload);
4656   if( pCur->info.nLocal<amt ) amt = pCur->info.nLocal;
4657   *pAmt = amt;
4658   return (void*)pCur->info.pPayload;
4659 }
4660 
4661 
4662 /*
4663 ** For the entry that cursor pCur is point to, return as
4664 ** many bytes of the key or data as are available on the local
4665 ** b-tree page.  Write the number of available bytes into *pAmt.
4666 **
4667 ** The pointer returned is ephemeral.  The key/data may move
4668 ** or be destroyed on the next call to any Btree routine,
4669 ** including calls from other threads against the same cache.
4670 ** Hence, a mutex on the BtShared should be held prior to calling
4671 ** this routine.
4672 **
4673 ** These routines is used to get quick access to key and data
4674 ** in the common case where no overflow pages are used.
4675 */
4676 const void *sqlite3BtreeKeyFetch(BtCursor *pCur, u32 *pAmt){
4677   return fetchPayload(pCur, pAmt);
4678 }
4679 const void *sqlite3BtreeDataFetch(BtCursor *pCur, u32 *pAmt){
4680   return fetchPayload(pCur, pAmt);
4681 }
4682 
4683 
4684 /*
4685 ** Move the cursor down to a new child page.  The newPgno argument is the
4686 ** page number of the child page to move to.
4687 **
4688 ** This function returns SQLITE_CORRUPT if the page-header flags field of
4689 ** the new child page does not match the flags field of the parent (i.e.
4690 ** if an intkey page appears to be the parent of a non-intkey page, or
4691 ** vice-versa).
4692 */
4693 static int moveToChild(BtCursor *pCur, u32 newPgno){
4694   BtShared *pBt = pCur->pBt;
4695 
4696   assert( cursorHoldsMutex(pCur) );
4697   assert( pCur->eState==CURSOR_VALID );
4698   assert( pCur->iPage<BTCURSOR_MAX_DEPTH );
4699   assert( pCur->iPage>=0 );
4700   if( pCur->iPage>=(BTCURSOR_MAX_DEPTH-1) ){
4701     return SQLITE_CORRUPT_BKPT;
4702   }
4703   pCur->info.nSize = 0;
4704   pCur->curFlags &= ~(BTCF_ValidNKey|BTCF_ValidOvfl);
4705   pCur->iPage++;
4706   pCur->aiIdx[pCur->iPage] = 0;
4707   return getAndInitPage(pBt, newPgno, &pCur->apPage[pCur->iPage],
4708                         pCur, pCur->curPagerFlags);
4709 }
4710 
4711 #if SQLITE_DEBUG
4712 /*
4713 ** Page pParent is an internal (non-leaf) tree page. This function
4714 ** asserts that page number iChild is the left-child if the iIdx'th
4715 ** cell in page pParent. Or, if iIdx is equal to the total number of
4716 ** cells in pParent, that page number iChild is the right-child of
4717 ** the page.
4718 */
4719 static void assertParentIndex(MemPage *pParent, int iIdx, Pgno iChild){
4720   if( CORRUPT_DB ) return;  /* The conditions tested below might not be true
4721                             ** in a corrupt database */
4722   assert( iIdx<=pParent->nCell );
4723   if( iIdx==pParent->nCell ){
4724     assert( get4byte(&pParent->aData[pParent->hdrOffset+8])==iChild );
4725   }else{
4726     assert( get4byte(findCell(pParent, iIdx))==iChild );
4727   }
4728 }
4729 #else
4730 #  define assertParentIndex(x,y,z)
4731 #endif
4732 
4733 /*
4734 ** Move the cursor up to the parent page.
4735 **
4736 ** pCur->idx is set to the cell index that contains the pointer
4737 ** to the page we are coming from.  If we are coming from the
4738 ** right-most child page then pCur->idx is set to one more than
4739 ** the largest cell index.
4740 */
4741 static void moveToParent(BtCursor *pCur){
4742   assert( cursorHoldsMutex(pCur) );
4743   assert( pCur->eState==CURSOR_VALID );
4744   assert( pCur->iPage>0 );
4745   assert( pCur->apPage[pCur->iPage] );
4746   assertParentIndex(
4747     pCur->apPage[pCur->iPage-1],
4748     pCur->aiIdx[pCur->iPage-1],
4749     pCur->apPage[pCur->iPage]->pgno
4750   );
4751   testcase( pCur->aiIdx[pCur->iPage-1] > pCur->apPage[pCur->iPage-1]->nCell );
4752   pCur->info.nSize = 0;
4753   pCur->curFlags &= ~(BTCF_ValidNKey|BTCF_ValidOvfl);
4754   releasePageNotNull(pCur->apPage[pCur->iPage--]);
4755 }
4756 
4757 /*
4758 ** Move the cursor to point to the root page of its b-tree structure.
4759 **
4760 ** If the table has a virtual root page, then the cursor is moved to point
4761 ** to the virtual root page instead of the actual root page. A table has a
4762 ** virtual root page when the actual root page contains no cells and a
4763 ** single child page. This can only happen with the table rooted at page 1.
4764 **
4765 ** If the b-tree structure is empty, the cursor state is set to
4766 ** CURSOR_INVALID. Otherwise, the cursor is set to point to the first
4767 ** cell located on the root (or virtual root) page and the cursor state
4768 ** is set to CURSOR_VALID.
4769 **
4770 ** If this function returns successfully, it may be assumed that the
4771 ** page-header flags indicate that the [virtual] root-page is the expected
4772 ** kind of b-tree page (i.e. if when opening the cursor the caller did not
4773 ** specify a KeyInfo structure the flags byte is set to 0x05 or 0x0D,
4774 ** indicating a table b-tree, or if the caller did specify a KeyInfo
4775 ** structure the flags byte is set to 0x02 or 0x0A, indicating an index
4776 ** b-tree).
4777 */
4778 static int moveToRoot(BtCursor *pCur){
4779   MemPage *pRoot;
4780   int rc = SQLITE_OK;
4781 
4782   assert( cursorHoldsMutex(pCur) );
4783   assert( CURSOR_INVALID < CURSOR_REQUIRESEEK );
4784   assert( CURSOR_VALID   < CURSOR_REQUIRESEEK );
4785   assert( CURSOR_FAULT   > CURSOR_REQUIRESEEK );
4786   if( pCur->eState>=CURSOR_REQUIRESEEK ){
4787     if( pCur->eState==CURSOR_FAULT ){
4788       assert( pCur->skipNext!=SQLITE_OK );
4789       return pCur->skipNext;
4790     }
4791     sqlite3BtreeClearCursor(pCur);
4792   }
4793 
4794   if( pCur->iPage>=0 ){
4795     while( pCur->iPage ){
4796       assert( pCur->apPage[pCur->iPage]!=0 );
4797       releasePageNotNull(pCur->apPage[pCur->iPage--]);
4798     }
4799   }else if( pCur->pgnoRoot==0 ){
4800     pCur->eState = CURSOR_INVALID;
4801     return SQLITE_OK;
4802   }else{
4803     assert( pCur->iPage==(-1) );
4804     rc = getAndInitPage(pCur->pBtree->pBt, pCur->pgnoRoot, &pCur->apPage[0],
4805                         0, pCur->curPagerFlags);
4806     if( rc!=SQLITE_OK ){
4807       pCur->eState = CURSOR_INVALID;
4808       return rc;
4809     }
4810     pCur->iPage = 0;
4811     pCur->curIntKey = pCur->apPage[0]->intKey;
4812   }
4813   pRoot = pCur->apPage[0];
4814   assert( pRoot->pgno==pCur->pgnoRoot );
4815 
4816   /* If pCur->pKeyInfo is not NULL, then the caller that opened this cursor
4817   ** expected to open it on an index b-tree. Otherwise, if pKeyInfo is
4818   ** NULL, the caller expects a table b-tree. If this is not the case,
4819   ** return an SQLITE_CORRUPT error.
4820   **
4821   ** Earlier versions of SQLite assumed that this test could not fail
4822   ** if the root page was already loaded when this function was called (i.e.
4823   ** if pCur->iPage>=0). But this is not so if the database is corrupted
4824   ** in such a way that page pRoot is linked into a second b-tree table
4825   ** (or the freelist).  */
4826   assert( pRoot->intKey==1 || pRoot->intKey==0 );
4827   if( pRoot->isInit==0 || (pCur->pKeyInfo==0)!=pRoot->intKey ){
4828     return SQLITE_CORRUPT_BKPT;
4829   }
4830 
4831   pCur->aiIdx[0] = 0;
4832   pCur->info.nSize = 0;
4833   pCur->curFlags &= ~(BTCF_AtLast|BTCF_ValidNKey|BTCF_ValidOvfl);
4834 
4835   if( pRoot->nCell>0 ){
4836     pCur->eState = CURSOR_VALID;
4837   }else if( !pRoot->leaf ){
4838     Pgno subpage;
4839     if( pRoot->pgno!=1 ) return SQLITE_CORRUPT_BKPT;
4840     subpage = get4byte(&pRoot->aData[pRoot->hdrOffset+8]);
4841     pCur->eState = CURSOR_VALID;
4842     rc = moveToChild(pCur, subpage);
4843   }else{
4844     pCur->eState = CURSOR_INVALID;
4845   }
4846   return rc;
4847 }
4848 
4849 /*
4850 ** Move the cursor down to the left-most leaf entry beneath the
4851 ** entry to which it is currently pointing.
4852 **
4853 ** The left-most leaf is the one with the smallest key - the first
4854 ** in ascending order.
4855 */
4856 static int moveToLeftmost(BtCursor *pCur){
4857   Pgno pgno;
4858   int rc = SQLITE_OK;
4859   MemPage *pPage;
4860 
4861   assert( cursorHoldsMutex(pCur) );
4862   assert( pCur->eState==CURSOR_VALID );
4863   while( rc==SQLITE_OK && !(pPage = pCur->apPage[pCur->iPage])->leaf ){
4864     assert( pCur->aiIdx[pCur->iPage]<pPage->nCell );
4865     pgno = get4byte(findCell(pPage, pCur->aiIdx[pCur->iPage]));
4866     rc = moveToChild(pCur, pgno);
4867   }
4868   return rc;
4869 }
4870 
4871 /*
4872 ** Move the cursor down to the right-most leaf entry beneath the
4873 ** page to which it is currently pointing.  Notice the difference
4874 ** between moveToLeftmost() and moveToRightmost().  moveToLeftmost()
4875 ** finds the left-most entry beneath the *entry* whereas moveToRightmost()
4876 ** finds the right-most entry beneath the *page*.
4877 **
4878 ** The right-most entry is the one with the largest key - the last
4879 ** key in ascending order.
4880 */
4881 static int moveToRightmost(BtCursor *pCur){
4882   Pgno pgno;
4883   int rc = SQLITE_OK;
4884   MemPage *pPage = 0;
4885 
4886   assert( cursorHoldsMutex(pCur) );
4887   assert( pCur->eState==CURSOR_VALID );
4888   while( !(pPage = pCur->apPage[pCur->iPage])->leaf ){
4889     pgno = get4byte(&pPage->aData[pPage->hdrOffset+8]);
4890     pCur->aiIdx[pCur->iPage] = pPage->nCell;
4891     rc = moveToChild(pCur, pgno);
4892     if( rc ) return rc;
4893   }
4894   pCur->aiIdx[pCur->iPage] = pPage->nCell-1;
4895   assert( pCur->info.nSize==0 );
4896   assert( (pCur->curFlags & BTCF_ValidNKey)==0 );
4897   return SQLITE_OK;
4898 }
4899 
4900 /* Move the cursor to the first entry in the table.  Return SQLITE_OK
4901 ** on success.  Set *pRes to 0 if the cursor actually points to something
4902 ** or set *pRes to 1 if the table is empty.
4903 */
4904 int sqlite3BtreeFirst(BtCursor *pCur, int *pRes){
4905   int rc;
4906 
4907   assert( cursorHoldsMutex(pCur) );
4908   assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
4909   rc = moveToRoot(pCur);
4910   if( rc==SQLITE_OK ){
4911     if( pCur->eState==CURSOR_INVALID ){
4912       assert( pCur->pgnoRoot==0 || pCur->apPage[pCur->iPage]->nCell==0 );
4913       *pRes = 1;
4914     }else{
4915       assert( pCur->apPage[pCur->iPage]->nCell>0 );
4916       *pRes = 0;
4917       rc = moveToLeftmost(pCur);
4918     }
4919   }
4920   return rc;
4921 }
4922 
4923 /* Move the cursor to the last entry in the table.  Return SQLITE_OK
4924 ** on success.  Set *pRes to 0 if the cursor actually points to something
4925 ** or set *pRes to 1 if the table is empty.
4926 */
4927 int sqlite3BtreeLast(BtCursor *pCur, int *pRes){
4928   int rc;
4929 
4930   assert( cursorHoldsMutex(pCur) );
4931   assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
4932 
4933   /* If the cursor already points to the last entry, this is a no-op. */
4934   if( CURSOR_VALID==pCur->eState && (pCur->curFlags & BTCF_AtLast)!=0 ){
4935 #ifdef SQLITE_DEBUG
4936     /* This block serves to assert() that the cursor really does point
4937     ** to the last entry in the b-tree. */
4938     int ii;
4939     for(ii=0; ii<pCur->iPage; ii++){
4940       assert( pCur->aiIdx[ii]==pCur->apPage[ii]->nCell );
4941     }
4942     assert( pCur->aiIdx[pCur->iPage]==pCur->apPage[pCur->iPage]->nCell-1 );
4943     assert( pCur->apPage[pCur->iPage]->leaf );
4944 #endif
4945     return SQLITE_OK;
4946   }
4947 
4948   rc = moveToRoot(pCur);
4949   if( rc==SQLITE_OK ){
4950     if( CURSOR_INVALID==pCur->eState ){
4951       assert( pCur->pgnoRoot==0 || pCur->apPage[pCur->iPage]->nCell==0 );
4952       *pRes = 1;
4953     }else{
4954       assert( pCur->eState==CURSOR_VALID );
4955       *pRes = 0;
4956       rc = moveToRightmost(pCur);
4957       if( rc==SQLITE_OK ){
4958         pCur->curFlags |= BTCF_AtLast;
4959       }else{
4960         pCur->curFlags &= ~BTCF_AtLast;
4961       }
4962 
4963     }
4964   }
4965   return rc;
4966 }
4967 
4968 /* Move the cursor so that it points to an entry near the key
4969 ** specified by pIdxKey or intKey.   Return a success code.
4970 **
4971 ** For INTKEY tables, the intKey parameter is used.  pIdxKey
4972 ** must be NULL.  For index tables, pIdxKey is used and intKey
4973 ** is ignored.
4974 **
4975 ** If an exact match is not found, then the cursor is always
4976 ** left pointing at a leaf page which would hold the entry if it
4977 ** were present.  The cursor might point to an entry that comes
4978 ** before or after the key.
4979 **
4980 ** An integer is written into *pRes which is the result of
4981 ** comparing the key with the entry to which the cursor is
4982 ** pointing.  The meaning of the integer written into
4983 ** *pRes is as follows:
4984 **
4985 **     *pRes<0      The cursor is left pointing at an entry that
4986 **                  is smaller than intKey/pIdxKey or if the table is empty
4987 **                  and the cursor is therefore left point to nothing.
4988 **
4989 **     *pRes==0     The cursor is left pointing at an entry that
4990 **                  exactly matches intKey/pIdxKey.
4991 **
4992 **     *pRes>0      The cursor is left pointing at an entry that
4993 **                  is larger than intKey/pIdxKey.
4994 **
4995 */
4996 int sqlite3BtreeMovetoUnpacked(
4997   BtCursor *pCur,          /* The cursor to be moved */
4998   UnpackedRecord *pIdxKey, /* Unpacked index key */
4999   i64 intKey,              /* The table key */
5000   int biasRight,           /* If true, bias the search to the high end */
5001   int *pRes                /* Write search results here */
5002 ){
5003   int rc;
5004   RecordCompare xRecordCompare;
5005 
5006   assert( cursorHoldsMutex(pCur) );
5007   assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
5008   assert( pRes );
5009   assert( (pIdxKey==0)==(pCur->pKeyInfo==0) );
5010 
5011   /* If the cursor is already positioned at the point we are trying
5012   ** to move to, then just return without doing any work */
5013   if( pCur->eState==CURSOR_VALID && (pCur->curFlags & BTCF_ValidNKey)!=0
5014    && pCur->curIntKey
5015   ){
5016     if( pCur->info.nKey==intKey ){
5017       *pRes = 0;
5018       return SQLITE_OK;
5019     }
5020     if( (pCur->curFlags & BTCF_AtLast)!=0 && pCur->info.nKey<intKey ){
5021       *pRes = -1;
5022       return SQLITE_OK;
5023     }
5024   }
5025 
5026   if( pIdxKey ){
5027     xRecordCompare = sqlite3VdbeFindCompare(pIdxKey);
5028     pIdxKey->errCode = 0;
5029     assert( pIdxKey->default_rc==1
5030          || pIdxKey->default_rc==0
5031          || pIdxKey->default_rc==-1
5032     );
5033   }else{
5034     xRecordCompare = 0; /* All keys are integers */
5035   }
5036 
5037   rc = moveToRoot(pCur);
5038   if( rc ){
5039     return rc;
5040   }
5041   assert( pCur->pgnoRoot==0 || pCur->apPage[pCur->iPage] );
5042   assert( pCur->pgnoRoot==0 || pCur->apPage[pCur->iPage]->isInit );
5043   assert( pCur->eState==CURSOR_INVALID || pCur->apPage[pCur->iPage]->nCell>0 );
5044   if( pCur->eState==CURSOR_INVALID ){
5045     *pRes = -1;
5046     assert( pCur->pgnoRoot==0 || pCur->apPage[pCur->iPage]->nCell==0 );
5047     return SQLITE_OK;
5048   }
5049   assert( pCur->apPage[0]->intKey==pCur->curIntKey );
5050   assert( pCur->curIntKey || pIdxKey );
5051   for(;;){
5052     int lwr, upr, idx, c;
5053     Pgno chldPg;
5054     MemPage *pPage = pCur->apPage[pCur->iPage];
5055     u8 *pCell;                          /* Pointer to current cell in pPage */
5056 
5057     /* pPage->nCell must be greater than zero. If this is the root-page
5058     ** the cursor would have been INVALID above and this for(;;) loop
5059     ** not run. If this is not the root-page, then the moveToChild() routine
5060     ** would have already detected db corruption. Similarly, pPage must
5061     ** be the right kind (index or table) of b-tree page. Otherwise
5062     ** a moveToChild() or moveToRoot() call would have detected corruption.  */
5063     assert( pPage->nCell>0 );
5064     assert( pPage->intKey==(pIdxKey==0) );
5065     lwr = 0;
5066     upr = pPage->nCell-1;
5067     assert( biasRight==0 || biasRight==1 );
5068     idx = upr>>(1-biasRight); /* idx = biasRight ? upr : (lwr+upr)/2; */
5069     pCur->aiIdx[pCur->iPage] = (u16)idx;
5070     if( xRecordCompare==0 ){
5071       for(;;){
5072         i64 nCellKey;
5073         pCell = findCellPastPtr(pPage, idx);
5074         if( pPage->intKeyLeaf ){
5075           while( 0x80 <= *(pCell++) ){
5076             if( pCell>=pPage->aDataEnd ) return SQLITE_CORRUPT_BKPT;
5077           }
5078         }
5079         getVarint(pCell, (u64*)&nCellKey);
5080         if( nCellKey<intKey ){
5081           lwr = idx+1;
5082           if( lwr>upr ){ c = -1; break; }
5083         }else if( nCellKey>intKey ){
5084           upr = idx-1;
5085           if( lwr>upr ){ c = +1; break; }
5086         }else{
5087           assert( nCellKey==intKey );
5088           pCur->curFlags |= BTCF_ValidNKey;
5089           pCur->info.nKey = nCellKey;
5090           pCur->aiIdx[pCur->iPage] = (u16)idx;
5091           if( !pPage->leaf ){
5092             lwr = idx;
5093             goto moveto_next_layer;
5094           }else{
5095             *pRes = 0;
5096             rc = SQLITE_OK;
5097             goto moveto_finish;
5098           }
5099         }
5100         assert( lwr+upr>=0 );
5101         idx = (lwr+upr)>>1;  /* idx = (lwr+upr)/2; */
5102       }
5103     }else{
5104       for(;;){
5105         int nCell;  /* Size of the pCell cell in bytes */
5106         pCell = findCellPastPtr(pPage, idx);
5107 
5108         /* The maximum supported page-size is 65536 bytes. This means that
5109         ** the maximum number of record bytes stored on an index B-Tree
5110         ** page is less than 16384 bytes and may be stored as a 2-byte
5111         ** varint. This information is used to attempt to avoid parsing
5112         ** the entire cell by checking for the cases where the record is
5113         ** stored entirely within the b-tree page by inspecting the first
5114         ** 2 bytes of the cell.
5115         */
5116         nCell = pCell[0];
5117         if( nCell<=pPage->max1bytePayload ){
5118           /* This branch runs if the record-size field of the cell is a
5119           ** single byte varint and the record fits entirely on the main
5120           ** b-tree page.  */
5121           testcase( pCell+nCell+1==pPage->aDataEnd );
5122           c = xRecordCompare(nCell, (void*)&pCell[1], pIdxKey);
5123         }else if( !(pCell[1] & 0x80)
5124           && (nCell = ((nCell&0x7f)<<7) + pCell[1])<=pPage->maxLocal
5125         ){
5126           /* The record-size field is a 2 byte varint and the record
5127           ** fits entirely on the main b-tree page.  */
5128           testcase( pCell+nCell+2==pPage->aDataEnd );
5129           c = xRecordCompare(nCell, (void*)&pCell[2], pIdxKey);
5130         }else{
5131           /* The record flows over onto one or more overflow pages. In
5132           ** this case the whole cell needs to be parsed, a buffer allocated
5133           ** and accessPayload() used to retrieve the record into the
5134           ** buffer before VdbeRecordCompare() can be called.
5135           **
5136           ** If the record is corrupt, the xRecordCompare routine may read
5137           ** up to two varints past the end of the buffer. An extra 18
5138           ** bytes of padding is allocated at the end of the buffer in
5139           ** case this happens.  */
5140           void *pCellKey;
5141           u8 * const pCellBody = pCell - pPage->childPtrSize;
5142           pPage->xParseCell(pPage, pCellBody, &pCur->info);
5143           nCell = (int)pCur->info.nKey;
5144           testcase( nCell<0 );   /* True if key size is 2^32 or more */
5145           testcase( nCell==0 );  /* Invalid key size:  0x80 0x80 0x00 */
5146           testcase( nCell==1 );  /* Invalid key size:  0x80 0x80 0x01 */
5147           testcase( nCell==2 );  /* Minimum legal index key size */
5148           if( nCell<2 ){
5149             rc = SQLITE_CORRUPT_BKPT;
5150             goto moveto_finish;
5151           }
5152           pCellKey = sqlite3Malloc( nCell+18 );
5153           if( pCellKey==0 ){
5154             rc = SQLITE_NOMEM;
5155             goto moveto_finish;
5156           }
5157           pCur->aiIdx[pCur->iPage] = (u16)idx;
5158           rc = accessPayload(pCur, 0, nCell, (unsigned char*)pCellKey, 2);
5159           if( rc ){
5160             sqlite3_free(pCellKey);
5161             goto moveto_finish;
5162           }
5163           c = xRecordCompare(nCell, pCellKey, pIdxKey);
5164           sqlite3_free(pCellKey);
5165         }
5166         assert(
5167             (pIdxKey->errCode!=SQLITE_CORRUPT || c==0)
5168          && (pIdxKey->errCode!=SQLITE_NOMEM || pCur->pBtree->db->mallocFailed)
5169         );
5170         if( c<0 ){
5171           lwr = idx+1;
5172         }else if( c>0 ){
5173           upr = idx-1;
5174         }else{
5175           assert( c==0 );
5176           *pRes = 0;
5177           rc = SQLITE_OK;
5178           pCur->aiIdx[pCur->iPage] = (u16)idx;
5179           if( pIdxKey->errCode ) rc = SQLITE_CORRUPT;
5180           goto moveto_finish;
5181         }
5182         if( lwr>upr ) break;
5183         assert( lwr+upr>=0 );
5184         idx = (lwr+upr)>>1;  /* idx = (lwr+upr)/2 */
5185       }
5186     }
5187     assert( lwr==upr+1 || (pPage->intKey && !pPage->leaf) );
5188     assert( pPage->isInit );
5189     if( pPage->leaf ){
5190       assert( pCur->aiIdx[pCur->iPage]<pCur->apPage[pCur->iPage]->nCell );
5191       pCur->aiIdx[pCur->iPage] = (u16)idx;
5192       *pRes = c;
5193       rc = SQLITE_OK;
5194       goto moveto_finish;
5195     }
5196 moveto_next_layer:
5197     if( lwr>=pPage->nCell ){
5198       chldPg = get4byte(&pPage->aData[pPage->hdrOffset+8]);
5199     }else{
5200       chldPg = get4byte(findCell(pPage, lwr));
5201     }
5202     pCur->aiIdx[pCur->iPage] = (u16)lwr;
5203     rc = moveToChild(pCur, chldPg);
5204     if( rc ) break;
5205   }
5206 moveto_finish:
5207   pCur->info.nSize = 0;
5208   pCur->curFlags &= ~(BTCF_ValidNKey|BTCF_ValidOvfl);
5209   return rc;
5210 }
5211 
5212 
5213 /*
5214 ** Return TRUE if the cursor is not pointing at an entry of the table.
5215 **
5216 ** TRUE will be returned after a call to sqlite3BtreeNext() moves
5217 ** past the last entry in the table or sqlite3BtreePrev() moves past
5218 ** the first entry.  TRUE is also returned if the table is empty.
5219 */
5220 int sqlite3BtreeEof(BtCursor *pCur){
5221   /* TODO: What if the cursor is in CURSOR_REQUIRESEEK but all table entries
5222   ** have been deleted? This API will need to change to return an error code
5223   ** as well as the boolean result value.
5224   */
5225   return (CURSOR_VALID!=pCur->eState);
5226 }
5227 
5228 /*
5229 ** Advance the cursor to the next entry in the database.  If
5230 ** successful then set *pRes=0.  If the cursor
5231 ** was already pointing to the last entry in the database before
5232 ** this routine was called, then set *pRes=1.
5233 **
5234 ** The main entry point is sqlite3BtreeNext().  That routine is optimized
5235 ** for the common case of merely incrementing the cell counter BtCursor.aiIdx
5236 ** to the next cell on the current page.  The (slower) btreeNext() helper
5237 ** routine is called when it is necessary to move to a different page or
5238 ** to restore the cursor.
5239 **
5240 ** The calling function will set *pRes to 0 or 1.  The initial *pRes value
5241 ** will be 1 if the cursor being stepped corresponds to an SQL index and
5242 ** if this routine could have been skipped if that SQL index had been
5243 ** a unique index.  Otherwise the caller will have set *pRes to zero.
5244 ** Zero is the common case. The btree implementation is free to use the
5245 ** initial *pRes value as a hint to improve performance, but the current
5246 ** SQLite btree implementation does not. (Note that the comdb2 btree
5247 ** implementation does use this hint, however.)
5248 */
5249 static SQLITE_NOINLINE int btreeNext(BtCursor *pCur, int *pRes){
5250   int rc;
5251   int idx;
5252   MemPage *pPage;
5253 
5254   assert( cursorHoldsMutex(pCur) );
5255   assert( pCur->skipNext==0 || pCur->eState!=CURSOR_VALID );
5256   assert( *pRes==0 );
5257   if( pCur->eState!=CURSOR_VALID ){
5258     assert( (pCur->curFlags & BTCF_ValidOvfl)==0 );
5259     rc = restoreCursorPosition(pCur);
5260     if( rc!=SQLITE_OK ){
5261       return rc;
5262     }
5263     if( CURSOR_INVALID==pCur->eState ){
5264       *pRes = 1;
5265       return SQLITE_OK;
5266     }
5267     if( pCur->skipNext ){
5268       assert( pCur->eState==CURSOR_VALID || pCur->eState==CURSOR_SKIPNEXT );
5269       pCur->eState = CURSOR_VALID;
5270       if( pCur->skipNext>0 ){
5271         pCur->skipNext = 0;
5272         return SQLITE_OK;
5273       }
5274       pCur->skipNext = 0;
5275     }
5276   }
5277 
5278   pPage = pCur->apPage[pCur->iPage];
5279   idx = ++pCur->aiIdx[pCur->iPage];
5280   assert( pPage->isInit );
5281 
5282   /* If the database file is corrupt, it is possible for the value of idx
5283   ** to be invalid here. This can only occur if a second cursor modifies
5284   ** the page while cursor pCur is holding a reference to it. Which can
5285   ** only happen if the database is corrupt in such a way as to link the
5286   ** page into more than one b-tree structure. */
5287   testcase( idx>pPage->nCell );
5288 
5289   if( idx>=pPage->nCell ){
5290     if( !pPage->leaf ){
5291       rc = moveToChild(pCur, get4byte(&pPage->aData[pPage->hdrOffset+8]));
5292       if( rc ) return rc;
5293       return moveToLeftmost(pCur);
5294     }
5295     do{
5296       if( pCur->iPage==0 ){
5297         *pRes = 1;
5298         pCur->eState = CURSOR_INVALID;
5299         return SQLITE_OK;
5300       }
5301       moveToParent(pCur);
5302       pPage = pCur->apPage[pCur->iPage];
5303     }while( pCur->aiIdx[pCur->iPage]>=pPage->nCell );
5304     if( pPage->intKey ){
5305       return sqlite3BtreeNext(pCur, pRes);
5306     }else{
5307       return SQLITE_OK;
5308     }
5309   }
5310   if( pPage->leaf ){
5311     return SQLITE_OK;
5312   }else{
5313     return moveToLeftmost(pCur);
5314   }
5315 }
5316 int sqlite3BtreeNext(BtCursor *pCur, int *pRes){
5317   MemPage *pPage;
5318   assert( cursorHoldsMutex(pCur) );
5319   assert( pRes!=0 );
5320   assert( *pRes==0 || *pRes==1 );
5321   assert( pCur->skipNext==0 || pCur->eState!=CURSOR_VALID );
5322   pCur->info.nSize = 0;
5323   pCur->curFlags &= ~(BTCF_ValidNKey|BTCF_ValidOvfl);
5324   *pRes = 0;
5325   if( pCur->eState!=CURSOR_VALID ) return btreeNext(pCur, pRes);
5326   pPage = pCur->apPage[pCur->iPage];
5327   if( (++pCur->aiIdx[pCur->iPage])>=pPage->nCell ){
5328     pCur->aiIdx[pCur->iPage]--;
5329     return btreeNext(pCur, pRes);
5330   }
5331   if( pPage->leaf ){
5332     return SQLITE_OK;
5333   }else{
5334     return moveToLeftmost(pCur);
5335   }
5336 }
5337 
5338 /*
5339 ** Step the cursor to the back to the previous entry in the database.  If
5340 ** successful then set *pRes=0.  If the cursor
5341 ** was already pointing to the first entry in the database before
5342 ** this routine was called, then set *pRes=1.
5343 **
5344 ** The main entry point is sqlite3BtreePrevious().  That routine is optimized
5345 ** for the common case of merely decrementing the cell counter BtCursor.aiIdx
5346 ** to the previous cell on the current page.  The (slower) btreePrevious()
5347 ** helper routine is called when it is necessary to move to a different page
5348 ** or to restore the cursor.
5349 **
5350 ** The calling function will set *pRes to 0 or 1.  The initial *pRes value
5351 ** will be 1 if the cursor being stepped corresponds to an SQL index and
5352 ** if this routine could have been skipped if that SQL index had been
5353 ** a unique index.  Otherwise the caller will have set *pRes to zero.
5354 ** Zero is the common case. The btree implementation is free to use the
5355 ** initial *pRes value as a hint to improve performance, but the current
5356 ** SQLite btree implementation does not. (Note that the comdb2 btree
5357 ** implementation does use this hint, however.)
5358 */
5359 static SQLITE_NOINLINE int btreePrevious(BtCursor *pCur, int *pRes){
5360   int rc;
5361   MemPage *pPage;
5362 
5363   assert( cursorHoldsMutex(pCur) );
5364   assert( pRes!=0 );
5365   assert( *pRes==0 );
5366   assert( pCur->skipNext==0 || pCur->eState!=CURSOR_VALID );
5367   assert( (pCur->curFlags & (BTCF_AtLast|BTCF_ValidOvfl|BTCF_ValidNKey))==0 );
5368   assert( pCur->info.nSize==0 );
5369   if( pCur->eState!=CURSOR_VALID ){
5370     rc = restoreCursorPosition(pCur);
5371     if( rc!=SQLITE_OK ){
5372       return rc;
5373     }
5374     if( CURSOR_INVALID==pCur->eState ){
5375       *pRes = 1;
5376       return SQLITE_OK;
5377     }
5378     if( pCur->skipNext ){
5379       assert( pCur->eState==CURSOR_VALID || pCur->eState==CURSOR_SKIPNEXT );
5380       pCur->eState = CURSOR_VALID;
5381       if( pCur->skipNext<0 ){
5382         pCur->skipNext = 0;
5383         return SQLITE_OK;
5384       }
5385       pCur->skipNext = 0;
5386     }
5387   }
5388 
5389   pPage = pCur->apPage[pCur->iPage];
5390   assert( pPage->isInit );
5391   if( !pPage->leaf ){
5392     int idx = pCur->aiIdx[pCur->iPage];
5393     rc = moveToChild(pCur, get4byte(findCell(pPage, idx)));
5394     if( rc ) return rc;
5395     rc = moveToRightmost(pCur);
5396   }else{
5397     while( pCur->aiIdx[pCur->iPage]==0 ){
5398       if( pCur->iPage==0 ){
5399         pCur->eState = CURSOR_INVALID;
5400         *pRes = 1;
5401         return SQLITE_OK;
5402       }
5403       moveToParent(pCur);
5404     }
5405     assert( pCur->info.nSize==0 );
5406     assert( (pCur->curFlags & (BTCF_ValidNKey|BTCF_ValidOvfl))==0 );
5407 
5408     pCur->aiIdx[pCur->iPage]--;
5409     pPage = pCur->apPage[pCur->iPage];
5410     if( pPage->intKey && !pPage->leaf ){
5411       rc = sqlite3BtreePrevious(pCur, pRes);
5412     }else{
5413       rc = SQLITE_OK;
5414     }
5415   }
5416   return rc;
5417 }
5418 int sqlite3BtreePrevious(BtCursor *pCur, int *pRes){
5419   assert( cursorHoldsMutex(pCur) );
5420   assert( pRes!=0 );
5421   assert( *pRes==0 || *pRes==1 );
5422   assert( pCur->skipNext==0 || pCur->eState!=CURSOR_VALID );
5423   *pRes = 0;
5424   pCur->curFlags &= ~(BTCF_AtLast|BTCF_ValidOvfl|BTCF_ValidNKey);
5425   pCur->info.nSize = 0;
5426   if( pCur->eState!=CURSOR_VALID
5427    || pCur->aiIdx[pCur->iPage]==0
5428    || pCur->apPage[pCur->iPage]->leaf==0
5429   ){
5430     return btreePrevious(pCur, pRes);
5431   }
5432   pCur->aiIdx[pCur->iPage]--;
5433   return SQLITE_OK;
5434 }
5435 
5436 /*
5437 ** Allocate a new page from the database file.
5438 **
5439 ** The new page is marked as dirty.  (In other words, sqlite3PagerWrite()
5440 ** has already been called on the new page.)  The new page has also
5441 ** been referenced and the calling routine is responsible for calling
5442 ** sqlite3PagerUnref() on the new page when it is done.
5443 **
5444 ** SQLITE_OK is returned on success.  Any other return value indicates
5445 ** an error.  *ppPage is set to NULL in the event of an error.
5446 **
5447 ** If the "nearby" parameter is not 0, then an effort is made to
5448 ** locate a page close to the page number "nearby".  This can be used in an
5449 ** attempt to keep related pages close to each other in the database file,
5450 ** which in turn can make database access faster.
5451 **
5452 ** If the eMode parameter is BTALLOC_EXACT and the nearby page exists
5453 ** anywhere on the free-list, then it is guaranteed to be returned.  If
5454 ** eMode is BTALLOC_LT then the page returned will be less than or equal
5455 ** to nearby if any such page exists.  If eMode is BTALLOC_ANY then there
5456 ** are no restrictions on which page is returned.
5457 */
5458 static int allocateBtreePage(
5459   BtShared *pBt,         /* The btree */
5460   MemPage **ppPage,      /* Store pointer to the allocated page here */
5461   Pgno *pPgno,           /* Store the page number here */
5462   Pgno nearby,           /* Search for a page near this one */
5463   u8 eMode               /* BTALLOC_EXACT, BTALLOC_LT, or BTALLOC_ANY */
5464 ){
5465   MemPage *pPage1;
5466   int rc;
5467   u32 n;     /* Number of pages on the freelist */
5468   u32 k;     /* Number of leaves on the trunk of the freelist */
5469   MemPage *pTrunk = 0;
5470   MemPage *pPrevTrunk = 0;
5471   Pgno mxPage;     /* Total size of the database file */
5472 
5473   assert( sqlite3_mutex_held(pBt->mutex) );
5474   assert( eMode==BTALLOC_ANY || (nearby>0 && IfNotOmitAV(pBt->autoVacuum)) );
5475   pPage1 = pBt->pPage1;
5476   mxPage = btreePagecount(pBt);
5477   /* EVIDENCE-OF: R-05119-02637 The 4-byte big-endian integer at offset 36
5478   ** stores stores the total number of pages on the freelist. */
5479   n = get4byte(&pPage1->aData[36]);
5480   testcase( n==mxPage-1 );
5481   if( n>=mxPage ){
5482     return SQLITE_CORRUPT_BKPT;
5483   }
5484   if( n>0 ){
5485     /* There are pages on the freelist.  Reuse one of those pages. */
5486     Pgno iTrunk;
5487     u8 searchList = 0; /* If the free-list must be searched for 'nearby' */
5488     u32 nSearch = 0;   /* Count of the number of search attempts */
5489 
5490     /* If eMode==BTALLOC_EXACT and a query of the pointer-map
5491     ** shows that the page 'nearby' is somewhere on the free-list, then
5492     ** the entire-list will be searched for that page.
5493     */
5494 #ifndef SQLITE_OMIT_AUTOVACUUM
5495     if( eMode==BTALLOC_EXACT ){
5496       if( nearby<=mxPage ){
5497         u8 eType;
5498         assert( nearby>0 );
5499         assert( pBt->autoVacuum );
5500         rc = ptrmapGet(pBt, nearby, &eType, 0);
5501         if( rc ) return rc;
5502         if( eType==PTRMAP_FREEPAGE ){
5503           searchList = 1;
5504         }
5505       }
5506     }else if( eMode==BTALLOC_LE ){
5507       searchList = 1;
5508     }
5509 #endif
5510 
5511     /* Decrement the free-list count by 1. Set iTrunk to the index of the
5512     ** first free-list trunk page. iPrevTrunk is initially 1.
5513     */
5514     rc = sqlite3PagerWrite(pPage1->pDbPage);
5515     if( rc ) return rc;
5516     put4byte(&pPage1->aData[36], n-1);
5517 
5518     /* The code within this loop is run only once if the 'searchList' variable
5519     ** is not true. Otherwise, it runs once for each trunk-page on the
5520     ** free-list until the page 'nearby' is located (eMode==BTALLOC_EXACT)
5521     ** or until a page less than 'nearby' is located (eMode==BTALLOC_LT)
5522     */
5523     do {
5524       pPrevTrunk = pTrunk;
5525       if( pPrevTrunk ){
5526         /* EVIDENCE-OF: R-01506-11053 The first integer on a freelist trunk page
5527         ** is the page number of the next freelist trunk page in the list or
5528         ** zero if this is the last freelist trunk page. */
5529         iTrunk = get4byte(&pPrevTrunk->aData[0]);
5530       }else{
5531         /* EVIDENCE-OF: R-59841-13798 The 4-byte big-endian integer at offset 32
5532         ** stores the page number of the first page of the freelist, or zero if
5533         ** the freelist is empty. */
5534         iTrunk = get4byte(&pPage1->aData[32]);
5535       }
5536       testcase( iTrunk==mxPage );
5537       if( iTrunk>mxPage || nSearch++ > n ){
5538         rc = SQLITE_CORRUPT_BKPT;
5539       }else{
5540         rc = btreeGetUnusedPage(pBt, iTrunk, &pTrunk, 0);
5541       }
5542       if( rc ){
5543         pTrunk = 0;
5544         goto end_allocate_page;
5545       }
5546       assert( pTrunk!=0 );
5547       assert( pTrunk->aData!=0 );
5548       /* EVIDENCE-OF: R-13523-04394 The second integer on a freelist trunk page
5549       ** is the number of leaf page pointers to follow. */
5550       k = get4byte(&pTrunk->aData[4]);
5551       if( k==0 && !searchList ){
5552         /* The trunk has no leaves and the list is not being searched.
5553         ** So extract the trunk page itself and use it as the newly
5554         ** allocated page */
5555         assert( pPrevTrunk==0 );
5556         rc = sqlite3PagerWrite(pTrunk->pDbPage);
5557         if( rc ){
5558           goto end_allocate_page;
5559         }
5560         *pPgno = iTrunk;
5561         memcpy(&pPage1->aData[32], &pTrunk->aData[0], 4);
5562         *ppPage = pTrunk;
5563         pTrunk = 0;
5564         TRACE(("ALLOCATE: %d trunk - %d free pages left\n", *pPgno, n-1));
5565       }else if( k>(u32)(pBt->usableSize/4 - 2) ){
5566         /* Value of k is out of range.  Database corruption */
5567         rc = SQLITE_CORRUPT_BKPT;
5568         goto end_allocate_page;
5569 #ifndef SQLITE_OMIT_AUTOVACUUM
5570       }else if( searchList
5571             && (nearby==iTrunk || (iTrunk<nearby && eMode==BTALLOC_LE))
5572       ){
5573         /* The list is being searched and this trunk page is the page
5574         ** to allocate, regardless of whether it has leaves.
5575         */
5576         *pPgno = iTrunk;
5577         *ppPage = pTrunk;
5578         searchList = 0;
5579         rc = sqlite3PagerWrite(pTrunk->pDbPage);
5580         if( rc ){
5581           goto end_allocate_page;
5582         }
5583         if( k==0 ){
5584           if( !pPrevTrunk ){
5585             memcpy(&pPage1->aData[32], &pTrunk->aData[0], 4);
5586           }else{
5587             rc = sqlite3PagerWrite(pPrevTrunk->pDbPage);
5588             if( rc!=SQLITE_OK ){
5589               goto end_allocate_page;
5590             }
5591             memcpy(&pPrevTrunk->aData[0], &pTrunk->aData[0], 4);
5592           }
5593         }else{
5594           /* The trunk page is required by the caller but it contains
5595           ** pointers to free-list leaves. The first leaf becomes a trunk
5596           ** page in this case.
5597           */
5598           MemPage *pNewTrunk;
5599           Pgno iNewTrunk = get4byte(&pTrunk->aData[8]);
5600           if( iNewTrunk>mxPage ){
5601             rc = SQLITE_CORRUPT_BKPT;
5602             goto end_allocate_page;
5603           }
5604           testcase( iNewTrunk==mxPage );
5605           rc = btreeGetUnusedPage(pBt, iNewTrunk, &pNewTrunk, 0);
5606           if( rc!=SQLITE_OK ){
5607             goto end_allocate_page;
5608           }
5609           rc = sqlite3PagerWrite(pNewTrunk->pDbPage);
5610           if( rc!=SQLITE_OK ){
5611             releasePage(pNewTrunk);
5612             goto end_allocate_page;
5613           }
5614           memcpy(&pNewTrunk->aData[0], &pTrunk->aData[0], 4);
5615           put4byte(&pNewTrunk->aData[4], k-1);
5616           memcpy(&pNewTrunk->aData[8], &pTrunk->aData[12], (k-1)*4);
5617           releasePage(pNewTrunk);
5618           if( !pPrevTrunk ){
5619             assert( sqlite3PagerIswriteable(pPage1->pDbPage) );
5620             put4byte(&pPage1->aData[32], iNewTrunk);
5621           }else{
5622             rc = sqlite3PagerWrite(pPrevTrunk->pDbPage);
5623             if( rc ){
5624               goto end_allocate_page;
5625             }
5626             put4byte(&pPrevTrunk->aData[0], iNewTrunk);
5627           }
5628         }
5629         pTrunk = 0;
5630         TRACE(("ALLOCATE: %d trunk - %d free pages left\n", *pPgno, n-1));
5631 #endif
5632       }else if( k>0 ){
5633         /* Extract a leaf from the trunk */
5634         u32 closest;
5635         Pgno iPage;
5636         unsigned char *aData = pTrunk->aData;
5637         if( nearby>0 ){
5638           u32 i;
5639           closest = 0;
5640           if( eMode==BTALLOC_LE ){
5641             for(i=0; i<k; i++){
5642               iPage = get4byte(&aData[8+i*4]);
5643               if( iPage<=nearby ){
5644                 closest = i;
5645                 break;
5646               }
5647             }
5648           }else{
5649             int dist;
5650             dist = sqlite3AbsInt32(get4byte(&aData[8]) - nearby);
5651             for(i=1; i<k; i++){
5652               int d2 = sqlite3AbsInt32(get4byte(&aData[8+i*4]) - nearby);
5653               if( d2<dist ){
5654                 closest = i;
5655                 dist = d2;
5656               }
5657             }
5658           }
5659         }else{
5660           closest = 0;
5661         }
5662 
5663         iPage = get4byte(&aData[8+closest*4]);
5664         testcase( iPage==mxPage );
5665         if( iPage>mxPage ){
5666           rc = SQLITE_CORRUPT_BKPT;
5667           goto end_allocate_page;
5668         }
5669         testcase( iPage==mxPage );
5670         if( !searchList
5671          || (iPage==nearby || (iPage<nearby && eMode==BTALLOC_LE))
5672         ){
5673           int noContent;
5674           *pPgno = iPage;
5675           TRACE(("ALLOCATE: %d was leaf %d of %d on trunk %d"
5676                  ": %d more free pages\n",
5677                  *pPgno, closest+1, k, pTrunk->pgno, n-1));
5678           rc = sqlite3PagerWrite(pTrunk->pDbPage);
5679           if( rc ) goto end_allocate_page;
5680           if( closest<k-1 ){
5681             memcpy(&aData[8+closest*4], &aData[4+k*4], 4);
5682           }
5683           put4byte(&aData[4], k-1);
5684           noContent = !btreeGetHasContent(pBt, *pPgno)? PAGER_GET_NOCONTENT : 0;
5685           rc = btreeGetUnusedPage(pBt, *pPgno, ppPage, noContent);
5686           if( rc==SQLITE_OK ){
5687             rc = sqlite3PagerWrite((*ppPage)->pDbPage);
5688             if( rc!=SQLITE_OK ){
5689               releasePage(*ppPage);
5690               *ppPage = 0;
5691             }
5692           }
5693           searchList = 0;
5694         }
5695       }
5696       releasePage(pPrevTrunk);
5697       pPrevTrunk = 0;
5698     }while( searchList );
5699   }else{
5700     /* There are no pages on the freelist, so append a new page to the
5701     ** database image.
5702     **
5703     ** Normally, new pages allocated by this block can be requested from the
5704     ** pager layer with the 'no-content' flag set. This prevents the pager
5705     ** from trying to read the pages content from disk. However, if the
5706     ** current transaction has already run one or more incremental-vacuum
5707     ** steps, then the page we are about to allocate may contain content
5708     ** that is required in the event of a rollback. In this case, do
5709     ** not set the no-content flag. This causes the pager to load and journal
5710     ** the current page content before overwriting it.
5711     **
5712     ** Note that the pager will not actually attempt to load or journal
5713     ** content for any page that really does lie past the end of the database
5714     ** file on disk. So the effects of disabling the no-content optimization
5715     ** here are confined to those pages that lie between the end of the
5716     ** database image and the end of the database file.
5717     */
5718     int bNoContent = (0==IfNotOmitAV(pBt->bDoTruncate))? PAGER_GET_NOCONTENT:0;
5719 
5720     rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
5721     if( rc ) return rc;
5722     pBt->nPage++;
5723     if( pBt->nPage==PENDING_BYTE_PAGE(pBt) ) pBt->nPage++;
5724 
5725 #ifndef SQLITE_OMIT_AUTOVACUUM
5726     if( pBt->autoVacuum && PTRMAP_ISPAGE(pBt, pBt->nPage) ){
5727       /* If *pPgno refers to a pointer-map page, allocate two new pages
5728       ** at the end of the file instead of one. The first allocated page
5729       ** becomes a new pointer-map page, the second is used by the caller.
5730       */
5731       MemPage *pPg = 0;
5732       TRACE(("ALLOCATE: %d from end of file (pointer-map page)\n", pBt->nPage));
5733       assert( pBt->nPage!=PENDING_BYTE_PAGE(pBt) );
5734       rc = btreeGetUnusedPage(pBt, pBt->nPage, &pPg, bNoContent);
5735       if( rc==SQLITE_OK ){
5736         rc = sqlite3PagerWrite(pPg->pDbPage);
5737         releasePage(pPg);
5738       }
5739       if( rc ) return rc;
5740       pBt->nPage++;
5741       if( pBt->nPage==PENDING_BYTE_PAGE(pBt) ){ pBt->nPage++; }
5742     }
5743 #endif
5744     put4byte(28 + (u8*)pBt->pPage1->aData, pBt->nPage);
5745     *pPgno = pBt->nPage;
5746 
5747     assert( *pPgno!=PENDING_BYTE_PAGE(pBt) );
5748     rc = btreeGetUnusedPage(pBt, *pPgno, ppPage, bNoContent);
5749     if( rc ) return rc;
5750     rc = sqlite3PagerWrite((*ppPage)->pDbPage);
5751     if( rc!=SQLITE_OK ){
5752       releasePage(*ppPage);
5753       *ppPage = 0;
5754     }
5755     TRACE(("ALLOCATE: %d from end of file\n", *pPgno));
5756   }
5757 
5758   assert( *pPgno!=PENDING_BYTE_PAGE(pBt) );
5759 
5760 end_allocate_page:
5761   releasePage(pTrunk);
5762   releasePage(pPrevTrunk);
5763   assert( rc!=SQLITE_OK || sqlite3PagerPageRefcount((*ppPage)->pDbPage)<=1 );
5764   assert( rc!=SQLITE_OK || (*ppPage)->isInit==0 );
5765   return rc;
5766 }
5767 
5768 /*
5769 ** This function is used to add page iPage to the database file free-list.
5770 ** It is assumed that the page is not already a part of the free-list.
5771 **
5772 ** The value passed as the second argument to this function is optional.
5773 ** If the caller happens to have a pointer to the MemPage object
5774 ** corresponding to page iPage handy, it may pass it as the second value.
5775 ** Otherwise, it may pass NULL.
5776 **
5777 ** If a pointer to a MemPage object is passed as the second argument,
5778 ** its reference count is not altered by this function.
5779 */
5780 static int freePage2(BtShared *pBt, MemPage *pMemPage, Pgno iPage){
5781   MemPage *pTrunk = 0;                /* Free-list trunk page */
5782   Pgno iTrunk = 0;                    /* Page number of free-list trunk page */
5783   MemPage *pPage1 = pBt->pPage1;      /* Local reference to page 1 */
5784   MemPage *pPage;                     /* Page being freed. May be NULL. */
5785   int rc;                             /* Return Code */
5786   int nFree;                          /* Initial number of pages on free-list */
5787 
5788   assert( sqlite3_mutex_held(pBt->mutex) );
5789   assert( CORRUPT_DB || iPage>1 );
5790   assert( !pMemPage || pMemPage->pgno==iPage );
5791 
5792   if( iPage<2 ) return SQLITE_CORRUPT_BKPT;
5793   if( pMemPage ){
5794     pPage = pMemPage;
5795     sqlite3PagerRef(pPage->pDbPage);
5796   }else{
5797     pPage = btreePageLookup(pBt, iPage);
5798   }
5799 
5800   /* Increment the free page count on pPage1 */
5801   rc = sqlite3PagerWrite(pPage1->pDbPage);
5802   if( rc ) goto freepage_out;
5803   nFree = get4byte(&pPage1->aData[36]);
5804   put4byte(&pPage1->aData[36], nFree+1);
5805 
5806   if( pBt->btsFlags & BTS_SECURE_DELETE ){
5807     /* If the secure_delete option is enabled, then
5808     ** always fully overwrite deleted information with zeros.
5809     */
5810     if( (!pPage && ((rc = btreeGetPage(pBt, iPage, &pPage, 0))!=0) )
5811      ||            ((rc = sqlite3PagerWrite(pPage->pDbPage))!=0)
5812     ){
5813       goto freepage_out;
5814     }
5815     memset(pPage->aData, 0, pPage->pBt->pageSize);
5816   }
5817 
5818   /* If the database supports auto-vacuum, write an entry in the pointer-map
5819   ** to indicate that the page is free.
5820   */
5821   if( ISAUTOVACUUM ){
5822     ptrmapPut(pBt, iPage, PTRMAP_FREEPAGE, 0, &rc);
5823     if( rc ) goto freepage_out;
5824   }
5825 
5826   /* Now manipulate the actual database free-list structure. There are two
5827   ** possibilities. If the free-list is currently empty, or if the first
5828   ** trunk page in the free-list is full, then this page will become a
5829   ** new free-list trunk page. Otherwise, it will become a leaf of the
5830   ** first trunk page in the current free-list. This block tests if it
5831   ** is possible to add the page as a new free-list leaf.
5832   */
5833   if( nFree!=0 ){
5834     u32 nLeaf;                /* Initial number of leaf cells on trunk page */
5835 
5836     iTrunk = get4byte(&pPage1->aData[32]);
5837     rc = btreeGetPage(pBt, iTrunk, &pTrunk, 0);
5838     if( rc!=SQLITE_OK ){
5839       goto freepage_out;
5840     }
5841 
5842     nLeaf = get4byte(&pTrunk->aData[4]);
5843     assert( pBt->usableSize>32 );
5844     if( nLeaf > (u32)pBt->usableSize/4 - 2 ){
5845       rc = SQLITE_CORRUPT_BKPT;
5846       goto freepage_out;
5847     }
5848     if( nLeaf < (u32)pBt->usableSize/4 - 8 ){
5849       /* In this case there is room on the trunk page to insert the page
5850       ** being freed as a new leaf.
5851       **
5852       ** Note that the trunk page is not really full until it contains
5853       ** usableSize/4 - 2 entries, not usableSize/4 - 8 entries as we have
5854       ** coded.  But due to a coding error in versions of SQLite prior to
5855       ** 3.6.0, databases with freelist trunk pages holding more than
5856       ** usableSize/4 - 8 entries will be reported as corrupt.  In order
5857       ** to maintain backwards compatibility with older versions of SQLite,
5858       ** we will continue to restrict the number of entries to usableSize/4 - 8
5859       ** for now.  At some point in the future (once everyone has upgraded
5860       ** to 3.6.0 or later) we should consider fixing the conditional above
5861       ** to read "usableSize/4-2" instead of "usableSize/4-8".
5862       **
5863       ** EVIDENCE-OF: R-19920-11576 However, newer versions of SQLite still
5864       ** avoid using the last six entries in the freelist trunk page array in
5865       ** order that database files created by newer versions of SQLite can be
5866       ** read by older versions of SQLite.
5867       */
5868       rc = sqlite3PagerWrite(pTrunk->pDbPage);
5869       if( rc==SQLITE_OK ){
5870         put4byte(&pTrunk->aData[4], nLeaf+1);
5871         put4byte(&pTrunk->aData[8+nLeaf*4], iPage);
5872         if( pPage && (pBt->btsFlags & BTS_SECURE_DELETE)==0 ){
5873           sqlite3PagerDontWrite(pPage->pDbPage);
5874         }
5875         rc = btreeSetHasContent(pBt, iPage);
5876       }
5877       TRACE(("FREE-PAGE: %d leaf on trunk page %d\n",pPage->pgno,pTrunk->pgno));
5878       goto freepage_out;
5879     }
5880   }
5881 
5882   /* If control flows to this point, then it was not possible to add the
5883   ** the page being freed as a leaf page of the first trunk in the free-list.
5884   ** Possibly because the free-list is empty, or possibly because the
5885   ** first trunk in the free-list is full. Either way, the page being freed
5886   ** will become the new first trunk page in the free-list.
5887   */
5888   if( pPage==0 && SQLITE_OK!=(rc = btreeGetPage(pBt, iPage, &pPage, 0)) ){
5889     goto freepage_out;
5890   }
5891   rc = sqlite3PagerWrite(pPage->pDbPage);
5892   if( rc!=SQLITE_OK ){
5893     goto freepage_out;
5894   }
5895   put4byte(pPage->aData, iTrunk);
5896   put4byte(&pPage->aData[4], 0);
5897   put4byte(&pPage1->aData[32], iPage);
5898   TRACE(("FREE-PAGE: %d new trunk page replacing %d\n", pPage->pgno, iTrunk));
5899 
5900 freepage_out:
5901   if( pPage ){
5902     pPage->isInit = 0;
5903   }
5904   releasePage(pPage);
5905   releasePage(pTrunk);
5906   return rc;
5907 }
5908 static void freePage(MemPage *pPage, int *pRC){
5909   if( (*pRC)==SQLITE_OK ){
5910     *pRC = freePage2(pPage->pBt, pPage, pPage->pgno);
5911   }
5912 }
5913 
5914 /*
5915 ** Free any overflow pages associated with the given Cell.  Write the
5916 ** local Cell size (the number of bytes on the original page, omitting
5917 ** overflow) into *pnSize.
5918 */
5919 static int clearCell(
5920   MemPage *pPage,          /* The page that contains the Cell */
5921   unsigned char *pCell,    /* First byte of the Cell */
5922   u16 *pnSize              /* Write the size of the Cell here */
5923 ){
5924   BtShared *pBt = pPage->pBt;
5925   CellInfo info;
5926   Pgno ovflPgno;
5927   int rc;
5928   int nOvfl;
5929   u32 ovflPageSize;
5930 
5931   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
5932   pPage->xParseCell(pPage, pCell, &info);
5933   *pnSize = info.nSize;
5934   if( info.iOverflow==0 ){
5935     return SQLITE_OK;  /* No overflow pages. Return without doing anything */
5936   }
5937   if( pCell+info.iOverflow+3 > pPage->aData+pPage->maskPage ){
5938     return SQLITE_CORRUPT_BKPT;  /* Cell extends past end of page */
5939   }
5940   ovflPgno = get4byte(&pCell[info.iOverflow]);
5941   assert( pBt->usableSize > 4 );
5942   ovflPageSize = pBt->usableSize - 4;
5943   nOvfl = (info.nPayload - info.nLocal + ovflPageSize - 1)/ovflPageSize;
5944   assert( nOvfl>0 ||
5945     (CORRUPT_DB && (info.nPayload + ovflPageSize)<ovflPageSize)
5946   );
5947   while( nOvfl-- ){
5948     Pgno iNext = 0;
5949     MemPage *pOvfl = 0;
5950     if( ovflPgno<2 || ovflPgno>btreePagecount(pBt) ){
5951       /* 0 is not a legal page number and page 1 cannot be an
5952       ** overflow page. Therefore if ovflPgno<2 or past the end of the
5953       ** file the database must be corrupt. */
5954       return SQLITE_CORRUPT_BKPT;
5955     }
5956     if( nOvfl ){
5957       rc = getOverflowPage(pBt, ovflPgno, &pOvfl, &iNext);
5958       if( rc ) return rc;
5959     }
5960 
5961     if( ( pOvfl || ((pOvfl = btreePageLookup(pBt, ovflPgno))!=0) )
5962      && sqlite3PagerPageRefcount(pOvfl->pDbPage)!=1
5963     ){
5964       /* There is no reason any cursor should have an outstanding reference
5965       ** to an overflow page belonging to a cell that is being deleted/updated.
5966       ** So if there exists more than one reference to this page, then it
5967       ** must not really be an overflow page and the database must be corrupt.
5968       ** It is helpful to detect this before calling freePage2(), as
5969       ** freePage2() may zero the page contents if secure-delete mode is
5970       ** enabled. If this 'overflow' page happens to be a page that the
5971       ** caller is iterating through or using in some other way, this
5972       ** can be problematic.
5973       */
5974       rc = SQLITE_CORRUPT_BKPT;
5975     }else{
5976       rc = freePage2(pBt, pOvfl, ovflPgno);
5977     }
5978 
5979     if( pOvfl ){
5980       sqlite3PagerUnref(pOvfl->pDbPage);
5981     }
5982     if( rc ) return rc;
5983     ovflPgno = iNext;
5984   }
5985   return SQLITE_OK;
5986 }
5987 
5988 /*
5989 ** Create the byte sequence used to represent a cell on page pPage
5990 ** and write that byte sequence into pCell[].  Overflow pages are
5991 ** allocated and filled in as necessary.  The calling procedure
5992 ** is responsible for making sure sufficient space has been allocated
5993 ** for pCell[].
5994 **
5995 ** Note that pCell does not necessary need to point to the pPage->aData
5996 ** area.  pCell might point to some temporary storage.  The cell will
5997 ** be constructed in this temporary area then copied into pPage->aData
5998 ** later.
5999 */
6000 static int fillInCell(
6001   MemPage *pPage,                /* The page that contains the cell */
6002   unsigned char *pCell,          /* Complete text of the cell */
6003   const void *pKey, i64 nKey,    /* The key */
6004   const void *pData,int nData,   /* The data */
6005   int nZero,                     /* Extra zero bytes to append to pData */
6006   int *pnSize                    /* Write cell size here */
6007 ){
6008   int nPayload;
6009   const u8 *pSrc;
6010   int nSrc, n, rc;
6011   int spaceLeft;
6012   MemPage *pOvfl = 0;
6013   MemPage *pToRelease = 0;
6014   unsigned char *pPrior;
6015   unsigned char *pPayload;
6016   BtShared *pBt = pPage->pBt;
6017   Pgno pgnoOvfl = 0;
6018   int nHeader;
6019 
6020   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
6021 
6022   /* pPage is not necessarily writeable since pCell might be auxiliary
6023   ** buffer space that is separate from the pPage buffer area */
6024   assert( pCell<pPage->aData || pCell>=&pPage->aData[pBt->pageSize]
6025             || sqlite3PagerIswriteable(pPage->pDbPage) );
6026 
6027   /* Fill in the header. */
6028   nHeader = pPage->childPtrSize;
6029   nPayload = nData + nZero;
6030   if( pPage->intKeyLeaf ){
6031     nHeader += putVarint32(&pCell[nHeader], nPayload);
6032   }else{
6033     assert( nData==0 );
6034     assert( nZero==0 );
6035   }
6036   nHeader += putVarint(&pCell[nHeader], *(u64*)&nKey);
6037 
6038   /* Fill in the payload size */
6039   if( pPage->intKey ){
6040     pSrc = pData;
6041     nSrc = nData;
6042     nData = 0;
6043   }else{
6044     assert( nKey<=0x7fffffff && pKey!=0 );
6045     nPayload = (int)nKey;
6046     pSrc = pKey;
6047     nSrc = (int)nKey;
6048   }
6049   if( nPayload<=pPage->maxLocal ){
6050     n = nHeader + nPayload;
6051     testcase( n==3 );
6052     testcase( n==4 );
6053     if( n<4 ) n = 4;
6054     *pnSize = n;
6055     spaceLeft = nPayload;
6056     pPrior = pCell;
6057   }else{
6058     int mn = pPage->minLocal;
6059     n = mn + (nPayload - mn) % (pPage->pBt->usableSize - 4);
6060     testcase( n==pPage->maxLocal );
6061     testcase( n==pPage->maxLocal+1 );
6062     if( n > pPage->maxLocal ) n = mn;
6063     spaceLeft = n;
6064     *pnSize = n + nHeader + 4;
6065     pPrior = &pCell[nHeader+n];
6066   }
6067   pPayload = &pCell[nHeader];
6068 
6069   /* At this point variables should be set as follows:
6070   **
6071   **   nPayload           Total payload size in bytes
6072   **   pPayload           Begin writing payload here
6073   **   spaceLeft          Space available at pPayload.  If nPayload>spaceLeft,
6074   **                      that means content must spill into overflow pages.
6075   **   *pnSize            Size of the local cell (not counting overflow pages)
6076   **   pPrior             Where to write the pgno of the first overflow page
6077   **
6078   ** Use a call to btreeParseCellPtr() to verify that the values above
6079   ** were computed correctly.
6080   */
6081 #if SQLITE_DEBUG
6082   {
6083     CellInfo info;
6084     pPage->xParseCell(pPage, pCell, &info);
6085     assert( nHeader=(int)(info.pPayload - pCell) );
6086     assert( info.nKey==nKey );
6087     assert( *pnSize == info.nSize );
6088     assert( spaceLeft == info.nLocal );
6089     assert( pPrior == &pCell[info.iOverflow] );
6090   }
6091 #endif
6092 
6093   /* Write the payload into the local Cell and any extra into overflow pages */
6094   while( nPayload>0 ){
6095     if( spaceLeft==0 ){
6096 #ifndef SQLITE_OMIT_AUTOVACUUM
6097       Pgno pgnoPtrmap = pgnoOvfl; /* Overflow page pointer-map entry page */
6098       if( pBt->autoVacuum ){
6099         do{
6100           pgnoOvfl++;
6101         } while(
6102           PTRMAP_ISPAGE(pBt, pgnoOvfl) || pgnoOvfl==PENDING_BYTE_PAGE(pBt)
6103         );
6104       }
6105 #endif
6106       rc = allocateBtreePage(pBt, &pOvfl, &pgnoOvfl, pgnoOvfl, 0);
6107 #ifndef SQLITE_OMIT_AUTOVACUUM
6108       /* If the database supports auto-vacuum, and the second or subsequent
6109       ** overflow page is being allocated, add an entry to the pointer-map
6110       ** for that page now.
6111       **
6112       ** If this is the first overflow page, then write a partial entry
6113       ** to the pointer-map. If we write nothing to this pointer-map slot,
6114       ** then the optimistic overflow chain processing in clearCell()
6115       ** may misinterpret the uninitialized values and delete the
6116       ** wrong pages from the database.
6117       */
6118       if( pBt->autoVacuum && rc==SQLITE_OK ){
6119         u8 eType = (pgnoPtrmap?PTRMAP_OVERFLOW2:PTRMAP_OVERFLOW1);
6120         ptrmapPut(pBt, pgnoOvfl, eType, pgnoPtrmap, &rc);
6121         if( rc ){
6122           releasePage(pOvfl);
6123         }
6124       }
6125 #endif
6126       if( rc ){
6127         releasePage(pToRelease);
6128         return rc;
6129       }
6130 
6131       /* If pToRelease is not zero than pPrior points into the data area
6132       ** of pToRelease.  Make sure pToRelease is still writeable. */
6133       assert( pToRelease==0 || sqlite3PagerIswriteable(pToRelease->pDbPage) );
6134 
6135       /* If pPrior is part of the data area of pPage, then make sure pPage
6136       ** is still writeable */
6137       assert( pPrior<pPage->aData || pPrior>=&pPage->aData[pBt->pageSize]
6138             || sqlite3PagerIswriteable(pPage->pDbPage) );
6139 
6140       put4byte(pPrior, pgnoOvfl);
6141       releasePage(pToRelease);
6142       pToRelease = pOvfl;
6143       pPrior = pOvfl->aData;
6144       put4byte(pPrior, 0);
6145       pPayload = &pOvfl->aData[4];
6146       spaceLeft = pBt->usableSize - 4;
6147     }
6148     n = nPayload;
6149     if( n>spaceLeft ) n = spaceLeft;
6150 
6151     /* If pToRelease is not zero than pPayload points into the data area
6152     ** of pToRelease.  Make sure pToRelease is still writeable. */
6153     assert( pToRelease==0 || sqlite3PagerIswriteable(pToRelease->pDbPage) );
6154 
6155     /* If pPayload is part of the data area of pPage, then make sure pPage
6156     ** is still writeable */
6157     assert( pPayload<pPage->aData || pPayload>=&pPage->aData[pBt->pageSize]
6158             || sqlite3PagerIswriteable(pPage->pDbPage) );
6159 
6160     if( nSrc>0 ){
6161       if( n>nSrc ) n = nSrc;
6162       assert( pSrc );
6163       memcpy(pPayload, pSrc, n);
6164     }else{
6165       memset(pPayload, 0, n);
6166     }
6167     nPayload -= n;
6168     pPayload += n;
6169     pSrc += n;
6170     nSrc -= n;
6171     spaceLeft -= n;
6172     if( nSrc==0 ){
6173       nSrc = nData;
6174       pSrc = pData;
6175     }
6176   }
6177   releasePage(pToRelease);
6178   return SQLITE_OK;
6179 }
6180 
6181 /*
6182 ** Remove the i-th cell from pPage.  This routine effects pPage only.
6183 ** The cell content is not freed or deallocated.  It is assumed that
6184 ** the cell content has been copied someplace else.  This routine just
6185 ** removes the reference to the cell from pPage.
6186 **
6187 ** "sz" must be the number of bytes in the cell.
6188 */
6189 static void dropCell(MemPage *pPage, int idx, int sz, int *pRC){
6190   u32 pc;         /* Offset to cell content of cell being deleted */
6191   u8 *data;       /* pPage->aData */
6192   u8 *ptr;        /* Used to move bytes around within data[] */
6193   int rc;         /* The return code */
6194   int hdr;        /* Beginning of the header.  0 most pages.  100 page 1 */
6195 
6196   if( *pRC ) return;
6197 
6198   assert( idx>=0 && idx<pPage->nCell );
6199   assert( CORRUPT_DB || sz==cellSize(pPage, idx) );
6200   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
6201   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
6202   data = pPage->aData;
6203   ptr = &pPage->aCellIdx[2*idx];
6204   pc = get2byte(ptr);
6205   hdr = pPage->hdrOffset;
6206   testcase( pc==get2byte(&data[hdr+5]) );
6207   testcase( pc+sz==pPage->pBt->usableSize );
6208   if( pc < (u32)get2byte(&data[hdr+5]) || pc+sz > pPage->pBt->usableSize ){
6209     *pRC = SQLITE_CORRUPT_BKPT;
6210     return;
6211   }
6212   rc = freeSpace(pPage, pc, sz);
6213   if( rc ){
6214     *pRC = rc;
6215     return;
6216   }
6217   pPage->nCell--;
6218   if( pPage->nCell==0 ){
6219     memset(&data[hdr+1], 0, 4);
6220     data[hdr+7] = 0;
6221     put2byte(&data[hdr+5], pPage->pBt->usableSize);
6222     pPage->nFree = pPage->pBt->usableSize - pPage->hdrOffset
6223                        - pPage->childPtrSize - 8;
6224   }else{
6225     memmove(ptr, ptr+2, 2*(pPage->nCell - idx));
6226     put2byte(&data[hdr+3], pPage->nCell);
6227     pPage->nFree += 2;
6228   }
6229 }
6230 
6231 /*
6232 ** Insert a new cell on pPage at cell index "i".  pCell points to the
6233 ** content of the cell.
6234 **
6235 ** If the cell content will fit on the page, then put it there.  If it
6236 ** will not fit, then make a copy of the cell content into pTemp if
6237 ** pTemp is not null.  Regardless of pTemp, allocate a new entry
6238 ** in pPage->apOvfl[] and make it point to the cell content (either
6239 ** in pTemp or the original pCell) and also record its index.
6240 ** Allocating a new entry in pPage->aCell[] implies that
6241 ** pPage->nOverflow is incremented.
6242 */
6243 static void insertCell(
6244   MemPage *pPage,   /* Page into which we are copying */
6245   int i,            /* New cell becomes the i-th cell of the page */
6246   u8 *pCell,        /* Content of the new cell */
6247   int sz,           /* Bytes of content in pCell */
6248   u8 *pTemp,        /* Temp storage space for pCell, if needed */
6249   Pgno iChild,      /* If non-zero, replace first 4 bytes with this value */
6250   int *pRC          /* Read and write return code from here */
6251 ){
6252   int idx = 0;      /* Where to write new cell content in data[] */
6253   int j;            /* Loop counter */
6254   u8 *data;         /* The content of the whole page */
6255   u8 *pIns;         /* The point in pPage->aCellIdx[] where no cell inserted */
6256 
6257   if( *pRC ) return;
6258 
6259   assert( i>=0 && i<=pPage->nCell+pPage->nOverflow );
6260   assert( MX_CELL(pPage->pBt)<=10921 );
6261   assert( pPage->nCell<=MX_CELL(pPage->pBt) || CORRUPT_DB );
6262   assert( pPage->nOverflow<=ArraySize(pPage->apOvfl) );
6263   assert( ArraySize(pPage->apOvfl)==ArraySize(pPage->aiOvfl) );
6264   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
6265   /* The cell should normally be sized correctly.  However, when moving a
6266   ** malformed cell from a leaf page to an interior page, if the cell size
6267   ** wanted to be less than 4 but got rounded up to 4 on the leaf, then size
6268   ** might be less than 8 (leaf-size + pointer) on the interior node.  Hence
6269   ** the term after the || in the following assert(). */
6270   assert( sz==pPage->xCellSize(pPage, pCell) || (sz==8 && iChild>0) );
6271   if( pPage->nOverflow || sz+2>pPage->nFree ){
6272     if( pTemp ){
6273       memcpy(pTemp, pCell, sz);
6274       pCell = pTemp;
6275     }
6276     if( iChild ){
6277       put4byte(pCell, iChild);
6278     }
6279     j = pPage->nOverflow++;
6280     assert( j<(int)(sizeof(pPage->apOvfl)/sizeof(pPage->apOvfl[0])) );
6281     pPage->apOvfl[j] = pCell;
6282     pPage->aiOvfl[j] = (u16)i;
6283 
6284     /* When multiple overflows occur, they are always sequential and in
6285     ** sorted order.  This invariants arise because multiple overflows can
6286     ** only occur when inserting divider cells into the parent page during
6287     ** balancing, and the dividers are adjacent and sorted.
6288     */
6289     assert( j==0 || pPage->aiOvfl[j-1]<(u16)i ); /* Overflows in sorted order */
6290     assert( j==0 || i==pPage->aiOvfl[j-1]+1 );   /* Overflows are sequential */
6291   }else{
6292     int rc = sqlite3PagerWrite(pPage->pDbPage);
6293     if( rc!=SQLITE_OK ){
6294       *pRC = rc;
6295       return;
6296     }
6297     assert( sqlite3PagerIswriteable(pPage->pDbPage) );
6298     data = pPage->aData;
6299     assert( &data[pPage->cellOffset]==pPage->aCellIdx );
6300     rc = allocateSpace(pPage, sz, &idx);
6301     if( rc ){ *pRC = rc; return; }
6302     /* The allocateSpace() routine guarantees the following properties
6303     ** if it returns successfully */
6304     assert( idx >= 0 );
6305     assert( idx >= pPage->cellOffset+2*pPage->nCell+2 || CORRUPT_DB );
6306     assert( idx+sz <= (int)pPage->pBt->usableSize );
6307     pPage->nFree -= (u16)(2 + sz);
6308     memcpy(&data[idx], pCell, sz);
6309     if( iChild ){
6310       put4byte(&data[idx], iChild);
6311     }
6312     pIns = pPage->aCellIdx + i*2;
6313     memmove(pIns+2, pIns, 2*(pPage->nCell - i));
6314     put2byte(pIns, idx);
6315     pPage->nCell++;
6316     /* increment the cell count */
6317     if( (++data[pPage->hdrOffset+4])==0 ) data[pPage->hdrOffset+3]++;
6318     assert( get2byte(&data[pPage->hdrOffset+3])==pPage->nCell );
6319 #ifndef SQLITE_OMIT_AUTOVACUUM
6320     if( pPage->pBt->autoVacuum ){
6321       /* The cell may contain a pointer to an overflow page. If so, write
6322       ** the entry for the overflow page into the pointer map.
6323       */
6324       ptrmapPutOvflPtr(pPage, pCell, pRC);
6325     }
6326 #endif
6327   }
6328 }
6329 
6330 /*
6331 ** A CellArray object contains a cache of pointers and sizes for a
6332 ** consecutive sequence of cells that might be held multiple pages.
6333 */
6334 typedef struct CellArray CellArray;
6335 struct CellArray {
6336   int nCell;              /* Number of cells in apCell[] */
6337   MemPage *pRef;          /* Reference page */
6338   u8 **apCell;            /* All cells begin balanced */
6339   u16 *szCell;            /* Local size of all cells in apCell[] */
6340 };
6341 
6342 /*
6343 ** Make sure the cell sizes at idx, idx+1, ..., idx+N-1 have been
6344 ** computed.
6345 */
6346 static void populateCellCache(CellArray *p, int idx, int N){
6347   assert( idx>=0 && idx+N<=p->nCell );
6348   while( N>0 ){
6349     assert( p->apCell[idx]!=0 );
6350     if( p->szCell[idx]==0 ){
6351       p->szCell[idx] = p->pRef->xCellSize(p->pRef, p->apCell[idx]);
6352     }else{
6353       assert( CORRUPT_DB ||
6354               p->szCell[idx]==p->pRef->xCellSize(p->pRef, p->apCell[idx]) );
6355     }
6356     idx++;
6357     N--;
6358   }
6359 }
6360 
6361 /*
6362 ** Return the size of the Nth element of the cell array
6363 */
6364 static SQLITE_NOINLINE u16 computeCellSize(CellArray *p, int N){
6365   assert( N>=0 && N<p->nCell );
6366   assert( p->szCell[N]==0 );
6367   p->szCell[N] = p->pRef->xCellSize(p->pRef, p->apCell[N]);
6368   return p->szCell[N];
6369 }
6370 static u16 cachedCellSize(CellArray *p, int N){
6371   assert( N>=0 && N<p->nCell );
6372   if( p->szCell[N] ) return p->szCell[N];
6373   return computeCellSize(p, N);
6374 }
6375 
6376 /*
6377 ** Array apCell[] contains pointers to nCell b-tree page cells. The
6378 ** szCell[] array contains the size in bytes of each cell. This function
6379 ** replaces the current contents of page pPg with the contents of the cell
6380 ** array.
6381 **
6382 ** Some of the cells in apCell[] may currently be stored in pPg. This
6383 ** function works around problems caused by this by making a copy of any
6384 ** such cells before overwriting the page data.
6385 **
6386 ** The MemPage.nFree field is invalidated by this function. It is the
6387 ** responsibility of the caller to set it correctly.
6388 */
6389 static int rebuildPage(
6390   MemPage *pPg,                   /* Edit this page */
6391   int nCell,                      /* Final number of cells on page */
6392   u8 **apCell,                    /* Array of cells */
6393   u16 *szCell                     /* Array of cell sizes */
6394 ){
6395   const int hdr = pPg->hdrOffset;          /* Offset of header on pPg */
6396   u8 * const aData = pPg->aData;           /* Pointer to data for pPg */
6397   const int usableSize = pPg->pBt->usableSize;
6398   u8 * const pEnd = &aData[usableSize];
6399   int i;
6400   u8 *pCellptr = pPg->aCellIdx;
6401   u8 *pTmp = sqlite3PagerTempSpace(pPg->pBt->pPager);
6402   u8 *pData;
6403 
6404   i = get2byte(&aData[hdr+5]);
6405   memcpy(&pTmp[i], &aData[i], usableSize - i);
6406 
6407   pData = pEnd;
6408   for(i=0; i<nCell; i++){
6409     u8 *pCell = apCell[i];
6410     if( pCell>aData && pCell<pEnd ){
6411       pCell = &pTmp[pCell - aData];
6412     }
6413     pData -= szCell[i];
6414     put2byte(pCellptr, (pData - aData));
6415     pCellptr += 2;
6416     if( pData < pCellptr ) return SQLITE_CORRUPT_BKPT;
6417     memcpy(pData, pCell, szCell[i]);
6418     assert( szCell[i]==pPg->xCellSize(pPg, pCell) || CORRUPT_DB );
6419     testcase( szCell[i]!=pPg->xCellSize(pPg,pCell) );
6420   }
6421 
6422   /* The pPg->nFree field is now set incorrectly. The caller will fix it. */
6423   pPg->nCell = nCell;
6424   pPg->nOverflow = 0;
6425 
6426   put2byte(&aData[hdr+1], 0);
6427   put2byte(&aData[hdr+3], pPg->nCell);
6428   put2byte(&aData[hdr+5], pData - aData);
6429   aData[hdr+7] = 0x00;
6430   return SQLITE_OK;
6431 }
6432 
6433 /*
6434 ** Array apCell[] contains nCell pointers to b-tree cells. Array szCell
6435 ** contains the size in bytes of each such cell. This function attempts to
6436 ** add the cells stored in the array to page pPg. If it cannot (because
6437 ** the page needs to be defragmented before the cells will fit), non-zero
6438 ** is returned. Otherwise, if the cells are added successfully, zero is
6439 ** returned.
6440 **
6441 ** Argument pCellptr points to the first entry in the cell-pointer array
6442 ** (part of page pPg) to populate. After cell apCell[0] is written to the
6443 ** page body, a 16-bit offset is written to pCellptr. And so on, for each
6444 ** cell in the array. It is the responsibility of the caller to ensure
6445 ** that it is safe to overwrite this part of the cell-pointer array.
6446 **
6447 ** When this function is called, *ppData points to the start of the
6448 ** content area on page pPg. If the size of the content area is extended,
6449 ** *ppData is updated to point to the new start of the content area
6450 ** before returning.
6451 **
6452 ** Finally, argument pBegin points to the byte immediately following the
6453 ** end of the space required by this page for the cell-pointer area (for
6454 ** all cells - not just those inserted by the current call). If the content
6455 ** area must be extended to before this point in order to accomodate all
6456 ** cells in apCell[], then the cells do not fit and non-zero is returned.
6457 */
6458 static int pageInsertArray(
6459   MemPage *pPg,                   /* Page to add cells to */
6460   u8 *pBegin,                     /* End of cell-pointer array */
6461   u8 **ppData,                    /* IN/OUT: Page content -area pointer */
6462   u8 *pCellptr,                   /* Pointer to cell-pointer area */
6463   int iFirst,                     /* Index of first cell to add */
6464   int nCell,                      /* Number of cells to add to pPg */
6465   CellArray *pCArray              /* Array of cells */
6466 ){
6467   int i;
6468   u8 *aData = pPg->aData;
6469   u8 *pData = *ppData;
6470   int iEnd = iFirst + nCell;
6471   assert( CORRUPT_DB || pPg->hdrOffset==0 );    /* Never called on page 1 */
6472   for(i=iFirst; i<iEnd; i++){
6473     int sz, rc;
6474     u8 *pSlot;
6475     sz = cachedCellSize(pCArray, i);
6476     if( (aData[1]==0 && aData[2]==0) || (pSlot = pageFindSlot(pPg,sz,&rc))==0 ){
6477       pData -= sz;
6478       if( pData<pBegin ) return 1;
6479       pSlot = pData;
6480     }
6481     memcpy(pSlot, pCArray->apCell[i], sz);
6482     put2byte(pCellptr, (pSlot - aData));
6483     pCellptr += 2;
6484   }
6485   *ppData = pData;
6486   return 0;
6487 }
6488 
6489 /*
6490 ** Array apCell[] contains nCell pointers to b-tree cells. Array szCell
6491 ** contains the size in bytes of each such cell. This function adds the
6492 ** space associated with each cell in the array that is currently stored
6493 ** within the body of pPg to the pPg free-list. The cell-pointers and other
6494 ** fields of the page are not updated.
6495 **
6496 ** This function returns the total number of cells added to the free-list.
6497 */
6498 static int pageFreeArray(
6499   MemPage *pPg,                   /* Page to edit */
6500   int iFirst,                     /* First cell to delete */
6501   int nCell,                      /* Cells to delete */
6502   CellArray *pCArray              /* Array of cells */
6503 ){
6504   u8 * const aData = pPg->aData;
6505   u8 * const pEnd = &aData[pPg->pBt->usableSize];
6506   u8 * const pStart = &aData[pPg->hdrOffset + 8 + pPg->childPtrSize];
6507   int nRet = 0;
6508   int i;
6509   int iEnd = iFirst + nCell;
6510   u8 *pFree = 0;
6511   int szFree = 0;
6512 
6513   for(i=iFirst; i<iEnd; i++){
6514     u8 *pCell = pCArray->apCell[i];
6515     if( pCell>=pStart && pCell<pEnd ){
6516       int sz;
6517       /* No need to use cachedCellSize() here.  The sizes of all cells that
6518       ** are to be freed have already been computing while deciding which
6519       ** cells need freeing */
6520       sz = pCArray->szCell[i];  assert( sz>0 );
6521       if( pFree!=(pCell + sz) ){
6522         if( pFree ){
6523           assert( pFree>aData && (pFree - aData)<65536 );
6524           freeSpace(pPg, (u16)(pFree - aData), szFree);
6525         }
6526         pFree = pCell;
6527         szFree = sz;
6528         if( pFree+sz>pEnd ) return 0;
6529       }else{
6530         pFree = pCell;
6531         szFree += sz;
6532       }
6533       nRet++;
6534     }
6535   }
6536   if( pFree ){
6537     assert( pFree>aData && (pFree - aData)<65536 );
6538     freeSpace(pPg, (u16)(pFree - aData), szFree);
6539   }
6540   return nRet;
6541 }
6542 
6543 /*
6544 ** apCell[] and szCell[] contains pointers to and sizes of all cells in the
6545 ** pages being balanced.  The current page, pPg, has pPg->nCell cells starting
6546 ** with apCell[iOld].  After balancing, this page should hold nNew cells
6547 ** starting at apCell[iNew].
6548 **
6549 ** This routine makes the necessary adjustments to pPg so that it contains
6550 ** the correct cells after being balanced.
6551 **
6552 ** The pPg->nFree field is invalid when this function returns. It is the
6553 ** responsibility of the caller to set it correctly.
6554 */
6555 static int editPage(
6556   MemPage *pPg,                   /* Edit this page */
6557   int iOld,                       /* Index of first cell currently on page */
6558   int iNew,                       /* Index of new first cell on page */
6559   int nNew,                       /* Final number of cells on page */
6560   CellArray *pCArray              /* Array of cells and sizes */
6561 ){
6562   u8 * const aData = pPg->aData;
6563   const int hdr = pPg->hdrOffset;
6564   u8 *pBegin = &pPg->aCellIdx[nNew * 2];
6565   int nCell = pPg->nCell;       /* Cells stored on pPg */
6566   u8 *pData;
6567   u8 *pCellptr;
6568   int i;
6569   int iOldEnd = iOld + pPg->nCell + pPg->nOverflow;
6570   int iNewEnd = iNew + nNew;
6571 
6572 #ifdef SQLITE_DEBUG
6573   u8 *pTmp = sqlite3PagerTempSpace(pPg->pBt->pPager);
6574   memcpy(pTmp, aData, pPg->pBt->usableSize);
6575 #endif
6576 
6577   /* Remove cells from the start and end of the page */
6578   if( iOld<iNew ){
6579     int nShift = pageFreeArray(pPg, iOld, iNew-iOld, pCArray);
6580     memmove(pPg->aCellIdx, &pPg->aCellIdx[nShift*2], nCell*2);
6581     nCell -= nShift;
6582   }
6583   if( iNewEnd < iOldEnd ){
6584     nCell -= pageFreeArray(pPg, iNewEnd, iOldEnd - iNewEnd, pCArray);
6585   }
6586 
6587   pData = &aData[get2byteNotZero(&aData[hdr+5])];
6588   if( pData<pBegin ) goto editpage_fail;
6589 
6590   /* Add cells to the start of the page */
6591   if( iNew<iOld ){
6592     int nAdd = MIN(nNew,iOld-iNew);
6593     assert( (iOld-iNew)<nNew || nCell==0 || CORRUPT_DB );
6594     pCellptr = pPg->aCellIdx;
6595     memmove(&pCellptr[nAdd*2], pCellptr, nCell*2);
6596     if( pageInsertArray(
6597           pPg, pBegin, &pData, pCellptr,
6598           iNew, nAdd, pCArray
6599     ) ) goto editpage_fail;
6600     nCell += nAdd;
6601   }
6602 
6603   /* Add any overflow cells */
6604   for(i=0; i<pPg->nOverflow; i++){
6605     int iCell = (iOld + pPg->aiOvfl[i]) - iNew;
6606     if( iCell>=0 && iCell<nNew ){
6607       pCellptr = &pPg->aCellIdx[iCell * 2];
6608       memmove(&pCellptr[2], pCellptr, (nCell - iCell) * 2);
6609       nCell++;
6610       if( pageInsertArray(
6611             pPg, pBegin, &pData, pCellptr,
6612             iCell+iNew, 1, pCArray
6613       ) ) goto editpage_fail;
6614     }
6615   }
6616 
6617   /* Append cells to the end of the page */
6618   pCellptr = &pPg->aCellIdx[nCell*2];
6619   if( pageInsertArray(
6620         pPg, pBegin, &pData, pCellptr,
6621         iNew+nCell, nNew-nCell, pCArray
6622   ) ) goto editpage_fail;
6623 
6624   pPg->nCell = nNew;
6625   pPg->nOverflow = 0;
6626 
6627   put2byte(&aData[hdr+3], pPg->nCell);
6628   put2byte(&aData[hdr+5], pData - aData);
6629 
6630 #ifdef SQLITE_DEBUG
6631   for(i=0; i<nNew && !CORRUPT_DB; i++){
6632     u8 *pCell = pCArray->apCell[i+iNew];
6633     int iOff = get2byteAligned(&pPg->aCellIdx[i*2]);
6634     if( pCell>=aData && pCell<&aData[pPg->pBt->usableSize] ){
6635       pCell = &pTmp[pCell - aData];
6636     }
6637     assert( 0==memcmp(pCell, &aData[iOff],
6638             pCArray->pRef->xCellSize(pCArray->pRef, pCArray->apCell[i+iNew])) );
6639   }
6640 #endif
6641 
6642   return SQLITE_OK;
6643  editpage_fail:
6644   /* Unable to edit this page. Rebuild it from scratch instead. */
6645   populateCellCache(pCArray, iNew, nNew);
6646   return rebuildPage(pPg, nNew, &pCArray->apCell[iNew], &pCArray->szCell[iNew]);
6647 }
6648 
6649 /*
6650 ** The following parameters determine how many adjacent pages get involved
6651 ** in a balancing operation.  NN is the number of neighbors on either side
6652 ** of the page that participate in the balancing operation.  NB is the
6653 ** total number of pages that participate, including the target page and
6654 ** NN neighbors on either side.
6655 **
6656 ** The minimum value of NN is 1 (of course).  Increasing NN above 1
6657 ** (to 2 or 3) gives a modest improvement in SELECT and DELETE performance
6658 ** in exchange for a larger degradation in INSERT and UPDATE performance.
6659 ** The value of NN appears to give the best results overall.
6660 */
6661 #define NN 1             /* Number of neighbors on either side of pPage */
6662 #define NB (NN*2+1)      /* Total pages involved in the balance */
6663 
6664 
6665 #ifndef SQLITE_OMIT_QUICKBALANCE
6666 /*
6667 ** This version of balance() handles the common special case where
6668 ** a new entry is being inserted on the extreme right-end of the
6669 ** tree, in other words, when the new entry will become the largest
6670 ** entry in the tree.
6671 **
6672 ** Instead of trying to balance the 3 right-most leaf pages, just add
6673 ** a new page to the right-hand side and put the one new entry in
6674 ** that page.  This leaves the right side of the tree somewhat
6675 ** unbalanced.  But odds are that we will be inserting new entries
6676 ** at the end soon afterwards so the nearly empty page will quickly
6677 ** fill up.  On average.
6678 **
6679 ** pPage is the leaf page which is the right-most page in the tree.
6680 ** pParent is its parent.  pPage must have a single overflow entry
6681 ** which is also the right-most entry on the page.
6682 **
6683 ** The pSpace buffer is used to store a temporary copy of the divider
6684 ** cell that will be inserted into pParent. Such a cell consists of a 4
6685 ** byte page number followed by a variable length integer. In other
6686 ** words, at most 13 bytes. Hence the pSpace buffer must be at
6687 ** least 13 bytes in size.
6688 */
6689 static int balance_quick(MemPage *pParent, MemPage *pPage, u8 *pSpace){
6690   BtShared *const pBt = pPage->pBt;    /* B-Tree Database */
6691   MemPage *pNew;                       /* Newly allocated page */
6692   int rc;                              /* Return Code */
6693   Pgno pgnoNew;                        /* Page number of pNew */
6694 
6695   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
6696   assert( sqlite3PagerIswriteable(pParent->pDbPage) );
6697   assert( pPage->nOverflow==1 );
6698 
6699   /* This error condition is now caught prior to reaching this function */
6700   if( NEVER(pPage->nCell==0) ) return SQLITE_CORRUPT_BKPT;
6701 
6702   /* Allocate a new page. This page will become the right-sibling of
6703   ** pPage. Make the parent page writable, so that the new divider cell
6704   ** may be inserted. If both these operations are successful, proceed.
6705   */
6706   rc = allocateBtreePage(pBt, &pNew, &pgnoNew, 0, 0);
6707 
6708   if( rc==SQLITE_OK ){
6709 
6710     u8 *pOut = &pSpace[4];
6711     u8 *pCell = pPage->apOvfl[0];
6712     u16 szCell = pPage->xCellSize(pPage, pCell);
6713     u8 *pStop;
6714 
6715     assert( sqlite3PagerIswriteable(pNew->pDbPage) );
6716     assert( pPage->aData[0]==(PTF_INTKEY|PTF_LEAFDATA|PTF_LEAF) );
6717     zeroPage(pNew, PTF_INTKEY|PTF_LEAFDATA|PTF_LEAF);
6718     rc = rebuildPage(pNew, 1, &pCell, &szCell);
6719     if( NEVER(rc) ) return rc;
6720     pNew->nFree = pBt->usableSize - pNew->cellOffset - 2 - szCell;
6721 
6722     /* If this is an auto-vacuum database, update the pointer map
6723     ** with entries for the new page, and any pointer from the
6724     ** cell on the page to an overflow page. If either of these
6725     ** operations fails, the return code is set, but the contents
6726     ** of the parent page are still manipulated by thh code below.
6727     ** That is Ok, at this point the parent page is guaranteed to
6728     ** be marked as dirty. Returning an error code will cause a
6729     ** rollback, undoing any changes made to the parent page.
6730     */
6731     if( ISAUTOVACUUM ){
6732       ptrmapPut(pBt, pgnoNew, PTRMAP_BTREE, pParent->pgno, &rc);
6733       if( szCell>pNew->minLocal ){
6734         ptrmapPutOvflPtr(pNew, pCell, &rc);
6735       }
6736     }
6737 
6738     /* Create a divider cell to insert into pParent. The divider cell
6739     ** consists of a 4-byte page number (the page number of pPage) and
6740     ** a variable length key value (which must be the same value as the
6741     ** largest key on pPage).
6742     **
6743     ** To find the largest key value on pPage, first find the right-most
6744     ** cell on pPage. The first two fields of this cell are the
6745     ** record-length (a variable length integer at most 32-bits in size)
6746     ** and the key value (a variable length integer, may have any value).
6747     ** The first of the while(...) loops below skips over the record-length
6748     ** field. The second while(...) loop copies the key value from the
6749     ** cell on pPage into the pSpace buffer.
6750     */
6751     pCell = findCell(pPage, pPage->nCell-1);
6752     pStop = &pCell[9];
6753     while( (*(pCell++)&0x80) && pCell<pStop );
6754     pStop = &pCell[9];
6755     while( ((*(pOut++) = *(pCell++))&0x80) && pCell<pStop );
6756 
6757     /* Insert the new divider cell into pParent. */
6758     insertCell(pParent, pParent->nCell, pSpace, (int)(pOut-pSpace),
6759                0, pPage->pgno, &rc);
6760 
6761     /* Set the right-child pointer of pParent to point to the new page. */
6762     put4byte(&pParent->aData[pParent->hdrOffset+8], pgnoNew);
6763 
6764     /* Release the reference to the new page. */
6765     releasePage(pNew);
6766   }
6767 
6768   return rc;
6769 }
6770 #endif /* SQLITE_OMIT_QUICKBALANCE */
6771 
6772 #if 0
6773 /*
6774 ** This function does not contribute anything to the operation of SQLite.
6775 ** it is sometimes activated temporarily while debugging code responsible
6776 ** for setting pointer-map entries.
6777 */
6778 static int ptrmapCheckPages(MemPage **apPage, int nPage){
6779   int i, j;
6780   for(i=0; i<nPage; i++){
6781     Pgno n;
6782     u8 e;
6783     MemPage *pPage = apPage[i];
6784     BtShared *pBt = pPage->pBt;
6785     assert( pPage->isInit );
6786 
6787     for(j=0; j<pPage->nCell; j++){
6788       CellInfo info;
6789       u8 *z;
6790 
6791       z = findCell(pPage, j);
6792       pPage->xParseCell(pPage, z, &info);
6793       if( info.iOverflow ){
6794         Pgno ovfl = get4byte(&z[info.iOverflow]);
6795         ptrmapGet(pBt, ovfl, &e, &n);
6796         assert( n==pPage->pgno && e==PTRMAP_OVERFLOW1 );
6797       }
6798       if( !pPage->leaf ){
6799         Pgno child = get4byte(z);
6800         ptrmapGet(pBt, child, &e, &n);
6801         assert( n==pPage->pgno && e==PTRMAP_BTREE );
6802       }
6803     }
6804     if( !pPage->leaf ){
6805       Pgno child = get4byte(&pPage->aData[pPage->hdrOffset+8]);
6806       ptrmapGet(pBt, child, &e, &n);
6807       assert( n==pPage->pgno && e==PTRMAP_BTREE );
6808     }
6809   }
6810   return 1;
6811 }
6812 #endif
6813 
6814 /*
6815 ** This function is used to copy the contents of the b-tree node stored
6816 ** on page pFrom to page pTo. If page pFrom was not a leaf page, then
6817 ** the pointer-map entries for each child page are updated so that the
6818 ** parent page stored in the pointer map is page pTo. If pFrom contained
6819 ** any cells with overflow page pointers, then the corresponding pointer
6820 ** map entries are also updated so that the parent page is page pTo.
6821 **
6822 ** If pFrom is currently carrying any overflow cells (entries in the
6823 ** MemPage.apOvfl[] array), they are not copied to pTo.
6824 **
6825 ** Before returning, page pTo is reinitialized using btreeInitPage().
6826 **
6827 ** The performance of this function is not critical. It is only used by
6828 ** the balance_shallower() and balance_deeper() procedures, neither of
6829 ** which are called often under normal circumstances.
6830 */
6831 static void copyNodeContent(MemPage *pFrom, MemPage *pTo, int *pRC){
6832   if( (*pRC)==SQLITE_OK ){
6833     BtShared * const pBt = pFrom->pBt;
6834     u8 * const aFrom = pFrom->aData;
6835     u8 * const aTo = pTo->aData;
6836     int const iFromHdr = pFrom->hdrOffset;
6837     int const iToHdr = ((pTo->pgno==1) ? 100 : 0);
6838     int rc;
6839     int iData;
6840 
6841 
6842     assert( pFrom->isInit );
6843     assert( pFrom->nFree>=iToHdr );
6844     assert( get2byte(&aFrom[iFromHdr+5]) <= (int)pBt->usableSize );
6845 
6846     /* Copy the b-tree node content from page pFrom to page pTo. */
6847     iData = get2byte(&aFrom[iFromHdr+5]);
6848     memcpy(&aTo[iData], &aFrom[iData], pBt->usableSize-iData);
6849     memcpy(&aTo[iToHdr], &aFrom[iFromHdr], pFrom->cellOffset + 2*pFrom->nCell);
6850 
6851     /* Reinitialize page pTo so that the contents of the MemPage structure
6852     ** match the new data. The initialization of pTo can actually fail under
6853     ** fairly obscure circumstances, even though it is a copy of initialized
6854     ** page pFrom.
6855     */
6856     pTo->isInit = 0;
6857     rc = btreeInitPage(pTo);
6858     if( rc!=SQLITE_OK ){
6859       *pRC = rc;
6860       return;
6861     }
6862 
6863     /* If this is an auto-vacuum database, update the pointer-map entries
6864     ** for any b-tree or overflow pages that pTo now contains the pointers to.
6865     */
6866     if( ISAUTOVACUUM ){
6867       *pRC = setChildPtrmaps(pTo);
6868     }
6869   }
6870 }
6871 
6872 /*
6873 ** This routine redistributes cells on the iParentIdx'th child of pParent
6874 ** (hereafter "the page") and up to 2 siblings so that all pages have about the
6875 ** same amount of free space. Usually a single sibling on either side of the
6876 ** page are used in the balancing, though both siblings might come from one
6877 ** side if the page is the first or last child of its parent. If the page
6878 ** has fewer than 2 siblings (something which can only happen if the page
6879 ** is a root page or a child of a root page) then all available siblings
6880 ** participate in the balancing.
6881 **
6882 ** The number of siblings of the page might be increased or decreased by
6883 ** one or two in an effort to keep pages nearly full but not over full.
6884 **
6885 ** Note that when this routine is called, some of the cells on the page
6886 ** might not actually be stored in MemPage.aData[]. This can happen
6887 ** if the page is overfull. This routine ensures that all cells allocated
6888 ** to the page and its siblings fit into MemPage.aData[] before returning.
6889 **
6890 ** In the course of balancing the page and its siblings, cells may be
6891 ** inserted into or removed from the parent page (pParent). Doing so
6892 ** may cause the parent page to become overfull or underfull. If this
6893 ** happens, it is the responsibility of the caller to invoke the correct
6894 ** balancing routine to fix this problem (see the balance() routine).
6895 **
6896 ** If this routine fails for any reason, it might leave the database
6897 ** in a corrupted state. So if this routine fails, the database should
6898 ** be rolled back.
6899 **
6900 ** The third argument to this function, aOvflSpace, is a pointer to a
6901 ** buffer big enough to hold one page. If while inserting cells into the parent
6902 ** page (pParent) the parent page becomes overfull, this buffer is
6903 ** used to store the parent's overflow cells. Because this function inserts
6904 ** a maximum of four divider cells into the parent page, and the maximum
6905 ** size of a cell stored within an internal node is always less than 1/4
6906 ** of the page-size, the aOvflSpace[] buffer is guaranteed to be large
6907 ** enough for all overflow cells.
6908 **
6909 ** If aOvflSpace is set to a null pointer, this function returns
6910 ** SQLITE_NOMEM.
6911 */
6912 #if defined(_MSC_VER) && _MSC_VER >= 1700 && defined(_M_ARM)
6913 #pragma optimize("", off)
6914 #endif
6915 static int balance_nonroot(
6916   MemPage *pParent,               /* Parent page of siblings being balanced */
6917   int iParentIdx,                 /* Index of "the page" in pParent */
6918   u8 *aOvflSpace,                 /* page-size bytes of space for parent ovfl */
6919   int isRoot,                     /* True if pParent is a root-page */
6920   int bBulk                       /* True if this call is part of a bulk load */
6921 ){
6922   BtShared *pBt;               /* The whole database */
6923   int nMaxCells = 0;           /* Allocated size of apCell, szCell, aFrom. */
6924   int nNew = 0;                /* Number of pages in apNew[] */
6925   int nOld;                    /* Number of pages in apOld[] */
6926   int i, j, k;                 /* Loop counters */
6927   int nxDiv;                   /* Next divider slot in pParent->aCell[] */
6928   int rc = SQLITE_OK;          /* The return code */
6929   u16 leafCorrection;          /* 4 if pPage is a leaf.  0 if not */
6930   int leafData;                /* True if pPage is a leaf of a LEAFDATA tree */
6931   int usableSpace;             /* Bytes in pPage beyond the header */
6932   int pageFlags;               /* Value of pPage->aData[0] */
6933   int iSpace1 = 0;             /* First unused byte of aSpace1[] */
6934   int iOvflSpace = 0;          /* First unused byte of aOvflSpace[] */
6935   int szScratch;               /* Size of scratch memory requested */
6936   MemPage *apOld[NB];          /* pPage and up to two siblings */
6937   MemPage *apNew[NB+2];        /* pPage and up to NB siblings after balancing */
6938   u8 *pRight;                  /* Location in parent of right-sibling pointer */
6939   u8 *apDiv[NB-1];             /* Divider cells in pParent */
6940   int cntNew[NB+2];            /* Index in b.paCell[] of cell after i-th page */
6941   int cntOld[NB+2];            /* Old index in b.apCell[] */
6942   int szNew[NB+2];             /* Combined size of cells placed on i-th page */
6943   u8 *aSpace1;                 /* Space for copies of dividers cells */
6944   Pgno pgno;                   /* Temp var to store a page number in */
6945   u8 abDone[NB+2];             /* True after i'th new page is populated */
6946   Pgno aPgno[NB+2];            /* Page numbers of new pages before shuffling */
6947   Pgno aPgOrder[NB+2];         /* Copy of aPgno[] used for sorting pages */
6948   u16 aPgFlags[NB+2];          /* flags field of new pages before shuffling */
6949   CellArray b;                  /* Parsed information on cells being balanced */
6950 
6951   memset(abDone, 0, sizeof(abDone));
6952   b.nCell = 0;
6953   b.apCell = 0;
6954   pBt = pParent->pBt;
6955   assert( sqlite3_mutex_held(pBt->mutex) );
6956   assert( sqlite3PagerIswriteable(pParent->pDbPage) );
6957 
6958 #if 0
6959   TRACE(("BALANCE: begin page %d child of %d\n", pPage->pgno, pParent->pgno));
6960 #endif
6961 
6962   /* At this point pParent may have at most one overflow cell. And if
6963   ** this overflow cell is present, it must be the cell with
6964   ** index iParentIdx. This scenario comes about when this function
6965   ** is called (indirectly) from sqlite3BtreeDelete().
6966   */
6967   assert( pParent->nOverflow==0 || pParent->nOverflow==1 );
6968   assert( pParent->nOverflow==0 || pParent->aiOvfl[0]==iParentIdx );
6969 
6970   if( !aOvflSpace ){
6971     return SQLITE_NOMEM;
6972   }
6973 
6974   /* Find the sibling pages to balance. Also locate the cells in pParent
6975   ** that divide the siblings. An attempt is made to find NN siblings on
6976   ** either side of pPage. More siblings are taken from one side, however,
6977   ** if there are fewer than NN siblings on the other side. If pParent
6978   ** has NB or fewer children then all children of pParent are taken.
6979   **
6980   ** This loop also drops the divider cells from the parent page. This
6981   ** way, the remainder of the function does not have to deal with any
6982   ** overflow cells in the parent page, since if any existed they will
6983   ** have already been removed.
6984   */
6985   i = pParent->nOverflow + pParent->nCell;
6986   if( i<2 ){
6987     nxDiv = 0;
6988   }else{
6989     assert( bBulk==0 || bBulk==1 );
6990     if( iParentIdx==0 ){
6991       nxDiv = 0;
6992     }else if( iParentIdx==i ){
6993       nxDiv = i-2+bBulk;
6994     }else{
6995       nxDiv = iParentIdx-1;
6996     }
6997     i = 2-bBulk;
6998   }
6999   nOld = i+1;
7000   if( (i+nxDiv-pParent->nOverflow)==pParent->nCell ){
7001     pRight = &pParent->aData[pParent->hdrOffset+8];
7002   }else{
7003     pRight = findCell(pParent, i+nxDiv-pParent->nOverflow);
7004   }
7005   pgno = get4byte(pRight);
7006   while( 1 ){
7007     rc = getAndInitPage(pBt, pgno, &apOld[i], 0, 0);
7008     if( rc ){
7009       memset(apOld, 0, (i+1)*sizeof(MemPage*));
7010       goto balance_cleanup;
7011     }
7012     nMaxCells += 1+apOld[i]->nCell+apOld[i]->nOverflow;
7013     if( (i--)==0 ) break;
7014 
7015     if( i+nxDiv==pParent->aiOvfl[0] && pParent->nOverflow ){
7016       apDiv[i] = pParent->apOvfl[0];
7017       pgno = get4byte(apDiv[i]);
7018       szNew[i] = pParent->xCellSize(pParent, apDiv[i]);
7019       pParent->nOverflow = 0;
7020     }else{
7021       apDiv[i] = findCell(pParent, i+nxDiv-pParent->nOverflow);
7022       pgno = get4byte(apDiv[i]);
7023       szNew[i] = pParent->xCellSize(pParent, apDiv[i]);
7024 
7025       /* Drop the cell from the parent page. apDiv[i] still points to
7026       ** the cell within the parent, even though it has been dropped.
7027       ** This is safe because dropping a cell only overwrites the first
7028       ** four bytes of it, and this function does not need the first
7029       ** four bytes of the divider cell. So the pointer is safe to use
7030       ** later on.
7031       **
7032       ** But not if we are in secure-delete mode. In secure-delete mode,
7033       ** the dropCell() routine will overwrite the entire cell with zeroes.
7034       ** In this case, temporarily copy the cell into the aOvflSpace[]
7035       ** buffer. It will be copied out again as soon as the aSpace[] buffer
7036       ** is allocated.  */
7037       if( pBt->btsFlags & BTS_SECURE_DELETE ){
7038         int iOff;
7039 
7040         iOff = SQLITE_PTR_TO_INT(apDiv[i]) - SQLITE_PTR_TO_INT(pParent->aData);
7041         if( (iOff+szNew[i])>(int)pBt->usableSize ){
7042           rc = SQLITE_CORRUPT_BKPT;
7043           memset(apOld, 0, (i+1)*sizeof(MemPage*));
7044           goto balance_cleanup;
7045         }else{
7046           memcpy(&aOvflSpace[iOff], apDiv[i], szNew[i]);
7047           apDiv[i] = &aOvflSpace[apDiv[i]-pParent->aData];
7048         }
7049       }
7050       dropCell(pParent, i+nxDiv-pParent->nOverflow, szNew[i], &rc);
7051     }
7052   }
7053 
7054   /* Make nMaxCells a multiple of 4 in order to preserve 8-byte
7055   ** alignment */
7056   nMaxCells = (nMaxCells + 3)&~3;
7057 
7058   /*
7059   ** Allocate space for memory structures
7060   */
7061   szScratch =
7062        nMaxCells*sizeof(u8*)                       /* b.apCell */
7063      + nMaxCells*sizeof(u16)                       /* b.szCell */
7064      + pBt->pageSize;                              /* aSpace1 */
7065 
7066   /* EVIDENCE-OF: R-28375-38319 SQLite will never request a scratch buffer
7067   ** that is more than 6 times the database page size. */
7068   assert( szScratch<=6*(int)pBt->pageSize );
7069   b.apCell = sqlite3ScratchMalloc( szScratch );
7070   if( b.apCell==0 ){
7071     rc = SQLITE_NOMEM;
7072     goto balance_cleanup;
7073   }
7074   b.szCell = (u16*)&b.apCell[nMaxCells];
7075   aSpace1 = (u8*)&b.szCell[nMaxCells];
7076   assert( EIGHT_BYTE_ALIGNMENT(aSpace1) );
7077 
7078   /*
7079   ** Load pointers to all cells on sibling pages and the divider cells
7080   ** into the local b.apCell[] array.  Make copies of the divider cells
7081   ** into space obtained from aSpace1[]. The divider cells have already
7082   ** been removed from pParent.
7083   **
7084   ** If the siblings are on leaf pages, then the child pointers of the
7085   ** divider cells are stripped from the cells before they are copied
7086   ** into aSpace1[].  In this way, all cells in b.apCell[] are without
7087   ** child pointers.  If siblings are not leaves, then all cell in
7088   ** b.apCell[] include child pointers.  Either way, all cells in b.apCell[]
7089   ** are alike.
7090   **
7091   ** leafCorrection:  4 if pPage is a leaf.  0 if pPage is not a leaf.
7092   **       leafData:  1 if pPage holds key+data and pParent holds only keys.
7093   */
7094   b.pRef = apOld[0];
7095   leafCorrection = b.pRef->leaf*4;
7096   leafData = b.pRef->intKeyLeaf;
7097   for(i=0; i<nOld; i++){
7098     MemPage *pOld = apOld[i];
7099     int limit = pOld->nCell;
7100     u8 *aData = pOld->aData;
7101     u16 maskPage = pOld->maskPage;
7102     u8 *piCell = aData + pOld->cellOffset;
7103     u8 *piEnd;
7104 
7105     /* Verify that all sibling pages are of the same "type" (table-leaf,
7106     ** table-interior, index-leaf, or index-interior).
7107     */
7108     if( pOld->aData[0]!=apOld[0]->aData[0] ){
7109       rc = SQLITE_CORRUPT_BKPT;
7110       goto balance_cleanup;
7111     }
7112 
7113     /* Load b.apCell[] with pointers to all cells in pOld.  If pOld
7114     ** constains overflow cells, include them in the b.apCell[] array
7115     ** in the correct spot.
7116     **
7117     ** Note that when there are multiple overflow cells, it is always the
7118     ** case that they are sequential and adjacent.  This invariant arises
7119     ** because multiple overflows can only occurs when inserting divider
7120     ** cells into a parent on a prior balance, and divider cells are always
7121     ** adjacent and are inserted in order.  There is an assert() tagged
7122     ** with "NOTE 1" in the overflow cell insertion loop to prove this
7123     ** invariant.
7124     **
7125     ** This must be done in advance.  Once the balance starts, the cell
7126     ** offset section of the btree page will be overwritten and we will no
7127     ** long be able to find the cells if a pointer to each cell is not saved
7128     ** first.
7129     */
7130     memset(&b.szCell[b.nCell], 0, sizeof(b.szCell[0])*limit);
7131     if( pOld->nOverflow>0 ){
7132       memset(&b.szCell[b.nCell+limit], 0, sizeof(b.szCell[0])*pOld->nOverflow);
7133       limit = pOld->aiOvfl[0];
7134       for(j=0; j<limit; j++){
7135         b.apCell[b.nCell] = aData + (maskPage & get2byteAligned(piCell));
7136         piCell += 2;
7137         b.nCell++;
7138       }
7139       for(k=0; k<pOld->nOverflow; k++){
7140         assert( k==0 || pOld->aiOvfl[k-1]+1==pOld->aiOvfl[k] );/* NOTE 1 */
7141         b.apCell[b.nCell] = pOld->apOvfl[k];
7142         b.nCell++;
7143       }
7144     }
7145     piEnd = aData + pOld->cellOffset + 2*pOld->nCell;
7146     while( piCell<piEnd ){
7147       assert( b.nCell<nMaxCells );
7148       b.apCell[b.nCell] = aData + (maskPage & get2byteAligned(piCell));
7149       piCell += 2;
7150       b.nCell++;
7151     }
7152 
7153     cntOld[i] = b.nCell;
7154     if( i<nOld-1 && !leafData){
7155       u16 sz = (u16)szNew[i];
7156       u8 *pTemp;
7157       assert( b.nCell<nMaxCells );
7158       b.szCell[b.nCell] = sz;
7159       pTemp = &aSpace1[iSpace1];
7160       iSpace1 += sz;
7161       assert( sz<=pBt->maxLocal+23 );
7162       assert( iSpace1 <= (int)pBt->pageSize );
7163       memcpy(pTemp, apDiv[i], sz);
7164       b.apCell[b.nCell] = pTemp+leafCorrection;
7165       assert( leafCorrection==0 || leafCorrection==4 );
7166       b.szCell[b.nCell] = b.szCell[b.nCell] - leafCorrection;
7167       if( !pOld->leaf ){
7168         assert( leafCorrection==0 );
7169         assert( pOld->hdrOffset==0 );
7170         /* The right pointer of the child page pOld becomes the left
7171         ** pointer of the divider cell */
7172         memcpy(b.apCell[b.nCell], &pOld->aData[8], 4);
7173       }else{
7174         assert( leafCorrection==4 );
7175         while( b.szCell[b.nCell]<4 ){
7176           /* Do not allow any cells smaller than 4 bytes. If a smaller cell
7177           ** does exist, pad it with 0x00 bytes. */
7178           assert( b.szCell[b.nCell]==3 || CORRUPT_DB );
7179           assert( b.apCell[b.nCell]==&aSpace1[iSpace1-3] || CORRUPT_DB );
7180           aSpace1[iSpace1++] = 0x00;
7181           b.szCell[b.nCell]++;
7182         }
7183       }
7184       b.nCell++;
7185     }
7186   }
7187 
7188   /*
7189   ** Figure out the number of pages needed to hold all b.nCell cells.
7190   ** Store this number in "k".  Also compute szNew[] which is the total
7191   ** size of all cells on the i-th page and cntNew[] which is the index
7192   ** in b.apCell[] of the cell that divides page i from page i+1.
7193   ** cntNew[k] should equal b.nCell.
7194   **
7195   ** Values computed by this block:
7196   **
7197   **           k: The total number of sibling pages
7198   **    szNew[i]: Spaced used on the i-th sibling page.
7199   **   cntNew[i]: Index in b.apCell[] and b.szCell[] for the first cell to
7200   **              the right of the i-th sibling page.
7201   ** usableSpace: Number of bytes of space available on each sibling.
7202   **
7203   */
7204   usableSpace = pBt->usableSize - 12 + leafCorrection;
7205   for(i=0; i<nOld; i++){
7206     MemPage *p = apOld[i];
7207     szNew[i] = usableSpace - p->nFree;
7208     if( szNew[i]<0 ){ rc = SQLITE_CORRUPT_BKPT; goto balance_cleanup; }
7209     for(j=0; j<p->nOverflow; j++){
7210       szNew[i] += 2 + p->xCellSize(p, p->apOvfl[j]);
7211     }
7212     cntNew[i] = cntOld[i];
7213   }
7214   k = nOld;
7215   for(i=0; i<k; i++){
7216     int sz;
7217     while( szNew[i]>usableSpace ){
7218       if( i+1>=k ){
7219         k = i+2;
7220         if( k>NB+2 ){ rc = SQLITE_CORRUPT_BKPT; goto balance_cleanup; }
7221         szNew[k-1] = 0;
7222         cntNew[k-1] = b.nCell;
7223       }
7224       sz = 2 + cachedCellSize(&b, cntNew[i]-1);
7225       szNew[i] -= sz;
7226       if( !leafData ){
7227         if( cntNew[i]<b.nCell ){
7228           sz = 2 + cachedCellSize(&b, cntNew[i]);
7229         }else{
7230           sz = 0;
7231         }
7232       }
7233       szNew[i+1] += sz;
7234       cntNew[i]--;
7235     }
7236     while( cntNew[i]<b.nCell ){
7237       sz = 2 + cachedCellSize(&b, cntNew[i]);
7238       if( szNew[i]+sz>usableSpace ) break;
7239       szNew[i] += sz;
7240       cntNew[i]++;
7241       if( !leafData ){
7242         if( cntNew[i]<b.nCell ){
7243           sz = 2 + cachedCellSize(&b, cntNew[i]);
7244         }else{
7245           sz = 0;
7246         }
7247       }
7248       szNew[i+1] -= sz;
7249     }
7250     if( cntNew[i]>=b.nCell ){
7251       k = i+1;
7252     }else if( cntNew[i] <= (i>0 ? cntNew[i-1] : 0) ){
7253       rc = SQLITE_CORRUPT_BKPT;
7254       goto balance_cleanup;
7255     }
7256   }
7257 
7258   /*
7259   ** The packing computed by the previous block is biased toward the siblings
7260   ** on the left side (siblings with smaller keys). The left siblings are
7261   ** always nearly full, while the right-most sibling might be nearly empty.
7262   ** The next block of code attempts to adjust the packing of siblings to
7263   ** get a better balance.
7264   **
7265   ** This adjustment is more than an optimization.  The packing above might
7266   ** be so out of balance as to be illegal.  For example, the right-most
7267   ** sibling might be completely empty.  This adjustment is not optional.
7268   */
7269   for(i=k-1; i>0; i--){
7270     int szRight = szNew[i];  /* Size of sibling on the right */
7271     int szLeft = szNew[i-1]; /* Size of sibling on the left */
7272     int r;              /* Index of right-most cell in left sibling */
7273     int d;              /* Index of first cell to the left of right sibling */
7274 
7275     r = cntNew[i-1] - 1;
7276     d = r + 1 - leafData;
7277     (void)cachedCellSize(&b, d);
7278     do{
7279       assert( d<nMaxCells );
7280       assert( r<nMaxCells );
7281       (void)cachedCellSize(&b, r);
7282       if( szRight!=0
7283        && (bBulk || szRight+b.szCell[d]+2 > szLeft-(b.szCell[r]+2)) ){
7284         break;
7285       }
7286       szRight += b.szCell[d] + 2;
7287       szLeft -= b.szCell[r] + 2;
7288       cntNew[i-1] = r;
7289       r--;
7290       d--;
7291     }while( r>=0 );
7292     szNew[i] = szRight;
7293     szNew[i-1] = szLeft;
7294     if( cntNew[i-1] <= (i>1 ? cntNew[i-2] : 0) ){
7295       rc = SQLITE_CORRUPT_BKPT;
7296       goto balance_cleanup;
7297     }
7298   }
7299 
7300   /* Sanity check:  For a non-corrupt database file one of the follwing
7301   ** must be true:
7302   **    (1) We found one or more cells (cntNew[0])>0), or
7303   **    (2) pPage is a virtual root page.  A virtual root page is when
7304   **        the real root page is page 1 and we are the only child of
7305   **        that page.
7306   */
7307   assert( cntNew[0]>0 || (pParent->pgno==1 && pParent->nCell==0) || CORRUPT_DB);
7308   TRACE(("BALANCE: old: %d(nc=%d) %d(nc=%d) %d(nc=%d)\n",
7309     apOld[0]->pgno, apOld[0]->nCell,
7310     nOld>=2 ? apOld[1]->pgno : 0, nOld>=2 ? apOld[1]->nCell : 0,
7311     nOld>=3 ? apOld[2]->pgno : 0, nOld>=3 ? apOld[2]->nCell : 0
7312   ));
7313 
7314   /*
7315   ** Allocate k new pages.  Reuse old pages where possible.
7316   */
7317   pageFlags = apOld[0]->aData[0];
7318   for(i=0; i<k; i++){
7319     MemPage *pNew;
7320     if( i<nOld ){
7321       pNew = apNew[i] = apOld[i];
7322       apOld[i] = 0;
7323       rc = sqlite3PagerWrite(pNew->pDbPage);
7324       nNew++;
7325       if( rc ) goto balance_cleanup;
7326     }else{
7327       assert( i>0 );
7328       rc = allocateBtreePage(pBt, &pNew, &pgno, (bBulk ? 1 : pgno), 0);
7329       if( rc ) goto balance_cleanup;
7330       zeroPage(pNew, pageFlags);
7331       apNew[i] = pNew;
7332       nNew++;
7333       cntOld[i] = b.nCell;
7334 
7335       /* Set the pointer-map entry for the new sibling page. */
7336       if( ISAUTOVACUUM ){
7337         ptrmapPut(pBt, pNew->pgno, PTRMAP_BTREE, pParent->pgno, &rc);
7338         if( rc!=SQLITE_OK ){
7339           goto balance_cleanup;
7340         }
7341       }
7342     }
7343   }
7344 
7345   /*
7346   ** Reassign page numbers so that the new pages are in ascending order.
7347   ** This helps to keep entries in the disk file in order so that a scan
7348   ** of the table is closer to a linear scan through the file. That in turn
7349   ** helps the operating system to deliver pages from the disk more rapidly.
7350   **
7351   ** An O(n^2) insertion sort algorithm is used, but since n is never more
7352   ** than (NB+2) (a small constant), that should not be a problem.
7353   **
7354   ** When NB==3, this one optimization makes the database about 25% faster
7355   ** for large insertions and deletions.
7356   */
7357   for(i=0; i<nNew; i++){
7358     aPgOrder[i] = aPgno[i] = apNew[i]->pgno;
7359     aPgFlags[i] = apNew[i]->pDbPage->flags;
7360     for(j=0; j<i; j++){
7361       if( aPgno[j]==aPgno[i] ){
7362         /* This branch is taken if the set of sibling pages somehow contains
7363         ** duplicate entries. This can happen if the database is corrupt.
7364         ** It would be simpler to detect this as part of the loop below, but
7365         ** we do the detection here in order to avoid populating the pager
7366         ** cache with two separate objects associated with the same
7367         ** page number.  */
7368         assert( CORRUPT_DB );
7369         rc = SQLITE_CORRUPT_BKPT;
7370         goto balance_cleanup;
7371       }
7372     }
7373   }
7374   for(i=0; i<nNew; i++){
7375     int iBest = 0;                /* aPgno[] index of page number to use */
7376     for(j=1; j<nNew; j++){
7377       if( aPgOrder[j]<aPgOrder[iBest] ) iBest = j;
7378     }
7379     pgno = aPgOrder[iBest];
7380     aPgOrder[iBest] = 0xffffffff;
7381     if( iBest!=i ){
7382       if( iBest>i ){
7383         sqlite3PagerRekey(apNew[iBest]->pDbPage, pBt->nPage+iBest+1, 0);
7384       }
7385       sqlite3PagerRekey(apNew[i]->pDbPage, pgno, aPgFlags[iBest]);
7386       apNew[i]->pgno = pgno;
7387     }
7388   }
7389 
7390   TRACE(("BALANCE: new: %d(%d nc=%d) %d(%d nc=%d) %d(%d nc=%d) "
7391          "%d(%d nc=%d) %d(%d nc=%d)\n",
7392     apNew[0]->pgno, szNew[0], cntNew[0],
7393     nNew>=2 ? apNew[1]->pgno : 0, nNew>=2 ? szNew[1] : 0,
7394     nNew>=2 ? cntNew[1] - cntNew[0] - !leafData : 0,
7395     nNew>=3 ? apNew[2]->pgno : 0, nNew>=3 ? szNew[2] : 0,
7396     nNew>=3 ? cntNew[2] - cntNew[1] - !leafData : 0,
7397     nNew>=4 ? apNew[3]->pgno : 0, nNew>=4 ? szNew[3] : 0,
7398     nNew>=4 ? cntNew[3] - cntNew[2] - !leafData : 0,
7399     nNew>=5 ? apNew[4]->pgno : 0, nNew>=5 ? szNew[4] : 0,
7400     nNew>=5 ? cntNew[4] - cntNew[3] - !leafData : 0
7401   ));
7402 
7403   assert( sqlite3PagerIswriteable(pParent->pDbPage) );
7404   put4byte(pRight, apNew[nNew-1]->pgno);
7405 
7406   /* If the sibling pages are not leaves, ensure that the right-child pointer
7407   ** of the right-most new sibling page is set to the value that was
7408   ** originally in the same field of the right-most old sibling page. */
7409   if( (pageFlags & PTF_LEAF)==0 && nOld!=nNew ){
7410     MemPage *pOld = (nNew>nOld ? apNew : apOld)[nOld-1];
7411     memcpy(&apNew[nNew-1]->aData[8], &pOld->aData[8], 4);
7412   }
7413 
7414   /* Make any required updates to pointer map entries associated with
7415   ** cells stored on sibling pages following the balance operation. Pointer
7416   ** map entries associated with divider cells are set by the insertCell()
7417   ** routine. The associated pointer map entries are:
7418   **
7419   **   a) if the cell contains a reference to an overflow chain, the
7420   **      entry associated with the first page in the overflow chain, and
7421   **
7422   **   b) if the sibling pages are not leaves, the child page associated
7423   **      with the cell.
7424   **
7425   ** If the sibling pages are not leaves, then the pointer map entry
7426   ** associated with the right-child of each sibling may also need to be
7427   ** updated. This happens below, after the sibling pages have been
7428   ** populated, not here.
7429   */
7430   if( ISAUTOVACUUM ){
7431     MemPage *pNew = apNew[0];
7432     u8 *aOld = pNew->aData;
7433     int cntOldNext = pNew->nCell + pNew->nOverflow;
7434     int usableSize = pBt->usableSize;
7435     int iNew = 0;
7436     int iOld = 0;
7437 
7438     for(i=0; i<b.nCell; i++){
7439       u8 *pCell = b.apCell[i];
7440       if( i==cntOldNext ){
7441         MemPage *pOld = (++iOld)<nNew ? apNew[iOld] : apOld[iOld];
7442         cntOldNext += pOld->nCell + pOld->nOverflow + !leafData;
7443         aOld = pOld->aData;
7444       }
7445       if( i==cntNew[iNew] ){
7446         pNew = apNew[++iNew];
7447         if( !leafData ) continue;
7448       }
7449 
7450       /* Cell pCell is destined for new sibling page pNew. Originally, it
7451       ** was either part of sibling page iOld (possibly an overflow cell),
7452       ** or else the divider cell to the left of sibling page iOld. So,
7453       ** if sibling page iOld had the same page number as pNew, and if
7454       ** pCell really was a part of sibling page iOld (not a divider or
7455       ** overflow cell), we can skip updating the pointer map entries.  */
7456       if( iOld>=nNew
7457        || pNew->pgno!=aPgno[iOld]
7458        || pCell<aOld
7459        || pCell>=&aOld[usableSize]
7460       ){
7461         if( !leafCorrection ){
7462           ptrmapPut(pBt, get4byte(pCell), PTRMAP_BTREE, pNew->pgno, &rc);
7463         }
7464         if( cachedCellSize(&b,i)>pNew->minLocal ){
7465           ptrmapPutOvflPtr(pNew, pCell, &rc);
7466         }
7467         if( rc ) goto balance_cleanup;
7468       }
7469     }
7470   }
7471 
7472   /* Insert new divider cells into pParent. */
7473   for(i=0; i<nNew-1; i++){
7474     u8 *pCell;
7475     u8 *pTemp;
7476     int sz;
7477     MemPage *pNew = apNew[i];
7478     j = cntNew[i];
7479 
7480     assert( j<nMaxCells );
7481     assert( b.apCell[j]!=0 );
7482     pCell = b.apCell[j];
7483     sz = b.szCell[j] + leafCorrection;
7484     pTemp = &aOvflSpace[iOvflSpace];
7485     if( !pNew->leaf ){
7486       memcpy(&pNew->aData[8], pCell, 4);
7487     }else if( leafData ){
7488       /* If the tree is a leaf-data tree, and the siblings are leaves,
7489       ** then there is no divider cell in b.apCell[]. Instead, the divider
7490       ** cell consists of the integer key for the right-most cell of
7491       ** the sibling-page assembled above only.
7492       */
7493       CellInfo info;
7494       j--;
7495       pNew->xParseCell(pNew, b.apCell[j], &info);
7496       pCell = pTemp;
7497       sz = 4 + putVarint(&pCell[4], info.nKey);
7498       pTemp = 0;
7499     }else{
7500       pCell -= 4;
7501       /* Obscure case for non-leaf-data trees: If the cell at pCell was
7502       ** previously stored on a leaf node, and its reported size was 4
7503       ** bytes, then it may actually be smaller than this
7504       ** (see btreeParseCellPtr(), 4 bytes is the minimum size of
7505       ** any cell). But it is important to pass the correct size to
7506       ** insertCell(), so reparse the cell now.
7507       **
7508       ** Note that this can never happen in an SQLite data file, as all
7509       ** cells are at least 4 bytes. It only happens in b-trees used
7510       ** to evaluate "IN (SELECT ...)" and similar clauses.
7511       */
7512       if( b.szCell[j]==4 ){
7513         assert(leafCorrection==4);
7514         sz = pParent->xCellSize(pParent, pCell);
7515       }
7516     }
7517     iOvflSpace += sz;
7518     assert( sz<=pBt->maxLocal+23 );
7519     assert( iOvflSpace <= (int)pBt->pageSize );
7520     insertCell(pParent, nxDiv+i, pCell, sz, pTemp, pNew->pgno, &rc);
7521     if( rc!=SQLITE_OK ) goto balance_cleanup;
7522     assert( sqlite3PagerIswriteable(pParent->pDbPage) );
7523   }
7524 
7525   /* Now update the actual sibling pages. The order in which they are updated
7526   ** is important, as this code needs to avoid disrupting any page from which
7527   ** cells may still to be read. In practice, this means:
7528   **
7529   **  (1) If cells are moving left (from apNew[iPg] to apNew[iPg-1])
7530   **      then it is not safe to update page apNew[iPg] until after
7531   **      the left-hand sibling apNew[iPg-1] has been updated.
7532   **
7533   **  (2) If cells are moving right (from apNew[iPg] to apNew[iPg+1])
7534   **      then it is not safe to update page apNew[iPg] until after
7535   **      the right-hand sibling apNew[iPg+1] has been updated.
7536   **
7537   ** If neither of the above apply, the page is safe to update.
7538   **
7539   ** The iPg value in the following loop starts at nNew-1 goes down
7540   ** to 0, then back up to nNew-1 again, thus making two passes over
7541   ** the pages.  On the initial downward pass, only condition (1) above
7542   ** needs to be tested because (2) will always be true from the previous
7543   ** step.  On the upward pass, both conditions are always true, so the
7544   ** upwards pass simply processes pages that were missed on the downward
7545   ** pass.
7546   */
7547   for(i=1-nNew; i<nNew; i++){
7548     int iPg = i<0 ? -i : i;
7549     assert( iPg>=0 && iPg<nNew );
7550     if( abDone[iPg] ) continue;         /* Skip pages already processed */
7551     if( i>=0                            /* On the upwards pass, or... */
7552      || cntOld[iPg-1]>=cntNew[iPg-1]    /* Condition (1) is true */
7553     ){
7554       int iNew;
7555       int iOld;
7556       int nNewCell;
7557 
7558       /* Verify condition (1):  If cells are moving left, update iPg
7559       ** only after iPg-1 has already been updated. */
7560       assert( iPg==0 || cntOld[iPg-1]>=cntNew[iPg-1] || abDone[iPg-1] );
7561 
7562       /* Verify condition (2):  If cells are moving right, update iPg
7563       ** only after iPg+1 has already been updated. */
7564       assert( cntNew[iPg]>=cntOld[iPg] || abDone[iPg+1] );
7565 
7566       if( iPg==0 ){
7567         iNew = iOld = 0;
7568         nNewCell = cntNew[0];
7569       }else{
7570         iOld = iPg<nOld ? (cntOld[iPg-1] + !leafData) : b.nCell;
7571         iNew = cntNew[iPg-1] + !leafData;
7572         nNewCell = cntNew[iPg] - iNew;
7573       }
7574 
7575       rc = editPage(apNew[iPg], iOld, iNew, nNewCell, &b);
7576       if( rc ) goto balance_cleanup;
7577       abDone[iPg]++;
7578       apNew[iPg]->nFree = usableSpace-szNew[iPg];
7579       assert( apNew[iPg]->nOverflow==0 );
7580       assert( apNew[iPg]->nCell==nNewCell );
7581     }
7582   }
7583 
7584   /* All pages have been processed exactly once */
7585   assert( memcmp(abDone, "\01\01\01\01\01", nNew)==0 );
7586 
7587   assert( nOld>0 );
7588   assert( nNew>0 );
7589 
7590   if( isRoot && pParent->nCell==0 && pParent->hdrOffset<=apNew[0]->nFree ){
7591     /* The root page of the b-tree now contains no cells. The only sibling
7592     ** page is the right-child of the parent. Copy the contents of the
7593     ** child page into the parent, decreasing the overall height of the
7594     ** b-tree structure by one. This is described as the "balance-shallower"
7595     ** sub-algorithm in some documentation.
7596     **
7597     ** If this is an auto-vacuum database, the call to copyNodeContent()
7598     ** sets all pointer-map entries corresponding to database image pages
7599     ** for which the pointer is stored within the content being copied.
7600     **
7601     ** It is critical that the child page be defragmented before being
7602     ** copied into the parent, because if the parent is page 1 then it will
7603     ** by smaller than the child due to the database header, and so all the
7604     ** free space needs to be up front.
7605     */
7606     assert( nNew==1 );
7607     rc = defragmentPage(apNew[0]);
7608     testcase( rc!=SQLITE_OK );
7609     assert( apNew[0]->nFree ==
7610         (get2byte(&apNew[0]->aData[5])-apNew[0]->cellOffset-apNew[0]->nCell*2)
7611       || rc!=SQLITE_OK
7612     );
7613     copyNodeContent(apNew[0], pParent, &rc);
7614     freePage(apNew[0], &rc);
7615   }else if( ISAUTOVACUUM && !leafCorrection ){
7616     /* Fix the pointer map entries associated with the right-child of each
7617     ** sibling page. All other pointer map entries have already been taken
7618     ** care of.  */
7619     for(i=0; i<nNew; i++){
7620       u32 key = get4byte(&apNew[i]->aData[8]);
7621       ptrmapPut(pBt, key, PTRMAP_BTREE, apNew[i]->pgno, &rc);
7622     }
7623   }
7624 
7625   assert( pParent->isInit );
7626   TRACE(("BALANCE: finished: old=%d new=%d cells=%d\n",
7627           nOld, nNew, b.nCell));
7628 
7629   /* Free any old pages that were not reused as new pages.
7630   */
7631   for(i=nNew; i<nOld; i++){
7632     freePage(apOld[i], &rc);
7633   }
7634 
7635 #if 0
7636   if( ISAUTOVACUUM && rc==SQLITE_OK && apNew[0]->isInit ){
7637     /* The ptrmapCheckPages() contains assert() statements that verify that
7638     ** all pointer map pages are set correctly. This is helpful while
7639     ** debugging. This is usually disabled because a corrupt database may
7640     ** cause an assert() statement to fail.  */
7641     ptrmapCheckPages(apNew, nNew);
7642     ptrmapCheckPages(&pParent, 1);
7643   }
7644 #endif
7645 
7646   /*
7647   ** Cleanup before returning.
7648   */
7649 balance_cleanup:
7650   sqlite3ScratchFree(b.apCell);
7651   for(i=0; i<nOld; i++){
7652     releasePage(apOld[i]);
7653   }
7654   for(i=0; i<nNew; i++){
7655     releasePage(apNew[i]);
7656   }
7657 
7658   return rc;
7659 }
7660 #if defined(_MSC_VER) && _MSC_VER >= 1700 && defined(_M_ARM)
7661 #pragma optimize("", on)
7662 #endif
7663 
7664 
7665 /*
7666 ** This function is called when the root page of a b-tree structure is
7667 ** overfull (has one or more overflow pages).
7668 **
7669 ** A new child page is allocated and the contents of the current root
7670 ** page, including overflow cells, are copied into the child. The root
7671 ** page is then overwritten to make it an empty page with the right-child
7672 ** pointer pointing to the new page.
7673 **
7674 ** Before returning, all pointer-map entries corresponding to pages
7675 ** that the new child-page now contains pointers to are updated. The
7676 ** entry corresponding to the new right-child pointer of the root
7677 ** page is also updated.
7678 **
7679 ** If successful, *ppChild is set to contain a reference to the child
7680 ** page and SQLITE_OK is returned. In this case the caller is required
7681 ** to call releasePage() on *ppChild exactly once. If an error occurs,
7682 ** an error code is returned and *ppChild is set to 0.
7683 */
7684 static int balance_deeper(MemPage *pRoot, MemPage **ppChild){
7685   int rc;                        /* Return value from subprocedures */
7686   MemPage *pChild = 0;           /* Pointer to a new child page */
7687   Pgno pgnoChild = 0;            /* Page number of the new child page */
7688   BtShared *pBt = pRoot->pBt;    /* The BTree */
7689 
7690   assert( pRoot->nOverflow>0 );
7691   assert( sqlite3_mutex_held(pBt->mutex) );
7692 
7693   /* Make pRoot, the root page of the b-tree, writable. Allocate a new
7694   ** page that will become the new right-child of pPage. Copy the contents
7695   ** of the node stored on pRoot into the new child page.
7696   */
7697   rc = sqlite3PagerWrite(pRoot->pDbPage);
7698   if( rc==SQLITE_OK ){
7699     rc = allocateBtreePage(pBt,&pChild,&pgnoChild,pRoot->pgno,0);
7700     copyNodeContent(pRoot, pChild, &rc);
7701     if( ISAUTOVACUUM ){
7702       ptrmapPut(pBt, pgnoChild, PTRMAP_BTREE, pRoot->pgno, &rc);
7703     }
7704   }
7705   if( rc ){
7706     *ppChild = 0;
7707     releasePage(pChild);
7708     return rc;
7709   }
7710   assert( sqlite3PagerIswriteable(pChild->pDbPage) );
7711   assert( sqlite3PagerIswriteable(pRoot->pDbPage) );
7712   assert( pChild->nCell==pRoot->nCell );
7713 
7714   TRACE(("BALANCE: copy root %d into %d\n", pRoot->pgno, pChild->pgno));
7715 
7716   /* Copy the overflow cells from pRoot to pChild */
7717   memcpy(pChild->aiOvfl, pRoot->aiOvfl,
7718          pRoot->nOverflow*sizeof(pRoot->aiOvfl[0]));
7719   memcpy(pChild->apOvfl, pRoot->apOvfl,
7720          pRoot->nOverflow*sizeof(pRoot->apOvfl[0]));
7721   pChild->nOverflow = pRoot->nOverflow;
7722 
7723   /* Zero the contents of pRoot. Then install pChild as the right-child. */
7724   zeroPage(pRoot, pChild->aData[0] & ~PTF_LEAF);
7725   put4byte(&pRoot->aData[pRoot->hdrOffset+8], pgnoChild);
7726 
7727   *ppChild = pChild;
7728   return SQLITE_OK;
7729 }
7730 
7731 /*
7732 ** The page that pCur currently points to has just been modified in
7733 ** some way. This function figures out if this modification means the
7734 ** tree needs to be balanced, and if so calls the appropriate balancing
7735 ** routine. Balancing routines are:
7736 **
7737 **   balance_quick()
7738 **   balance_deeper()
7739 **   balance_nonroot()
7740 */
7741 static int balance(BtCursor *pCur){
7742   int rc = SQLITE_OK;
7743   const int nMin = pCur->pBt->usableSize * 2 / 3;
7744   u8 aBalanceQuickSpace[13];
7745   u8 *pFree = 0;
7746 
7747   TESTONLY( int balance_quick_called = 0 );
7748   TESTONLY( int balance_deeper_called = 0 );
7749 
7750   do {
7751     int iPage = pCur->iPage;
7752     MemPage *pPage = pCur->apPage[iPage];
7753 
7754     if( iPage==0 ){
7755       if( pPage->nOverflow ){
7756         /* The root page of the b-tree is overfull. In this case call the
7757         ** balance_deeper() function to create a new child for the root-page
7758         ** and copy the current contents of the root-page to it. The
7759         ** next iteration of the do-loop will balance the child page.
7760         */
7761         assert( (balance_deeper_called++)==0 );
7762         rc = balance_deeper(pPage, &pCur->apPage[1]);
7763         if( rc==SQLITE_OK ){
7764           pCur->iPage = 1;
7765           pCur->aiIdx[0] = 0;
7766           pCur->aiIdx[1] = 0;
7767           assert( pCur->apPage[1]->nOverflow );
7768         }
7769       }else{
7770         break;
7771       }
7772     }else if( pPage->nOverflow==0 && pPage->nFree<=nMin ){
7773       break;
7774     }else{
7775       MemPage * const pParent = pCur->apPage[iPage-1];
7776       int const iIdx = pCur->aiIdx[iPage-1];
7777 
7778       rc = sqlite3PagerWrite(pParent->pDbPage);
7779       if( rc==SQLITE_OK ){
7780 #ifndef SQLITE_OMIT_QUICKBALANCE
7781         if( pPage->intKeyLeaf
7782          && pPage->nOverflow==1
7783          && pPage->aiOvfl[0]==pPage->nCell
7784          && pParent->pgno!=1
7785          && pParent->nCell==iIdx
7786         ){
7787           /* Call balance_quick() to create a new sibling of pPage on which
7788           ** to store the overflow cell. balance_quick() inserts a new cell
7789           ** into pParent, which may cause pParent overflow. If this
7790           ** happens, the next iteration of the do-loop will balance pParent
7791           ** use either balance_nonroot() or balance_deeper(). Until this
7792           ** happens, the overflow cell is stored in the aBalanceQuickSpace[]
7793           ** buffer.
7794           **
7795           ** The purpose of the following assert() is to check that only a
7796           ** single call to balance_quick() is made for each call to this
7797           ** function. If this were not verified, a subtle bug involving reuse
7798           ** of the aBalanceQuickSpace[] might sneak in.
7799           */
7800           assert( (balance_quick_called++)==0 );
7801           rc = balance_quick(pParent, pPage, aBalanceQuickSpace);
7802         }else
7803 #endif
7804         {
7805           /* In this case, call balance_nonroot() to redistribute cells
7806           ** between pPage and up to 2 of its sibling pages. This involves
7807           ** modifying the contents of pParent, which may cause pParent to
7808           ** become overfull or underfull. The next iteration of the do-loop
7809           ** will balance the parent page to correct this.
7810           **
7811           ** If the parent page becomes overfull, the overflow cell or cells
7812           ** are stored in the pSpace buffer allocated immediately below.
7813           ** A subsequent iteration of the do-loop will deal with this by
7814           ** calling balance_nonroot() (balance_deeper() may be called first,
7815           ** but it doesn't deal with overflow cells - just moves them to a
7816           ** different page). Once this subsequent call to balance_nonroot()
7817           ** has completed, it is safe to release the pSpace buffer used by
7818           ** the previous call, as the overflow cell data will have been
7819           ** copied either into the body of a database page or into the new
7820           ** pSpace buffer passed to the latter call to balance_nonroot().
7821           */
7822           u8 *pSpace = sqlite3PageMalloc(pCur->pBt->pageSize);
7823           rc = balance_nonroot(pParent, iIdx, pSpace, iPage==1,
7824                                pCur->hints&BTREE_BULKLOAD);
7825           if( pFree ){
7826             /* If pFree is not NULL, it points to the pSpace buffer used
7827             ** by a previous call to balance_nonroot(). Its contents are
7828             ** now stored either on real database pages or within the
7829             ** new pSpace buffer, so it may be safely freed here. */
7830             sqlite3PageFree(pFree);
7831           }
7832 
7833           /* The pSpace buffer will be freed after the next call to
7834           ** balance_nonroot(), or just before this function returns, whichever
7835           ** comes first. */
7836           pFree = pSpace;
7837         }
7838       }
7839 
7840       pPage->nOverflow = 0;
7841 
7842       /* The next iteration of the do-loop balances the parent page. */
7843       releasePage(pPage);
7844       pCur->iPage--;
7845       assert( pCur->iPage>=0 );
7846     }
7847   }while( rc==SQLITE_OK );
7848 
7849   if( pFree ){
7850     sqlite3PageFree(pFree);
7851   }
7852   return rc;
7853 }
7854 
7855 
7856 /*
7857 ** Insert a new record into the BTree.  The key is given by (pKey,nKey)
7858 ** and the data is given by (pData,nData).  The cursor is used only to
7859 ** define what table the record should be inserted into.  The cursor
7860 ** is left pointing at a random location.
7861 **
7862 ** For an INTKEY table, only the nKey value of the key is used.  pKey is
7863 ** ignored.  For a ZERODATA table, the pData and nData are both ignored.
7864 **
7865 ** If the seekResult parameter is non-zero, then a successful call to
7866 ** MovetoUnpacked() to seek cursor pCur to (pKey, nKey) has already
7867 ** been performed. seekResult is the search result returned (a negative
7868 ** number if pCur points at an entry that is smaller than (pKey, nKey), or
7869 ** a positive value if pCur points at an entry that is larger than
7870 ** (pKey, nKey)).
7871 **
7872 ** If the seekResult parameter is non-zero, then the caller guarantees that
7873 ** cursor pCur is pointing at the existing copy of a row that is to be
7874 ** overwritten.  If the seekResult parameter is 0, then cursor pCur may
7875 ** point to any entry or to no entry at all and so this function has to seek
7876 ** the cursor before the new key can be inserted.
7877 */
7878 int sqlite3BtreeInsert(
7879   BtCursor *pCur,                /* Insert data into the table of this cursor */
7880   const void *pKey, i64 nKey,    /* The key of the new record */
7881   const void *pData, int nData,  /* The data of the new record */
7882   int nZero,                     /* Number of extra 0 bytes to append to data */
7883   int appendBias,                /* True if this is likely an append */
7884   int seekResult                 /* Result of prior MovetoUnpacked() call */
7885 ){
7886   int rc;
7887   int loc = seekResult;          /* -1: before desired location  +1: after */
7888   int szNew = 0;
7889   int idx;
7890   MemPage *pPage;
7891   Btree *p = pCur->pBtree;
7892   BtShared *pBt = p->pBt;
7893   unsigned char *oldCell;
7894   unsigned char *newCell = 0;
7895 
7896   if( pCur->eState==CURSOR_FAULT ){
7897     assert( pCur->skipNext!=SQLITE_OK );
7898     return pCur->skipNext;
7899   }
7900 
7901   assert( cursorHoldsMutex(pCur) );
7902   assert( (pCur->curFlags & BTCF_WriteFlag)!=0
7903               && pBt->inTransaction==TRANS_WRITE
7904               && (pBt->btsFlags & BTS_READ_ONLY)==0 );
7905   assert( hasSharedCacheTableLock(p, pCur->pgnoRoot, pCur->pKeyInfo!=0, 2) );
7906 
7907   /* Assert that the caller has been consistent. If this cursor was opened
7908   ** expecting an index b-tree, then the caller should be inserting blob
7909   ** keys with no associated data. If the cursor was opened expecting an
7910   ** intkey table, the caller should be inserting integer keys with a
7911   ** blob of associated data.  */
7912   assert( (pKey==0)==(pCur->pKeyInfo==0) );
7913 
7914   /* Save the positions of any other cursors open on this table.
7915   **
7916   ** In some cases, the call to btreeMoveto() below is a no-op. For
7917   ** example, when inserting data into a table with auto-generated integer
7918   ** keys, the VDBE layer invokes sqlite3BtreeLast() to figure out the
7919   ** integer key to use. It then calls this function to actually insert the
7920   ** data into the intkey B-Tree. In this case btreeMoveto() recognizes
7921   ** that the cursor is already where it needs to be and returns without
7922   ** doing any work. To avoid thwarting these optimizations, it is important
7923   ** not to clear the cursor here.
7924   */
7925   if( pCur->curFlags & BTCF_Multiple ){
7926     rc = saveAllCursors(pBt, pCur->pgnoRoot, pCur);
7927     if( rc ) return rc;
7928   }
7929 
7930   if( pCur->pKeyInfo==0 ){
7931     assert( pKey==0 );
7932     /* If this is an insert into a table b-tree, invalidate any incrblob
7933     ** cursors open on the row being replaced */
7934     invalidateIncrblobCursors(p, nKey, 0);
7935 
7936     /* If the cursor is currently on the last row and we are appending a
7937     ** new row onto the end, set the "loc" to avoid an unnecessary
7938     ** btreeMoveto() call */
7939     if( (pCur->curFlags&BTCF_ValidNKey)!=0 && nKey>0
7940       && pCur->info.nKey==nKey-1 ){
7941        loc = -1;
7942     }else if( loc==0 ){
7943       rc = sqlite3BtreeMovetoUnpacked(pCur, 0, nKey, appendBias, &loc);
7944       if( rc ) return rc;
7945     }
7946   }else if( loc==0 ){
7947     rc = btreeMoveto(pCur, pKey, nKey, appendBias, &loc);
7948     if( rc ) return rc;
7949   }
7950   assert( pCur->eState==CURSOR_VALID || (pCur->eState==CURSOR_INVALID && loc) );
7951 
7952   pPage = pCur->apPage[pCur->iPage];
7953   assert( pPage->intKey || nKey>=0 );
7954   assert( pPage->leaf || !pPage->intKey );
7955 
7956   TRACE(("INSERT: table=%d nkey=%lld ndata=%d page=%d %s\n",
7957           pCur->pgnoRoot, nKey, nData, pPage->pgno,
7958           loc==0 ? "overwrite" : "new entry"));
7959   assert( pPage->isInit );
7960   newCell = pBt->pTmpSpace;
7961   assert( newCell!=0 );
7962   rc = fillInCell(pPage, newCell, pKey, nKey, pData, nData, nZero, &szNew);
7963   if( rc ) goto end_insert;
7964   assert( szNew==pPage->xCellSize(pPage, newCell) );
7965   assert( szNew <= MX_CELL_SIZE(pBt) );
7966   idx = pCur->aiIdx[pCur->iPage];
7967   if( loc==0 ){
7968     u16 szOld;
7969     assert( idx<pPage->nCell );
7970     rc = sqlite3PagerWrite(pPage->pDbPage);
7971     if( rc ){
7972       goto end_insert;
7973     }
7974     oldCell = findCell(pPage, idx);
7975     if( !pPage->leaf ){
7976       memcpy(newCell, oldCell, 4);
7977     }
7978     rc = clearCell(pPage, oldCell, &szOld);
7979     dropCell(pPage, idx, szOld, &rc);
7980     if( rc ) goto end_insert;
7981   }else if( loc<0 && pPage->nCell>0 ){
7982     assert( pPage->leaf );
7983     idx = ++pCur->aiIdx[pCur->iPage];
7984   }else{
7985     assert( pPage->leaf );
7986   }
7987   insertCell(pPage, idx, newCell, szNew, 0, 0, &rc);
7988   assert( rc!=SQLITE_OK || pPage->nCell>0 || pPage->nOverflow>0 );
7989 
7990   /* If no error has occurred and pPage has an overflow cell, call balance()
7991   ** to redistribute the cells within the tree. Since balance() may move
7992   ** the cursor, zero the BtCursor.info.nSize and BTCF_ValidNKey
7993   ** variables.
7994   **
7995   ** Previous versions of SQLite called moveToRoot() to move the cursor
7996   ** back to the root page as balance() used to invalidate the contents
7997   ** of BtCursor.apPage[] and BtCursor.aiIdx[]. Instead of doing that,
7998   ** set the cursor state to "invalid". This makes common insert operations
7999   ** slightly faster.
8000   **
8001   ** There is a subtle but important optimization here too. When inserting
8002   ** multiple records into an intkey b-tree using a single cursor (as can
8003   ** happen while processing an "INSERT INTO ... SELECT" statement), it
8004   ** is advantageous to leave the cursor pointing to the last entry in
8005   ** the b-tree if possible. If the cursor is left pointing to the last
8006   ** entry in the table, and the next row inserted has an integer key
8007   ** larger than the largest existing key, it is possible to insert the
8008   ** row without seeking the cursor. This can be a big performance boost.
8009   */
8010   pCur->info.nSize = 0;
8011   if( rc==SQLITE_OK && pPage->nOverflow ){
8012     pCur->curFlags &= ~(BTCF_ValidNKey);
8013     rc = balance(pCur);
8014 
8015     /* Must make sure nOverflow is reset to zero even if the balance()
8016     ** fails. Internal data structure corruption will result otherwise.
8017     ** Also, set the cursor state to invalid. This stops saveCursorPosition()
8018     ** from trying to save the current position of the cursor.  */
8019     pCur->apPage[pCur->iPage]->nOverflow = 0;
8020     pCur->eState = CURSOR_INVALID;
8021   }
8022   assert( pCur->apPage[pCur->iPage]->nOverflow==0 );
8023 
8024 end_insert:
8025   return rc;
8026 }
8027 
8028 /*
8029 ** Delete the entry that the cursor is pointing to.  The cursor
8030 ** is left pointing at an arbitrary location.
8031 */
8032 int sqlite3BtreeDelete(BtCursor *pCur){
8033   Btree *p = pCur->pBtree;
8034   BtShared *pBt = p->pBt;
8035   int rc;                              /* Return code */
8036   MemPage *pPage;                      /* Page to delete cell from */
8037   unsigned char *pCell;                /* Pointer to cell to delete */
8038   int iCellIdx;                        /* Index of cell to delete */
8039   int iCellDepth;                      /* Depth of node containing pCell */
8040   u16 szCell;                          /* Size of the cell being deleted */
8041 
8042   assert( cursorHoldsMutex(pCur) );
8043   assert( pBt->inTransaction==TRANS_WRITE );
8044   assert( (pBt->btsFlags & BTS_READ_ONLY)==0 );
8045   assert( pCur->curFlags & BTCF_WriteFlag );
8046   assert( hasSharedCacheTableLock(p, pCur->pgnoRoot, pCur->pKeyInfo!=0, 2) );
8047   assert( !hasReadConflicts(p, pCur->pgnoRoot) );
8048   assert( pCur->aiIdx[pCur->iPage]<pCur->apPage[pCur->iPage]->nCell );
8049   assert( pCur->eState==CURSOR_VALID );
8050 
8051   iCellDepth = pCur->iPage;
8052   iCellIdx = pCur->aiIdx[iCellDepth];
8053   pPage = pCur->apPage[iCellDepth];
8054   pCell = findCell(pPage, iCellIdx);
8055 
8056   /* If the page containing the entry to delete is not a leaf page, move
8057   ** the cursor to the largest entry in the tree that is smaller than
8058   ** the entry being deleted. This cell will replace the cell being deleted
8059   ** from the internal node. The 'previous' entry is used for this instead
8060   ** of the 'next' entry, as the previous entry is always a part of the
8061   ** sub-tree headed by the child page of the cell being deleted. This makes
8062   ** balancing the tree following the delete operation easier.  */
8063   if( !pPage->leaf ){
8064     int notUsed = 0;
8065     rc = sqlite3BtreePrevious(pCur, &notUsed);
8066     if( rc ) return rc;
8067   }
8068 
8069   /* Save the positions of any other cursors open on this table before
8070   ** making any modifications. Make the page containing the entry to be
8071   ** deleted writable. Then free any overflow pages associated with the
8072   ** entry and finally remove the cell itself from within the page.
8073   */
8074   if( pCur->curFlags & BTCF_Multiple ){
8075     rc = saveAllCursors(pBt, pCur->pgnoRoot, pCur);
8076     if( rc ) return rc;
8077   }
8078 
8079   /* If this is a delete operation to remove a row from a table b-tree,
8080   ** invalidate any incrblob cursors open on the row being deleted.  */
8081   if( pCur->pKeyInfo==0 ){
8082     invalidateIncrblobCursors(p, pCur->info.nKey, 0);
8083   }
8084 
8085   rc = sqlite3PagerWrite(pPage->pDbPage);
8086   if( rc ) return rc;
8087   rc = clearCell(pPage, pCell, &szCell);
8088   dropCell(pPage, iCellIdx, szCell, &rc);
8089   if( rc ) return rc;
8090 
8091   /* If the cell deleted was not located on a leaf page, then the cursor
8092   ** is currently pointing to the largest entry in the sub-tree headed
8093   ** by the child-page of the cell that was just deleted from an internal
8094   ** node. The cell from the leaf node needs to be moved to the internal
8095   ** node to replace the deleted cell.  */
8096   if( !pPage->leaf ){
8097     MemPage *pLeaf = pCur->apPage[pCur->iPage];
8098     int nCell;
8099     Pgno n = pCur->apPage[iCellDepth+1]->pgno;
8100     unsigned char *pTmp;
8101 
8102     pCell = findCell(pLeaf, pLeaf->nCell-1);
8103     if( pCell<&pLeaf->aData[4] ) return SQLITE_CORRUPT_BKPT;
8104     nCell = pLeaf->xCellSize(pLeaf, pCell);
8105     assert( MX_CELL_SIZE(pBt) >= nCell );
8106     pTmp = pBt->pTmpSpace;
8107     assert( pTmp!=0 );
8108     rc = sqlite3PagerWrite(pLeaf->pDbPage);
8109     insertCell(pPage, iCellIdx, pCell-4, nCell+4, pTmp, n, &rc);
8110     dropCell(pLeaf, pLeaf->nCell-1, nCell, &rc);
8111     if( rc ) return rc;
8112   }
8113 
8114   /* Balance the tree. If the entry deleted was located on a leaf page,
8115   ** then the cursor still points to that page. In this case the first
8116   ** call to balance() repairs the tree, and the if(...) condition is
8117   ** never true.
8118   **
8119   ** Otherwise, if the entry deleted was on an internal node page, then
8120   ** pCur is pointing to the leaf page from which a cell was removed to
8121   ** replace the cell deleted from the internal node. This is slightly
8122   ** tricky as the leaf node may be underfull, and the internal node may
8123   ** be either under or overfull. In this case run the balancing algorithm
8124   ** on the leaf node first. If the balance proceeds far enough up the
8125   ** tree that we can be sure that any problem in the internal node has
8126   ** been corrected, so be it. Otherwise, after balancing the leaf node,
8127   ** walk the cursor up the tree to the internal node and balance it as
8128   ** well.  */
8129   rc = balance(pCur);
8130   if( rc==SQLITE_OK && pCur->iPage>iCellDepth ){
8131     while( pCur->iPage>iCellDepth ){
8132       releasePage(pCur->apPage[pCur->iPage--]);
8133     }
8134     rc = balance(pCur);
8135   }
8136 
8137   if( rc==SQLITE_OK ){
8138     moveToRoot(pCur);
8139   }
8140   return rc;
8141 }
8142 
8143 /*
8144 ** Create a new BTree table.  Write into *piTable the page
8145 ** number for the root page of the new table.
8146 **
8147 ** The type of type is determined by the flags parameter.  Only the
8148 ** following values of flags are currently in use.  Other values for
8149 ** flags might not work:
8150 **
8151 **     BTREE_INTKEY|BTREE_LEAFDATA     Used for SQL tables with rowid keys
8152 **     BTREE_ZERODATA                  Used for SQL indices
8153 */
8154 static int btreeCreateTable(Btree *p, int *piTable, int createTabFlags){
8155   BtShared *pBt = p->pBt;
8156   MemPage *pRoot;
8157   Pgno pgnoRoot;
8158   int rc;
8159   int ptfFlags;          /* Page-type flage for the root page of new table */
8160 
8161   assert( sqlite3BtreeHoldsMutex(p) );
8162   assert( pBt->inTransaction==TRANS_WRITE );
8163   assert( (pBt->btsFlags & BTS_READ_ONLY)==0 );
8164 
8165 #ifdef SQLITE_OMIT_AUTOVACUUM
8166   rc = allocateBtreePage(pBt, &pRoot, &pgnoRoot, 1, 0);
8167   if( rc ){
8168     return rc;
8169   }
8170 #else
8171   if( pBt->autoVacuum ){
8172     Pgno pgnoMove;      /* Move a page here to make room for the root-page */
8173     MemPage *pPageMove; /* The page to move to. */
8174 
8175     /* Creating a new table may probably require moving an existing database
8176     ** to make room for the new tables root page. In case this page turns
8177     ** out to be an overflow page, delete all overflow page-map caches
8178     ** held by open cursors.
8179     */
8180     invalidateAllOverflowCache(pBt);
8181 
8182     /* Read the value of meta[3] from the database to determine where the
8183     ** root page of the new table should go. meta[3] is the largest root-page
8184     ** created so far, so the new root-page is (meta[3]+1).
8185     */
8186     sqlite3BtreeGetMeta(p, BTREE_LARGEST_ROOT_PAGE, &pgnoRoot);
8187     pgnoRoot++;
8188 
8189     /* The new root-page may not be allocated on a pointer-map page, or the
8190     ** PENDING_BYTE page.
8191     */
8192     while( pgnoRoot==PTRMAP_PAGENO(pBt, pgnoRoot) ||
8193         pgnoRoot==PENDING_BYTE_PAGE(pBt) ){
8194       pgnoRoot++;
8195     }
8196     assert( pgnoRoot>=3 || CORRUPT_DB );
8197     testcase( pgnoRoot<3 );
8198 
8199     /* Allocate a page. The page that currently resides at pgnoRoot will
8200     ** be moved to the allocated page (unless the allocated page happens
8201     ** to reside at pgnoRoot).
8202     */
8203     rc = allocateBtreePage(pBt, &pPageMove, &pgnoMove, pgnoRoot, BTALLOC_EXACT);
8204     if( rc!=SQLITE_OK ){
8205       return rc;
8206     }
8207 
8208     if( pgnoMove!=pgnoRoot ){
8209       /* pgnoRoot is the page that will be used for the root-page of
8210       ** the new table (assuming an error did not occur). But we were
8211       ** allocated pgnoMove. If required (i.e. if it was not allocated
8212       ** by extending the file), the current page at position pgnoMove
8213       ** is already journaled.
8214       */
8215       u8 eType = 0;
8216       Pgno iPtrPage = 0;
8217 
8218       /* Save the positions of any open cursors. This is required in
8219       ** case they are holding a reference to an xFetch reference
8220       ** corresponding to page pgnoRoot.  */
8221       rc = saveAllCursors(pBt, 0, 0);
8222       releasePage(pPageMove);
8223       if( rc!=SQLITE_OK ){
8224         return rc;
8225       }
8226 
8227       /* Move the page currently at pgnoRoot to pgnoMove. */
8228       rc = btreeGetPage(pBt, pgnoRoot, &pRoot, 0);
8229       if( rc!=SQLITE_OK ){
8230         return rc;
8231       }
8232       rc = ptrmapGet(pBt, pgnoRoot, &eType, &iPtrPage);
8233       if( eType==PTRMAP_ROOTPAGE || eType==PTRMAP_FREEPAGE ){
8234         rc = SQLITE_CORRUPT_BKPT;
8235       }
8236       if( rc!=SQLITE_OK ){
8237         releasePage(pRoot);
8238         return rc;
8239       }
8240       assert( eType!=PTRMAP_ROOTPAGE );
8241       assert( eType!=PTRMAP_FREEPAGE );
8242       rc = relocatePage(pBt, pRoot, eType, iPtrPage, pgnoMove, 0);
8243       releasePage(pRoot);
8244 
8245       /* Obtain the page at pgnoRoot */
8246       if( rc!=SQLITE_OK ){
8247         return rc;
8248       }
8249       rc = btreeGetPage(pBt, pgnoRoot, &pRoot, 0);
8250       if( rc!=SQLITE_OK ){
8251         return rc;
8252       }
8253       rc = sqlite3PagerWrite(pRoot->pDbPage);
8254       if( rc!=SQLITE_OK ){
8255         releasePage(pRoot);
8256         return rc;
8257       }
8258     }else{
8259       pRoot = pPageMove;
8260     }
8261 
8262     /* Update the pointer-map and meta-data with the new root-page number. */
8263     ptrmapPut(pBt, pgnoRoot, PTRMAP_ROOTPAGE, 0, &rc);
8264     if( rc ){
8265       releasePage(pRoot);
8266       return rc;
8267     }
8268 
8269     /* When the new root page was allocated, page 1 was made writable in
8270     ** order either to increase the database filesize, or to decrement the
8271     ** freelist count.  Hence, the sqlite3BtreeUpdateMeta() call cannot fail.
8272     */
8273     assert( sqlite3PagerIswriteable(pBt->pPage1->pDbPage) );
8274     rc = sqlite3BtreeUpdateMeta(p, 4, pgnoRoot);
8275     if( NEVER(rc) ){
8276       releasePage(pRoot);
8277       return rc;
8278     }
8279 
8280   }else{
8281     rc = allocateBtreePage(pBt, &pRoot, &pgnoRoot, 1, 0);
8282     if( rc ) return rc;
8283   }
8284 #endif
8285   assert( sqlite3PagerIswriteable(pRoot->pDbPage) );
8286   if( createTabFlags & BTREE_INTKEY ){
8287     ptfFlags = PTF_INTKEY | PTF_LEAFDATA | PTF_LEAF;
8288   }else{
8289     ptfFlags = PTF_ZERODATA | PTF_LEAF;
8290   }
8291   zeroPage(pRoot, ptfFlags);
8292   sqlite3PagerUnref(pRoot->pDbPage);
8293   assert( (pBt->openFlags & BTREE_SINGLE)==0 || pgnoRoot==2 );
8294   *piTable = (int)pgnoRoot;
8295   return SQLITE_OK;
8296 }
8297 int sqlite3BtreeCreateTable(Btree *p, int *piTable, int flags){
8298   int rc;
8299   sqlite3BtreeEnter(p);
8300   rc = btreeCreateTable(p, piTable, flags);
8301   sqlite3BtreeLeave(p);
8302   return rc;
8303 }
8304 
8305 /*
8306 ** Erase the given database page and all its children.  Return
8307 ** the page to the freelist.
8308 */
8309 static int clearDatabasePage(
8310   BtShared *pBt,           /* The BTree that contains the table */
8311   Pgno pgno,               /* Page number to clear */
8312   int freePageFlag,        /* Deallocate page if true */
8313   int *pnChange            /* Add number of Cells freed to this counter */
8314 ){
8315   MemPage *pPage;
8316   int rc;
8317   unsigned char *pCell;
8318   int i;
8319   int hdr;
8320   u16 szCell;
8321 
8322   assert( sqlite3_mutex_held(pBt->mutex) );
8323   if( pgno>btreePagecount(pBt) ){
8324     return SQLITE_CORRUPT_BKPT;
8325   }
8326   rc = getAndInitPage(pBt, pgno, &pPage, 0, 0);
8327   if( rc ) return rc;
8328   if( pPage->bBusy ){
8329     rc = SQLITE_CORRUPT_BKPT;
8330     goto cleardatabasepage_out;
8331   }
8332   pPage->bBusy = 1;
8333   hdr = pPage->hdrOffset;
8334   for(i=0; i<pPage->nCell; i++){
8335     pCell = findCell(pPage, i);
8336     if( !pPage->leaf ){
8337       rc = clearDatabasePage(pBt, get4byte(pCell), 1, pnChange);
8338       if( rc ) goto cleardatabasepage_out;
8339     }
8340     rc = clearCell(pPage, pCell, &szCell);
8341     if( rc ) goto cleardatabasepage_out;
8342   }
8343   if( !pPage->leaf ){
8344     rc = clearDatabasePage(pBt, get4byte(&pPage->aData[hdr+8]), 1, pnChange);
8345     if( rc ) goto cleardatabasepage_out;
8346   }else if( pnChange ){
8347     assert( pPage->intKey || CORRUPT_DB );
8348     testcase( !pPage->intKey );
8349     *pnChange += pPage->nCell;
8350   }
8351   if( freePageFlag ){
8352     freePage(pPage, &rc);
8353   }else if( (rc = sqlite3PagerWrite(pPage->pDbPage))==0 ){
8354     zeroPage(pPage, pPage->aData[hdr] | PTF_LEAF);
8355   }
8356 
8357 cleardatabasepage_out:
8358   pPage->bBusy = 0;
8359   releasePage(pPage);
8360   return rc;
8361 }
8362 
8363 /*
8364 ** Delete all information from a single table in the database.  iTable is
8365 ** the page number of the root of the table.  After this routine returns,
8366 ** the root page is empty, but still exists.
8367 **
8368 ** This routine will fail with SQLITE_LOCKED if there are any open
8369 ** read cursors on the table.  Open write cursors are moved to the
8370 ** root of the table.
8371 **
8372 ** If pnChange is not NULL, then table iTable must be an intkey table. The
8373 ** integer value pointed to by pnChange is incremented by the number of
8374 ** entries in the table.
8375 */
8376 int sqlite3BtreeClearTable(Btree *p, int iTable, int *pnChange){
8377   int rc;
8378   BtShared *pBt = p->pBt;
8379   sqlite3BtreeEnter(p);
8380   assert( p->inTrans==TRANS_WRITE );
8381 
8382   rc = saveAllCursors(pBt, (Pgno)iTable, 0);
8383 
8384   if( SQLITE_OK==rc ){
8385     /* Invalidate all incrblob cursors open on table iTable (assuming iTable
8386     ** is the root of a table b-tree - if it is not, the following call is
8387     ** a no-op).  */
8388     invalidateIncrblobCursors(p, 0, 1);
8389     rc = clearDatabasePage(pBt, (Pgno)iTable, 0, pnChange);
8390   }
8391   sqlite3BtreeLeave(p);
8392   return rc;
8393 }
8394 
8395 /*
8396 ** Delete all information from the single table that pCur is open on.
8397 **
8398 ** This routine only work for pCur on an ephemeral table.
8399 */
8400 int sqlite3BtreeClearTableOfCursor(BtCursor *pCur){
8401   return sqlite3BtreeClearTable(pCur->pBtree, pCur->pgnoRoot, 0);
8402 }
8403 
8404 /*
8405 ** Erase all information in a table and add the root of the table to
8406 ** the freelist.  Except, the root of the principle table (the one on
8407 ** page 1) is never added to the freelist.
8408 **
8409 ** This routine will fail with SQLITE_LOCKED if there are any open
8410 ** cursors on the table.
8411 **
8412 ** If AUTOVACUUM is enabled and the page at iTable is not the last
8413 ** root page in the database file, then the last root page
8414 ** in the database file is moved into the slot formerly occupied by
8415 ** iTable and that last slot formerly occupied by the last root page
8416 ** is added to the freelist instead of iTable.  In this say, all
8417 ** root pages are kept at the beginning of the database file, which
8418 ** is necessary for AUTOVACUUM to work right.  *piMoved is set to the
8419 ** page number that used to be the last root page in the file before
8420 ** the move.  If no page gets moved, *piMoved is set to 0.
8421 ** The last root page is recorded in meta[3] and the value of
8422 ** meta[3] is updated by this procedure.
8423 */
8424 static int btreeDropTable(Btree *p, Pgno iTable, int *piMoved){
8425   int rc;
8426   MemPage *pPage = 0;
8427   BtShared *pBt = p->pBt;
8428 
8429   assert( sqlite3BtreeHoldsMutex(p) );
8430   assert( p->inTrans==TRANS_WRITE );
8431 
8432   /* It is illegal to drop a table if any cursors are open on the
8433   ** database. This is because in auto-vacuum mode the backend may
8434   ** need to move another root-page to fill a gap left by the deleted
8435   ** root page. If an open cursor was using this page a problem would
8436   ** occur.
8437   **
8438   ** This error is caught long before control reaches this point.
8439   */
8440   if( NEVER(pBt->pCursor) ){
8441     sqlite3ConnectionBlocked(p->db, pBt->pCursor->pBtree->db);
8442     return SQLITE_LOCKED_SHAREDCACHE;
8443   }
8444 
8445   rc = btreeGetPage(pBt, (Pgno)iTable, &pPage, 0);
8446   if( rc ) return rc;
8447   rc = sqlite3BtreeClearTable(p, iTable, 0);
8448   if( rc ){
8449     releasePage(pPage);
8450     return rc;
8451   }
8452 
8453   *piMoved = 0;
8454 
8455   if( iTable>1 ){
8456 #ifdef SQLITE_OMIT_AUTOVACUUM
8457     freePage(pPage, &rc);
8458     releasePage(pPage);
8459 #else
8460     if( pBt->autoVacuum ){
8461       Pgno maxRootPgno;
8462       sqlite3BtreeGetMeta(p, BTREE_LARGEST_ROOT_PAGE, &maxRootPgno);
8463 
8464       if( iTable==maxRootPgno ){
8465         /* If the table being dropped is the table with the largest root-page
8466         ** number in the database, put the root page on the free list.
8467         */
8468         freePage(pPage, &rc);
8469         releasePage(pPage);
8470         if( rc!=SQLITE_OK ){
8471           return rc;
8472         }
8473       }else{
8474         /* The table being dropped does not have the largest root-page
8475         ** number in the database. So move the page that does into the
8476         ** gap left by the deleted root-page.
8477         */
8478         MemPage *pMove;
8479         releasePage(pPage);
8480         rc = btreeGetPage(pBt, maxRootPgno, &pMove, 0);
8481         if( rc!=SQLITE_OK ){
8482           return rc;
8483         }
8484         rc = relocatePage(pBt, pMove, PTRMAP_ROOTPAGE, 0, iTable, 0);
8485         releasePage(pMove);
8486         if( rc!=SQLITE_OK ){
8487           return rc;
8488         }
8489         pMove = 0;
8490         rc = btreeGetPage(pBt, maxRootPgno, &pMove, 0);
8491         freePage(pMove, &rc);
8492         releasePage(pMove);
8493         if( rc!=SQLITE_OK ){
8494           return rc;
8495         }
8496         *piMoved = maxRootPgno;
8497       }
8498 
8499       /* Set the new 'max-root-page' value in the database header. This
8500       ** is the old value less one, less one more if that happens to
8501       ** be a root-page number, less one again if that is the
8502       ** PENDING_BYTE_PAGE.
8503       */
8504       maxRootPgno--;
8505       while( maxRootPgno==PENDING_BYTE_PAGE(pBt)
8506              || PTRMAP_ISPAGE(pBt, maxRootPgno) ){
8507         maxRootPgno--;
8508       }
8509       assert( maxRootPgno!=PENDING_BYTE_PAGE(pBt) );
8510 
8511       rc = sqlite3BtreeUpdateMeta(p, 4, maxRootPgno);
8512     }else{
8513       freePage(pPage, &rc);
8514       releasePage(pPage);
8515     }
8516 #endif
8517   }else{
8518     /* If sqlite3BtreeDropTable was called on page 1.
8519     ** This really never should happen except in a corrupt
8520     ** database.
8521     */
8522     zeroPage(pPage, PTF_INTKEY|PTF_LEAF );
8523     releasePage(pPage);
8524   }
8525   return rc;
8526 }
8527 int sqlite3BtreeDropTable(Btree *p, int iTable, int *piMoved){
8528   int rc;
8529   sqlite3BtreeEnter(p);
8530   rc = btreeDropTable(p, iTable, piMoved);
8531   sqlite3BtreeLeave(p);
8532   return rc;
8533 }
8534 
8535 
8536 /*
8537 ** This function may only be called if the b-tree connection already
8538 ** has a read or write transaction open on the database.
8539 **
8540 ** Read the meta-information out of a database file.  Meta[0]
8541 ** is the number of free pages currently in the database.  Meta[1]
8542 ** through meta[15] are available for use by higher layers.  Meta[0]
8543 ** is read-only, the others are read/write.
8544 **
8545 ** The schema layer numbers meta values differently.  At the schema
8546 ** layer (and the SetCookie and ReadCookie opcodes) the number of
8547 ** free pages is not visible.  So Cookie[0] is the same as Meta[1].
8548 **
8549 ** This routine treats Meta[BTREE_DATA_VERSION] as a special case.  Instead
8550 ** of reading the value out of the header, it instead loads the "DataVersion"
8551 ** from the pager.  The BTREE_DATA_VERSION value is not actually stored in the
8552 ** database file.  It is a number computed by the pager.  But its access
8553 ** pattern is the same as header meta values, and so it is convenient to
8554 ** read it from this routine.
8555 */
8556 void sqlite3BtreeGetMeta(Btree *p, int idx, u32 *pMeta){
8557   BtShared *pBt = p->pBt;
8558 
8559   sqlite3BtreeEnter(p);
8560   assert( p->inTrans>TRANS_NONE );
8561   assert( SQLITE_OK==querySharedCacheTableLock(p, MASTER_ROOT, READ_LOCK) );
8562   assert( pBt->pPage1 );
8563   assert( idx>=0 && idx<=15 );
8564 
8565   if( idx==BTREE_DATA_VERSION ){
8566     *pMeta = sqlite3PagerDataVersion(pBt->pPager) + p->iDataVersion;
8567   }else{
8568     *pMeta = get4byte(&pBt->pPage1->aData[36 + idx*4]);
8569   }
8570 
8571   /* If auto-vacuum is disabled in this build and this is an auto-vacuum
8572   ** database, mark the database as read-only.  */
8573 #ifdef SQLITE_OMIT_AUTOVACUUM
8574   if( idx==BTREE_LARGEST_ROOT_PAGE && *pMeta>0 ){
8575     pBt->btsFlags |= BTS_READ_ONLY;
8576   }
8577 #endif
8578 
8579   sqlite3BtreeLeave(p);
8580 }
8581 
8582 /*
8583 ** Write meta-information back into the database.  Meta[0] is
8584 ** read-only and may not be written.
8585 */
8586 int sqlite3BtreeUpdateMeta(Btree *p, int idx, u32 iMeta){
8587   BtShared *pBt = p->pBt;
8588   unsigned char *pP1;
8589   int rc;
8590   assert( idx>=1 && idx<=15 );
8591   sqlite3BtreeEnter(p);
8592   assert( p->inTrans==TRANS_WRITE );
8593   assert( pBt->pPage1!=0 );
8594   pP1 = pBt->pPage1->aData;
8595   rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
8596   if( rc==SQLITE_OK ){
8597     put4byte(&pP1[36 + idx*4], iMeta);
8598 #ifndef SQLITE_OMIT_AUTOVACUUM
8599     if( idx==BTREE_INCR_VACUUM ){
8600       assert( pBt->autoVacuum || iMeta==0 );
8601       assert( iMeta==0 || iMeta==1 );
8602       pBt->incrVacuum = (u8)iMeta;
8603     }
8604 #endif
8605   }
8606   sqlite3BtreeLeave(p);
8607   return rc;
8608 }
8609 
8610 #ifndef SQLITE_OMIT_BTREECOUNT
8611 /*
8612 ** The first argument, pCur, is a cursor opened on some b-tree. Count the
8613 ** number of entries in the b-tree and write the result to *pnEntry.
8614 **
8615 ** SQLITE_OK is returned if the operation is successfully executed.
8616 ** Otherwise, if an error is encountered (i.e. an IO error or database
8617 ** corruption) an SQLite error code is returned.
8618 */
8619 int sqlite3BtreeCount(BtCursor *pCur, i64 *pnEntry){
8620   i64 nEntry = 0;                      /* Value to return in *pnEntry */
8621   int rc;                              /* Return code */
8622 
8623   if( pCur->pgnoRoot==0 ){
8624     *pnEntry = 0;
8625     return SQLITE_OK;
8626   }
8627   rc = moveToRoot(pCur);
8628 
8629   /* Unless an error occurs, the following loop runs one iteration for each
8630   ** page in the B-Tree structure (not including overflow pages).
8631   */
8632   while( rc==SQLITE_OK ){
8633     int iIdx;                          /* Index of child node in parent */
8634     MemPage *pPage;                    /* Current page of the b-tree */
8635 
8636     /* If this is a leaf page or the tree is not an int-key tree, then
8637     ** this page contains countable entries. Increment the entry counter
8638     ** accordingly.
8639     */
8640     pPage = pCur->apPage[pCur->iPage];
8641     if( pPage->leaf || !pPage->intKey ){
8642       nEntry += pPage->nCell;
8643     }
8644 
8645     /* pPage is a leaf node. This loop navigates the cursor so that it
8646     ** points to the first interior cell that it points to the parent of
8647     ** the next page in the tree that has not yet been visited. The
8648     ** pCur->aiIdx[pCur->iPage] value is set to the index of the parent cell
8649     ** of the page, or to the number of cells in the page if the next page
8650     ** to visit is the right-child of its parent.
8651     **
8652     ** If all pages in the tree have been visited, return SQLITE_OK to the
8653     ** caller.
8654     */
8655     if( pPage->leaf ){
8656       do {
8657         if( pCur->iPage==0 ){
8658           /* All pages of the b-tree have been visited. Return successfully. */
8659           *pnEntry = nEntry;
8660           return moveToRoot(pCur);
8661         }
8662         moveToParent(pCur);
8663       }while ( pCur->aiIdx[pCur->iPage]>=pCur->apPage[pCur->iPage]->nCell );
8664 
8665       pCur->aiIdx[pCur->iPage]++;
8666       pPage = pCur->apPage[pCur->iPage];
8667     }
8668 
8669     /* Descend to the child node of the cell that the cursor currently
8670     ** points at. This is the right-child if (iIdx==pPage->nCell).
8671     */
8672     iIdx = pCur->aiIdx[pCur->iPage];
8673     if( iIdx==pPage->nCell ){
8674       rc = moveToChild(pCur, get4byte(&pPage->aData[pPage->hdrOffset+8]));
8675     }else{
8676       rc = moveToChild(pCur, get4byte(findCell(pPage, iIdx)));
8677     }
8678   }
8679 
8680   /* An error has occurred. Return an error code. */
8681   return rc;
8682 }
8683 #endif
8684 
8685 /*
8686 ** Return the pager associated with a BTree.  This routine is used for
8687 ** testing and debugging only.
8688 */
8689 Pager *sqlite3BtreePager(Btree *p){
8690   return p->pBt->pPager;
8691 }
8692 
8693 #ifndef SQLITE_OMIT_INTEGRITY_CHECK
8694 /*
8695 ** Append a message to the error message string.
8696 */
8697 static void checkAppendMsg(
8698   IntegrityCk *pCheck,
8699   const char *zFormat,
8700   ...
8701 ){
8702   va_list ap;
8703   char zBuf[200];
8704   if( !pCheck->mxErr ) return;
8705   pCheck->mxErr--;
8706   pCheck->nErr++;
8707   va_start(ap, zFormat);
8708   if( pCheck->errMsg.nChar ){
8709     sqlite3StrAccumAppend(&pCheck->errMsg, "\n", 1);
8710   }
8711   if( pCheck->zPfx ){
8712     sqlite3_snprintf(sizeof(zBuf), zBuf, pCheck->zPfx, pCheck->v1, pCheck->v2);
8713     sqlite3StrAccumAppendAll(&pCheck->errMsg, zBuf);
8714   }
8715   sqlite3VXPrintf(&pCheck->errMsg, 1, zFormat, ap);
8716   va_end(ap);
8717   if( pCheck->errMsg.accError==STRACCUM_NOMEM ){
8718     pCheck->mallocFailed = 1;
8719   }
8720 }
8721 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
8722 
8723 #ifndef SQLITE_OMIT_INTEGRITY_CHECK
8724 
8725 /*
8726 ** Return non-zero if the bit in the IntegrityCk.aPgRef[] array that
8727 ** corresponds to page iPg is already set.
8728 */
8729 static int getPageReferenced(IntegrityCk *pCheck, Pgno iPg){
8730   assert( iPg<=pCheck->nPage && sizeof(pCheck->aPgRef[0])==1 );
8731   return (pCheck->aPgRef[iPg/8] & (1 << (iPg & 0x07)));
8732 }
8733 
8734 /*
8735 ** Set the bit in the IntegrityCk.aPgRef[] array that corresponds to page iPg.
8736 */
8737 static void setPageReferenced(IntegrityCk *pCheck, Pgno iPg){
8738   assert( iPg<=pCheck->nPage && sizeof(pCheck->aPgRef[0])==1 );
8739   pCheck->aPgRef[iPg/8] |= (1 << (iPg & 0x07));
8740 }
8741 
8742 
8743 /*
8744 ** Add 1 to the reference count for page iPage.  If this is the second
8745 ** reference to the page, add an error message to pCheck->zErrMsg.
8746 ** Return 1 if there are 2 or more references to the page and 0 if
8747 ** if this is the first reference to the page.
8748 **
8749 ** Also check that the page number is in bounds.
8750 */
8751 static int checkRef(IntegrityCk *pCheck, Pgno iPage){
8752   if( iPage==0 ) return 1;
8753   if( iPage>pCheck->nPage ){
8754     checkAppendMsg(pCheck, "invalid page number %d", iPage);
8755     return 1;
8756   }
8757   if( getPageReferenced(pCheck, iPage) ){
8758     checkAppendMsg(pCheck, "2nd reference to page %d", iPage);
8759     return 1;
8760   }
8761   setPageReferenced(pCheck, iPage);
8762   return 0;
8763 }
8764 
8765 #ifndef SQLITE_OMIT_AUTOVACUUM
8766 /*
8767 ** Check that the entry in the pointer-map for page iChild maps to
8768 ** page iParent, pointer type ptrType. If not, append an error message
8769 ** to pCheck.
8770 */
8771 static void checkPtrmap(
8772   IntegrityCk *pCheck,   /* Integrity check context */
8773   Pgno iChild,           /* Child page number */
8774   u8 eType,              /* Expected pointer map type */
8775   Pgno iParent           /* Expected pointer map parent page number */
8776 ){
8777   int rc;
8778   u8 ePtrmapType;
8779   Pgno iPtrmapParent;
8780 
8781   rc = ptrmapGet(pCheck->pBt, iChild, &ePtrmapType, &iPtrmapParent);
8782   if( rc!=SQLITE_OK ){
8783     if( rc==SQLITE_NOMEM || rc==SQLITE_IOERR_NOMEM ) pCheck->mallocFailed = 1;
8784     checkAppendMsg(pCheck, "Failed to read ptrmap key=%d", iChild);
8785     return;
8786   }
8787 
8788   if( ePtrmapType!=eType || iPtrmapParent!=iParent ){
8789     checkAppendMsg(pCheck,
8790       "Bad ptr map entry key=%d expected=(%d,%d) got=(%d,%d)",
8791       iChild, eType, iParent, ePtrmapType, iPtrmapParent);
8792   }
8793 }
8794 #endif
8795 
8796 /*
8797 ** Check the integrity of the freelist or of an overflow page list.
8798 ** Verify that the number of pages on the list is N.
8799 */
8800 static void checkList(
8801   IntegrityCk *pCheck,  /* Integrity checking context */
8802   int isFreeList,       /* True for a freelist.  False for overflow page list */
8803   int iPage,            /* Page number for first page in the list */
8804   int N                 /* Expected number of pages in the list */
8805 ){
8806   int i;
8807   int expected = N;
8808   int iFirst = iPage;
8809   while( N-- > 0 && pCheck->mxErr ){
8810     DbPage *pOvflPage;
8811     unsigned char *pOvflData;
8812     if( iPage<1 ){
8813       checkAppendMsg(pCheck,
8814          "%d of %d pages missing from overflow list starting at %d",
8815           N+1, expected, iFirst);
8816       break;
8817     }
8818     if( checkRef(pCheck, iPage) ) break;
8819     if( sqlite3PagerGet(pCheck->pPager, (Pgno)iPage, &pOvflPage) ){
8820       checkAppendMsg(pCheck, "failed to get page %d", iPage);
8821       break;
8822     }
8823     pOvflData = (unsigned char *)sqlite3PagerGetData(pOvflPage);
8824     if( isFreeList ){
8825       int n = get4byte(&pOvflData[4]);
8826 #ifndef SQLITE_OMIT_AUTOVACUUM
8827       if( pCheck->pBt->autoVacuum ){
8828         checkPtrmap(pCheck, iPage, PTRMAP_FREEPAGE, 0);
8829       }
8830 #endif
8831       if( n>(int)pCheck->pBt->usableSize/4-2 ){
8832         checkAppendMsg(pCheck,
8833            "freelist leaf count too big on page %d", iPage);
8834         N--;
8835       }else{
8836         for(i=0; i<n; i++){
8837           Pgno iFreePage = get4byte(&pOvflData[8+i*4]);
8838 #ifndef SQLITE_OMIT_AUTOVACUUM
8839           if( pCheck->pBt->autoVacuum ){
8840             checkPtrmap(pCheck, iFreePage, PTRMAP_FREEPAGE, 0);
8841           }
8842 #endif
8843           checkRef(pCheck, iFreePage);
8844         }
8845         N -= n;
8846       }
8847     }
8848 #ifndef SQLITE_OMIT_AUTOVACUUM
8849     else{
8850       /* If this database supports auto-vacuum and iPage is not the last
8851       ** page in this overflow list, check that the pointer-map entry for
8852       ** the following page matches iPage.
8853       */
8854       if( pCheck->pBt->autoVacuum && N>0 ){
8855         i = get4byte(pOvflData);
8856         checkPtrmap(pCheck, i, PTRMAP_OVERFLOW2, iPage);
8857       }
8858     }
8859 #endif
8860     iPage = get4byte(pOvflData);
8861     sqlite3PagerUnref(pOvflPage);
8862   }
8863 }
8864 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
8865 
8866 /*
8867 ** An implementation of a min-heap.
8868 **
8869 ** aHeap[0] is the number of elements on the heap.  aHeap[1] is the
8870 ** root element.  The daughter nodes of aHeap[N] are aHeap[N*2]
8871 ** and aHeap[N*2+1].
8872 **
8873 ** The heap property is this:  Every node is less than or equal to both
8874 ** of its daughter nodes.  A consequence of the heap property is that the
8875 ** root node aHeap[1] is always the minimum value currently in the heap.
8876 **
8877 ** The btreeHeapInsert() routine inserts an unsigned 32-bit number onto
8878 ** the heap, preserving the heap property.  The btreeHeapPull() routine
8879 ** removes the root element from the heap (the minimum value in the heap)
8880 ** and then moves other nodes around as necessary to preserve the heap
8881 ** property.
8882 **
8883 ** This heap is used for cell overlap and coverage testing.  Each u32
8884 ** entry represents the span of a cell or freeblock on a btree page.
8885 ** The upper 16 bits are the index of the first byte of a range and the
8886 ** lower 16 bits are the index of the last byte of that range.
8887 */
8888 static void btreeHeapInsert(u32 *aHeap, u32 x){
8889   u32 j, i = ++aHeap[0];
8890   aHeap[i] = x;
8891   while( (j = i/2)>0 && aHeap[j]>aHeap[i] ){
8892     x = aHeap[j];
8893     aHeap[j] = aHeap[i];
8894     aHeap[i] = x;
8895     i = j;
8896   }
8897 }
8898 static int btreeHeapPull(u32 *aHeap, u32 *pOut){
8899   u32 j, i, x;
8900   if( (x = aHeap[0])==0 ) return 0;
8901   *pOut = aHeap[1];
8902   aHeap[1] = aHeap[x];
8903   aHeap[x] = 0xffffffff;
8904   aHeap[0]--;
8905   i = 1;
8906   while( (j = i*2)<=aHeap[0] ){
8907     if( aHeap[j]>aHeap[j+1] ) j++;
8908     if( aHeap[i]<aHeap[j] ) break;
8909     x = aHeap[i];
8910     aHeap[i] = aHeap[j];
8911     aHeap[j] = x;
8912     i = j;
8913   }
8914   return 1;
8915 }
8916 
8917 #ifndef SQLITE_OMIT_INTEGRITY_CHECK
8918 /*
8919 ** Do various sanity checks on a single page of a tree.  Return
8920 ** the tree depth.  Root pages return 0.  Parents of root pages
8921 ** return 1, and so forth.
8922 **
8923 ** These checks are done:
8924 **
8925 **      1.  Make sure that cells and freeblocks do not overlap
8926 **          but combine to completely cover the page.
8927 **      2.  Make sure integer cell keys are in order.
8928 **      3.  Check the integrity of overflow pages.
8929 **      4.  Recursively call checkTreePage on all children.
8930 **      5.  Verify that the depth of all children is the same.
8931 */
8932 static int checkTreePage(
8933   IntegrityCk *pCheck,  /* Context for the sanity check */
8934   int iPage,            /* Page number of the page to check */
8935   i64 *piMinKey,        /* Write minimum integer primary key here */
8936   i64 maxKey            /* Error if integer primary key greater than this */
8937 ){
8938   MemPage *pPage = 0;      /* The page being analyzed */
8939   int i;                   /* Loop counter */
8940   int rc;                  /* Result code from subroutine call */
8941   int depth = -1, d2;      /* Depth of a subtree */
8942   int pgno;                /* Page number */
8943   int nFrag;               /* Number of fragmented bytes on the page */
8944   int hdr;                 /* Offset to the page header */
8945   int cellStart;           /* Offset to the start of the cell pointer array */
8946   int nCell;               /* Number of cells */
8947   int doCoverageCheck = 1; /* True if cell coverage checking should be done */
8948   int keyCanBeEqual = 1;   /* True if IPK can be equal to maxKey
8949                            ** False if IPK must be strictly less than maxKey */
8950   u8 *data;                /* Page content */
8951   u8 *pCell;               /* Cell content */
8952   u8 *pCellIdx;            /* Next element of the cell pointer array */
8953   BtShared *pBt;           /* The BtShared object that owns pPage */
8954   u32 pc;                  /* Address of a cell */
8955   u32 usableSize;          /* Usable size of the page */
8956   u32 contentOffset;       /* Offset to the start of the cell content area */
8957   u32 *heap = 0;           /* Min-heap used for checking cell coverage */
8958   u32 x, prev = 0;         /* Next and previous entry on the min-heap */
8959   const char *saved_zPfx = pCheck->zPfx;
8960   int saved_v1 = pCheck->v1;
8961   int saved_v2 = pCheck->v2;
8962 
8963   /* Check that the page exists
8964   */
8965   pBt = pCheck->pBt;
8966   usableSize = pBt->usableSize;
8967   if( iPage==0 ) return 0;
8968   if( checkRef(pCheck, iPage) ) return 0;
8969   pCheck->zPfx = "Page %d: ";
8970   pCheck->v1 = iPage;
8971   if( (rc = btreeGetPage(pBt, (Pgno)iPage, &pPage, 0))!=0 ){
8972     checkAppendMsg(pCheck,
8973        "unable to get the page. error code=%d", rc);
8974     goto end_of_check;
8975   }
8976 
8977   /* Clear MemPage.isInit to make sure the corruption detection code in
8978   ** btreeInitPage() is executed.  */
8979   pPage->isInit = 0;
8980   if( (rc = btreeInitPage(pPage))!=0 ){
8981     assert( rc==SQLITE_CORRUPT );  /* The only possible error from InitPage */
8982     checkAppendMsg(pCheck,
8983                    "btreeInitPage() returns error code %d", rc);
8984     goto end_of_check;
8985   }
8986   data = pPage->aData;
8987   hdr = pPage->hdrOffset;
8988 
8989   /* Set up for cell analysis */
8990   pCheck->zPfx = "On tree page %d cell %d: ";
8991   contentOffset = get2byteNotZero(&data[hdr+5]);
8992   assert( contentOffset<=usableSize );  /* Enforced by btreeInitPage() */
8993 
8994   /* EVIDENCE-OF: R-37002-32774 The two-byte integer at offset 3 gives the
8995   ** number of cells on the page. */
8996   nCell = get2byte(&data[hdr+3]);
8997   assert( pPage->nCell==nCell );
8998 
8999   /* EVIDENCE-OF: R-23882-45353 The cell pointer array of a b-tree page
9000   ** immediately follows the b-tree page header. */
9001   cellStart = hdr + 12 - 4*pPage->leaf;
9002   assert( pPage->aCellIdx==&data[cellStart] );
9003   pCellIdx = &data[cellStart + 2*(nCell-1)];
9004 
9005   if( !pPage->leaf ){
9006     /* Analyze the right-child page of internal pages */
9007     pgno = get4byte(&data[hdr+8]);
9008 #ifndef SQLITE_OMIT_AUTOVACUUM
9009     if( pBt->autoVacuum ){
9010       pCheck->zPfx = "On page %d at right child: ";
9011       checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage);
9012     }
9013 #endif
9014     depth = checkTreePage(pCheck, pgno, &maxKey, maxKey);
9015     keyCanBeEqual = 0;
9016   }else{
9017     /* For leaf pages, the coverage check will occur in the same loop
9018     ** as the other cell checks, so initialize the heap.  */
9019     heap = pCheck->heap;
9020     heap[0] = 0;
9021   }
9022 
9023   /* EVIDENCE-OF: R-02776-14802 The cell pointer array consists of K 2-byte
9024   ** integer offsets to the cell contents. */
9025   for(i=nCell-1; i>=0 && pCheck->mxErr; i--){
9026     CellInfo info;
9027 
9028     /* Check cell size */
9029     pCheck->v2 = i;
9030     assert( pCellIdx==&data[cellStart + i*2] );
9031     pc = get2byteAligned(pCellIdx);
9032     pCellIdx -= 2;
9033     if( pc<contentOffset || pc>usableSize-4 ){
9034       checkAppendMsg(pCheck, "Offset %d out of range %d..%d",
9035                              pc, contentOffset, usableSize-4);
9036       doCoverageCheck = 0;
9037       continue;
9038     }
9039     pCell = &data[pc];
9040     pPage->xParseCell(pPage, pCell, &info);
9041     if( pc+info.nSize>usableSize ){
9042       checkAppendMsg(pCheck, "Extends off end of page");
9043       doCoverageCheck = 0;
9044       continue;
9045     }
9046 
9047     /* Check for integer primary key out of range */
9048     if( pPage->intKey ){
9049       if( keyCanBeEqual ? (info.nKey > maxKey) : (info.nKey >= maxKey) ){
9050         checkAppendMsg(pCheck, "Rowid %lld out of order", info.nKey);
9051       }
9052       maxKey = info.nKey;
9053     }
9054 
9055     /* Check the content overflow list */
9056     if( info.nPayload>info.nLocal ){
9057       int nPage;       /* Number of pages on the overflow chain */
9058       Pgno pgnoOvfl;   /* First page of the overflow chain */
9059       assert( pc + info.iOverflow <= usableSize );
9060       nPage = (info.nPayload - info.nLocal + usableSize - 5)/(usableSize - 4);
9061       pgnoOvfl = get4byte(&pCell[info.iOverflow]);
9062 #ifndef SQLITE_OMIT_AUTOVACUUM
9063       if( pBt->autoVacuum ){
9064         checkPtrmap(pCheck, pgnoOvfl, PTRMAP_OVERFLOW1, iPage);
9065       }
9066 #endif
9067       checkList(pCheck, 0, pgnoOvfl, nPage);
9068     }
9069 
9070     if( !pPage->leaf ){
9071       /* Check sanity of left child page for internal pages */
9072       pgno = get4byte(pCell);
9073 #ifndef SQLITE_OMIT_AUTOVACUUM
9074       if( pBt->autoVacuum ){
9075         checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage);
9076       }
9077 #endif
9078       d2 = checkTreePage(pCheck, pgno, &maxKey, maxKey);
9079       keyCanBeEqual = 0;
9080       if( d2!=depth ){
9081         checkAppendMsg(pCheck, "Child page depth differs");
9082         depth = d2;
9083       }
9084     }else{
9085       /* Populate the coverage-checking heap for leaf pages */
9086       btreeHeapInsert(heap, (pc<<16)|(pc+info.nSize-1));
9087     }
9088   }
9089   *piMinKey = maxKey;
9090 
9091   /* Check for complete coverage of the page
9092   */
9093   pCheck->zPfx = 0;
9094   if( doCoverageCheck && pCheck->mxErr>0 ){
9095     /* For leaf pages, the min-heap has already been initialized and the
9096     ** cells have already been inserted.  But for internal pages, that has
9097     ** not yet been done, so do it now */
9098     if( !pPage->leaf ){
9099       heap = pCheck->heap;
9100       heap[0] = 0;
9101       for(i=nCell-1; i>=0; i--){
9102         u32 size;
9103         pc = get2byteAligned(&data[cellStart+i*2]);
9104         size = pPage->xCellSize(pPage, &data[pc]);
9105         btreeHeapInsert(heap, (pc<<16)|(pc+size-1));
9106       }
9107     }
9108     /* Add the freeblocks to the min-heap
9109     **
9110     ** EVIDENCE-OF: R-20690-50594 The second field of the b-tree page header
9111     ** is the offset of the first freeblock, or zero if there are no
9112     ** freeblocks on the page.
9113     */
9114     i = get2byte(&data[hdr+1]);
9115     while( i>0 ){
9116       int size, j;
9117       assert( (u32)i<=usableSize-4 );     /* Enforced by btreeInitPage() */
9118       size = get2byte(&data[i+2]);
9119       assert( (u32)(i+size)<=usableSize );  /* Enforced by btreeInitPage() */
9120       btreeHeapInsert(heap, (i<<16)|(i+size-1));
9121       /* EVIDENCE-OF: R-58208-19414 The first 2 bytes of a freeblock are a
9122       ** big-endian integer which is the offset in the b-tree page of the next
9123       ** freeblock in the chain, or zero if the freeblock is the last on the
9124       ** chain. */
9125       j = get2byte(&data[i]);
9126       /* EVIDENCE-OF: R-06866-39125 Freeblocks are always connected in order of
9127       ** increasing offset. */
9128       assert( j==0 || j>i+size );  /* Enforced by btreeInitPage() */
9129       assert( (u32)j<=usableSize-4 );   /* Enforced by btreeInitPage() */
9130       i = j;
9131     }
9132     /* Analyze the min-heap looking for overlap between cells and/or
9133     ** freeblocks, and counting the number of untracked bytes in nFrag.
9134     **
9135     ** Each min-heap entry is of the form:    (start_address<<16)|end_address.
9136     ** There is an implied first entry the covers the page header, the cell
9137     ** pointer index, and the gap between the cell pointer index and the start
9138     ** of cell content.
9139     **
9140     ** The loop below pulls entries from the min-heap in order and compares
9141     ** the start_address against the previous end_address.  If there is an
9142     ** overlap, that means bytes are used multiple times.  If there is a gap,
9143     ** that gap is added to the fragmentation count.
9144     */
9145     nFrag = 0;
9146     prev = contentOffset - 1;   /* Implied first min-heap entry */
9147     while( btreeHeapPull(heap,&x) ){
9148       if( (prev&0xffff)>=(x>>16) ){
9149         checkAppendMsg(pCheck,
9150           "Multiple uses for byte %u of page %d", x>>16, iPage);
9151         break;
9152       }else{
9153         nFrag += (x>>16) - (prev&0xffff) - 1;
9154         prev = x;
9155       }
9156     }
9157     nFrag += usableSize - (prev&0xffff) - 1;
9158     /* EVIDENCE-OF: R-43263-13491 The total number of bytes in all fragments
9159     ** is stored in the fifth field of the b-tree page header.
9160     ** EVIDENCE-OF: R-07161-27322 The one-byte integer at offset 7 gives the
9161     ** number of fragmented free bytes within the cell content area.
9162     */
9163     if( heap[0]==0 && nFrag!=data[hdr+7] ){
9164       checkAppendMsg(pCheck,
9165           "Fragmentation of %d bytes reported as %d on page %d",
9166           nFrag, data[hdr+7], iPage);
9167     }
9168   }
9169 
9170 end_of_check:
9171   releasePage(pPage);
9172   pCheck->zPfx = saved_zPfx;
9173   pCheck->v1 = saved_v1;
9174   pCheck->v2 = saved_v2;
9175   return depth+1;
9176 }
9177 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
9178 
9179 #ifndef SQLITE_OMIT_INTEGRITY_CHECK
9180 /*
9181 ** This routine does a complete check of the given BTree file.  aRoot[] is
9182 ** an array of pages numbers were each page number is the root page of
9183 ** a table.  nRoot is the number of entries in aRoot.
9184 **
9185 ** A read-only or read-write transaction must be opened before calling
9186 ** this function.
9187 **
9188 ** Write the number of error seen in *pnErr.  Except for some memory
9189 ** allocation errors,  an error message held in memory obtained from
9190 ** malloc is returned if *pnErr is non-zero.  If *pnErr==0 then NULL is
9191 ** returned.  If a memory allocation error occurs, NULL is returned.
9192 */
9193 char *sqlite3BtreeIntegrityCheck(
9194   Btree *p,     /* The btree to be checked */
9195   int *aRoot,   /* An array of root pages numbers for individual trees */
9196   int nRoot,    /* Number of entries in aRoot[] */
9197   int mxErr,    /* Stop reporting errors after this many */
9198   int *pnErr    /* Write number of errors seen to this variable */
9199 ){
9200   Pgno i;
9201   IntegrityCk sCheck;
9202   BtShared *pBt = p->pBt;
9203   int savedDbFlags = pBt->db->flags;
9204   char zErr[100];
9205   VVA_ONLY( int nRef );
9206 
9207   sqlite3BtreeEnter(p);
9208   assert( p->inTrans>TRANS_NONE && pBt->inTransaction>TRANS_NONE );
9209   assert( (nRef = sqlite3PagerRefcount(pBt->pPager))>=0 );
9210   sCheck.pBt = pBt;
9211   sCheck.pPager = pBt->pPager;
9212   sCheck.nPage = btreePagecount(sCheck.pBt);
9213   sCheck.mxErr = mxErr;
9214   sCheck.nErr = 0;
9215   sCheck.mallocFailed = 0;
9216   sCheck.zPfx = 0;
9217   sCheck.v1 = 0;
9218   sCheck.v2 = 0;
9219   sCheck.aPgRef = 0;
9220   sCheck.heap = 0;
9221   sqlite3StrAccumInit(&sCheck.errMsg, 0, zErr, sizeof(zErr), SQLITE_MAX_LENGTH);
9222   if( sCheck.nPage==0 ){
9223     goto integrity_ck_cleanup;
9224   }
9225 
9226   sCheck.aPgRef = sqlite3MallocZero((sCheck.nPage / 8)+ 1);
9227   if( !sCheck.aPgRef ){
9228     sCheck.mallocFailed = 1;
9229     goto integrity_ck_cleanup;
9230   }
9231   sCheck.heap = (u32*)sqlite3PageMalloc( pBt->pageSize );
9232   if( sCheck.heap==0 ){
9233     sCheck.mallocFailed = 1;
9234     goto integrity_ck_cleanup;
9235   }
9236 
9237   i = PENDING_BYTE_PAGE(pBt);
9238   if( i<=sCheck.nPage ) setPageReferenced(&sCheck, i);
9239 
9240   /* Check the integrity of the freelist
9241   */
9242   sCheck.zPfx = "Main freelist: ";
9243   checkList(&sCheck, 1, get4byte(&pBt->pPage1->aData[32]),
9244             get4byte(&pBt->pPage1->aData[36]));
9245   sCheck.zPfx = 0;
9246 
9247   /* Check all the tables.
9248   */
9249   testcase( pBt->db->flags & SQLITE_CellSizeCk );
9250   pBt->db->flags &= ~SQLITE_CellSizeCk;
9251   for(i=0; (int)i<nRoot && sCheck.mxErr; i++){
9252     i64 notUsed;
9253     if( aRoot[i]==0 ) continue;
9254 #ifndef SQLITE_OMIT_AUTOVACUUM
9255     if( pBt->autoVacuum && aRoot[i]>1 ){
9256       checkPtrmap(&sCheck, aRoot[i], PTRMAP_ROOTPAGE, 0);
9257     }
9258 #endif
9259     checkTreePage(&sCheck, aRoot[i], &notUsed, LARGEST_INT64);
9260   }
9261   pBt->db->flags = savedDbFlags;
9262 
9263   /* Make sure every page in the file is referenced
9264   */
9265   for(i=1; i<=sCheck.nPage && sCheck.mxErr; i++){
9266 #ifdef SQLITE_OMIT_AUTOVACUUM
9267     if( getPageReferenced(&sCheck, i)==0 ){
9268       checkAppendMsg(&sCheck, "Page %d is never used", i);
9269     }
9270 #else
9271     /* If the database supports auto-vacuum, make sure no tables contain
9272     ** references to pointer-map pages.
9273     */
9274     if( getPageReferenced(&sCheck, i)==0 &&
9275        (PTRMAP_PAGENO(pBt, i)!=i || !pBt->autoVacuum) ){
9276       checkAppendMsg(&sCheck, "Page %d is never used", i);
9277     }
9278     if( getPageReferenced(&sCheck, i)!=0 &&
9279        (PTRMAP_PAGENO(pBt, i)==i && pBt->autoVacuum) ){
9280       checkAppendMsg(&sCheck, "Pointer map page %d is referenced", i);
9281     }
9282 #endif
9283   }
9284 
9285   /* Clean  up and report errors.
9286   */
9287 integrity_ck_cleanup:
9288   sqlite3PageFree(sCheck.heap);
9289   sqlite3_free(sCheck.aPgRef);
9290   if( sCheck.mallocFailed ){
9291     sqlite3StrAccumReset(&sCheck.errMsg);
9292     sCheck.nErr++;
9293   }
9294   *pnErr = sCheck.nErr;
9295   if( sCheck.nErr==0 ) sqlite3StrAccumReset(&sCheck.errMsg);
9296   /* Make sure this analysis did not leave any unref() pages. */
9297   assert( nRef==sqlite3PagerRefcount(pBt->pPager) );
9298   sqlite3BtreeLeave(p);
9299   return sqlite3StrAccumFinish(&sCheck.errMsg);
9300 }
9301 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
9302 
9303 /*
9304 ** Return the full pathname of the underlying database file.  Return
9305 ** an empty string if the database is in-memory or a TEMP database.
9306 **
9307 ** The pager filename is invariant as long as the pager is
9308 ** open so it is safe to access without the BtShared mutex.
9309 */
9310 const char *sqlite3BtreeGetFilename(Btree *p){
9311   assert( p->pBt->pPager!=0 );
9312   return sqlite3PagerFilename(p->pBt->pPager, 1);
9313 }
9314 
9315 /*
9316 ** Return the pathname of the journal file for this database. The return
9317 ** value of this routine is the same regardless of whether the journal file
9318 ** has been created or not.
9319 **
9320 ** The pager journal filename is invariant as long as the pager is
9321 ** open so it is safe to access without the BtShared mutex.
9322 */
9323 const char *sqlite3BtreeGetJournalname(Btree *p){
9324   assert( p->pBt->pPager!=0 );
9325   return sqlite3PagerJournalname(p->pBt->pPager);
9326 }
9327 
9328 /*
9329 ** Return non-zero if a transaction is active.
9330 */
9331 int sqlite3BtreeIsInTrans(Btree *p){
9332   assert( p==0 || sqlite3_mutex_held(p->db->mutex) );
9333   return (p && (p->inTrans==TRANS_WRITE));
9334 }
9335 
9336 #ifndef SQLITE_OMIT_WAL
9337 /*
9338 ** Run a checkpoint on the Btree passed as the first argument.
9339 **
9340 ** Return SQLITE_LOCKED if this or any other connection has an open
9341 ** transaction on the shared-cache the argument Btree is connected to.
9342 **
9343 ** Parameter eMode is one of SQLITE_CHECKPOINT_PASSIVE, FULL or RESTART.
9344 */
9345 int sqlite3BtreeCheckpoint(Btree *p, int eMode, int *pnLog, int *pnCkpt){
9346   int rc = SQLITE_OK;
9347   if( p ){
9348     BtShared *pBt = p->pBt;
9349     sqlite3BtreeEnter(p);
9350     if( pBt->inTransaction!=TRANS_NONE ){
9351       rc = SQLITE_LOCKED;
9352     }else{
9353       rc = sqlite3PagerCheckpoint(pBt->pPager, eMode, pnLog, pnCkpt);
9354     }
9355     sqlite3BtreeLeave(p);
9356   }
9357   return rc;
9358 }
9359 #endif
9360 
9361 /*
9362 ** Return non-zero if a read (or write) transaction is active.
9363 */
9364 int sqlite3BtreeIsInReadTrans(Btree *p){
9365   assert( p );
9366   assert( sqlite3_mutex_held(p->db->mutex) );
9367   return p->inTrans!=TRANS_NONE;
9368 }
9369 
9370 int sqlite3BtreeIsInBackup(Btree *p){
9371   assert( p );
9372   assert( sqlite3_mutex_held(p->db->mutex) );
9373   return p->nBackup!=0;
9374 }
9375 
9376 /*
9377 ** This function returns a pointer to a blob of memory associated with
9378 ** a single shared-btree. The memory is used by client code for its own
9379 ** purposes (for example, to store a high-level schema associated with
9380 ** the shared-btree). The btree layer manages reference counting issues.
9381 **
9382 ** The first time this is called on a shared-btree, nBytes bytes of memory
9383 ** are allocated, zeroed, and returned to the caller. For each subsequent
9384 ** call the nBytes parameter is ignored and a pointer to the same blob
9385 ** of memory returned.
9386 **
9387 ** If the nBytes parameter is 0 and the blob of memory has not yet been
9388 ** allocated, a null pointer is returned. If the blob has already been
9389 ** allocated, it is returned as normal.
9390 **
9391 ** Just before the shared-btree is closed, the function passed as the
9392 ** xFree argument when the memory allocation was made is invoked on the
9393 ** blob of allocated memory. The xFree function should not call sqlite3_free()
9394 ** on the memory, the btree layer does that.
9395 */
9396 void *sqlite3BtreeSchema(Btree *p, int nBytes, void(*xFree)(void *)){
9397   BtShared *pBt = p->pBt;
9398   sqlite3BtreeEnter(p);
9399   if( !pBt->pSchema && nBytes ){
9400     pBt->pSchema = sqlite3DbMallocZero(0, nBytes);
9401     pBt->xFreeSchema = xFree;
9402   }
9403   sqlite3BtreeLeave(p);
9404   return pBt->pSchema;
9405 }
9406 
9407 /*
9408 ** Return SQLITE_LOCKED_SHAREDCACHE if another user of the same shared
9409 ** btree as the argument handle holds an exclusive lock on the
9410 ** sqlite_master table. Otherwise SQLITE_OK.
9411 */
9412 int sqlite3BtreeSchemaLocked(Btree *p){
9413   int rc;
9414   assert( sqlite3_mutex_held(p->db->mutex) );
9415   sqlite3BtreeEnter(p);
9416   rc = querySharedCacheTableLock(p, MASTER_ROOT, READ_LOCK);
9417   assert( rc==SQLITE_OK || rc==SQLITE_LOCKED_SHAREDCACHE );
9418   sqlite3BtreeLeave(p);
9419   return rc;
9420 }
9421 
9422 
9423 #ifndef SQLITE_OMIT_SHARED_CACHE
9424 /*
9425 ** Obtain a lock on the table whose root page is iTab.  The
9426 ** lock is a write lock if isWritelock is true or a read lock
9427 ** if it is false.
9428 */
9429 int sqlite3BtreeLockTable(Btree *p, int iTab, u8 isWriteLock){
9430   int rc = SQLITE_OK;
9431   assert( p->inTrans!=TRANS_NONE );
9432   if( p->sharable ){
9433     u8 lockType = READ_LOCK + isWriteLock;
9434     assert( READ_LOCK+1==WRITE_LOCK );
9435     assert( isWriteLock==0 || isWriteLock==1 );
9436 
9437     sqlite3BtreeEnter(p);
9438     rc = querySharedCacheTableLock(p, iTab, lockType);
9439     if( rc==SQLITE_OK ){
9440       rc = setSharedCacheTableLock(p, iTab, lockType);
9441     }
9442     sqlite3BtreeLeave(p);
9443   }
9444   return rc;
9445 }
9446 #endif
9447 
9448 #ifndef SQLITE_OMIT_INCRBLOB
9449 /*
9450 ** Argument pCsr must be a cursor opened for writing on an
9451 ** INTKEY table currently pointing at a valid table entry.
9452 ** This function modifies the data stored as part of that entry.
9453 **
9454 ** Only the data content may only be modified, it is not possible to
9455 ** change the length of the data stored. If this function is called with
9456 ** parameters that attempt to write past the end of the existing data,
9457 ** no modifications are made and SQLITE_CORRUPT is returned.
9458 */
9459 int sqlite3BtreePutData(BtCursor *pCsr, u32 offset, u32 amt, void *z){
9460   int rc;
9461   assert( cursorHoldsMutex(pCsr) );
9462   assert( sqlite3_mutex_held(pCsr->pBtree->db->mutex) );
9463   assert( pCsr->curFlags & BTCF_Incrblob );
9464 
9465   rc = restoreCursorPosition(pCsr);
9466   if( rc!=SQLITE_OK ){
9467     return rc;
9468   }
9469   assert( pCsr->eState!=CURSOR_REQUIRESEEK );
9470   if( pCsr->eState!=CURSOR_VALID ){
9471     return SQLITE_ABORT;
9472   }
9473 
9474   /* Save the positions of all other cursors open on this table. This is
9475   ** required in case any of them are holding references to an xFetch
9476   ** version of the b-tree page modified by the accessPayload call below.
9477   **
9478   ** Note that pCsr must be open on a INTKEY table and saveCursorPosition()
9479   ** and hence saveAllCursors() cannot fail on a BTREE_INTKEY table, hence
9480   ** saveAllCursors can only return SQLITE_OK.
9481   */
9482   VVA_ONLY(rc =) saveAllCursors(pCsr->pBt, pCsr->pgnoRoot, pCsr);
9483   assert( rc==SQLITE_OK );
9484 
9485   /* Check some assumptions:
9486   **   (a) the cursor is open for writing,
9487   **   (b) there is a read/write transaction open,
9488   **   (c) the connection holds a write-lock on the table (if required),
9489   **   (d) there are no conflicting read-locks, and
9490   **   (e) the cursor points at a valid row of an intKey table.
9491   */
9492   if( (pCsr->curFlags & BTCF_WriteFlag)==0 ){
9493     return SQLITE_READONLY;
9494   }
9495   assert( (pCsr->pBt->btsFlags & BTS_READ_ONLY)==0
9496               && pCsr->pBt->inTransaction==TRANS_WRITE );
9497   assert( hasSharedCacheTableLock(pCsr->pBtree, pCsr->pgnoRoot, 0, 2) );
9498   assert( !hasReadConflicts(pCsr->pBtree, pCsr->pgnoRoot) );
9499   assert( pCsr->apPage[pCsr->iPage]->intKey );
9500 
9501   return accessPayload(pCsr, offset, amt, (unsigned char *)z, 1);
9502 }
9503 
9504 /*
9505 ** Mark this cursor as an incremental blob cursor.
9506 */
9507 void sqlite3BtreeIncrblobCursor(BtCursor *pCur){
9508   pCur->curFlags |= BTCF_Incrblob;
9509   pCur->pBtree->hasIncrblobCur = 1;
9510 }
9511 #endif
9512 
9513 /*
9514 ** Set both the "read version" (single byte at byte offset 18) and
9515 ** "write version" (single byte at byte offset 19) fields in the database
9516 ** header to iVersion.
9517 */
9518 int sqlite3BtreeSetVersion(Btree *pBtree, int iVersion){
9519   BtShared *pBt = pBtree->pBt;
9520   int rc;                         /* Return code */
9521 
9522   assert( iVersion==1 || iVersion==2 );
9523 
9524   /* If setting the version fields to 1, do not automatically open the
9525   ** WAL connection, even if the version fields are currently set to 2.
9526   */
9527   pBt->btsFlags &= ~BTS_NO_WAL;
9528   if( iVersion==1 ) pBt->btsFlags |= BTS_NO_WAL;
9529 
9530   rc = sqlite3BtreeBeginTrans(pBtree, 0);
9531   if( rc==SQLITE_OK ){
9532     u8 *aData = pBt->pPage1->aData;
9533     if( aData[18]!=(u8)iVersion || aData[19]!=(u8)iVersion ){
9534       rc = sqlite3BtreeBeginTrans(pBtree, 2);
9535       if( rc==SQLITE_OK ){
9536         rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
9537         if( rc==SQLITE_OK ){
9538           aData[18] = (u8)iVersion;
9539           aData[19] = (u8)iVersion;
9540         }
9541       }
9542     }
9543   }
9544 
9545   pBt->btsFlags &= ~BTS_NO_WAL;
9546   return rc;
9547 }
9548 
9549 /*
9550 ** set the mask of hint flags for cursor pCsr.
9551 */
9552 void sqlite3BtreeCursorHints(BtCursor *pCsr, unsigned int mask){
9553   assert( mask==BTREE_BULKLOAD || mask==BTREE_SEEK_EQ || mask==0 );
9554   pCsr->hints = mask;
9555 }
9556 
9557 #ifdef SQLITE_DEBUG
9558 /*
9559 ** Return true if the cursor has a hint specified.  This routine is
9560 ** only used from within assert() statements
9561 */
9562 int sqlite3BtreeCursorHasHint(BtCursor *pCsr, unsigned int mask){
9563   return (pCsr->hints & mask)!=0;
9564 }
9565 #endif
9566 
9567 /*
9568 ** Return true if the given Btree is read-only.
9569 */
9570 int sqlite3BtreeIsReadonly(Btree *p){
9571   return (p->pBt->btsFlags & BTS_READ_ONLY)!=0;
9572 }
9573 
9574 /*
9575 ** Return the size of the header added to each page by this module.
9576 */
9577 int sqlite3HeaderSizeBtree(void){ return ROUND8(sizeof(MemPage)); }
9578