xref: /sqlite-3.40.0/src/btree.c (revision 79d5bc80)
1 /*
2 ** 2004 April 6
3 **
4 ** The author disclaims copyright to this source code.  In place of
5 ** a legal notice, here is a blessing:
6 **
7 **    May you do good and not evil.
8 **    May you find forgiveness for yourself and forgive others.
9 **    May you share freely, never taking more than you give.
10 **
11 *************************************************************************
12 ** This file implements an external (disk-based) database using BTrees.
13 ** See the header comment on "btreeInt.h" for additional information.
14 ** Including a description of file format and an overview of operation.
15 */
16 #include "btreeInt.h"
17 
18 /*
19 ** The header string that appears at the beginning of every
20 ** SQLite database.
21 */
22 static const char zMagicHeader[] = SQLITE_FILE_HEADER;
23 
24 /*
25 ** Set this global variable to 1 to enable tracing using the TRACE
26 ** macro.
27 */
28 #if 0
29 int sqlite3BtreeTrace=1;  /* True to enable tracing */
30 # define TRACE(X)  if(sqlite3BtreeTrace){printf X;fflush(stdout);}
31 #else
32 # define TRACE(X)
33 #endif
34 
35 /*
36 ** Extract a 2-byte big-endian integer from an array of unsigned bytes.
37 ** But if the value is zero, make it 65536.
38 **
39 ** This routine is used to extract the "offset to cell content area" value
40 ** from the header of a btree page.  If the page size is 65536 and the page
41 ** is empty, the offset should be 65536, but the 2-byte value stores zero.
42 ** This routine makes the necessary adjustment to 65536.
43 */
44 #define get2byteNotZero(X)  (((((int)get2byte(X))-1)&0xffff)+1)
45 
46 /*
47 ** Values passed as the 5th argument to allocateBtreePage()
48 */
49 #define BTALLOC_ANY   0           /* Allocate any page */
50 #define BTALLOC_EXACT 1           /* Allocate exact page if possible */
51 #define BTALLOC_LE    2           /* Allocate any page <= the parameter */
52 
53 /*
54 ** Macro IfNotOmitAV(x) returns (x) if SQLITE_OMIT_AUTOVACUUM is not
55 ** defined, or 0 if it is. For example:
56 **
57 **   bIncrVacuum = IfNotOmitAV(pBtShared->incrVacuum);
58 */
59 #ifndef SQLITE_OMIT_AUTOVACUUM
60 #define IfNotOmitAV(expr) (expr)
61 #else
62 #define IfNotOmitAV(expr) 0
63 #endif
64 
65 #ifndef SQLITE_OMIT_SHARED_CACHE
66 /*
67 ** A list of BtShared objects that are eligible for participation
68 ** in shared cache.  This variable has file scope during normal builds,
69 ** but the test harness needs to access it so we make it global for
70 ** test builds.
71 **
72 ** Access to this variable is protected by SQLITE_MUTEX_STATIC_MASTER.
73 */
74 #ifdef SQLITE_TEST
75 BtShared *SQLITE_WSD sqlite3SharedCacheList = 0;
76 #else
77 static BtShared *SQLITE_WSD sqlite3SharedCacheList = 0;
78 #endif
79 #endif /* SQLITE_OMIT_SHARED_CACHE */
80 
81 #ifndef SQLITE_OMIT_SHARED_CACHE
82 /*
83 ** Enable or disable the shared pager and schema features.
84 **
85 ** This routine has no effect on existing database connections.
86 ** The shared cache setting effects only future calls to
87 ** sqlite3_open(), sqlite3_open16(), or sqlite3_open_v2().
88 */
89 int sqlite3_enable_shared_cache(int enable){
90   sqlite3GlobalConfig.sharedCacheEnabled = enable;
91   return SQLITE_OK;
92 }
93 #endif
94 
95 
96 
97 #ifdef SQLITE_OMIT_SHARED_CACHE
98   /*
99   ** The functions querySharedCacheTableLock(), setSharedCacheTableLock(),
100   ** and clearAllSharedCacheTableLocks()
101   ** manipulate entries in the BtShared.pLock linked list used to store
102   ** shared-cache table level locks. If the library is compiled with the
103   ** shared-cache feature disabled, then there is only ever one user
104   ** of each BtShared structure and so this locking is not necessary.
105   ** So define the lock related functions as no-ops.
106   */
107   #define querySharedCacheTableLock(a,b,c) SQLITE_OK
108   #define setSharedCacheTableLock(a,b,c) SQLITE_OK
109   #define clearAllSharedCacheTableLocks(a)
110   #define downgradeAllSharedCacheTableLocks(a)
111   #define hasSharedCacheTableLock(a,b,c,d) 1
112   #define hasReadConflicts(a, b) 0
113 #endif
114 
115 /*
116 ** Implementation of the SQLITE_CORRUPT_PAGE() macro. Takes a single
117 ** (MemPage*) as an argument. The (MemPage*) must not be NULL.
118 **
119 ** If SQLITE_DEBUG is not defined, then this macro is equivalent to
120 ** SQLITE_CORRUPT_BKPT. Or, if SQLITE_DEBUG is set, then the log message
121 ** normally produced as a side-effect of SQLITE_CORRUPT_BKPT is augmented
122 ** with the page number and filename associated with the (MemPage*).
123 */
124 #ifdef SQLITE_DEBUG
125 int corruptPageError(int lineno, MemPage *p){
126   char *zMsg;
127   sqlite3BeginBenignMalloc();
128   zMsg = sqlite3_mprintf("database corruption page %d of %s",
129       (int)p->pgno, sqlite3PagerFilename(p->pBt->pPager, 0)
130   );
131   sqlite3EndBenignMalloc();
132   if( zMsg ){
133     sqlite3ReportError(SQLITE_CORRUPT, lineno, zMsg);
134   }
135   sqlite3_free(zMsg);
136   return SQLITE_CORRUPT_BKPT;
137 }
138 # define SQLITE_CORRUPT_PAGE(pMemPage) corruptPageError(__LINE__, pMemPage)
139 #else
140 # define SQLITE_CORRUPT_PAGE(pMemPage) SQLITE_CORRUPT_PGNO(pMemPage->pgno)
141 #endif
142 
143 #ifndef SQLITE_OMIT_SHARED_CACHE
144 
145 #ifdef SQLITE_DEBUG
146 /*
147 **** This function is only used as part of an assert() statement. ***
148 **
149 ** Check to see if pBtree holds the required locks to read or write to the
150 ** table with root page iRoot.   Return 1 if it does and 0 if not.
151 **
152 ** For example, when writing to a table with root-page iRoot via
153 ** Btree connection pBtree:
154 **
155 **    assert( hasSharedCacheTableLock(pBtree, iRoot, 0, WRITE_LOCK) );
156 **
157 ** When writing to an index that resides in a sharable database, the
158 ** caller should have first obtained a lock specifying the root page of
159 ** the corresponding table. This makes things a bit more complicated,
160 ** as this module treats each table as a separate structure. To determine
161 ** the table corresponding to the index being written, this
162 ** function has to search through the database schema.
163 **
164 ** Instead of a lock on the table/index rooted at page iRoot, the caller may
165 ** hold a write-lock on the schema table (root page 1). This is also
166 ** acceptable.
167 */
168 static int hasSharedCacheTableLock(
169   Btree *pBtree,         /* Handle that must hold lock */
170   Pgno iRoot,            /* Root page of b-tree */
171   int isIndex,           /* True if iRoot is the root of an index b-tree */
172   int eLockType          /* Required lock type (READ_LOCK or WRITE_LOCK) */
173 ){
174   Schema *pSchema = (Schema *)pBtree->pBt->pSchema;
175   Pgno iTab = 0;
176   BtLock *pLock;
177 
178   /* If this database is not shareable, or if the client is reading
179   ** and has the read-uncommitted flag set, then no lock is required.
180   ** Return true immediately.
181   */
182   if( (pBtree->sharable==0)
183    || (eLockType==READ_LOCK && (pBtree->db->flags & SQLITE_ReadUncommit))
184   ){
185     return 1;
186   }
187 
188   /* If the client is reading  or writing an index and the schema is
189   ** not loaded, then it is too difficult to actually check to see if
190   ** the correct locks are held.  So do not bother - just return true.
191   ** This case does not come up very often anyhow.
192   */
193   if( isIndex && (!pSchema || (pSchema->schemaFlags&DB_SchemaLoaded)==0) ){
194     return 1;
195   }
196 
197   /* Figure out the root-page that the lock should be held on. For table
198   ** b-trees, this is just the root page of the b-tree being read or
199   ** written. For index b-trees, it is the root page of the associated
200   ** table.  */
201   if( isIndex ){
202     HashElem *p;
203     for(p=sqliteHashFirst(&pSchema->idxHash); p; p=sqliteHashNext(p)){
204       Index *pIdx = (Index *)sqliteHashData(p);
205       if( pIdx->tnum==(int)iRoot ){
206         if( iTab ){
207           /* Two or more indexes share the same root page.  There must
208           ** be imposter tables.  So just return true.  The assert is not
209           ** useful in that case. */
210           return 1;
211         }
212         iTab = pIdx->pTable->tnum;
213       }
214     }
215   }else{
216     iTab = iRoot;
217   }
218 
219   /* Search for the required lock. Either a write-lock on root-page iTab, a
220   ** write-lock on the schema table, or (if the client is reading) a
221   ** read-lock on iTab will suffice. Return 1 if any of these are found.  */
222   for(pLock=pBtree->pBt->pLock; pLock; pLock=pLock->pNext){
223     if( pLock->pBtree==pBtree
224      && (pLock->iTable==iTab || (pLock->eLock==WRITE_LOCK && pLock->iTable==1))
225      && pLock->eLock>=eLockType
226     ){
227       return 1;
228     }
229   }
230 
231   /* Failed to find the required lock. */
232   return 0;
233 }
234 #endif /* SQLITE_DEBUG */
235 
236 #ifdef SQLITE_DEBUG
237 /*
238 **** This function may be used as part of assert() statements only. ****
239 **
240 ** Return true if it would be illegal for pBtree to write into the
241 ** table or index rooted at iRoot because other shared connections are
242 ** simultaneously reading that same table or index.
243 **
244 ** It is illegal for pBtree to write if some other Btree object that
245 ** shares the same BtShared object is currently reading or writing
246 ** the iRoot table.  Except, if the other Btree object has the
247 ** read-uncommitted flag set, then it is OK for the other object to
248 ** have a read cursor.
249 **
250 ** For example, before writing to any part of the table or index
251 ** rooted at page iRoot, one should call:
252 **
253 **    assert( !hasReadConflicts(pBtree, iRoot) );
254 */
255 static int hasReadConflicts(Btree *pBtree, Pgno iRoot){
256   BtCursor *p;
257   for(p=pBtree->pBt->pCursor; p; p=p->pNext){
258     if( p->pgnoRoot==iRoot
259      && p->pBtree!=pBtree
260      && 0==(p->pBtree->db->flags & SQLITE_ReadUncommit)
261     ){
262       return 1;
263     }
264   }
265   return 0;
266 }
267 #endif    /* #ifdef SQLITE_DEBUG */
268 
269 /*
270 ** Query to see if Btree handle p may obtain a lock of type eLock
271 ** (READ_LOCK or WRITE_LOCK) on the table with root-page iTab. Return
272 ** SQLITE_OK if the lock may be obtained (by calling
273 ** setSharedCacheTableLock()), or SQLITE_LOCKED if not.
274 */
275 static int querySharedCacheTableLock(Btree *p, Pgno iTab, u8 eLock){
276   BtShared *pBt = p->pBt;
277   BtLock *pIter;
278 
279   assert( sqlite3BtreeHoldsMutex(p) );
280   assert( eLock==READ_LOCK || eLock==WRITE_LOCK );
281   assert( p->db!=0 );
282   assert( !(p->db->flags&SQLITE_ReadUncommit)||eLock==WRITE_LOCK||iTab==1 );
283 
284   /* If requesting a write-lock, then the Btree must have an open write
285   ** transaction on this file. And, obviously, for this to be so there
286   ** must be an open write transaction on the file itself.
287   */
288   assert( eLock==READ_LOCK || (p==pBt->pWriter && p->inTrans==TRANS_WRITE) );
289   assert( eLock==READ_LOCK || pBt->inTransaction==TRANS_WRITE );
290 
291   /* This routine is a no-op if the shared-cache is not enabled */
292   if( !p->sharable ){
293     return SQLITE_OK;
294   }
295 
296   /* If some other connection is holding an exclusive lock, the
297   ** requested lock may not be obtained.
298   */
299   if( pBt->pWriter!=p && (pBt->btsFlags & BTS_EXCLUSIVE)!=0 ){
300     sqlite3ConnectionBlocked(p->db, pBt->pWriter->db);
301     return SQLITE_LOCKED_SHAREDCACHE;
302   }
303 
304   for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){
305     /* The condition (pIter->eLock!=eLock) in the following if(...)
306     ** statement is a simplification of:
307     **
308     **   (eLock==WRITE_LOCK || pIter->eLock==WRITE_LOCK)
309     **
310     ** since we know that if eLock==WRITE_LOCK, then no other connection
311     ** may hold a WRITE_LOCK on any table in this file (since there can
312     ** only be a single writer).
313     */
314     assert( pIter->eLock==READ_LOCK || pIter->eLock==WRITE_LOCK );
315     assert( eLock==READ_LOCK || pIter->pBtree==p || pIter->eLock==READ_LOCK);
316     if( pIter->pBtree!=p && pIter->iTable==iTab && pIter->eLock!=eLock ){
317       sqlite3ConnectionBlocked(p->db, pIter->pBtree->db);
318       if( eLock==WRITE_LOCK ){
319         assert( p==pBt->pWriter );
320         pBt->btsFlags |= BTS_PENDING;
321       }
322       return SQLITE_LOCKED_SHAREDCACHE;
323     }
324   }
325   return SQLITE_OK;
326 }
327 #endif /* !SQLITE_OMIT_SHARED_CACHE */
328 
329 #ifndef SQLITE_OMIT_SHARED_CACHE
330 /*
331 ** Add a lock on the table with root-page iTable to the shared-btree used
332 ** by Btree handle p. Parameter eLock must be either READ_LOCK or
333 ** WRITE_LOCK.
334 **
335 ** This function assumes the following:
336 **
337 **   (a) The specified Btree object p is connected to a sharable
338 **       database (one with the BtShared.sharable flag set), and
339 **
340 **   (b) No other Btree objects hold a lock that conflicts
341 **       with the requested lock (i.e. querySharedCacheTableLock() has
342 **       already been called and returned SQLITE_OK).
343 **
344 ** SQLITE_OK is returned if the lock is added successfully. SQLITE_NOMEM
345 ** is returned if a malloc attempt fails.
346 */
347 static int setSharedCacheTableLock(Btree *p, Pgno iTable, u8 eLock){
348   BtShared *pBt = p->pBt;
349   BtLock *pLock = 0;
350   BtLock *pIter;
351 
352   assert( sqlite3BtreeHoldsMutex(p) );
353   assert( eLock==READ_LOCK || eLock==WRITE_LOCK );
354   assert( p->db!=0 );
355 
356   /* A connection with the read-uncommitted flag set will never try to
357   ** obtain a read-lock using this function. The only read-lock obtained
358   ** by a connection in read-uncommitted mode is on the sqlite_master
359   ** table, and that lock is obtained in BtreeBeginTrans().  */
360   assert( 0==(p->db->flags&SQLITE_ReadUncommit) || eLock==WRITE_LOCK );
361 
362   /* This function should only be called on a sharable b-tree after it
363   ** has been determined that no other b-tree holds a conflicting lock.  */
364   assert( p->sharable );
365   assert( SQLITE_OK==querySharedCacheTableLock(p, iTable, eLock) );
366 
367   /* First search the list for an existing lock on this table. */
368   for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){
369     if( pIter->iTable==iTable && pIter->pBtree==p ){
370       pLock = pIter;
371       break;
372     }
373   }
374 
375   /* If the above search did not find a BtLock struct associating Btree p
376   ** with table iTable, allocate one and link it into the list.
377   */
378   if( !pLock ){
379     pLock = (BtLock *)sqlite3MallocZero(sizeof(BtLock));
380     if( !pLock ){
381       return SQLITE_NOMEM_BKPT;
382     }
383     pLock->iTable = iTable;
384     pLock->pBtree = p;
385     pLock->pNext = pBt->pLock;
386     pBt->pLock = pLock;
387   }
388 
389   /* Set the BtLock.eLock variable to the maximum of the current lock
390   ** and the requested lock. This means if a write-lock was already held
391   ** and a read-lock requested, we don't incorrectly downgrade the lock.
392   */
393   assert( WRITE_LOCK>READ_LOCK );
394   if( eLock>pLock->eLock ){
395     pLock->eLock = eLock;
396   }
397 
398   return SQLITE_OK;
399 }
400 #endif /* !SQLITE_OMIT_SHARED_CACHE */
401 
402 #ifndef SQLITE_OMIT_SHARED_CACHE
403 /*
404 ** Release all the table locks (locks obtained via calls to
405 ** the setSharedCacheTableLock() procedure) held by Btree object p.
406 **
407 ** This function assumes that Btree p has an open read or write
408 ** transaction. If it does not, then the BTS_PENDING flag
409 ** may be incorrectly cleared.
410 */
411 static void clearAllSharedCacheTableLocks(Btree *p){
412   BtShared *pBt = p->pBt;
413   BtLock **ppIter = &pBt->pLock;
414 
415   assert( sqlite3BtreeHoldsMutex(p) );
416   assert( p->sharable || 0==*ppIter );
417   assert( p->inTrans>0 );
418 
419   while( *ppIter ){
420     BtLock *pLock = *ppIter;
421     assert( (pBt->btsFlags & BTS_EXCLUSIVE)==0 || pBt->pWriter==pLock->pBtree );
422     assert( pLock->pBtree->inTrans>=pLock->eLock );
423     if( pLock->pBtree==p ){
424       *ppIter = pLock->pNext;
425       assert( pLock->iTable!=1 || pLock==&p->lock );
426       if( pLock->iTable!=1 ){
427         sqlite3_free(pLock);
428       }
429     }else{
430       ppIter = &pLock->pNext;
431     }
432   }
433 
434   assert( (pBt->btsFlags & BTS_PENDING)==0 || pBt->pWriter );
435   if( pBt->pWriter==p ){
436     pBt->pWriter = 0;
437     pBt->btsFlags &= ~(BTS_EXCLUSIVE|BTS_PENDING);
438   }else if( pBt->nTransaction==2 ){
439     /* This function is called when Btree p is concluding its
440     ** transaction. If there currently exists a writer, and p is not
441     ** that writer, then the number of locks held by connections other
442     ** than the writer must be about to drop to zero. In this case
443     ** set the BTS_PENDING flag to 0.
444     **
445     ** If there is not currently a writer, then BTS_PENDING must
446     ** be zero already. So this next line is harmless in that case.
447     */
448     pBt->btsFlags &= ~BTS_PENDING;
449   }
450 }
451 
452 /*
453 ** This function changes all write-locks held by Btree p into read-locks.
454 */
455 static void downgradeAllSharedCacheTableLocks(Btree *p){
456   BtShared *pBt = p->pBt;
457   if( pBt->pWriter==p ){
458     BtLock *pLock;
459     pBt->pWriter = 0;
460     pBt->btsFlags &= ~(BTS_EXCLUSIVE|BTS_PENDING);
461     for(pLock=pBt->pLock; pLock; pLock=pLock->pNext){
462       assert( pLock->eLock==READ_LOCK || pLock->pBtree==p );
463       pLock->eLock = READ_LOCK;
464     }
465   }
466 }
467 
468 #endif /* SQLITE_OMIT_SHARED_CACHE */
469 
470 static void releasePage(MemPage *pPage);         /* Forward reference */
471 static void releasePageOne(MemPage *pPage);      /* Forward reference */
472 static void releasePageNotNull(MemPage *pPage);  /* Forward reference */
473 
474 /*
475 ***** This routine is used inside of assert() only ****
476 **
477 ** Verify that the cursor holds the mutex on its BtShared
478 */
479 #ifdef SQLITE_DEBUG
480 static int cursorHoldsMutex(BtCursor *p){
481   return sqlite3_mutex_held(p->pBt->mutex);
482 }
483 
484 /* Verify that the cursor and the BtShared agree about what is the current
485 ** database connetion. This is important in shared-cache mode. If the database
486 ** connection pointers get out-of-sync, it is possible for routines like
487 ** btreeInitPage() to reference an stale connection pointer that references a
488 ** a connection that has already closed.  This routine is used inside assert()
489 ** statements only and for the purpose of double-checking that the btree code
490 ** does keep the database connection pointers up-to-date.
491 */
492 static int cursorOwnsBtShared(BtCursor *p){
493   assert( cursorHoldsMutex(p) );
494   return (p->pBtree->db==p->pBt->db);
495 }
496 #endif
497 
498 /*
499 ** Invalidate the overflow cache of the cursor passed as the first argument.
500 ** on the shared btree structure pBt.
501 */
502 #define invalidateOverflowCache(pCur) (pCur->curFlags &= ~BTCF_ValidOvfl)
503 
504 /*
505 ** Invalidate the overflow page-list cache for all cursors opened
506 ** on the shared btree structure pBt.
507 */
508 static void invalidateAllOverflowCache(BtShared *pBt){
509   BtCursor *p;
510   assert( sqlite3_mutex_held(pBt->mutex) );
511   for(p=pBt->pCursor; p; p=p->pNext){
512     invalidateOverflowCache(p);
513   }
514 }
515 
516 #ifndef SQLITE_OMIT_INCRBLOB
517 /*
518 ** This function is called before modifying the contents of a table
519 ** to invalidate any incrblob cursors that are open on the
520 ** row or one of the rows being modified.
521 **
522 ** If argument isClearTable is true, then the entire contents of the
523 ** table is about to be deleted. In this case invalidate all incrblob
524 ** cursors open on any row within the table with root-page pgnoRoot.
525 **
526 ** Otherwise, if argument isClearTable is false, then the row with
527 ** rowid iRow is being replaced or deleted. In this case invalidate
528 ** only those incrblob cursors open on that specific row.
529 */
530 static void invalidateIncrblobCursors(
531   Btree *pBtree,          /* The database file to check */
532   Pgno pgnoRoot,          /* The table that might be changing */
533   i64 iRow,               /* The rowid that might be changing */
534   int isClearTable        /* True if all rows are being deleted */
535 ){
536   BtCursor *p;
537   if( pBtree->hasIncrblobCur==0 ) return;
538   assert( sqlite3BtreeHoldsMutex(pBtree) );
539   pBtree->hasIncrblobCur = 0;
540   for(p=pBtree->pBt->pCursor; p; p=p->pNext){
541     if( (p->curFlags & BTCF_Incrblob)!=0 ){
542       pBtree->hasIncrblobCur = 1;
543       if( p->pgnoRoot==pgnoRoot && (isClearTable || p->info.nKey==iRow) ){
544         p->eState = CURSOR_INVALID;
545       }
546     }
547   }
548 }
549 
550 #else
551   /* Stub function when INCRBLOB is omitted */
552   #define invalidateIncrblobCursors(w,x,y,z)
553 #endif /* SQLITE_OMIT_INCRBLOB */
554 
555 /*
556 ** Set bit pgno of the BtShared.pHasContent bitvec. This is called
557 ** when a page that previously contained data becomes a free-list leaf
558 ** page.
559 **
560 ** The BtShared.pHasContent bitvec exists to work around an obscure
561 ** bug caused by the interaction of two useful IO optimizations surrounding
562 ** free-list leaf pages:
563 **
564 **   1) When all data is deleted from a page and the page becomes
565 **      a free-list leaf page, the page is not written to the database
566 **      (as free-list leaf pages contain no meaningful data). Sometimes
567 **      such a page is not even journalled (as it will not be modified,
568 **      why bother journalling it?).
569 **
570 **   2) When a free-list leaf page is reused, its content is not read
571 **      from the database or written to the journal file (why should it
572 **      be, if it is not at all meaningful?).
573 **
574 ** By themselves, these optimizations work fine and provide a handy
575 ** performance boost to bulk delete or insert operations. However, if
576 ** a page is moved to the free-list and then reused within the same
577 ** transaction, a problem comes up. If the page is not journalled when
578 ** it is moved to the free-list and it is also not journalled when it
579 ** is extracted from the free-list and reused, then the original data
580 ** may be lost. In the event of a rollback, it may not be possible
581 ** to restore the database to its original configuration.
582 **
583 ** The solution is the BtShared.pHasContent bitvec. Whenever a page is
584 ** moved to become a free-list leaf page, the corresponding bit is
585 ** set in the bitvec. Whenever a leaf page is extracted from the free-list,
586 ** optimization 2 above is omitted if the corresponding bit is already
587 ** set in BtShared.pHasContent. The contents of the bitvec are cleared
588 ** at the end of every transaction.
589 */
590 static int btreeSetHasContent(BtShared *pBt, Pgno pgno){
591   int rc = SQLITE_OK;
592   if( !pBt->pHasContent ){
593     assert( pgno<=pBt->nPage );
594     pBt->pHasContent = sqlite3BitvecCreate(pBt->nPage);
595     if( !pBt->pHasContent ){
596       rc = SQLITE_NOMEM_BKPT;
597     }
598   }
599   if( rc==SQLITE_OK && pgno<=sqlite3BitvecSize(pBt->pHasContent) ){
600     rc = sqlite3BitvecSet(pBt->pHasContent, pgno);
601   }
602   return rc;
603 }
604 
605 /*
606 ** Query the BtShared.pHasContent vector.
607 **
608 ** This function is called when a free-list leaf page is removed from the
609 ** free-list for reuse. It returns false if it is safe to retrieve the
610 ** page from the pager layer with the 'no-content' flag set. True otherwise.
611 */
612 static int btreeGetHasContent(BtShared *pBt, Pgno pgno){
613   Bitvec *p = pBt->pHasContent;
614   return (p && (pgno>sqlite3BitvecSize(p) || sqlite3BitvecTest(p, pgno)));
615 }
616 
617 /*
618 ** Clear (destroy) the BtShared.pHasContent bitvec. This should be
619 ** invoked at the conclusion of each write-transaction.
620 */
621 static void btreeClearHasContent(BtShared *pBt){
622   sqlite3BitvecDestroy(pBt->pHasContent);
623   pBt->pHasContent = 0;
624 }
625 
626 /*
627 ** Release all of the apPage[] pages for a cursor.
628 */
629 static void btreeReleaseAllCursorPages(BtCursor *pCur){
630   int i;
631   if( pCur->iPage>=0 ){
632     for(i=0; i<pCur->iPage; i++){
633       releasePageNotNull(pCur->apPage[i]);
634     }
635     releasePageNotNull(pCur->pPage);
636     pCur->iPage = -1;
637   }
638 }
639 
640 /*
641 ** The cursor passed as the only argument must point to a valid entry
642 ** when this function is called (i.e. have eState==CURSOR_VALID). This
643 ** function saves the current cursor key in variables pCur->nKey and
644 ** pCur->pKey. SQLITE_OK is returned if successful or an SQLite error
645 ** code otherwise.
646 **
647 ** If the cursor is open on an intkey table, then the integer key
648 ** (the rowid) is stored in pCur->nKey and pCur->pKey is left set to
649 ** NULL. If the cursor is open on a non-intkey table, then pCur->pKey is
650 ** set to point to a malloced buffer pCur->nKey bytes in size containing
651 ** the key.
652 */
653 static int saveCursorKey(BtCursor *pCur){
654   int rc = SQLITE_OK;
655   assert( CURSOR_VALID==pCur->eState );
656   assert( 0==pCur->pKey );
657   assert( cursorHoldsMutex(pCur) );
658 
659   if( pCur->curIntKey ){
660     /* Only the rowid is required for a table btree */
661     pCur->nKey = sqlite3BtreeIntegerKey(pCur);
662   }else{
663     /* For an index btree, save the complete key content. It is possible
664     ** that the current key is corrupt. In that case, it is possible that
665     ** the sqlite3VdbeRecordUnpack() function may overread the buffer by
666     ** up to the size of 1 varint plus 1 8-byte value when the cursor
667     ** position is restored. Hence the 17 bytes of padding allocated
668     ** below. */
669     void *pKey;
670     pCur->nKey = sqlite3BtreePayloadSize(pCur);
671     pKey = sqlite3Malloc( pCur->nKey + 9 + 8 );
672     if( pKey ){
673       rc = sqlite3BtreePayload(pCur, 0, (int)pCur->nKey, pKey);
674       if( rc==SQLITE_OK ){
675         memset(((u8*)pKey)+pCur->nKey, 0, 9+8);
676         pCur->pKey = pKey;
677       }else{
678         sqlite3_free(pKey);
679       }
680     }else{
681       rc = SQLITE_NOMEM_BKPT;
682     }
683   }
684   assert( !pCur->curIntKey || !pCur->pKey );
685   return rc;
686 }
687 
688 /*
689 ** Save the current cursor position in the variables BtCursor.nKey
690 ** and BtCursor.pKey. The cursor's state is set to CURSOR_REQUIRESEEK.
691 **
692 ** The caller must ensure that the cursor is valid (has eState==CURSOR_VALID)
693 ** prior to calling this routine.
694 */
695 static int saveCursorPosition(BtCursor *pCur){
696   int rc;
697 
698   assert( CURSOR_VALID==pCur->eState || CURSOR_SKIPNEXT==pCur->eState );
699   assert( 0==pCur->pKey );
700   assert( cursorHoldsMutex(pCur) );
701 
702   if( pCur->curFlags & BTCF_Pinned ){
703     return SQLITE_CONSTRAINT_PINNED;
704   }
705   if( pCur->eState==CURSOR_SKIPNEXT ){
706     pCur->eState = CURSOR_VALID;
707   }else{
708     pCur->skipNext = 0;
709   }
710 
711   rc = saveCursorKey(pCur);
712   if( rc==SQLITE_OK ){
713     btreeReleaseAllCursorPages(pCur);
714     pCur->eState = CURSOR_REQUIRESEEK;
715   }
716 
717   pCur->curFlags &= ~(BTCF_ValidNKey|BTCF_ValidOvfl|BTCF_AtLast);
718   return rc;
719 }
720 
721 /* Forward reference */
722 static int SQLITE_NOINLINE saveCursorsOnList(BtCursor*,Pgno,BtCursor*);
723 
724 /*
725 ** Save the positions of all cursors (except pExcept) that are open on
726 ** the table with root-page iRoot.  "Saving the cursor position" means that
727 ** the location in the btree is remembered in such a way that it can be
728 ** moved back to the same spot after the btree has been modified.  This
729 ** routine is called just before cursor pExcept is used to modify the
730 ** table, for example in BtreeDelete() or BtreeInsert().
731 **
732 ** If there are two or more cursors on the same btree, then all such
733 ** cursors should have their BTCF_Multiple flag set.  The btreeCursor()
734 ** routine enforces that rule.  This routine only needs to be called in
735 ** the uncommon case when pExpect has the BTCF_Multiple flag set.
736 **
737 ** If pExpect!=NULL and if no other cursors are found on the same root-page,
738 ** then the BTCF_Multiple flag on pExpect is cleared, to avoid another
739 ** pointless call to this routine.
740 **
741 ** Implementation note:  This routine merely checks to see if any cursors
742 ** need to be saved.  It calls out to saveCursorsOnList() in the (unusual)
743 ** event that cursors are in need to being saved.
744 */
745 static int saveAllCursors(BtShared *pBt, Pgno iRoot, BtCursor *pExcept){
746   BtCursor *p;
747   assert( sqlite3_mutex_held(pBt->mutex) );
748   assert( pExcept==0 || pExcept->pBt==pBt );
749   for(p=pBt->pCursor; p; p=p->pNext){
750     if( p!=pExcept && (0==iRoot || p->pgnoRoot==iRoot) ) break;
751   }
752   if( p ) return saveCursorsOnList(p, iRoot, pExcept);
753   if( pExcept ) pExcept->curFlags &= ~BTCF_Multiple;
754   return SQLITE_OK;
755 }
756 
757 /* This helper routine to saveAllCursors does the actual work of saving
758 ** the cursors if and when a cursor is found that actually requires saving.
759 ** The common case is that no cursors need to be saved, so this routine is
760 ** broken out from its caller to avoid unnecessary stack pointer movement.
761 */
762 static int SQLITE_NOINLINE saveCursorsOnList(
763   BtCursor *p,         /* The first cursor that needs saving */
764   Pgno iRoot,          /* Only save cursor with this iRoot. Save all if zero */
765   BtCursor *pExcept    /* Do not save this cursor */
766 ){
767   do{
768     if( p!=pExcept && (0==iRoot || p->pgnoRoot==iRoot) ){
769       if( p->eState==CURSOR_VALID || p->eState==CURSOR_SKIPNEXT ){
770         int rc = saveCursorPosition(p);
771         if( SQLITE_OK!=rc ){
772           return rc;
773         }
774       }else{
775         testcase( p->iPage>=0 );
776         btreeReleaseAllCursorPages(p);
777       }
778     }
779     p = p->pNext;
780   }while( p );
781   return SQLITE_OK;
782 }
783 
784 /*
785 ** Clear the current cursor position.
786 */
787 void sqlite3BtreeClearCursor(BtCursor *pCur){
788   assert( cursorHoldsMutex(pCur) );
789   sqlite3_free(pCur->pKey);
790   pCur->pKey = 0;
791   pCur->eState = CURSOR_INVALID;
792 }
793 
794 /*
795 ** In this version of BtreeMoveto, pKey is a packed index record
796 ** such as is generated by the OP_MakeRecord opcode.  Unpack the
797 ** record and then call BtreeMovetoUnpacked() to do the work.
798 */
799 static int btreeMoveto(
800   BtCursor *pCur,     /* Cursor open on the btree to be searched */
801   const void *pKey,   /* Packed key if the btree is an index */
802   i64 nKey,           /* Integer key for tables.  Size of pKey for indices */
803   int bias,           /* Bias search to the high end */
804   int *pRes           /* Write search results here */
805 ){
806   int rc;                    /* Status code */
807   UnpackedRecord *pIdxKey;   /* Unpacked index key */
808 
809   if( pKey ){
810     KeyInfo *pKeyInfo = pCur->pKeyInfo;
811     assert( nKey==(i64)(int)nKey );
812     pIdxKey = sqlite3VdbeAllocUnpackedRecord(pKeyInfo);
813     if( pIdxKey==0 ) return SQLITE_NOMEM_BKPT;
814     sqlite3VdbeRecordUnpack(pKeyInfo, (int)nKey, pKey, pIdxKey);
815     if( pIdxKey->nField==0 || pIdxKey->nField>pKeyInfo->nAllField ){
816       rc = SQLITE_CORRUPT_BKPT;
817       goto moveto_done;
818     }
819   }else{
820     pIdxKey = 0;
821   }
822   rc = sqlite3BtreeMovetoUnpacked(pCur, pIdxKey, nKey, bias, pRes);
823 moveto_done:
824   if( pIdxKey ){
825     sqlite3DbFree(pCur->pKeyInfo->db, pIdxKey);
826   }
827   return rc;
828 }
829 
830 /*
831 ** Restore the cursor to the position it was in (or as close to as possible)
832 ** when saveCursorPosition() was called. Note that this call deletes the
833 ** saved position info stored by saveCursorPosition(), so there can be
834 ** at most one effective restoreCursorPosition() call after each
835 ** saveCursorPosition().
836 */
837 static int btreeRestoreCursorPosition(BtCursor *pCur){
838   int rc;
839   int skipNext = 0;
840   assert( cursorOwnsBtShared(pCur) );
841   assert( pCur->eState>=CURSOR_REQUIRESEEK );
842   if( pCur->eState==CURSOR_FAULT ){
843     return pCur->skipNext;
844   }
845   pCur->eState = CURSOR_INVALID;
846   if( sqlite3FaultSim(410) ){
847     rc = SQLITE_IOERR;
848   }else{
849     rc = btreeMoveto(pCur, pCur->pKey, pCur->nKey, 0, &skipNext);
850   }
851   if( rc==SQLITE_OK ){
852     sqlite3_free(pCur->pKey);
853     pCur->pKey = 0;
854     assert( pCur->eState==CURSOR_VALID || pCur->eState==CURSOR_INVALID );
855     if( skipNext ) pCur->skipNext = skipNext;
856     if( pCur->skipNext && pCur->eState==CURSOR_VALID ){
857       pCur->eState = CURSOR_SKIPNEXT;
858     }
859   }
860   return rc;
861 }
862 
863 #define restoreCursorPosition(p) \
864   (p->eState>=CURSOR_REQUIRESEEK ? \
865          btreeRestoreCursorPosition(p) : \
866          SQLITE_OK)
867 
868 /*
869 ** Determine whether or not a cursor has moved from the position where
870 ** it was last placed, or has been invalidated for any other reason.
871 ** Cursors can move when the row they are pointing at is deleted out
872 ** from under them, for example.  Cursor might also move if a btree
873 ** is rebalanced.
874 **
875 ** Calling this routine with a NULL cursor pointer returns false.
876 **
877 ** Use the separate sqlite3BtreeCursorRestore() routine to restore a cursor
878 ** back to where it ought to be if this routine returns true.
879 */
880 int sqlite3BtreeCursorHasMoved(BtCursor *pCur){
881   assert( EIGHT_BYTE_ALIGNMENT(pCur)
882        || pCur==sqlite3BtreeFakeValidCursor() );
883   assert( offsetof(BtCursor, eState)==0 );
884   assert( sizeof(pCur->eState)==1 );
885   return CURSOR_VALID != *(u8*)pCur;
886 }
887 
888 /*
889 ** Return a pointer to a fake BtCursor object that will always answer
890 ** false to the sqlite3BtreeCursorHasMoved() routine above.  The fake
891 ** cursor returned must not be used with any other Btree interface.
892 */
893 BtCursor *sqlite3BtreeFakeValidCursor(void){
894   static u8 fakeCursor = CURSOR_VALID;
895   assert( offsetof(BtCursor, eState)==0 );
896   return (BtCursor*)&fakeCursor;
897 }
898 
899 /*
900 ** This routine restores a cursor back to its original position after it
901 ** has been moved by some outside activity (such as a btree rebalance or
902 ** a row having been deleted out from under the cursor).
903 **
904 ** On success, the *pDifferentRow parameter is false if the cursor is left
905 ** pointing at exactly the same row.  *pDifferntRow is the row the cursor
906 ** was pointing to has been deleted, forcing the cursor to point to some
907 ** nearby row.
908 **
909 ** This routine should only be called for a cursor that just returned
910 ** TRUE from sqlite3BtreeCursorHasMoved().
911 */
912 int sqlite3BtreeCursorRestore(BtCursor *pCur, int *pDifferentRow){
913   int rc;
914 
915   assert( pCur!=0 );
916   assert( pCur->eState!=CURSOR_VALID );
917   rc = restoreCursorPosition(pCur);
918   if( rc ){
919     *pDifferentRow = 1;
920     return rc;
921   }
922   if( pCur->eState!=CURSOR_VALID ){
923     *pDifferentRow = 1;
924   }else{
925     *pDifferentRow = 0;
926   }
927   return SQLITE_OK;
928 }
929 
930 #ifdef SQLITE_ENABLE_CURSOR_HINTS
931 /*
932 ** Provide hints to the cursor.  The particular hint given (and the type
933 ** and number of the varargs parameters) is determined by the eHintType
934 ** parameter.  See the definitions of the BTREE_HINT_* macros for details.
935 */
936 void sqlite3BtreeCursorHint(BtCursor *pCur, int eHintType, ...){
937   /* Used only by system that substitute their own storage engine */
938 }
939 #endif
940 
941 /*
942 ** Provide flag hints to the cursor.
943 */
944 void sqlite3BtreeCursorHintFlags(BtCursor *pCur, unsigned x){
945   assert( x==BTREE_SEEK_EQ || x==BTREE_BULKLOAD || x==0 );
946   pCur->hints = x;
947 }
948 
949 
950 #ifndef SQLITE_OMIT_AUTOVACUUM
951 /*
952 ** Given a page number of a regular database page, return the page
953 ** number for the pointer-map page that contains the entry for the
954 ** input page number.
955 **
956 ** Return 0 (not a valid page) for pgno==1 since there is
957 ** no pointer map associated with page 1.  The integrity_check logic
958 ** requires that ptrmapPageno(*,1)!=1.
959 */
960 static Pgno ptrmapPageno(BtShared *pBt, Pgno pgno){
961   int nPagesPerMapPage;
962   Pgno iPtrMap, ret;
963   assert( sqlite3_mutex_held(pBt->mutex) );
964   if( pgno<2 ) return 0;
965   nPagesPerMapPage = (pBt->usableSize/5)+1;
966   iPtrMap = (pgno-2)/nPagesPerMapPage;
967   ret = (iPtrMap*nPagesPerMapPage) + 2;
968   if( ret==PENDING_BYTE_PAGE(pBt) ){
969     ret++;
970   }
971   return ret;
972 }
973 
974 /*
975 ** Write an entry into the pointer map.
976 **
977 ** This routine updates the pointer map entry for page number 'key'
978 ** so that it maps to type 'eType' and parent page number 'pgno'.
979 **
980 ** If *pRC is initially non-zero (non-SQLITE_OK) then this routine is
981 ** a no-op.  If an error occurs, the appropriate error code is written
982 ** into *pRC.
983 */
984 static void ptrmapPut(BtShared *pBt, Pgno key, u8 eType, Pgno parent, int *pRC){
985   DbPage *pDbPage;  /* The pointer map page */
986   u8 *pPtrmap;      /* The pointer map data */
987   Pgno iPtrmap;     /* The pointer map page number */
988   int offset;       /* Offset in pointer map page */
989   int rc;           /* Return code from subfunctions */
990 
991   if( *pRC ) return;
992 
993   assert( sqlite3_mutex_held(pBt->mutex) );
994   /* The master-journal page number must never be used as a pointer map page */
995   assert( 0==PTRMAP_ISPAGE(pBt, PENDING_BYTE_PAGE(pBt)) );
996 
997   assert( pBt->autoVacuum );
998   if( key==0 ){
999     *pRC = SQLITE_CORRUPT_BKPT;
1000     return;
1001   }
1002   iPtrmap = PTRMAP_PAGENO(pBt, key);
1003   rc = sqlite3PagerGet(pBt->pPager, iPtrmap, &pDbPage, 0);
1004   if( rc!=SQLITE_OK ){
1005     *pRC = rc;
1006     return;
1007   }
1008   if( ((char*)sqlite3PagerGetExtra(pDbPage))[0]!=0 ){
1009     /* The first byte of the extra data is the MemPage.isInit byte.
1010     ** If that byte is set, it means this page is also being used
1011     ** as a btree page. */
1012     *pRC = SQLITE_CORRUPT_BKPT;
1013     goto ptrmap_exit;
1014   }
1015   offset = PTRMAP_PTROFFSET(iPtrmap, key);
1016   if( offset<0 ){
1017     *pRC = SQLITE_CORRUPT_BKPT;
1018     goto ptrmap_exit;
1019   }
1020   assert( offset <= (int)pBt->usableSize-5 );
1021   pPtrmap = (u8 *)sqlite3PagerGetData(pDbPage);
1022 
1023   if( eType!=pPtrmap[offset] || get4byte(&pPtrmap[offset+1])!=parent ){
1024     TRACE(("PTRMAP_UPDATE: %d->(%d,%d)\n", key, eType, parent));
1025     *pRC= rc = sqlite3PagerWrite(pDbPage);
1026     if( rc==SQLITE_OK ){
1027       pPtrmap[offset] = eType;
1028       put4byte(&pPtrmap[offset+1], parent);
1029     }
1030   }
1031 
1032 ptrmap_exit:
1033   sqlite3PagerUnref(pDbPage);
1034 }
1035 
1036 /*
1037 ** Read an entry from the pointer map.
1038 **
1039 ** This routine retrieves the pointer map entry for page 'key', writing
1040 ** the type and parent page number to *pEType and *pPgno respectively.
1041 ** An error code is returned if something goes wrong, otherwise SQLITE_OK.
1042 */
1043 static int ptrmapGet(BtShared *pBt, Pgno key, u8 *pEType, Pgno *pPgno){
1044   DbPage *pDbPage;   /* The pointer map page */
1045   int iPtrmap;       /* Pointer map page index */
1046   u8 *pPtrmap;       /* Pointer map page data */
1047   int offset;        /* Offset of entry in pointer map */
1048   int rc;
1049 
1050   assert( sqlite3_mutex_held(pBt->mutex) );
1051 
1052   iPtrmap = PTRMAP_PAGENO(pBt, key);
1053   rc = sqlite3PagerGet(pBt->pPager, iPtrmap, &pDbPage, 0);
1054   if( rc!=0 ){
1055     return rc;
1056   }
1057   pPtrmap = (u8 *)sqlite3PagerGetData(pDbPage);
1058 
1059   offset = PTRMAP_PTROFFSET(iPtrmap, key);
1060   if( offset<0 ){
1061     sqlite3PagerUnref(pDbPage);
1062     return SQLITE_CORRUPT_BKPT;
1063   }
1064   assert( offset <= (int)pBt->usableSize-5 );
1065   assert( pEType!=0 );
1066   *pEType = pPtrmap[offset];
1067   if( pPgno ) *pPgno = get4byte(&pPtrmap[offset+1]);
1068 
1069   sqlite3PagerUnref(pDbPage);
1070   if( *pEType<1 || *pEType>5 ) return SQLITE_CORRUPT_PGNO(iPtrmap);
1071   return SQLITE_OK;
1072 }
1073 
1074 #else /* if defined SQLITE_OMIT_AUTOVACUUM */
1075   #define ptrmapPut(w,x,y,z,rc)
1076   #define ptrmapGet(w,x,y,z) SQLITE_OK
1077   #define ptrmapPutOvflPtr(x, y, z, rc)
1078 #endif
1079 
1080 /*
1081 ** Given a btree page and a cell index (0 means the first cell on
1082 ** the page, 1 means the second cell, and so forth) return a pointer
1083 ** to the cell content.
1084 **
1085 ** findCellPastPtr() does the same except it skips past the initial
1086 ** 4-byte child pointer found on interior pages, if there is one.
1087 **
1088 ** This routine works only for pages that do not contain overflow cells.
1089 */
1090 #define findCell(P,I) \
1091   ((P)->aData + ((P)->maskPage & get2byteAligned(&(P)->aCellIdx[2*(I)])))
1092 #define findCellPastPtr(P,I) \
1093   ((P)->aDataOfst + ((P)->maskPage & get2byteAligned(&(P)->aCellIdx[2*(I)])))
1094 
1095 
1096 /*
1097 ** This is common tail processing for btreeParseCellPtr() and
1098 ** btreeParseCellPtrIndex() for the case when the cell does not fit entirely
1099 ** on a single B-tree page.  Make necessary adjustments to the CellInfo
1100 ** structure.
1101 */
1102 static SQLITE_NOINLINE void btreeParseCellAdjustSizeForOverflow(
1103   MemPage *pPage,         /* Page containing the cell */
1104   u8 *pCell,              /* Pointer to the cell text. */
1105   CellInfo *pInfo         /* Fill in this structure */
1106 ){
1107   /* If the payload will not fit completely on the local page, we have
1108   ** to decide how much to store locally and how much to spill onto
1109   ** overflow pages.  The strategy is to minimize the amount of unused
1110   ** space on overflow pages while keeping the amount of local storage
1111   ** in between minLocal and maxLocal.
1112   **
1113   ** Warning:  changing the way overflow payload is distributed in any
1114   ** way will result in an incompatible file format.
1115   */
1116   int minLocal;  /* Minimum amount of payload held locally */
1117   int maxLocal;  /* Maximum amount of payload held locally */
1118   int surplus;   /* Overflow payload available for local storage */
1119 
1120   minLocal = pPage->minLocal;
1121   maxLocal = pPage->maxLocal;
1122   surplus = minLocal + (pInfo->nPayload - minLocal)%(pPage->pBt->usableSize-4);
1123   testcase( surplus==maxLocal );
1124   testcase( surplus==maxLocal+1 );
1125   if( surplus <= maxLocal ){
1126     pInfo->nLocal = (u16)surplus;
1127   }else{
1128     pInfo->nLocal = (u16)minLocal;
1129   }
1130   pInfo->nSize = (u16)(&pInfo->pPayload[pInfo->nLocal] - pCell) + 4;
1131 }
1132 
1133 /*
1134 ** The following routines are implementations of the MemPage.xParseCell()
1135 ** method.
1136 **
1137 ** Parse a cell content block and fill in the CellInfo structure.
1138 **
1139 ** btreeParseCellPtr()        =>   table btree leaf nodes
1140 ** btreeParseCellNoPayload()  =>   table btree internal nodes
1141 ** btreeParseCellPtrIndex()   =>   index btree nodes
1142 **
1143 ** There is also a wrapper function btreeParseCell() that works for
1144 ** all MemPage types and that references the cell by index rather than
1145 ** by pointer.
1146 */
1147 static void btreeParseCellPtrNoPayload(
1148   MemPage *pPage,         /* Page containing the cell */
1149   u8 *pCell,              /* Pointer to the cell text. */
1150   CellInfo *pInfo         /* Fill in this structure */
1151 ){
1152   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1153   assert( pPage->leaf==0 );
1154   assert( pPage->childPtrSize==4 );
1155 #ifndef SQLITE_DEBUG
1156   UNUSED_PARAMETER(pPage);
1157 #endif
1158   pInfo->nSize = 4 + getVarint(&pCell[4], (u64*)&pInfo->nKey);
1159   pInfo->nPayload = 0;
1160   pInfo->nLocal = 0;
1161   pInfo->pPayload = 0;
1162   return;
1163 }
1164 static void btreeParseCellPtr(
1165   MemPage *pPage,         /* Page containing the cell */
1166   u8 *pCell,              /* Pointer to the cell text. */
1167   CellInfo *pInfo         /* Fill in this structure */
1168 ){
1169   u8 *pIter;              /* For scanning through pCell */
1170   u32 nPayload;           /* Number of bytes of cell payload */
1171   u64 iKey;               /* Extracted Key value */
1172 
1173   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1174   assert( pPage->leaf==0 || pPage->leaf==1 );
1175   assert( pPage->intKeyLeaf );
1176   assert( pPage->childPtrSize==0 );
1177   pIter = pCell;
1178 
1179   /* The next block of code is equivalent to:
1180   **
1181   **     pIter += getVarint32(pIter, nPayload);
1182   **
1183   ** The code is inlined to avoid a function call.
1184   */
1185   nPayload = *pIter;
1186   if( nPayload>=0x80 ){
1187     u8 *pEnd = &pIter[8];
1188     nPayload &= 0x7f;
1189     do{
1190       nPayload = (nPayload<<7) | (*++pIter & 0x7f);
1191     }while( (*pIter)>=0x80 && pIter<pEnd );
1192   }
1193   pIter++;
1194 
1195   /* The next block of code is equivalent to:
1196   **
1197   **     pIter += getVarint(pIter, (u64*)&pInfo->nKey);
1198   **
1199   ** The code is inlined to avoid a function call.
1200   */
1201   iKey = *pIter;
1202   if( iKey>=0x80 ){
1203     u8 *pEnd = &pIter[7];
1204     iKey &= 0x7f;
1205     while(1){
1206       iKey = (iKey<<7) | (*++pIter & 0x7f);
1207       if( (*pIter)<0x80 ) break;
1208       if( pIter>=pEnd ){
1209         iKey = (iKey<<8) | *++pIter;
1210         break;
1211       }
1212     }
1213   }
1214   pIter++;
1215 
1216   pInfo->nKey = *(i64*)&iKey;
1217   pInfo->nPayload = nPayload;
1218   pInfo->pPayload = pIter;
1219   testcase( nPayload==pPage->maxLocal );
1220   testcase( nPayload==pPage->maxLocal+1 );
1221   if( nPayload<=pPage->maxLocal ){
1222     /* This is the (easy) common case where the entire payload fits
1223     ** on the local page.  No overflow is required.
1224     */
1225     pInfo->nSize = nPayload + (u16)(pIter - pCell);
1226     if( pInfo->nSize<4 ) pInfo->nSize = 4;
1227     pInfo->nLocal = (u16)nPayload;
1228   }else{
1229     btreeParseCellAdjustSizeForOverflow(pPage, pCell, pInfo);
1230   }
1231 }
1232 static void btreeParseCellPtrIndex(
1233   MemPage *pPage,         /* Page containing the cell */
1234   u8 *pCell,              /* Pointer to the cell text. */
1235   CellInfo *pInfo         /* Fill in this structure */
1236 ){
1237   u8 *pIter;              /* For scanning through pCell */
1238   u32 nPayload;           /* Number of bytes of cell payload */
1239 
1240   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1241   assert( pPage->leaf==0 || pPage->leaf==1 );
1242   assert( pPage->intKeyLeaf==0 );
1243   pIter = pCell + pPage->childPtrSize;
1244   nPayload = *pIter;
1245   if( nPayload>=0x80 ){
1246     u8 *pEnd = &pIter[8];
1247     nPayload &= 0x7f;
1248     do{
1249       nPayload = (nPayload<<7) | (*++pIter & 0x7f);
1250     }while( *(pIter)>=0x80 && pIter<pEnd );
1251   }
1252   pIter++;
1253   pInfo->nKey = nPayload;
1254   pInfo->nPayload = nPayload;
1255   pInfo->pPayload = pIter;
1256   testcase( nPayload==pPage->maxLocal );
1257   testcase( nPayload==pPage->maxLocal+1 );
1258   if( nPayload<=pPage->maxLocal ){
1259     /* This is the (easy) common case where the entire payload fits
1260     ** on the local page.  No overflow is required.
1261     */
1262     pInfo->nSize = nPayload + (u16)(pIter - pCell);
1263     if( pInfo->nSize<4 ) pInfo->nSize = 4;
1264     pInfo->nLocal = (u16)nPayload;
1265   }else{
1266     btreeParseCellAdjustSizeForOverflow(pPage, pCell, pInfo);
1267   }
1268 }
1269 static void btreeParseCell(
1270   MemPage *pPage,         /* Page containing the cell */
1271   int iCell,              /* The cell index.  First cell is 0 */
1272   CellInfo *pInfo         /* Fill in this structure */
1273 ){
1274   pPage->xParseCell(pPage, findCell(pPage, iCell), pInfo);
1275 }
1276 
1277 /*
1278 ** The following routines are implementations of the MemPage.xCellSize
1279 ** method.
1280 **
1281 ** Compute the total number of bytes that a Cell needs in the cell
1282 ** data area of the btree-page.  The return number includes the cell
1283 ** data header and the local payload, but not any overflow page or
1284 ** the space used by the cell pointer.
1285 **
1286 ** cellSizePtrNoPayload()    =>   table internal nodes
1287 ** cellSizePtr()             =>   all index nodes & table leaf nodes
1288 */
1289 static u16 cellSizePtr(MemPage *pPage, u8 *pCell){
1290   u8 *pIter = pCell + pPage->childPtrSize; /* For looping over bytes of pCell */
1291   u8 *pEnd;                                /* End mark for a varint */
1292   u32 nSize;                               /* Size value to return */
1293 
1294 #ifdef SQLITE_DEBUG
1295   /* The value returned by this function should always be the same as
1296   ** the (CellInfo.nSize) value found by doing a full parse of the
1297   ** cell. If SQLITE_DEBUG is defined, an assert() at the bottom of
1298   ** this function verifies that this invariant is not violated. */
1299   CellInfo debuginfo;
1300   pPage->xParseCell(pPage, pCell, &debuginfo);
1301 #endif
1302 
1303   nSize = *pIter;
1304   if( nSize>=0x80 ){
1305     pEnd = &pIter[8];
1306     nSize &= 0x7f;
1307     do{
1308       nSize = (nSize<<7) | (*++pIter & 0x7f);
1309     }while( *(pIter)>=0x80 && pIter<pEnd );
1310   }
1311   pIter++;
1312   if( pPage->intKey ){
1313     /* pIter now points at the 64-bit integer key value, a variable length
1314     ** integer. The following block moves pIter to point at the first byte
1315     ** past the end of the key value. */
1316     pEnd = &pIter[9];
1317     while( (*pIter++)&0x80 && pIter<pEnd );
1318   }
1319   testcase( nSize==pPage->maxLocal );
1320   testcase( nSize==pPage->maxLocal+1 );
1321   if( nSize<=pPage->maxLocal ){
1322     nSize += (u32)(pIter - pCell);
1323     if( nSize<4 ) nSize = 4;
1324   }else{
1325     int minLocal = pPage->minLocal;
1326     nSize = minLocal + (nSize - minLocal) % (pPage->pBt->usableSize - 4);
1327     testcase( nSize==pPage->maxLocal );
1328     testcase( nSize==pPage->maxLocal+1 );
1329     if( nSize>pPage->maxLocal ){
1330       nSize = minLocal;
1331     }
1332     nSize += 4 + (u16)(pIter - pCell);
1333   }
1334   assert( nSize==debuginfo.nSize || CORRUPT_DB );
1335   return (u16)nSize;
1336 }
1337 static u16 cellSizePtrNoPayload(MemPage *pPage, u8 *pCell){
1338   u8 *pIter = pCell + 4; /* For looping over bytes of pCell */
1339   u8 *pEnd;              /* End mark for a varint */
1340 
1341 #ifdef SQLITE_DEBUG
1342   /* The value returned by this function should always be the same as
1343   ** the (CellInfo.nSize) value found by doing a full parse of the
1344   ** cell. If SQLITE_DEBUG is defined, an assert() at the bottom of
1345   ** this function verifies that this invariant is not violated. */
1346   CellInfo debuginfo;
1347   pPage->xParseCell(pPage, pCell, &debuginfo);
1348 #else
1349   UNUSED_PARAMETER(pPage);
1350 #endif
1351 
1352   assert( pPage->childPtrSize==4 );
1353   pEnd = pIter + 9;
1354   while( (*pIter++)&0x80 && pIter<pEnd );
1355   assert( debuginfo.nSize==(u16)(pIter - pCell) || CORRUPT_DB );
1356   return (u16)(pIter - pCell);
1357 }
1358 
1359 
1360 #ifdef SQLITE_DEBUG
1361 /* This variation on cellSizePtr() is used inside of assert() statements
1362 ** only. */
1363 static u16 cellSize(MemPage *pPage, int iCell){
1364   return pPage->xCellSize(pPage, findCell(pPage, iCell));
1365 }
1366 #endif
1367 
1368 #ifndef SQLITE_OMIT_AUTOVACUUM
1369 /*
1370 ** The cell pCell is currently part of page pSrc but will ultimately be part
1371 ** of pPage.  (pSrc and pPager are often the same.)  If pCell contains a
1372 ** pointer to an overflow page, insert an entry into the pointer-map for
1373 ** the overflow page that will be valid after pCell has been moved to pPage.
1374 */
1375 static void ptrmapPutOvflPtr(MemPage *pPage, MemPage *pSrc, u8 *pCell,int *pRC){
1376   CellInfo info;
1377   if( *pRC ) return;
1378   assert( pCell!=0 );
1379   pPage->xParseCell(pPage, pCell, &info);
1380   if( info.nLocal<info.nPayload ){
1381     Pgno ovfl;
1382     if( SQLITE_WITHIN(pSrc->aDataEnd, pCell, pCell+info.nLocal) ){
1383       testcase( pSrc!=pPage );
1384       *pRC = SQLITE_CORRUPT_BKPT;
1385       return;
1386     }
1387     ovfl = get4byte(&pCell[info.nSize-4]);
1388     ptrmapPut(pPage->pBt, ovfl, PTRMAP_OVERFLOW1, pPage->pgno, pRC);
1389   }
1390 }
1391 #endif
1392 
1393 
1394 /*
1395 ** Defragment the page given. This routine reorganizes cells within the
1396 ** page so that there are no free-blocks on the free-block list.
1397 **
1398 ** Parameter nMaxFrag is the maximum amount of fragmented space that may be
1399 ** present in the page after this routine returns.
1400 **
1401 ** EVIDENCE-OF: R-44582-60138 SQLite may from time to time reorganize a
1402 ** b-tree page so that there are no freeblocks or fragment bytes, all
1403 ** unused bytes are contained in the unallocated space region, and all
1404 ** cells are packed tightly at the end of the page.
1405 */
1406 static int defragmentPage(MemPage *pPage, int nMaxFrag){
1407   int i;                     /* Loop counter */
1408   int pc;                    /* Address of the i-th cell */
1409   int hdr;                   /* Offset to the page header */
1410   int size;                  /* Size of a cell */
1411   int usableSize;            /* Number of usable bytes on a page */
1412   int cellOffset;            /* Offset to the cell pointer array */
1413   int cbrk;                  /* Offset to the cell content area */
1414   int nCell;                 /* Number of cells on the page */
1415   unsigned char *data;       /* The page data */
1416   unsigned char *temp;       /* Temp area for cell content */
1417   unsigned char *src;        /* Source of content */
1418   int iCellFirst;            /* First allowable cell index */
1419   int iCellLast;             /* Last possible cell index */
1420 
1421   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
1422   assert( pPage->pBt!=0 );
1423   assert( pPage->pBt->usableSize <= SQLITE_MAX_PAGE_SIZE );
1424   assert( pPage->nOverflow==0 );
1425   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1426   temp = 0;
1427   src = data = pPage->aData;
1428   hdr = pPage->hdrOffset;
1429   cellOffset = pPage->cellOffset;
1430   nCell = pPage->nCell;
1431   assert( nCell==get2byte(&data[hdr+3]) || CORRUPT_DB );
1432   iCellFirst = cellOffset + 2*nCell;
1433   usableSize = pPage->pBt->usableSize;
1434 
1435   /* This block handles pages with two or fewer free blocks and nMaxFrag
1436   ** or fewer fragmented bytes. In this case it is faster to move the
1437   ** two (or one) blocks of cells using memmove() and add the required
1438   ** offsets to each pointer in the cell-pointer array than it is to
1439   ** reconstruct the entire page.  */
1440   if( (int)data[hdr+7]<=nMaxFrag ){
1441     int iFree = get2byte(&data[hdr+1]);
1442     if( iFree>usableSize-4 ) return SQLITE_CORRUPT_PAGE(pPage);
1443     if( iFree ){
1444       int iFree2 = get2byte(&data[iFree]);
1445       if( iFree2>usableSize-4 ) return SQLITE_CORRUPT_PAGE(pPage);
1446       if( 0==iFree2 || (data[iFree2]==0 && data[iFree2+1]==0) ){
1447         u8 *pEnd = &data[cellOffset + nCell*2];
1448         u8 *pAddr;
1449         int sz2 = 0;
1450         int sz = get2byte(&data[iFree+2]);
1451         int top = get2byte(&data[hdr+5]);
1452         if( top>=iFree ){
1453           return SQLITE_CORRUPT_PAGE(pPage);
1454         }
1455         if( iFree2 ){
1456           if( iFree+sz>iFree2 ) return SQLITE_CORRUPT_PAGE(pPage);
1457           sz2 = get2byte(&data[iFree2+2]);
1458           if( iFree2+sz2 > usableSize ) return SQLITE_CORRUPT_PAGE(pPage);
1459           memmove(&data[iFree+sz+sz2], &data[iFree+sz], iFree2-(iFree+sz));
1460           sz += sz2;
1461         }else if( iFree+sz>usableSize ){
1462           return SQLITE_CORRUPT_PAGE(pPage);
1463         }
1464 
1465         cbrk = top+sz;
1466         assert( cbrk+(iFree-top) <= usableSize );
1467         memmove(&data[cbrk], &data[top], iFree-top);
1468         for(pAddr=&data[cellOffset]; pAddr<pEnd; pAddr+=2){
1469           pc = get2byte(pAddr);
1470           if( pc<iFree ){ put2byte(pAddr, pc+sz); }
1471           else if( pc<iFree2 ){ put2byte(pAddr, pc+sz2); }
1472         }
1473         goto defragment_out;
1474       }
1475     }
1476   }
1477 
1478   cbrk = usableSize;
1479   iCellLast = usableSize - 4;
1480   for(i=0; i<nCell; i++){
1481     u8 *pAddr;     /* The i-th cell pointer */
1482     pAddr = &data[cellOffset + i*2];
1483     pc = get2byte(pAddr);
1484     testcase( pc==iCellFirst );
1485     testcase( pc==iCellLast );
1486     /* These conditions have already been verified in btreeInitPage()
1487     ** if PRAGMA cell_size_check=ON.
1488     */
1489     if( pc<iCellFirst || pc>iCellLast ){
1490       return SQLITE_CORRUPT_PAGE(pPage);
1491     }
1492     assert( pc>=iCellFirst && pc<=iCellLast );
1493     size = pPage->xCellSize(pPage, &src[pc]);
1494     cbrk -= size;
1495     if( cbrk<iCellFirst || pc+size>usableSize ){
1496       return SQLITE_CORRUPT_PAGE(pPage);
1497     }
1498     assert( cbrk+size<=usableSize && cbrk>=iCellFirst );
1499     testcase( cbrk+size==usableSize );
1500     testcase( pc+size==usableSize );
1501     put2byte(pAddr, cbrk);
1502     if( temp==0 ){
1503       int x;
1504       if( cbrk==pc ) continue;
1505       temp = sqlite3PagerTempSpace(pPage->pBt->pPager);
1506       x = get2byte(&data[hdr+5]);
1507       memcpy(&temp[x], &data[x], (cbrk+size) - x);
1508       src = temp;
1509     }
1510     memcpy(&data[cbrk], &src[pc], size);
1511   }
1512   data[hdr+7] = 0;
1513 
1514  defragment_out:
1515   assert( pPage->nFree>=0 );
1516   if( data[hdr+7]+cbrk-iCellFirst!=pPage->nFree ){
1517     return SQLITE_CORRUPT_PAGE(pPage);
1518   }
1519   assert( cbrk>=iCellFirst );
1520   put2byte(&data[hdr+5], cbrk);
1521   data[hdr+1] = 0;
1522   data[hdr+2] = 0;
1523   memset(&data[iCellFirst], 0, cbrk-iCellFirst);
1524   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
1525   return SQLITE_OK;
1526 }
1527 
1528 /*
1529 ** Search the free-list on page pPg for space to store a cell nByte bytes in
1530 ** size. If one can be found, return a pointer to the space and remove it
1531 ** from the free-list.
1532 **
1533 ** If no suitable space can be found on the free-list, return NULL.
1534 **
1535 ** This function may detect corruption within pPg.  If corruption is
1536 ** detected then *pRc is set to SQLITE_CORRUPT and NULL is returned.
1537 **
1538 ** Slots on the free list that are between 1 and 3 bytes larger than nByte
1539 ** will be ignored if adding the extra space to the fragmentation count
1540 ** causes the fragmentation count to exceed 60.
1541 */
1542 static u8 *pageFindSlot(MemPage *pPg, int nByte, int *pRc){
1543   const int hdr = pPg->hdrOffset;            /* Offset to page header */
1544   u8 * const aData = pPg->aData;             /* Page data */
1545   int iAddr = hdr + 1;                       /* Address of ptr to pc */
1546   int pc = get2byte(&aData[iAddr]);          /* Address of a free slot */
1547   int x;                                     /* Excess size of the slot */
1548   int maxPC = pPg->pBt->usableSize - nByte;  /* Max address for a usable slot */
1549   int size;                                  /* Size of the free slot */
1550 
1551   assert( pc>0 );
1552   while( pc<=maxPC ){
1553     /* EVIDENCE-OF: R-22710-53328 The third and fourth bytes of each
1554     ** freeblock form a big-endian integer which is the size of the freeblock
1555     ** in bytes, including the 4-byte header. */
1556     size = get2byte(&aData[pc+2]);
1557     if( (x = size - nByte)>=0 ){
1558       testcase( x==4 );
1559       testcase( x==3 );
1560       if( x<4 ){
1561         /* EVIDENCE-OF: R-11498-58022 In a well-formed b-tree page, the total
1562         ** number of bytes in fragments may not exceed 60. */
1563         if( aData[hdr+7]>57 ) return 0;
1564 
1565         /* Remove the slot from the free-list. Update the number of
1566         ** fragmented bytes within the page. */
1567         memcpy(&aData[iAddr], &aData[pc], 2);
1568         aData[hdr+7] += (u8)x;
1569       }else if( x+pc > maxPC ){
1570         /* This slot extends off the end of the usable part of the page */
1571         *pRc = SQLITE_CORRUPT_PAGE(pPg);
1572         return 0;
1573       }else{
1574         /* The slot remains on the free-list. Reduce its size to account
1575         ** for the portion used by the new allocation. */
1576         put2byte(&aData[pc+2], x);
1577       }
1578       return &aData[pc + x];
1579     }
1580     iAddr = pc;
1581     pc = get2byte(&aData[pc]);
1582     if( pc<=iAddr+size ){
1583       if( pc ){
1584         /* The next slot in the chain is not past the end of the current slot */
1585         *pRc = SQLITE_CORRUPT_PAGE(pPg);
1586       }
1587       return 0;
1588     }
1589   }
1590   if( pc>maxPC+nByte-4 ){
1591     /* The free slot chain extends off the end of the page */
1592     *pRc = SQLITE_CORRUPT_PAGE(pPg);
1593   }
1594   return 0;
1595 }
1596 
1597 /*
1598 ** Allocate nByte bytes of space from within the B-Tree page passed
1599 ** as the first argument. Write into *pIdx the index into pPage->aData[]
1600 ** of the first byte of allocated space. Return either SQLITE_OK or
1601 ** an error code (usually SQLITE_CORRUPT).
1602 **
1603 ** The caller guarantees that there is sufficient space to make the
1604 ** allocation.  This routine might need to defragment in order to bring
1605 ** all the space together, however.  This routine will avoid using
1606 ** the first two bytes past the cell pointer area since presumably this
1607 ** allocation is being made in order to insert a new cell, so we will
1608 ** also end up needing a new cell pointer.
1609 */
1610 static int allocateSpace(MemPage *pPage, int nByte, int *pIdx){
1611   const int hdr = pPage->hdrOffset;    /* Local cache of pPage->hdrOffset */
1612   u8 * const data = pPage->aData;      /* Local cache of pPage->aData */
1613   int top;                             /* First byte of cell content area */
1614   int rc = SQLITE_OK;                  /* Integer return code */
1615   int gap;        /* First byte of gap between cell pointers and cell content */
1616 
1617   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
1618   assert( pPage->pBt );
1619   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1620   assert( nByte>=0 );  /* Minimum cell size is 4 */
1621   assert( pPage->nFree>=nByte );
1622   assert( pPage->nOverflow==0 );
1623   assert( nByte < (int)(pPage->pBt->usableSize-8) );
1624 
1625   assert( pPage->cellOffset == hdr + 12 - 4*pPage->leaf );
1626   gap = pPage->cellOffset + 2*pPage->nCell;
1627   assert( gap<=65536 );
1628   /* EVIDENCE-OF: R-29356-02391 If the database uses a 65536-byte page size
1629   ** and the reserved space is zero (the usual value for reserved space)
1630   ** then the cell content offset of an empty page wants to be 65536.
1631   ** However, that integer is too large to be stored in a 2-byte unsigned
1632   ** integer, so a value of 0 is used in its place. */
1633   top = get2byte(&data[hdr+5]);
1634   assert( top<=(int)pPage->pBt->usableSize ); /* by btreeComputeFreeSpace() */
1635   if( gap>top ){
1636     if( top==0 && pPage->pBt->usableSize==65536 ){
1637       top = 65536;
1638     }else{
1639       return SQLITE_CORRUPT_PAGE(pPage);
1640     }
1641   }
1642 
1643   /* If there is enough space between gap and top for one more cell pointer,
1644   ** and if the freelist is not empty, then search the
1645   ** freelist looking for a slot big enough to satisfy the request.
1646   */
1647   testcase( gap+2==top );
1648   testcase( gap+1==top );
1649   testcase( gap==top );
1650   if( (data[hdr+2] || data[hdr+1]) && gap+2<=top ){
1651     u8 *pSpace = pageFindSlot(pPage, nByte, &rc);
1652     if( pSpace ){
1653       assert( pSpace+nByte<=data+pPage->pBt->usableSize );
1654       if( (*pIdx = (int)(pSpace-data))<=gap ){
1655         return SQLITE_CORRUPT_PAGE(pPage);
1656       }else{
1657         return SQLITE_OK;
1658       }
1659     }else if( rc ){
1660       return rc;
1661     }
1662   }
1663 
1664   /* The request could not be fulfilled using a freelist slot.  Check
1665   ** to see if defragmentation is necessary.
1666   */
1667   testcase( gap+2+nByte==top );
1668   if( gap+2+nByte>top ){
1669     assert( pPage->nCell>0 || CORRUPT_DB );
1670     assert( pPage->nFree>=0 );
1671     rc = defragmentPage(pPage, MIN(4, pPage->nFree - (2+nByte)));
1672     if( rc ) return rc;
1673     top = get2byteNotZero(&data[hdr+5]);
1674     assert( gap+2+nByte<=top );
1675   }
1676 
1677 
1678   /* Allocate memory from the gap in between the cell pointer array
1679   ** and the cell content area.  The btreeComputeFreeSpace() call has already
1680   ** validated the freelist.  Given that the freelist is valid, there
1681   ** is no way that the allocation can extend off the end of the page.
1682   ** The assert() below verifies the previous sentence.
1683   */
1684   top -= nByte;
1685   put2byte(&data[hdr+5], top);
1686   assert( top+nByte <= (int)pPage->pBt->usableSize );
1687   *pIdx = top;
1688   return SQLITE_OK;
1689 }
1690 
1691 /*
1692 ** Return a section of the pPage->aData to the freelist.
1693 ** The first byte of the new free block is pPage->aData[iStart]
1694 ** and the size of the block is iSize bytes.
1695 **
1696 ** Adjacent freeblocks are coalesced.
1697 **
1698 ** Even though the freeblock list was checked by btreeComputeFreeSpace(),
1699 ** that routine will not detect overlap between cells or freeblocks.  Nor
1700 ** does it detect cells or freeblocks that encrouch into the reserved bytes
1701 ** at the end of the page.  So do additional corruption checks inside this
1702 ** routine and return SQLITE_CORRUPT if any problems are found.
1703 */
1704 static int freeSpace(MemPage *pPage, u16 iStart, u16 iSize){
1705   u16 iPtr;                             /* Address of ptr to next freeblock */
1706   u16 iFreeBlk;                         /* Address of the next freeblock */
1707   u8 hdr;                               /* Page header size.  0 or 100 */
1708   u8 nFrag = 0;                         /* Reduction in fragmentation */
1709   u16 iOrigSize = iSize;                /* Original value of iSize */
1710   u16 x;                                /* Offset to cell content area */
1711   u32 iEnd = iStart + iSize;            /* First byte past the iStart buffer */
1712   unsigned char *data = pPage->aData;   /* Page content */
1713 
1714   assert( pPage->pBt!=0 );
1715   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
1716   assert( CORRUPT_DB || iStart>=pPage->hdrOffset+6+pPage->childPtrSize );
1717   assert( CORRUPT_DB || iEnd <= pPage->pBt->usableSize );
1718   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1719   assert( iSize>=4 );   /* Minimum cell size is 4 */
1720   assert( iStart<=pPage->pBt->usableSize-4 );
1721 
1722   /* The list of freeblocks must be in ascending order.  Find the
1723   ** spot on the list where iStart should be inserted.
1724   */
1725   hdr = pPage->hdrOffset;
1726   iPtr = hdr + 1;
1727   if( data[iPtr+1]==0 && data[iPtr]==0 ){
1728     iFreeBlk = 0;  /* Shortcut for the case when the freelist is empty */
1729   }else{
1730     while( (iFreeBlk = get2byte(&data[iPtr]))<iStart ){
1731       if( iFreeBlk<iPtr+4 ){
1732         if( iFreeBlk==0 ) break;
1733         return SQLITE_CORRUPT_PAGE(pPage);
1734       }
1735       iPtr = iFreeBlk;
1736     }
1737     if( iFreeBlk>pPage->pBt->usableSize-4 ){
1738       return SQLITE_CORRUPT_PAGE(pPage);
1739     }
1740     assert( iFreeBlk>iPtr || iFreeBlk==0 );
1741 
1742     /* At this point:
1743     **    iFreeBlk:   First freeblock after iStart, or zero if none
1744     **    iPtr:       The address of a pointer to iFreeBlk
1745     **
1746     ** Check to see if iFreeBlk should be coalesced onto the end of iStart.
1747     */
1748     if( iFreeBlk && iEnd+3>=iFreeBlk ){
1749       nFrag = iFreeBlk - iEnd;
1750       if( iEnd>iFreeBlk ) return SQLITE_CORRUPT_PAGE(pPage);
1751       iEnd = iFreeBlk + get2byte(&data[iFreeBlk+2]);
1752       if( iEnd > pPage->pBt->usableSize ){
1753         return SQLITE_CORRUPT_PAGE(pPage);
1754       }
1755       iSize = iEnd - iStart;
1756       iFreeBlk = get2byte(&data[iFreeBlk]);
1757     }
1758 
1759     /* If iPtr is another freeblock (that is, if iPtr is not the freelist
1760     ** pointer in the page header) then check to see if iStart should be
1761     ** coalesced onto the end of iPtr.
1762     */
1763     if( iPtr>hdr+1 ){
1764       int iPtrEnd = iPtr + get2byte(&data[iPtr+2]);
1765       if( iPtrEnd+3>=iStart ){
1766         if( iPtrEnd>iStart ) return SQLITE_CORRUPT_PAGE(pPage);
1767         nFrag += iStart - iPtrEnd;
1768         iSize = iEnd - iPtr;
1769         iStart = iPtr;
1770       }
1771     }
1772     if( nFrag>data[hdr+7] ) return SQLITE_CORRUPT_PAGE(pPage);
1773     data[hdr+7] -= nFrag;
1774   }
1775   x = get2byte(&data[hdr+5]);
1776   if( iStart<=x ){
1777     /* The new freeblock is at the beginning of the cell content area,
1778     ** so just extend the cell content area rather than create another
1779     ** freelist entry */
1780     if( iStart<x || iPtr!=hdr+1 ) return SQLITE_CORRUPT_PAGE(pPage);
1781     put2byte(&data[hdr+1], iFreeBlk);
1782     put2byte(&data[hdr+5], iEnd);
1783   }else{
1784     /* Insert the new freeblock into the freelist */
1785     put2byte(&data[iPtr], iStart);
1786   }
1787   if( pPage->pBt->btsFlags & BTS_FAST_SECURE ){
1788     /* Overwrite deleted information with zeros when the secure_delete
1789     ** option is enabled */
1790     memset(&data[iStart], 0, iSize);
1791   }
1792   put2byte(&data[iStart], iFreeBlk);
1793   put2byte(&data[iStart+2], iSize);
1794   pPage->nFree += iOrigSize;
1795   return SQLITE_OK;
1796 }
1797 
1798 /*
1799 ** Decode the flags byte (the first byte of the header) for a page
1800 ** and initialize fields of the MemPage structure accordingly.
1801 **
1802 ** Only the following combinations are supported.  Anything different
1803 ** indicates a corrupt database files:
1804 **
1805 **         PTF_ZERODATA
1806 **         PTF_ZERODATA | PTF_LEAF
1807 **         PTF_LEAFDATA | PTF_INTKEY
1808 **         PTF_LEAFDATA | PTF_INTKEY | PTF_LEAF
1809 */
1810 static int decodeFlags(MemPage *pPage, int flagByte){
1811   BtShared *pBt;     /* A copy of pPage->pBt */
1812 
1813   assert( pPage->hdrOffset==(pPage->pgno==1 ? 100 : 0) );
1814   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1815   pPage->leaf = (u8)(flagByte>>3);  assert( PTF_LEAF == 1<<3 );
1816   flagByte &= ~PTF_LEAF;
1817   pPage->childPtrSize = 4-4*pPage->leaf;
1818   pPage->xCellSize = cellSizePtr;
1819   pBt = pPage->pBt;
1820   if( flagByte==(PTF_LEAFDATA | PTF_INTKEY) ){
1821     /* EVIDENCE-OF: R-07291-35328 A value of 5 (0x05) means the page is an
1822     ** interior table b-tree page. */
1823     assert( (PTF_LEAFDATA|PTF_INTKEY)==5 );
1824     /* EVIDENCE-OF: R-26900-09176 A value of 13 (0x0d) means the page is a
1825     ** leaf table b-tree page. */
1826     assert( (PTF_LEAFDATA|PTF_INTKEY|PTF_LEAF)==13 );
1827     pPage->intKey = 1;
1828     if( pPage->leaf ){
1829       pPage->intKeyLeaf = 1;
1830       pPage->xParseCell = btreeParseCellPtr;
1831     }else{
1832       pPage->intKeyLeaf = 0;
1833       pPage->xCellSize = cellSizePtrNoPayload;
1834       pPage->xParseCell = btreeParseCellPtrNoPayload;
1835     }
1836     pPage->maxLocal = pBt->maxLeaf;
1837     pPage->minLocal = pBt->minLeaf;
1838   }else if( flagByte==PTF_ZERODATA ){
1839     /* EVIDENCE-OF: R-43316-37308 A value of 2 (0x02) means the page is an
1840     ** interior index b-tree page. */
1841     assert( (PTF_ZERODATA)==2 );
1842     /* EVIDENCE-OF: R-59615-42828 A value of 10 (0x0a) means the page is a
1843     ** leaf index b-tree page. */
1844     assert( (PTF_ZERODATA|PTF_LEAF)==10 );
1845     pPage->intKey = 0;
1846     pPage->intKeyLeaf = 0;
1847     pPage->xParseCell = btreeParseCellPtrIndex;
1848     pPage->maxLocal = pBt->maxLocal;
1849     pPage->minLocal = pBt->minLocal;
1850   }else{
1851     /* EVIDENCE-OF: R-47608-56469 Any other value for the b-tree page type is
1852     ** an error. */
1853     return SQLITE_CORRUPT_PAGE(pPage);
1854   }
1855   pPage->max1bytePayload = pBt->max1bytePayload;
1856   return SQLITE_OK;
1857 }
1858 
1859 /*
1860 ** Compute the amount of freespace on the page.  In other words, fill
1861 ** in the pPage->nFree field.
1862 */
1863 static int btreeComputeFreeSpace(MemPage *pPage){
1864   int pc;            /* Address of a freeblock within pPage->aData[] */
1865   u8 hdr;            /* Offset to beginning of page header */
1866   u8 *data;          /* Equal to pPage->aData */
1867   int usableSize;    /* Amount of usable space on each page */
1868   int nFree;         /* Number of unused bytes on the page */
1869   int top;           /* First byte of the cell content area */
1870   int iCellFirst;    /* First allowable cell or freeblock offset */
1871   int iCellLast;     /* Last possible cell or freeblock offset */
1872 
1873   assert( pPage->pBt!=0 );
1874   assert( pPage->pBt->db!=0 );
1875   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1876   assert( pPage->pgno==sqlite3PagerPagenumber(pPage->pDbPage) );
1877   assert( pPage == sqlite3PagerGetExtra(pPage->pDbPage) );
1878   assert( pPage->aData == sqlite3PagerGetData(pPage->pDbPage) );
1879   assert( pPage->isInit==1 );
1880   assert( pPage->nFree<0 );
1881 
1882   usableSize = pPage->pBt->usableSize;
1883   hdr = pPage->hdrOffset;
1884   data = pPage->aData;
1885   /* EVIDENCE-OF: R-58015-48175 The two-byte integer at offset 5 designates
1886   ** the start of the cell content area. A zero value for this integer is
1887   ** interpreted as 65536. */
1888   top = get2byteNotZero(&data[hdr+5]);
1889   iCellFirst = hdr + 8 + pPage->childPtrSize + 2*pPage->nCell;
1890   iCellLast = usableSize - 4;
1891 
1892   /* Compute the total free space on the page
1893   ** EVIDENCE-OF: R-23588-34450 The two-byte integer at offset 1 gives the
1894   ** start of the first freeblock on the page, or is zero if there are no
1895   ** freeblocks. */
1896   pc = get2byte(&data[hdr+1]);
1897   nFree = data[hdr+7] + top;  /* Init nFree to non-freeblock free space */
1898   if( pc>0 ){
1899     u32 next, size;
1900     if( pc<iCellFirst ){
1901       /* EVIDENCE-OF: R-55530-52930 In a well-formed b-tree page, there will
1902       ** always be at least one cell before the first freeblock.
1903       */
1904       return SQLITE_CORRUPT_PAGE(pPage);
1905     }
1906     while( 1 ){
1907       if( pc>iCellLast ){
1908         /* Freeblock off the end of the page */
1909         return SQLITE_CORRUPT_PAGE(pPage);
1910       }
1911       next = get2byte(&data[pc]);
1912       size = get2byte(&data[pc+2]);
1913       nFree = nFree + size;
1914       if( next<=pc+size+3 ) break;
1915       pc = next;
1916     }
1917     if( next>0 ){
1918       /* Freeblock not in ascending order */
1919       return SQLITE_CORRUPT_PAGE(pPage);
1920     }
1921     if( pc+size>(unsigned int)usableSize ){
1922       /* Last freeblock extends past page end */
1923       return SQLITE_CORRUPT_PAGE(pPage);
1924     }
1925   }
1926 
1927   /* At this point, nFree contains the sum of the offset to the start
1928   ** of the cell-content area plus the number of free bytes within
1929   ** the cell-content area. If this is greater than the usable-size
1930   ** of the page, then the page must be corrupted. This check also
1931   ** serves to verify that the offset to the start of the cell-content
1932   ** area, according to the page header, lies within the page.
1933   */
1934   if( nFree>usableSize || nFree<iCellFirst ){
1935     return SQLITE_CORRUPT_PAGE(pPage);
1936   }
1937   pPage->nFree = (u16)(nFree - iCellFirst);
1938   return SQLITE_OK;
1939 }
1940 
1941 /*
1942 ** Do additional sanity check after btreeInitPage() if
1943 ** PRAGMA cell_size_check=ON
1944 */
1945 static SQLITE_NOINLINE int btreeCellSizeCheck(MemPage *pPage){
1946   int iCellFirst;    /* First allowable cell or freeblock offset */
1947   int iCellLast;     /* Last possible cell or freeblock offset */
1948   int i;             /* Index into the cell pointer array */
1949   int sz;            /* Size of a cell */
1950   int pc;            /* Address of a freeblock within pPage->aData[] */
1951   u8 *data;          /* Equal to pPage->aData */
1952   int usableSize;    /* Maximum usable space on the page */
1953   int cellOffset;    /* Start of cell content area */
1954 
1955   iCellFirst = pPage->cellOffset + 2*pPage->nCell;
1956   usableSize = pPage->pBt->usableSize;
1957   iCellLast = usableSize - 4;
1958   data = pPage->aData;
1959   cellOffset = pPage->cellOffset;
1960   if( !pPage->leaf ) iCellLast--;
1961   for(i=0; i<pPage->nCell; i++){
1962     pc = get2byteAligned(&data[cellOffset+i*2]);
1963     testcase( pc==iCellFirst );
1964     testcase( pc==iCellLast );
1965     if( pc<iCellFirst || pc>iCellLast ){
1966       return SQLITE_CORRUPT_PAGE(pPage);
1967     }
1968     sz = pPage->xCellSize(pPage, &data[pc]);
1969     testcase( pc+sz==usableSize );
1970     if( pc+sz>usableSize ){
1971       return SQLITE_CORRUPT_PAGE(pPage);
1972     }
1973   }
1974   return SQLITE_OK;
1975 }
1976 
1977 /*
1978 ** Initialize the auxiliary information for a disk block.
1979 **
1980 ** Return SQLITE_OK on success.  If we see that the page does
1981 ** not contain a well-formed database page, then return
1982 ** SQLITE_CORRUPT.  Note that a return of SQLITE_OK does not
1983 ** guarantee that the page is well-formed.  It only shows that
1984 ** we failed to detect any corruption.
1985 */
1986 static int btreeInitPage(MemPage *pPage){
1987   u8 *data;          /* Equal to pPage->aData */
1988   BtShared *pBt;        /* The main btree structure */
1989 
1990   assert( pPage->pBt!=0 );
1991   assert( pPage->pBt->db!=0 );
1992   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1993   assert( pPage->pgno==sqlite3PagerPagenumber(pPage->pDbPage) );
1994   assert( pPage == sqlite3PagerGetExtra(pPage->pDbPage) );
1995   assert( pPage->aData == sqlite3PagerGetData(pPage->pDbPage) );
1996   assert( pPage->isInit==0 );
1997 
1998   pBt = pPage->pBt;
1999   data = pPage->aData + pPage->hdrOffset;
2000   /* EVIDENCE-OF: R-28594-02890 The one-byte flag at offset 0 indicating
2001   ** the b-tree page type. */
2002   if( decodeFlags(pPage, data[0]) ){
2003     return SQLITE_CORRUPT_PAGE(pPage);
2004   }
2005   assert( pBt->pageSize>=512 && pBt->pageSize<=65536 );
2006   pPage->maskPage = (u16)(pBt->pageSize - 1);
2007   pPage->nOverflow = 0;
2008   pPage->cellOffset = pPage->hdrOffset + 8 + pPage->childPtrSize;
2009   pPage->aCellIdx = data + pPage->childPtrSize + 8;
2010   pPage->aDataEnd = pPage->aData + pBt->usableSize;
2011   pPage->aDataOfst = pPage->aData + pPage->childPtrSize;
2012   /* EVIDENCE-OF: R-37002-32774 The two-byte integer at offset 3 gives the
2013   ** number of cells on the page. */
2014   pPage->nCell = get2byte(&data[3]);
2015   if( pPage->nCell>MX_CELL(pBt) ){
2016     /* To many cells for a single page.  The page must be corrupt */
2017     return SQLITE_CORRUPT_PAGE(pPage);
2018   }
2019   testcase( pPage->nCell==MX_CELL(pBt) );
2020   /* EVIDENCE-OF: R-24089-57979 If a page contains no cells (which is only
2021   ** possible for a root page of a table that contains no rows) then the
2022   ** offset to the cell content area will equal the page size minus the
2023   ** bytes of reserved space. */
2024   assert( pPage->nCell>0
2025        || get2byteNotZero(&data[5])==(int)pBt->usableSize
2026        || CORRUPT_DB );
2027   pPage->nFree = -1;  /* Indicate that this value is yet uncomputed */
2028   pPage->isInit = 1;
2029   if( pBt->db->flags & SQLITE_CellSizeCk ){
2030     return btreeCellSizeCheck(pPage);
2031   }
2032   return SQLITE_OK;
2033 }
2034 
2035 /*
2036 ** Set up a raw page so that it looks like a database page holding
2037 ** no entries.
2038 */
2039 static void zeroPage(MemPage *pPage, int flags){
2040   unsigned char *data = pPage->aData;
2041   BtShared *pBt = pPage->pBt;
2042   u8 hdr = pPage->hdrOffset;
2043   u16 first;
2044 
2045   assert( sqlite3PagerPagenumber(pPage->pDbPage)==pPage->pgno );
2046   assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage );
2047   assert( sqlite3PagerGetData(pPage->pDbPage) == data );
2048   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
2049   assert( sqlite3_mutex_held(pBt->mutex) );
2050   if( pBt->btsFlags & BTS_FAST_SECURE ){
2051     memset(&data[hdr], 0, pBt->usableSize - hdr);
2052   }
2053   data[hdr] = (char)flags;
2054   first = hdr + ((flags&PTF_LEAF)==0 ? 12 : 8);
2055   memset(&data[hdr+1], 0, 4);
2056   data[hdr+7] = 0;
2057   put2byte(&data[hdr+5], pBt->usableSize);
2058   pPage->nFree = (u16)(pBt->usableSize - first);
2059   decodeFlags(pPage, flags);
2060   pPage->cellOffset = first;
2061   pPage->aDataEnd = &data[pBt->usableSize];
2062   pPage->aCellIdx = &data[first];
2063   pPage->aDataOfst = &data[pPage->childPtrSize];
2064   pPage->nOverflow = 0;
2065   assert( pBt->pageSize>=512 && pBt->pageSize<=65536 );
2066   pPage->maskPage = (u16)(pBt->pageSize - 1);
2067   pPage->nCell = 0;
2068   pPage->isInit = 1;
2069 }
2070 
2071 
2072 /*
2073 ** Convert a DbPage obtained from the pager into a MemPage used by
2074 ** the btree layer.
2075 */
2076 static MemPage *btreePageFromDbPage(DbPage *pDbPage, Pgno pgno, BtShared *pBt){
2077   MemPage *pPage = (MemPage*)sqlite3PagerGetExtra(pDbPage);
2078   if( pgno!=pPage->pgno ){
2079     pPage->aData = sqlite3PagerGetData(pDbPage);
2080     pPage->pDbPage = pDbPage;
2081     pPage->pBt = pBt;
2082     pPage->pgno = pgno;
2083     pPage->hdrOffset = pgno==1 ? 100 : 0;
2084   }
2085   assert( pPage->aData==sqlite3PagerGetData(pDbPage) );
2086   return pPage;
2087 }
2088 
2089 /*
2090 ** Get a page from the pager.  Initialize the MemPage.pBt and
2091 ** MemPage.aData elements if needed.  See also: btreeGetUnusedPage().
2092 **
2093 ** If the PAGER_GET_NOCONTENT flag is set, it means that we do not care
2094 ** about the content of the page at this time.  So do not go to the disk
2095 ** to fetch the content.  Just fill in the content with zeros for now.
2096 ** If in the future we call sqlite3PagerWrite() on this page, that
2097 ** means we have started to be concerned about content and the disk
2098 ** read should occur at that point.
2099 */
2100 static int btreeGetPage(
2101   BtShared *pBt,       /* The btree */
2102   Pgno pgno,           /* Number of the page to fetch */
2103   MemPage **ppPage,    /* Return the page in this parameter */
2104   int flags            /* PAGER_GET_NOCONTENT or PAGER_GET_READONLY */
2105 ){
2106   int rc;
2107   DbPage *pDbPage;
2108 
2109   assert( flags==0 || flags==PAGER_GET_NOCONTENT || flags==PAGER_GET_READONLY );
2110   assert( sqlite3_mutex_held(pBt->mutex) );
2111   rc = sqlite3PagerGet(pBt->pPager, pgno, (DbPage**)&pDbPage, flags);
2112   if( rc ) return rc;
2113   *ppPage = btreePageFromDbPage(pDbPage, pgno, pBt);
2114   return SQLITE_OK;
2115 }
2116 
2117 /*
2118 ** Retrieve a page from the pager cache. If the requested page is not
2119 ** already in the pager cache return NULL. Initialize the MemPage.pBt and
2120 ** MemPage.aData elements if needed.
2121 */
2122 static MemPage *btreePageLookup(BtShared *pBt, Pgno pgno){
2123   DbPage *pDbPage;
2124   assert( sqlite3_mutex_held(pBt->mutex) );
2125   pDbPage = sqlite3PagerLookup(pBt->pPager, pgno);
2126   if( pDbPage ){
2127     return btreePageFromDbPage(pDbPage, pgno, pBt);
2128   }
2129   return 0;
2130 }
2131 
2132 /*
2133 ** Return the size of the database file in pages. If there is any kind of
2134 ** error, return ((unsigned int)-1).
2135 */
2136 static Pgno btreePagecount(BtShared *pBt){
2137   return pBt->nPage;
2138 }
2139 u32 sqlite3BtreeLastPage(Btree *p){
2140   assert( sqlite3BtreeHoldsMutex(p) );
2141   assert( ((p->pBt->nPage)&0x80000000)==0 );
2142   return btreePagecount(p->pBt);
2143 }
2144 
2145 /*
2146 ** Get a page from the pager and initialize it.
2147 **
2148 ** If pCur!=0 then the page is being fetched as part of a moveToChild()
2149 ** call.  Do additional sanity checking on the page in this case.
2150 ** And if the fetch fails, this routine must decrement pCur->iPage.
2151 **
2152 ** The page is fetched as read-write unless pCur is not NULL and is
2153 ** a read-only cursor.
2154 **
2155 ** If an error occurs, then *ppPage is undefined. It
2156 ** may remain unchanged, or it may be set to an invalid value.
2157 */
2158 static int getAndInitPage(
2159   BtShared *pBt,                  /* The database file */
2160   Pgno pgno,                      /* Number of the page to get */
2161   MemPage **ppPage,               /* Write the page pointer here */
2162   BtCursor *pCur,                 /* Cursor to receive the page, or NULL */
2163   int bReadOnly                   /* True for a read-only page */
2164 ){
2165   int rc;
2166   DbPage *pDbPage;
2167   assert( sqlite3_mutex_held(pBt->mutex) );
2168   assert( pCur==0 || ppPage==&pCur->pPage );
2169   assert( pCur==0 || bReadOnly==pCur->curPagerFlags );
2170   assert( pCur==0 || pCur->iPage>0 );
2171 
2172   if( pgno>btreePagecount(pBt) ){
2173     rc = SQLITE_CORRUPT_BKPT;
2174     goto getAndInitPage_error1;
2175   }
2176   rc = sqlite3PagerGet(pBt->pPager, pgno, (DbPage**)&pDbPage, bReadOnly);
2177   if( rc ){
2178     goto getAndInitPage_error1;
2179   }
2180   *ppPage = (MemPage*)sqlite3PagerGetExtra(pDbPage);
2181   if( (*ppPage)->isInit==0 ){
2182     btreePageFromDbPage(pDbPage, pgno, pBt);
2183     rc = btreeInitPage(*ppPage);
2184     if( rc!=SQLITE_OK ){
2185       goto getAndInitPage_error2;
2186     }
2187   }
2188   assert( (*ppPage)->pgno==pgno );
2189   assert( (*ppPage)->aData==sqlite3PagerGetData(pDbPage) );
2190 
2191   /* If obtaining a child page for a cursor, we must verify that the page is
2192   ** compatible with the root page. */
2193   if( pCur && ((*ppPage)->nCell<1 || (*ppPage)->intKey!=pCur->curIntKey) ){
2194     rc = SQLITE_CORRUPT_PGNO(pgno);
2195     goto getAndInitPage_error2;
2196   }
2197   return SQLITE_OK;
2198 
2199 getAndInitPage_error2:
2200   releasePage(*ppPage);
2201 getAndInitPage_error1:
2202   if( pCur ){
2203     pCur->iPage--;
2204     pCur->pPage = pCur->apPage[pCur->iPage];
2205   }
2206   testcase( pgno==0 );
2207   assert( pgno!=0 || rc==SQLITE_CORRUPT );
2208   return rc;
2209 }
2210 
2211 /*
2212 ** Release a MemPage.  This should be called once for each prior
2213 ** call to btreeGetPage.
2214 **
2215 ** Page1 is a special case and must be released using releasePageOne().
2216 */
2217 static void releasePageNotNull(MemPage *pPage){
2218   assert( pPage->aData );
2219   assert( pPage->pBt );
2220   assert( pPage->pDbPage!=0 );
2221   assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage );
2222   assert( sqlite3PagerGetData(pPage->pDbPage)==pPage->aData );
2223   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
2224   sqlite3PagerUnrefNotNull(pPage->pDbPage);
2225 }
2226 static void releasePage(MemPage *pPage){
2227   if( pPage ) releasePageNotNull(pPage);
2228 }
2229 static void releasePageOne(MemPage *pPage){
2230   assert( pPage!=0 );
2231   assert( pPage->aData );
2232   assert( pPage->pBt );
2233   assert( pPage->pDbPage!=0 );
2234   assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage );
2235   assert( sqlite3PagerGetData(pPage->pDbPage)==pPage->aData );
2236   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
2237   sqlite3PagerUnrefPageOne(pPage->pDbPage);
2238 }
2239 
2240 /*
2241 ** Get an unused page.
2242 **
2243 ** This works just like btreeGetPage() with the addition:
2244 **
2245 **   *  If the page is already in use for some other purpose, immediately
2246 **      release it and return an SQLITE_CURRUPT error.
2247 **   *  Make sure the isInit flag is clear
2248 */
2249 static int btreeGetUnusedPage(
2250   BtShared *pBt,       /* The btree */
2251   Pgno pgno,           /* Number of the page to fetch */
2252   MemPage **ppPage,    /* Return the page in this parameter */
2253   int flags            /* PAGER_GET_NOCONTENT or PAGER_GET_READONLY */
2254 ){
2255   int rc = btreeGetPage(pBt, pgno, ppPage, flags);
2256   if( rc==SQLITE_OK ){
2257     if( sqlite3PagerPageRefcount((*ppPage)->pDbPage)>1 ){
2258       releasePage(*ppPage);
2259       *ppPage = 0;
2260       return SQLITE_CORRUPT_BKPT;
2261     }
2262     (*ppPage)->isInit = 0;
2263   }else{
2264     *ppPage = 0;
2265   }
2266   return rc;
2267 }
2268 
2269 
2270 /*
2271 ** During a rollback, when the pager reloads information into the cache
2272 ** so that the cache is restored to its original state at the start of
2273 ** the transaction, for each page restored this routine is called.
2274 **
2275 ** This routine needs to reset the extra data section at the end of the
2276 ** page to agree with the restored data.
2277 */
2278 static void pageReinit(DbPage *pData){
2279   MemPage *pPage;
2280   pPage = (MemPage *)sqlite3PagerGetExtra(pData);
2281   assert( sqlite3PagerPageRefcount(pData)>0 );
2282   if( pPage->isInit ){
2283     assert( sqlite3_mutex_held(pPage->pBt->mutex) );
2284     pPage->isInit = 0;
2285     if( sqlite3PagerPageRefcount(pData)>1 ){
2286       /* pPage might not be a btree page;  it might be an overflow page
2287       ** or ptrmap page or a free page.  In those cases, the following
2288       ** call to btreeInitPage() will likely return SQLITE_CORRUPT.
2289       ** But no harm is done by this.  And it is very important that
2290       ** btreeInitPage() be called on every btree page so we make
2291       ** the call for every page that comes in for re-initing. */
2292       btreeInitPage(pPage);
2293     }
2294   }
2295 }
2296 
2297 /*
2298 ** Invoke the busy handler for a btree.
2299 */
2300 static int btreeInvokeBusyHandler(void *pArg){
2301   BtShared *pBt = (BtShared*)pArg;
2302   assert( pBt->db );
2303   assert( sqlite3_mutex_held(pBt->db->mutex) );
2304   return sqlite3InvokeBusyHandler(&pBt->db->busyHandler,
2305                                   sqlite3PagerFile(pBt->pPager));
2306 }
2307 
2308 /*
2309 ** Open a database file.
2310 **
2311 ** zFilename is the name of the database file.  If zFilename is NULL
2312 ** then an ephemeral database is created.  The ephemeral database might
2313 ** be exclusively in memory, or it might use a disk-based memory cache.
2314 ** Either way, the ephemeral database will be automatically deleted
2315 ** when sqlite3BtreeClose() is called.
2316 **
2317 ** If zFilename is ":memory:" then an in-memory database is created
2318 ** that is automatically destroyed when it is closed.
2319 **
2320 ** The "flags" parameter is a bitmask that might contain bits like
2321 ** BTREE_OMIT_JOURNAL and/or BTREE_MEMORY.
2322 **
2323 ** If the database is already opened in the same database connection
2324 ** and we are in shared cache mode, then the open will fail with an
2325 ** SQLITE_CONSTRAINT error.  We cannot allow two or more BtShared
2326 ** objects in the same database connection since doing so will lead
2327 ** to problems with locking.
2328 */
2329 int sqlite3BtreeOpen(
2330   sqlite3_vfs *pVfs,      /* VFS to use for this b-tree */
2331   const char *zFilename,  /* Name of the file containing the BTree database */
2332   sqlite3 *db,            /* Associated database handle */
2333   Btree **ppBtree,        /* Pointer to new Btree object written here */
2334   int flags,              /* Options */
2335   int vfsFlags            /* Flags passed through to sqlite3_vfs.xOpen() */
2336 ){
2337   BtShared *pBt = 0;             /* Shared part of btree structure */
2338   Btree *p;                      /* Handle to return */
2339   sqlite3_mutex *mutexOpen = 0;  /* Prevents a race condition. Ticket #3537 */
2340   int rc = SQLITE_OK;            /* Result code from this function */
2341   u8 nReserve;                   /* Byte of unused space on each page */
2342   unsigned char zDbHeader[100];  /* Database header content */
2343 
2344   /* True if opening an ephemeral, temporary database */
2345   const int isTempDb = zFilename==0 || zFilename[0]==0;
2346 
2347   /* Set the variable isMemdb to true for an in-memory database, or
2348   ** false for a file-based database.
2349   */
2350 #ifdef SQLITE_OMIT_MEMORYDB
2351   const int isMemdb = 0;
2352 #else
2353   const int isMemdb = (zFilename && strcmp(zFilename, ":memory:")==0)
2354                        || (isTempDb && sqlite3TempInMemory(db))
2355                        || (vfsFlags & SQLITE_OPEN_MEMORY)!=0;
2356 #endif
2357 
2358   assert( db!=0 );
2359   assert( pVfs!=0 );
2360   assert( sqlite3_mutex_held(db->mutex) );
2361   assert( (flags&0xff)==flags );   /* flags fit in 8 bits */
2362 
2363   /* Only a BTREE_SINGLE database can be BTREE_UNORDERED */
2364   assert( (flags & BTREE_UNORDERED)==0 || (flags & BTREE_SINGLE)!=0 );
2365 
2366   /* A BTREE_SINGLE database is always a temporary and/or ephemeral */
2367   assert( (flags & BTREE_SINGLE)==0 || isTempDb );
2368 
2369   if( isMemdb ){
2370     flags |= BTREE_MEMORY;
2371   }
2372   if( (vfsFlags & SQLITE_OPEN_MAIN_DB)!=0 && (isMemdb || isTempDb) ){
2373     vfsFlags = (vfsFlags & ~SQLITE_OPEN_MAIN_DB) | SQLITE_OPEN_TEMP_DB;
2374   }
2375   p = sqlite3MallocZero(sizeof(Btree));
2376   if( !p ){
2377     return SQLITE_NOMEM_BKPT;
2378   }
2379   p->inTrans = TRANS_NONE;
2380   p->db = db;
2381 #ifndef SQLITE_OMIT_SHARED_CACHE
2382   p->lock.pBtree = p;
2383   p->lock.iTable = 1;
2384 #endif
2385 
2386 #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)
2387   /*
2388   ** If this Btree is a candidate for shared cache, try to find an
2389   ** existing BtShared object that we can share with
2390   */
2391   if( isTempDb==0 && (isMemdb==0 || (vfsFlags&SQLITE_OPEN_URI)!=0) ){
2392     if( vfsFlags & SQLITE_OPEN_SHAREDCACHE ){
2393       int nFilename = sqlite3Strlen30(zFilename)+1;
2394       int nFullPathname = pVfs->mxPathname+1;
2395       char *zFullPathname = sqlite3Malloc(MAX(nFullPathname,nFilename));
2396       MUTEX_LOGIC( sqlite3_mutex *mutexShared; )
2397 
2398       p->sharable = 1;
2399       if( !zFullPathname ){
2400         sqlite3_free(p);
2401         return SQLITE_NOMEM_BKPT;
2402       }
2403       if( isMemdb ){
2404         memcpy(zFullPathname, zFilename, nFilename);
2405       }else{
2406         rc = sqlite3OsFullPathname(pVfs, zFilename,
2407                                    nFullPathname, zFullPathname);
2408         if( rc ){
2409           if( rc==SQLITE_OK_SYMLINK ){
2410             rc = SQLITE_OK;
2411           }else{
2412             sqlite3_free(zFullPathname);
2413             sqlite3_free(p);
2414             return rc;
2415           }
2416         }
2417       }
2418 #if SQLITE_THREADSAFE
2419       mutexOpen = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_OPEN);
2420       sqlite3_mutex_enter(mutexOpen);
2421       mutexShared = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER);
2422       sqlite3_mutex_enter(mutexShared);
2423 #endif
2424       for(pBt=GLOBAL(BtShared*,sqlite3SharedCacheList); pBt; pBt=pBt->pNext){
2425         assert( pBt->nRef>0 );
2426         if( 0==strcmp(zFullPathname, sqlite3PagerFilename(pBt->pPager, 0))
2427                  && sqlite3PagerVfs(pBt->pPager)==pVfs ){
2428           int iDb;
2429           for(iDb=db->nDb-1; iDb>=0; iDb--){
2430             Btree *pExisting = db->aDb[iDb].pBt;
2431             if( pExisting && pExisting->pBt==pBt ){
2432               sqlite3_mutex_leave(mutexShared);
2433               sqlite3_mutex_leave(mutexOpen);
2434               sqlite3_free(zFullPathname);
2435               sqlite3_free(p);
2436               return SQLITE_CONSTRAINT;
2437             }
2438           }
2439           p->pBt = pBt;
2440           pBt->nRef++;
2441           break;
2442         }
2443       }
2444       sqlite3_mutex_leave(mutexShared);
2445       sqlite3_free(zFullPathname);
2446     }
2447 #ifdef SQLITE_DEBUG
2448     else{
2449       /* In debug mode, we mark all persistent databases as sharable
2450       ** even when they are not.  This exercises the locking code and
2451       ** gives more opportunity for asserts(sqlite3_mutex_held())
2452       ** statements to find locking problems.
2453       */
2454       p->sharable = 1;
2455     }
2456 #endif
2457   }
2458 #endif
2459   if( pBt==0 ){
2460     /*
2461     ** The following asserts make sure that structures used by the btree are
2462     ** the right size.  This is to guard against size changes that result
2463     ** when compiling on a different architecture.
2464     */
2465     assert( sizeof(i64)==8 );
2466     assert( sizeof(u64)==8 );
2467     assert( sizeof(u32)==4 );
2468     assert( sizeof(u16)==2 );
2469     assert( sizeof(Pgno)==4 );
2470 
2471     pBt = sqlite3MallocZero( sizeof(*pBt) );
2472     if( pBt==0 ){
2473       rc = SQLITE_NOMEM_BKPT;
2474       goto btree_open_out;
2475     }
2476     rc = sqlite3PagerOpen(pVfs, &pBt->pPager, zFilename,
2477                           sizeof(MemPage), flags, vfsFlags, pageReinit);
2478     if( rc==SQLITE_OK ){
2479       sqlite3PagerSetMmapLimit(pBt->pPager, db->szMmap);
2480       rc = sqlite3PagerReadFileheader(pBt->pPager,sizeof(zDbHeader),zDbHeader);
2481     }
2482     if( rc!=SQLITE_OK ){
2483       goto btree_open_out;
2484     }
2485     pBt->openFlags = (u8)flags;
2486     pBt->db = db;
2487     sqlite3PagerSetBusyHandler(pBt->pPager, btreeInvokeBusyHandler, pBt);
2488     p->pBt = pBt;
2489 
2490     pBt->pCursor = 0;
2491     pBt->pPage1 = 0;
2492     if( sqlite3PagerIsreadonly(pBt->pPager) ) pBt->btsFlags |= BTS_READ_ONLY;
2493 #if defined(SQLITE_SECURE_DELETE)
2494     pBt->btsFlags |= BTS_SECURE_DELETE;
2495 #elif defined(SQLITE_FAST_SECURE_DELETE)
2496     pBt->btsFlags |= BTS_OVERWRITE;
2497 #endif
2498     /* EVIDENCE-OF: R-51873-39618 The page size for a database file is
2499     ** determined by the 2-byte integer located at an offset of 16 bytes from
2500     ** the beginning of the database file. */
2501     pBt->pageSize = (zDbHeader[16]<<8) | (zDbHeader[17]<<16);
2502     if( pBt->pageSize<512 || pBt->pageSize>SQLITE_MAX_PAGE_SIZE
2503          || ((pBt->pageSize-1)&pBt->pageSize)!=0 ){
2504       pBt->pageSize = 0;
2505 #ifndef SQLITE_OMIT_AUTOVACUUM
2506       /* If the magic name ":memory:" will create an in-memory database, then
2507       ** leave the autoVacuum mode at 0 (do not auto-vacuum), even if
2508       ** SQLITE_DEFAULT_AUTOVACUUM is true. On the other hand, if
2509       ** SQLITE_OMIT_MEMORYDB has been defined, then ":memory:" is just a
2510       ** regular file-name. In this case the auto-vacuum applies as per normal.
2511       */
2512       if( zFilename && !isMemdb ){
2513         pBt->autoVacuum = (SQLITE_DEFAULT_AUTOVACUUM ? 1 : 0);
2514         pBt->incrVacuum = (SQLITE_DEFAULT_AUTOVACUUM==2 ? 1 : 0);
2515       }
2516 #endif
2517       nReserve = 0;
2518     }else{
2519       /* EVIDENCE-OF: R-37497-42412 The size of the reserved region is
2520       ** determined by the one-byte unsigned integer found at an offset of 20
2521       ** into the database file header. */
2522       nReserve = zDbHeader[20];
2523       pBt->btsFlags |= BTS_PAGESIZE_FIXED;
2524 #ifndef SQLITE_OMIT_AUTOVACUUM
2525       pBt->autoVacuum = (get4byte(&zDbHeader[36 + 4*4])?1:0);
2526       pBt->incrVacuum = (get4byte(&zDbHeader[36 + 7*4])?1:0);
2527 #endif
2528     }
2529     rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize, nReserve);
2530     if( rc ) goto btree_open_out;
2531     pBt->usableSize = pBt->pageSize - nReserve;
2532     assert( (pBt->pageSize & 7)==0 );  /* 8-byte alignment of pageSize */
2533 
2534 #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)
2535     /* Add the new BtShared object to the linked list sharable BtShareds.
2536     */
2537     pBt->nRef = 1;
2538     if( p->sharable ){
2539       MUTEX_LOGIC( sqlite3_mutex *mutexShared; )
2540       MUTEX_LOGIC( mutexShared = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER);)
2541       if( SQLITE_THREADSAFE && sqlite3GlobalConfig.bCoreMutex ){
2542         pBt->mutex = sqlite3MutexAlloc(SQLITE_MUTEX_FAST);
2543         if( pBt->mutex==0 ){
2544           rc = SQLITE_NOMEM_BKPT;
2545           goto btree_open_out;
2546         }
2547       }
2548       sqlite3_mutex_enter(mutexShared);
2549       pBt->pNext = GLOBAL(BtShared*,sqlite3SharedCacheList);
2550       GLOBAL(BtShared*,sqlite3SharedCacheList) = pBt;
2551       sqlite3_mutex_leave(mutexShared);
2552     }
2553 #endif
2554   }
2555 
2556 #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)
2557   /* If the new Btree uses a sharable pBtShared, then link the new
2558   ** Btree into the list of all sharable Btrees for the same connection.
2559   ** The list is kept in ascending order by pBt address.
2560   */
2561   if( p->sharable ){
2562     int i;
2563     Btree *pSib;
2564     for(i=0; i<db->nDb; i++){
2565       if( (pSib = db->aDb[i].pBt)!=0 && pSib->sharable ){
2566         while( pSib->pPrev ){ pSib = pSib->pPrev; }
2567         if( (uptr)p->pBt<(uptr)pSib->pBt ){
2568           p->pNext = pSib;
2569           p->pPrev = 0;
2570           pSib->pPrev = p;
2571         }else{
2572           while( pSib->pNext && (uptr)pSib->pNext->pBt<(uptr)p->pBt ){
2573             pSib = pSib->pNext;
2574           }
2575           p->pNext = pSib->pNext;
2576           p->pPrev = pSib;
2577           if( p->pNext ){
2578             p->pNext->pPrev = p;
2579           }
2580           pSib->pNext = p;
2581         }
2582         break;
2583       }
2584     }
2585   }
2586 #endif
2587   *ppBtree = p;
2588 
2589 btree_open_out:
2590   if( rc!=SQLITE_OK ){
2591     if( pBt && pBt->pPager ){
2592       sqlite3PagerClose(pBt->pPager, 0);
2593     }
2594     sqlite3_free(pBt);
2595     sqlite3_free(p);
2596     *ppBtree = 0;
2597   }else{
2598     sqlite3_file *pFile;
2599 
2600     /* If the B-Tree was successfully opened, set the pager-cache size to the
2601     ** default value. Except, when opening on an existing shared pager-cache,
2602     ** do not change the pager-cache size.
2603     */
2604     if( sqlite3BtreeSchema(p, 0, 0)==0 ){
2605       sqlite3PagerSetCachesize(p->pBt->pPager, SQLITE_DEFAULT_CACHE_SIZE);
2606     }
2607 
2608     pFile = sqlite3PagerFile(pBt->pPager);
2609     if( pFile->pMethods ){
2610       sqlite3OsFileControlHint(pFile, SQLITE_FCNTL_PDB, (void*)&pBt->db);
2611     }
2612   }
2613   if( mutexOpen ){
2614     assert( sqlite3_mutex_held(mutexOpen) );
2615     sqlite3_mutex_leave(mutexOpen);
2616   }
2617   assert( rc!=SQLITE_OK || sqlite3BtreeConnectionCount(*ppBtree)>0 );
2618   return rc;
2619 }
2620 
2621 /*
2622 ** Decrement the BtShared.nRef counter.  When it reaches zero,
2623 ** remove the BtShared structure from the sharing list.  Return
2624 ** true if the BtShared.nRef counter reaches zero and return
2625 ** false if it is still positive.
2626 */
2627 static int removeFromSharingList(BtShared *pBt){
2628 #ifndef SQLITE_OMIT_SHARED_CACHE
2629   MUTEX_LOGIC( sqlite3_mutex *pMaster; )
2630   BtShared *pList;
2631   int removed = 0;
2632 
2633   assert( sqlite3_mutex_notheld(pBt->mutex) );
2634   MUTEX_LOGIC( pMaster = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER); )
2635   sqlite3_mutex_enter(pMaster);
2636   pBt->nRef--;
2637   if( pBt->nRef<=0 ){
2638     if( GLOBAL(BtShared*,sqlite3SharedCacheList)==pBt ){
2639       GLOBAL(BtShared*,sqlite3SharedCacheList) = pBt->pNext;
2640     }else{
2641       pList = GLOBAL(BtShared*,sqlite3SharedCacheList);
2642       while( ALWAYS(pList) && pList->pNext!=pBt ){
2643         pList=pList->pNext;
2644       }
2645       if( ALWAYS(pList) ){
2646         pList->pNext = pBt->pNext;
2647       }
2648     }
2649     if( SQLITE_THREADSAFE ){
2650       sqlite3_mutex_free(pBt->mutex);
2651     }
2652     removed = 1;
2653   }
2654   sqlite3_mutex_leave(pMaster);
2655   return removed;
2656 #else
2657   return 1;
2658 #endif
2659 }
2660 
2661 /*
2662 ** Make sure pBt->pTmpSpace points to an allocation of
2663 ** MX_CELL_SIZE(pBt) bytes with a 4-byte prefix for a left-child
2664 ** pointer.
2665 */
2666 static void allocateTempSpace(BtShared *pBt){
2667   if( !pBt->pTmpSpace ){
2668     pBt->pTmpSpace = sqlite3PageMalloc( pBt->pageSize );
2669 
2670     /* One of the uses of pBt->pTmpSpace is to format cells before
2671     ** inserting them into a leaf page (function fillInCell()). If
2672     ** a cell is less than 4 bytes in size, it is rounded up to 4 bytes
2673     ** by the various routines that manipulate binary cells. Which
2674     ** can mean that fillInCell() only initializes the first 2 or 3
2675     ** bytes of pTmpSpace, but that the first 4 bytes are copied from
2676     ** it into a database page. This is not actually a problem, but it
2677     ** does cause a valgrind error when the 1 or 2 bytes of unitialized
2678     ** data is passed to system call write(). So to avoid this error,
2679     ** zero the first 4 bytes of temp space here.
2680     **
2681     ** Also:  Provide four bytes of initialized space before the
2682     ** beginning of pTmpSpace as an area available to prepend the
2683     ** left-child pointer to the beginning of a cell.
2684     */
2685     if( pBt->pTmpSpace ){
2686       memset(pBt->pTmpSpace, 0, 8);
2687       pBt->pTmpSpace += 4;
2688     }
2689   }
2690 }
2691 
2692 /*
2693 ** Free the pBt->pTmpSpace allocation
2694 */
2695 static void freeTempSpace(BtShared *pBt){
2696   if( pBt->pTmpSpace ){
2697     pBt->pTmpSpace -= 4;
2698     sqlite3PageFree(pBt->pTmpSpace);
2699     pBt->pTmpSpace = 0;
2700   }
2701 }
2702 
2703 /*
2704 ** Close an open database and invalidate all cursors.
2705 */
2706 int sqlite3BtreeClose(Btree *p){
2707   BtShared *pBt = p->pBt;
2708   BtCursor *pCur;
2709 
2710   /* Close all cursors opened via this handle.  */
2711   assert( sqlite3_mutex_held(p->db->mutex) );
2712   sqlite3BtreeEnter(p);
2713   pCur = pBt->pCursor;
2714   while( pCur ){
2715     BtCursor *pTmp = pCur;
2716     pCur = pCur->pNext;
2717     if( pTmp->pBtree==p ){
2718       sqlite3BtreeCloseCursor(pTmp);
2719     }
2720   }
2721 
2722   /* Rollback any active transaction and free the handle structure.
2723   ** The call to sqlite3BtreeRollback() drops any table-locks held by
2724   ** this handle.
2725   */
2726   sqlite3BtreeRollback(p, SQLITE_OK, 0);
2727   sqlite3BtreeLeave(p);
2728 
2729   /* If there are still other outstanding references to the shared-btree
2730   ** structure, return now. The remainder of this procedure cleans
2731   ** up the shared-btree.
2732   */
2733   assert( p->wantToLock==0 && p->locked==0 );
2734   if( !p->sharable || removeFromSharingList(pBt) ){
2735     /* The pBt is no longer on the sharing list, so we can access
2736     ** it without having to hold the mutex.
2737     **
2738     ** Clean out and delete the BtShared object.
2739     */
2740     assert( !pBt->pCursor );
2741     sqlite3PagerClose(pBt->pPager, p->db);
2742     if( pBt->xFreeSchema && pBt->pSchema ){
2743       pBt->xFreeSchema(pBt->pSchema);
2744     }
2745     sqlite3DbFree(0, pBt->pSchema);
2746     freeTempSpace(pBt);
2747     sqlite3_free(pBt);
2748   }
2749 
2750 #ifndef SQLITE_OMIT_SHARED_CACHE
2751   assert( p->wantToLock==0 );
2752   assert( p->locked==0 );
2753   if( p->pPrev ) p->pPrev->pNext = p->pNext;
2754   if( p->pNext ) p->pNext->pPrev = p->pPrev;
2755 #endif
2756 
2757   sqlite3_free(p);
2758   return SQLITE_OK;
2759 }
2760 
2761 /*
2762 ** Change the "soft" limit on the number of pages in the cache.
2763 ** Unused and unmodified pages will be recycled when the number of
2764 ** pages in the cache exceeds this soft limit.  But the size of the
2765 ** cache is allowed to grow larger than this limit if it contains
2766 ** dirty pages or pages still in active use.
2767 */
2768 int sqlite3BtreeSetCacheSize(Btree *p, int mxPage){
2769   BtShared *pBt = p->pBt;
2770   assert( sqlite3_mutex_held(p->db->mutex) );
2771   sqlite3BtreeEnter(p);
2772   sqlite3PagerSetCachesize(pBt->pPager, mxPage);
2773   sqlite3BtreeLeave(p);
2774   return SQLITE_OK;
2775 }
2776 
2777 /*
2778 ** Change the "spill" limit on the number of pages in the cache.
2779 ** If the number of pages exceeds this limit during a write transaction,
2780 ** the pager might attempt to "spill" pages to the journal early in
2781 ** order to free up memory.
2782 **
2783 ** The value returned is the current spill size.  If zero is passed
2784 ** as an argument, no changes are made to the spill size setting, so
2785 ** using mxPage of 0 is a way to query the current spill size.
2786 */
2787 int sqlite3BtreeSetSpillSize(Btree *p, int mxPage){
2788   BtShared *pBt = p->pBt;
2789   int res;
2790   assert( sqlite3_mutex_held(p->db->mutex) );
2791   sqlite3BtreeEnter(p);
2792   res = sqlite3PagerSetSpillsize(pBt->pPager, mxPage);
2793   sqlite3BtreeLeave(p);
2794   return res;
2795 }
2796 
2797 #if SQLITE_MAX_MMAP_SIZE>0
2798 /*
2799 ** Change the limit on the amount of the database file that may be
2800 ** memory mapped.
2801 */
2802 int sqlite3BtreeSetMmapLimit(Btree *p, sqlite3_int64 szMmap){
2803   BtShared *pBt = p->pBt;
2804   assert( sqlite3_mutex_held(p->db->mutex) );
2805   sqlite3BtreeEnter(p);
2806   sqlite3PagerSetMmapLimit(pBt->pPager, szMmap);
2807   sqlite3BtreeLeave(p);
2808   return SQLITE_OK;
2809 }
2810 #endif /* SQLITE_MAX_MMAP_SIZE>0 */
2811 
2812 /*
2813 ** Change the way data is synced to disk in order to increase or decrease
2814 ** how well the database resists damage due to OS crashes and power
2815 ** failures.  Level 1 is the same as asynchronous (no syncs() occur and
2816 ** there is a high probability of damage)  Level 2 is the default.  There
2817 ** is a very low but non-zero probability of damage.  Level 3 reduces the
2818 ** probability of damage to near zero but with a write performance reduction.
2819 */
2820 #ifndef SQLITE_OMIT_PAGER_PRAGMAS
2821 int sqlite3BtreeSetPagerFlags(
2822   Btree *p,              /* The btree to set the safety level on */
2823   unsigned pgFlags       /* Various PAGER_* flags */
2824 ){
2825   BtShared *pBt = p->pBt;
2826   assert( sqlite3_mutex_held(p->db->mutex) );
2827   sqlite3BtreeEnter(p);
2828   sqlite3PagerSetFlags(pBt->pPager, pgFlags);
2829   sqlite3BtreeLeave(p);
2830   return SQLITE_OK;
2831 }
2832 #endif
2833 
2834 /*
2835 ** Change the default pages size and the number of reserved bytes per page.
2836 ** Or, if the page size has already been fixed, return SQLITE_READONLY
2837 ** without changing anything.
2838 **
2839 ** The page size must be a power of 2 between 512 and 65536.  If the page
2840 ** size supplied does not meet this constraint then the page size is not
2841 ** changed.
2842 **
2843 ** Page sizes are constrained to be a power of two so that the region
2844 ** of the database file used for locking (beginning at PENDING_BYTE,
2845 ** the first byte past the 1GB boundary, 0x40000000) needs to occur
2846 ** at the beginning of a page.
2847 **
2848 ** If parameter nReserve is less than zero, then the number of reserved
2849 ** bytes per page is left unchanged.
2850 **
2851 ** If the iFix!=0 then the BTS_PAGESIZE_FIXED flag is set so that the page size
2852 ** and autovacuum mode can no longer be changed.
2853 */
2854 int sqlite3BtreeSetPageSize(Btree *p, int pageSize, int nReserve, int iFix){
2855   int rc = SQLITE_OK;
2856   BtShared *pBt = p->pBt;
2857   assert( nReserve>=-1 && nReserve<=255 );
2858   sqlite3BtreeEnter(p);
2859 #if SQLITE_HAS_CODEC
2860   if( nReserve>pBt->optimalReserve ) pBt->optimalReserve = (u8)nReserve;
2861 #endif
2862   if( pBt->btsFlags & BTS_PAGESIZE_FIXED ){
2863     sqlite3BtreeLeave(p);
2864     return SQLITE_READONLY;
2865   }
2866   if( nReserve<0 ){
2867     nReserve = pBt->pageSize - pBt->usableSize;
2868   }
2869   assert( nReserve>=0 && nReserve<=255 );
2870   if( pageSize>=512 && pageSize<=SQLITE_MAX_PAGE_SIZE &&
2871         ((pageSize-1)&pageSize)==0 ){
2872     assert( (pageSize & 7)==0 );
2873     assert( !pBt->pCursor );
2874     pBt->pageSize = (u32)pageSize;
2875     freeTempSpace(pBt);
2876   }
2877   rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize, nReserve);
2878   pBt->usableSize = pBt->pageSize - (u16)nReserve;
2879   if( iFix ) pBt->btsFlags |= BTS_PAGESIZE_FIXED;
2880   sqlite3BtreeLeave(p);
2881   return rc;
2882 }
2883 
2884 /*
2885 ** Return the currently defined page size
2886 */
2887 int sqlite3BtreeGetPageSize(Btree *p){
2888   return p->pBt->pageSize;
2889 }
2890 
2891 /*
2892 ** This function is similar to sqlite3BtreeGetReserve(), except that it
2893 ** may only be called if it is guaranteed that the b-tree mutex is already
2894 ** held.
2895 **
2896 ** This is useful in one special case in the backup API code where it is
2897 ** known that the shared b-tree mutex is held, but the mutex on the
2898 ** database handle that owns *p is not. In this case if sqlite3BtreeEnter()
2899 ** were to be called, it might collide with some other operation on the
2900 ** database handle that owns *p, causing undefined behavior.
2901 */
2902 int sqlite3BtreeGetReserveNoMutex(Btree *p){
2903   int n;
2904   assert( sqlite3_mutex_held(p->pBt->mutex) );
2905   n = p->pBt->pageSize - p->pBt->usableSize;
2906   return n;
2907 }
2908 
2909 /*
2910 ** Return the number of bytes of space at the end of every page that
2911 ** are intentually left unused.  This is the "reserved" space that is
2912 ** sometimes used by extensions.
2913 **
2914 ** If SQLITE_HAS_MUTEX is defined then the number returned is the
2915 ** greater of the current reserved space and the maximum requested
2916 ** reserve space.
2917 */
2918 int sqlite3BtreeGetOptimalReserve(Btree *p){
2919   int n;
2920   sqlite3BtreeEnter(p);
2921   n = sqlite3BtreeGetReserveNoMutex(p);
2922 #ifdef SQLITE_HAS_CODEC
2923   if( n<p->pBt->optimalReserve ) n = p->pBt->optimalReserve;
2924 #endif
2925   sqlite3BtreeLeave(p);
2926   return n;
2927 }
2928 
2929 
2930 /*
2931 ** Set the maximum page count for a database if mxPage is positive.
2932 ** No changes are made if mxPage is 0 or negative.
2933 ** Regardless of the value of mxPage, return the maximum page count.
2934 */
2935 int sqlite3BtreeMaxPageCount(Btree *p, int mxPage){
2936   int n;
2937   sqlite3BtreeEnter(p);
2938   n = sqlite3PagerMaxPageCount(p->pBt->pPager, mxPage);
2939   sqlite3BtreeLeave(p);
2940   return n;
2941 }
2942 
2943 /*
2944 ** Change the values for the BTS_SECURE_DELETE and BTS_OVERWRITE flags:
2945 **
2946 **    newFlag==0       Both BTS_SECURE_DELETE and BTS_OVERWRITE are cleared
2947 **    newFlag==1       BTS_SECURE_DELETE set and BTS_OVERWRITE is cleared
2948 **    newFlag==2       BTS_SECURE_DELETE cleared and BTS_OVERWRITE is set
2949 **    newFlag==(-1)    No changes
2950 **
2951 ** This routine acts as a query if newFlag is less than zero
2952 **
2953 ** With BTS_OVERWRITE set, deleted content is overwritten by zeros, but
2954 ** freelist leaf pages are not written back to the database.  Thus in-page
2955 ** deleted content is cleared, but freelist deleted content is not.
2956 **
2957 ** With BTS_SECURE_DELETE, operation is like BTS_OVERWRITE with the addition
2958 ** that freelist leaf pages are written back into the database, increasing
2959 ** the amount of disk I/O.
2960 */
2961 int sqlite3BtreeSecureDelete(Btree *p, int newFlag){
2962   int b;
2963   if( p==0 ) return 0;
2964   sqlite3BtreeEnter(p);
2965   assert( BTS_OVERWRITE==BTS_SECURE_DELETE*2 );
2966   assert( BTS_FAST_SECURE==(BTS_OVERWRITE|BTS_SECURE_DELETE) );
2967   if( newFlag>=0 ){
2968     p->pBt->btsFlags &= ~BTS_FAST_SECURE;
2969     p->pBt->btsFlags |= BTS_SECURE_DELETE*newFlag;
2970   }
2971   b = (p->pBt->btsFlags & BTS_FAST_SECURE)/BTS_SECURE_DELETE;
2972   sqlite3BtreeLeave(p);
2973   return b;
2974 }
2975 
2976 /*
2977 ** Change the 'auto-vacuum' property of the database. If the 'autoVacuum'
2978 ** parameter is non-zero, then auto-vacuum mode is enabled. If zero, it
2979 ** is disabled. The default value for the auto-vacuum property is
2980 ** determined by the SQLITE_DEFAULT_AUTOVACUUM macro.
2981 */
2982 int sqlite3BtreeSetAutoVacuum(Btree *p, int autoVacuum){
2983 #ifdef SQLITE_OMIT_AUTOVACUUM
2984   return SQLITE_READONLY;
2985 #else
2986   BtShared *pBt = p->pBt;
2987   int rc = SQLITE_OK;
2988   u8 av = (u8)autoVacuum;
2989 
2990   sqlite3BtreeEnter(p);
2991   if( (pBt->btsFlags & BTS_PAGESIZE_FIXED)!=0 && (av ?1:0)!=pBt->autoVacuum ){
2992     rc = SQLITE_READONLY;
2993   }else{
2994     pBt->autoVacuum = av ?1:0;
2995     pBt->incrVacuum = av==2 ?1:0;
2996   }
2997   sqlite3BtreeLeave(p);
2998   return rc;
2999 #endif
3000 }
3001 
3002 /*
3003 ** Return the value of the 'auto-vacuum' property. If auto-vacuum is
3004 ** enabled 1 is returned. Otherwise 0.
3005 */
3006 int sqlite3BtreeGetAutoVacuum(Btree *p){
3007 #ifdef SQLITE_OMIT_AUTOVACUUM
3008   return BTREE_AUTOVACUUM_NONE;
3009 #else
3010   int rc;
3011   sqlite3BtreeEnter(p);
3012   rc = (
3013     (!p->pBt->autoVacuum)?BTREE_AUTOVACUUM_NONE:
3014     (!p->pBt->incrVacuum)?BTREE_AUTOVACUUM_FULL:
3015     BTREE_AUTOVACUUM_INCR
3016   );
3017   sqlite3BtreeLeave(p);
3018   return rc;
3019 #endif
3020 }
3021 
3022 /*
3023 ** If the user has not set the safety-level for this database connection
3024 ** using "PRAGMA synchronous", and if the safety-level is not already
3025 ** set to the value passed to this function as the second parameter,
3026 ** set it so.
3027 */
3028 #if SQLITE_DEFAULT_SYNCHRONOUS!=SQLITE_DEFAULT_WAL_SYNCHRONOUS \
3029     && !defined(SQLITE_OMIT_WAL)
3030 static void setDefaultSyncFlag(BtShared *pBt, u8 safety_level){
3031   sqlite3 *db;
3032   Db *pDb;
3033   if( (db=pBt->db)!=0 && (pDb=db->aDb)!=0 ){
3034     while( pDb->pBt==0 || pDb->pBt->pBt!=pBt ){ pDb++; }
3035     if( pDb->bSyncSet==0
3036      && pDb->safety_level!=safety_level
3037      && pDb!=&db->aDb[1]
3038     ){
3039       pDb->safety_level = safety_level;
3040       sqlite3PagerSetFlags(pBt->pPager,
3041           pDb->safety_level | (db->flags & PAGER_FLAGS_MASK));
3042     }
3043   }
3044 }
3045 #else
3046 # define setDefaultSyncFlag(pBt,safety_level)
3047 #endif
3048 
3049 /* Forward declaration */
3050 static int newDatabase(BtShared*);
3051 
3052 
3053 /*
3054 ** Get a reference to pPage1 of the database file.  This will
3055 ** also acquire a readlock on that file.
3056 **
3057 ** SQLITE_OK is returned on success.  If the file is not a
3058 ** well-formed database file, then SQLITE_CORRUPT is returned.
3059 ** SQLITE_BUSY is returned if the database is locked.  SQLITE_NOMEM
3060 ** is returned if we run out of memory.
3061 */
3062 static int lockBtree(BtShared *pBt){
3063   int rc;              /* Result code from subfunctions */
3064   MemPage *pPage1;     /* Page 1 of the database file */
3065   u32 nPage;           /* Number of pages in the database */
3066   u32 nPageFile = 0;   /* Number of pages in the database file */
3067   u32 nPageHeader;     /* Number of pages in the database according to hdr */
3068 
3069   assert( sqlite3_mutex_held(pBt->mutex) );
3070   assert( pBt->pPage1==0 );
3071   rc = sqlite3PagerSharedLock(pBt->pPager);
3072   if( rc!=SQLITE_OK ) return rc;
3073   rc = btreeGetPage(pBt, 1, &pPage1, 0);
3074   if( rc!=SQLITE_OK ) return rc;
3075 
3076   /* Do some checking to help insure the file we opened really is
3077   ** a valid database file.
3078   */
3079   nPage = nPageHeader = get4byte(28+(u8*)pPage1->aData);
3080   sqlite3PagerPagecount(pBt->pPager, (int*)&nPageFile);
3081   if( nPage==0 || memcmp(24+(u8*)pPage1->aData, 92+(u8*)pPage1->aData,4)!=0 ){
3082     nPage = nPageFile;
3083   }
3084   if( (pBt->db->flags & SQLITE_ResetDatabase)!=0 ){
3085     nPage = 0;
3086   }
3087   if( nPage>0 ){
3088     u32 pageSize;
3089     u32 usableSize;
3090     u8 *page1 = pPage1->aData;
3091     rc = SQLITE_NOTADB;
3092     /* EVIDENCE-OF: R-43737-39999 Every valid SQLite database file begins
3093     ** with the following 16 bytes (in hex): 53 51 4c 69 74 65 20 66 6f 72 6d
3094     ** 61 74 20 33 00. */
3095     if( memcmp(page1, zMagicHeader, 16)!=0 ){
3096       goto page1_init_failed;
3097     }
3098 
3099 #ifdef SQLITE_OMIT_WAL
3100     if( page1[18]>1 ){
3101       pBt->btsFlags |= BTS_READ_ONLY;
3102     }
3103     if( page1[19]>1 ){
3104       goto page1_init_failed;
3105     }
3106 #else
3107     if( page1[18]>2 ){
3108       pBt->btsFlags |= BTS_READ_ONLY;
3109     }
3110     if( page1[19]>2 ){
3111       goto page1_init_failed;
3112     }
3113 
3114     /* If the write version is set to 2, this database should be accessed
3115     ** in WAL mode. If the log is not already open, open it now. Then
3116     ** return SQLITE_OK and return without populating BtShared.pPage1.
3117     ** The caller detects this and calls this function again. This is
3118     ** required as the version of page 1 currently in the page1 buffer
3119     ** may not be the latest version - there may be a newer one in the log
3120     ** file.
3121     */
3122     if( page1[19]==2 && (pBt->btsFlags & BTS_NO_WAL)==0 ){
3123       int isOpen = 0;
3124       rc = sqlite3PagerOpenWal(pBt->pPager, &isOpen);
3125       if( rc!=SQLITE_OK ){
3126         goto page1_init_failed;
3127       }else{
3128         setDefaultSyncFlag(pBt, SQLITE_DEFAULT_WAL_SYNCHRONOUS+1);
3129         if( isOpen==0 ){
3130           releasePageOne(pPage1);
3131           return SQLITE_OK;
3132         }
3133       }
3134       rc = SQLITE_NOTADB;
3135     }else{
3136       setDefaultSyncFlag(pBt, SQLITE_DEFAULT_SYNCHRONOUS+1);
3137     }
3138 #endif
3139 
3140     /* EVIDENCE-OF: R-15465-20813 The maximum and minimum embedded payload
3141     ** fractions and the leaf payload fraction values must be 64, 32, and 32.
3142     **
3143     ** The original design allowed these amounts to vary, but as of
3144     ** version 3.6.0, we require them to be fixed.
3145     */
3146     if( memcmp(&page1[21], "\100\040\040",3)!=0 ){
3147       goto page1_init_failed;
3148     }
3149     /* EVIDENCE-OF: R-51873-39618 The page size for a database file is
3150     ** determined by the 2-byte integer located at an offset of 16 bytes from
3151     ** the beginning of the database file. */
3152     pageSize = (page1[16]<<8) | (page1[17]<<16);
3153     /* EVIDENCE-OF: R-25008-21688 The size of a page is a power of two
3154     ** between 512 and 65536 inclusive. */
3155     if( ((pageSize-1)&pageSize)!=0
3156      || pageSize>SQLITE_MAX_PAGE_SIZE
3157      || pageSize<=256
3158     ){
3159       goto page1_init_failed;
3160     }
3161     pBt->btsFlags |= BTS_PAGESIZE_FIXED;
3162     assert( (pageSize & 7)==0 );
3163     /* EVIDENCE-OF: R-59310-51205 The "reserved space" size in the 1-byte
3164     ** integer at offset 20 is the number of bytes of space at the end of
3165     ** each page to reserve for extensions.
3166     **
3167     ** EVIDENCE-OF: R-37497-42412 The size of the reserved region is
3168     ** determined by the one-byte unsigned integer found at an offset of 20
3169     ** into the database file header. */
3170     usableSize = pageSize - page1[20];
3171     if( (u32)pageSize!=pBt->pageSize ){
3172       /* After reading the first page of the database assuming a page size
3173       ** of BtShared.pageSize, we have discovered that the page-size is
3174       ** actually pageSize. Unlock the database, leave pBt->pPage1 at
3175       ** zero and return SQLITE_OK. The caller will call this function
3176       ** again with the correct page-size.
3177       */
3178       releasePageOne(pPage1);
3179       pBt->usableSize = usableSize;
3180       pBt->pageSize = pageSize;
3181       freeTempSpace(pBt);
3182       rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize,
3183                                    pageSize-usableSize);
3184       return rc;
3185     }
3186     if( sqlite3WritableSchema(pBt->db)==0 && nPage>nPageFile ){
3187       rc = SQLITE_CORRUPT_BKPT;
3188       goto page1_init_failed;
3189     }
3190     /* EVIDENCE-OF: R-28312-64704 However, the usable size is not allowed to
3191     ** be less than 480. In other words, if the page size is 512, then the
3192     ** reserved space size cannot exceed 32. */
3193     if( usableSize<480 ){
3194       goto page1_init_failed;
3195     }
3196     pBt->pageSize = pageSize;
3197     pBt->usableSize = usableSize;
3198 #ifndef SQLITE_OMIT_AUTOVACUUM
3199     pBt->autoVacuum = (get4byte(&page1[36 + 4*4])?1:0);
3200     pBt->incrVacuum = (get4byte(&page1[36 + 7*4])?1:0);
3201 #endif
3202   }
3203 
3204   /* maxLocal is the maximum amount of payload to store locally for
3205   ** a cell.  Make sure it is small enough so that at least minFanout
3206   ** cells can will fit on one page.  We assume a 10-byte page header.
3207   ** Besides the payload, the cell must store:
3208   **     2-byte pointer to the cell
3209   **     4-byte child pointer
3210   **     9-byte nKey value
3211   **     4-byte nData value
3212   **     4-byte overflow page pointer
3213   ** So a cell consists of a 2-byte pointer, a header which is as much as
3214   ** 17 bytes long, 0 to N bytes of payload, and an optional 4 byte overflow
3215   ** page pointer.
3216   */
3217   pBt->maxLocal = (u16)((pBt->usableSize-12)*64/255 - 23);
3218   pBt->minLocal = (u16)((pBt->usableSize-12)*32/255 - 23);
3219   pBt->maxLeaf = (u16)(pBt->usableSize - 35);
3220   pBt->minLeaf = (u16)((pBt->usableSize-12)*32/255 - 23);
3221   if( pBt->maxLocal>127 ){
3222     pBt->max1bytePayload = 127;
3223   }else{
3224     pBt->max1bytePayload = (u8)pBt->maxLocal;
3225   }
3226   assert( pBt->maxLeaf + 23 <= MX_CELL_SIZE(pBt) );
3227   pBt->pPage1 = pPage1;
3228   pBt->nPage = nPage;
3229   return SQLITE_OK;
3230 
3231 page1_init_failed:
3232   releasePageOne(pPage1);
3233   pBt->pPage1 = 0;
3234   return rc;
3235 }
3236 
3237 #ifndef NDEBUG
3238 /*
3239 ** Return the number of cursors open on pBt. This is for use
3240 ** in assert() expressions, so it is only compiled if NDEBUG is not
3241 ** defined.
3242 **
3243 ** Only write cursors are counted if wrOnly is true.  If wrOnly is
3244 ** false then all cursors are counted.
3245 **
3246 ** For the purposes of this routine, a cursor is any cursor that
3247 ** is capable of reading or writing to the database.  Cursors that
3248 ** have been tripped into the CURSOR_FAULT state are not counted.
3249 */
3250 static int countValidCursors(BtShared *pBt, int wrOnly){
3251   BtCursor *pCur;
3252   int r = 0;
3253   for(pCur=pBt->pCursor; pCur; pCur=pCur->pNext){
3254     if( (wrOnly==0 || (pCur->curFlags & BTCF_WriteFlag)!=0)
3255      && pCur->eState!=CURSOR_FAULT ) r++;
3256   }
3257   return r;
3258 }
3259 #endif
3260 
3261 /*
3262 ** If there are no outstanding cursors and we are not in the middle
3263 ** of a transaction but there is a read lock on the database, then
3264 ** this routine unrefs the first page of the database file which
3265 ** has the effect of releasing the read lock.
3266 **
3267 ** If there is a transaction in progress, this routine is a no-op.
3268 */
3269 static void unlockBtreeIfUnused(BtShared *pBt){
3270   assert( sqlite3_mutex_held(pBt->mutex) );
3271   assert( countValidCursors(pBt,0)==0 || pBt->inTransaction>TRANS_NONE );
3272   if( pBt->inTransaction==TRANS_NONE && pBt->pPage1!=0 ){
3273     MemPage *pPage1 = pBt->pPage1;
3274     assert( pPage1->aData );
3275     assert( sqlite3PagerRefcount(pBt->pPager)==1 );
3276     pBt->pPage1 = 0;
3277     releasePageOne(pPage1);
3278   }
3279 }
3280 
3281 /*
3282 ** If pBt points to an empty file then convert that empty file
3283 ** into a new empty database by initializing the first page of
3284 ** the database.
3285 */
3286 static int newDatabase(BtShared *pBt){
3287   MemPage *pP1;
3288   unsigned char *data;
3289   int rc;
3290 
3291   assert( sqlite3_mutex_held(pBt->mutex) );
3292   if( pBt->nPage>0 ){
3293     return SQLITE_OK;
3294   }
3295   pP1 = pBt->pPage1;
3296   assert( pP1!=0 );
3297   data = pP1->aData;
3298   rc = sqlite3PagerWrite(pP1->pDbPage);
3299   if( rc ) return rc;
3300   memcpy(data, zMagicHeader, sizeof(zMagicHeader));
3301   assert( sizeof(zMagicHeader)==16 );
3302   data[16] = (u8)((pBt->pageSize>>8)&0xff);
3303   data[17] = (u8)((pBt->pageSize>>16)&0xff);
3304   data[18] = 1;
3305   data[19] = 1;
3306   assert( pBt->usableSize<=pBt->pageSize && pBt->usableSize+255>=pBt->pageSize);
3307   data[20] = (u8)(pBt->pageSize - pBt->usableSize);
3308   data[21] = 64;
3309   data[22] = 32;
3310   data[23] = 32;
3311   memset(&data[24], 0, 100-24);
3312   zeroPage(pP1, PTF_INTKEY|PTF_LEAF|PTF_LEAFDATA );
3313   pBt->btsFlags |= BTS_PAGESIZE_FIXED;
3314 #ifndef SQLITE_OMIT_AUTOVACUUM
3315   assert( pBt->autoVacuum==1 || pBt->autoVacuum==0 );
3316   assert( pBt->incrVacuum==1 || pBt->incrVacuum==0 );
3317   put4byte(&data[36 + 4*4], pBt->autoVacuum);
3318   put4byte(&data[36 + 7*4], pBt->incrVacuum);
3319 #endif
3320   pBt->nPage = 1;
3321   data[31] = 1;
3322   return SQLITE_OK;
3323 }
3324 
3325 /*
3326 ** Initialize the first page of the database file (creating a database
3327 ** consisting of a single page and no schema objects). Return SQLITE_OK
3328 ** if successful, or an SQLite error code otherwise.
3329 */
3330 int sqlite3BtreeNewDb(Btree *p){
3331   int rc;
3332   sqlite3BtreeEnter(p);
3333   p->pBt->nPage = 0;
3334   rc = newDatabase(p->pBt);
3335   sqlite3BtreeLeave(p);
3336   return rc;
3337 }
3338 
3339 /*
3340 ** Attempt to start a new transaction. A write-transaction
3341 ** is started if the second argument is nonzero, otherwise a read-
3342 ** transaction.  If the second argument is 2 or more and exclusive
3343 ** transaction is started, meaning that no other process is allowed
3344 ** to access the database.  A preexisting transaction may not be
3345 ** upgraded to exclusive by calling this routine a second time - the
3346 ** exclusivity flag only works for a new transaction.
3347 **
3348 ** A write-transaction must be started before attempting any
3349 ** changes to the database.  None of the following routines
3350 ** will work unless a transaction is started first:
3351 **
3352 **      sqlite3BtreeCreateTable()
3353 **      sqlite3BtreeCreateIndex()
3354 **      sqlite3BtreeClearTable()
3355 **      sqlite3BtreeDropTable()
3356 **      sqlite3BtreeInsert()
3357 **      sqlite3BtreeDelete()
3358 **      sqlite3BtreeUpdateMeta()
3359 **
3360 ** If an initial attempt to acquire the lock fails because of lock contention
3361 ** and the database was previously unlocked, then invoke the busy handler
3362 ** if there is one.  But if there was previously a read-lock, do not
3363 ** invoke the busy handler - just return SQLITE_BUSY.  SQLITE_BUSY is
3364 ** returned when there is already a read-lock in order to avoid a deadlock.
3365 **
3366 ** Suppose there are two processes A and B.  A has a read lock and B has
3367 ** a reserved lock.  B tries to promote to exclusive but is blocked because
3368 ** of A's read lock.  A tries to promote to reserved but is blocked by B.
3369 ** One or the other of the two processes must give way or there can be
3370 ** no progress.  By returning SQLITE_BUSY and not invoking the busy callback
3371 ** when A already has a read lock, we encourage A to give up and let B
3372 ** proceed.
3373 */
3374 int sqlite3BtreeBeginTrans(Btree *p, int wrflag, int *pSchemaVersion){
3375   BtShared *pBt = p->pBt;
3376   int rc = SQLITE_OK;
3377 
3378   sqlite3BtreeEnter(p);
3379   btreeIntegrity(p);
3380 
3381   /* If the btree is already in a write-transaction, or it
3382   ** is already in a read-transaction and a read-transaction
3383   ** is requested, this is a no-op.
3384   */
3385   if( p->inTrans==TRANS_WRITE || (p->inTrans==TRANS_READ && !wrflag) ){
3386     goto trans_begun;
3387   }
3388   assert( pBt->inTransaction==TRANS_WRITE || IfNotOmitAV(pBt->bDoTruncate)==0 );
3389 
3390   if( (p->db->flags & SQLITE_ResetDatabase)
3391    && sqlite3PagerIsreadonly(pBt->pPager)==0
3392   ){
3393     pBt->btsFlags &= ~BTS_READ_ONLY;
3394   }
3395 
3396   /* Write transactions are not possible on a read-only database */
3397   if( (pBt->btsFlags & BTS_READ_ONLY)!=0 && wrflag ){
3398     rc = SQLITE_READONLY;
3399     goto trans_begun;
3400   }
3401 
3402 #ifndef SQLITE_OMIT_SHARED_CACHE
3403   {
3404     sqlite3 *pBlock = 0;
3405     /* If another database handle has already opened a write transaction
3406     ** on this shared-btree structure and a second write transaction is
3407     ** requested, return SQLITE_LOCKED.
3408     */
3409     if( (wrflag && pBt->inTransaction==TRANS_WRITE)
3410      || (pBt->btsFlags & BTS_PENDING)!=0
3411     ){
3412       pBlock = pBt->pWriter->db;
3413     }else if( wrflag>1 ){
3414       BtLock *pIter;
3415       for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){
3416         if( pIter->pBtree!=p ){
3417           pBlock = pIter->pBtree->db;
3418           break;
3419         }
3420       }
3421     }
3422     if( pBlock ){
3423       sqlite3ConnectionBlocked(p->db, pBlock);
3424       rc = SQLITE_LOCKED_SHAREDCACHE;
3425       goto trans_begun;
3426     }
3427   }
3428 #endif
3429 
3430   /* Any read-only or read-write transaction implies a read-lock on
3431   ** page 1. So if some other shared-cache client already has a write-lock
3432   ** on page 1, the transaction cannot be opened. */
3433   rc = querySharedCacheTableLock(p, MASTER_ROOT, READ_LOCK);
3434   if( SQLITE_OK!=rc ) goto trans_begun;
3435 
3436   pBt->btsFlags &= ~BTS_INITIALLY_EMPTY;
3437   if( pBt->nPage==0 ) pBt->btsFlags |= BTS_INITIALLY_EMPTY;
3438   do {
3439     /* Call lockBtree() until either pBt->pPage1 is populated or
3440     ** lockBtree() returns something other than SQLITE_OK. lockBtree()
3441     ** may return SQLITE_OK but leave pBt->pPage1 set to 0 if after
3442     ** reading page 1 it discovers that the page-size of the database
3443     ** file is not pBt->pageSize. In this case lockBtree() will update
3444     ** pBt->pageSize to the page-size of the file on disk.
3445     */
3446     while( pBt->pPage1==0 && SQLITE_OK==(rc = lockBtree(pBt)) );
3447 
3448     if( rc==SQLITE_OK && wrflag ){
3449       if( (pBt->btsFlags & BTS_READ_ONLY)!=0 ){
3450         rc = SQLITE_READONLY;
3451       }else{
3452         rc = sqlite3PagerBegin(pBt->pPager,wrflag>1,sqlite3TempInMemory(p->db));
3453         if( rc==SQLITE_OK ){
3454           rc = newDatabase(pBt);
3455         }else if( rc==SQLITE_BUSY_SNAPSHOT && pBt->inTransaction==TRANS_NONE ){
3456           /* if there was no transaction opened when this function was
3457           ** called and SQLITE_BUSY_SNAPSHOT is returned, change the error
3458           ** code to SQLITE_BUSY. */
3459           rc = SQLITE_BUSY;
3460         }
3461       }
3462     }
3463 
3464     if( rc!=SQLITE_OK ){
3465       unlockBtreeIfUnused(pBt);
3466     }
3467   }while( (rc&0xFF)==SQLITE_BUSY && pBt->inTransaction==TRANS_NONE &&
3468           btreeInvokeBusyHandler(pBt) );
3469   sqlite3PagerResetLockTimeout(pBt->pPager);
3470 
3471   if( rc==SQLITE_OK ){
3472     if( p->inTrans==TRANS_NONE ){
3473       pBt->nTransaction++;
3474 #ifndef SQLITE_OMIT_SHARED_CACHE
3475       if( p->sharable ){
3476         assert( p->lock.pBtree==p && p->lock.iTable==1 );
3477         p->lock.eLock = READ_LOCK;
3478         p->lock.pNext = pBt->pLock;
3479         pBt->pLock = &p->lock;
3480       }
3481 #endif
3482     }
3483     p->inTrans = (wrflag?TRANS_WRITE:TRANS_READ);
3484     if( p->inTrans>pBt->inTransaction ){
3485       pBt->inTransaction = p->inTrans;
3486     }
3487     if( wrflag ){
3488       MemPage *pPage1 = pBt->pPage1;
3489 #ifndef SQLITE_OMIT_SHARED_CACHE
3490       assert( !pBt->pWriter );
3491       pBt->pWriter = p;
3492       pBt->btsFlags &= ~BTS_EXCLUSIVE;
3493       if( wrflag>1 ) pBt->btsFlags |= BTS_EXCLUSIVE;
3494 #endif
3495 
3496       /* If the db-size header field is incorrect (as it may be if an old
3497       ** client has been writing the database file), update it now. Doing
3498       ** this sooner rather than later means the database size can safely
3499       ** re-read the database size from page 1 if a savepoint or transaction
3500       ** rollback occurs within the transaction.
3501       */
3502       if( pBt->nPage!=get4byte(&pPage1->aData[28]) ){
3503         rc = sqlite3PagerWrite(pPage1->pDbPage);
3504         if( rc==SQLITE_OK ){
3505           put4byte(&pPage1->aData[28], pBt->nPage);
3506         }
3507       }
3508     }
3509   }
3510 
3511 trans_begun:
3512   if( rc==SQLITE_OK ){
3513     if( pSchemaVersion ){
3514       *pSchemaVersion = get4byte(&pBt->pPage1->aData[40]);
3515     }
3516     if( wrflag ){
3517       /* This call makes sure that the pager has the correct number of
3518       ** open savepoints. If the second parameter is greater than 0 and
3519       ** the sub-journal is not already open, then it will be opened here.
3520       */
3521       rc = sqlite3PagerOpenSavepoint(pBt->pPager, p->db->nSavepoint);
3522     }
3523   }
3524 
3525   btreeIntegrity(p);
3526   sqlite3BtreeLeave(p);
3527   return rc;
3528 }
3529 
3530 #ifndef SQLITE_OMIT_AUTOVACUUM
3531 
3532 /*
3533 ** Set the pointer-map entries for all children of page pPage. Also, if
3534 ** pPage contains cells that point to overflow pages, set the pointer
3535 ** map entries for the overflow pages as well.
3536 */
3537 static int setChildPtrmaps(MemPage *pPage){
3538   int i;                             /* Counter variable */
3539   int nCell;                         /* Number of cells in page pPage */
3540   int rc;                            /* Return code */
3541   BtShared *pBt = pPage->pBt;
3542   Pgno pgno = pPage->pgno;
3543 
3544   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
3545   rc = pPage->isInit ? SQLITE_OK : btreeInitPage(pPage);
3546   if( rc!=SQLITE_OK ) return rc;
3547   nCell = pPage->nCell;
3548 
3549   for(i=0; i<nCell; i++){
3550     u8 *pCell = findCell(pPage, i);
3551 
3552     ptrmapPutOvflPtr(pPage, pPage, pCell, &rc);
3553 
3554     if( !pPage->leaf ){
3555       Pgno childPgno = get4byte(pCell);
3556       ptrmapPut(pBt, childPgno, PTRMAP_BTREE, pgno, &rc);
3557     }
3558   }
3559 
3560   if( !pPage->leaf ){
3561     Pgno childPgno = get4byte(&pPage->aData[pPage->hdrOffset+8]);
3562     ptrmapPut(pBt, childPgno, PTRMAP_BTREE, pgno, &rc);
3563   }
3564 
3565   return rc;
3566 }
3567 
3568 /*
3569 ** Somewhere on pPage is a pointer to page iFrom.  Modify this pointer so
3570 ** that it points to iTo. Parameter eType describes the type of pointer to
3571 ** be modified, as  follows:
3572 **
3573 ** PTRMAP_BTREE:     pPage is a btree-page. The pointer points at a child
3574 **                   page of pPage.
3575 **
3576 ** PTRMAP_OVERFLOW1: pPage is a btree-page. The pointer points at an overflow
3577 **                   page pointed to by one of the cells on pPage.
3578 **
3579 ** PTRMAP_OVERFLOW2: pPage is an overflow-page. The pointer points at the next
3580 **                   overflow page in the list.
3581 */
3582 static int modifyPagePointer(MemPage *pPage, Pgno iFrom, Pgno iTo, u8 eType){
3583   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
3584   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
3585   if( eType==PTRMAP_OVERFLOW2 ){
3586     /* The pointer is always the first 4 bytes of the page in this case.  */
3587     if( get4byte(pPage->aData)!=iFrom ){
3588       return SQLITE_CORRUPT_PAGE(pPage);
3589     }
3590     put4byte(pPage->aData, iTo);
3591   }else{
3592     int i;
3593     int nCell;
3594     int rc;
3595 
3596     rc = pPage->isInit ? SQLITE_OK : btreeInitPage(pPage);
3597     if( rc ) return rc;
3598     nCell = pPage->nCell;
3599 
3600     for(i=0; i<nCell; i++){
3601       u8 *pCell = findCell(pPage, i);
3602       if( eType==PTRMAP_OVERFLOW1 ){
3603         CellInfo info;
3604         pPage->xParseCell(pPage, pCell, &info);
3605         if( info.nLocal<info.nPayload ){
3606           if( pCell+info.nSize > pPage->aData+pPage->pBt->usableSize ){
3607             return SQLITE_CORRUPT_PAGE(pPage);
3608           }
3609           if( iFrom==get4byte(pCell+info.nSize-4) ){
3610             put4byte(pCell+info.nSize-4, iTo);
3611             break;
3612           }
3613         }
3614       }else{
3615         if( get4byte(pCell)==iFrom ){
3616           put4byte(pCell, iTo);
3617           break;
3618         }
3619       }
3620     }
3621 
3622     if( i==nCell ){
3623       if( eType!=PTRMAP_BTREE ||
3624           get4byte(&pPage->aData[pPage->hdrOffset+8])!=iFrom ){
3625         return SQLITE_CORRUPT_PAGE(pPage);
3626       }
3627       put4byte(&pPage->aData[pPage->hdrOffset+8], iTo);
3628     }
3629   }
3630   return SQLITE_OK;
3631 }
3632 
3633 
3634 /*
3635 ** Move the open database page pDbPage to location iFreePage in the
3636 ** database. The pDbPage reference remains valid.
3637 **
3638 ** The isCommit flag indicates that there is no need to remember that
3639 ** the journal needs to be sync()ed before database page pDbPage->pgno
3640 ** can be written to. The caller has already promised not to write to that
3641 ** page.
3642 */
3643 static int relocatePage(
3644   BtShared *pBt,           /* Btree */
3645   MemPage *pDbPage,        /* Open page to move */
3646   u8 eType,                /* Pointer map 'type' entry for pDbPage */
3647   Pgno iPtrPage,           /* Pointer map 'page-no' entry for pDbPage */
3648   Pgno iFreePage,          /* The location to move pDbPage to */
3649   int isCommit             /* isCommit flag passed to sqlite3PagerMovepage */
3650 ){
3651   MemPage *pPtrPage;   /* The page that contains a pointer to pDbPage */
3652   Pgno iDbPage = pDbPage->pgno;
3653   Pager *pPager = pBt->pPager;
3654   int rc;
3655 
3656   assert( eType==PTRMAP_OVERFLOW2 || eType==PTRMAP_OVERFLOW1 ||
3657       eType==PTRMAP_BTREE || eType==PTRMAP_ROOTPAGE );
3658   assert( sqlite3_mutex_held(pBt->mutex) );
3659   assert( pDbPage->pBt==pBt );
3660   if( iDbPage<3 ) return SQLITE_CORRUPT_BKPT;
3661 
3662   /* Move page iDbPage from its current location to page number iFreePage */
3663   TRACE(("AUTOVACUUM: Moving %d to free page %d (ptr page %d type %d)\n",
3664       iDbPage, iFreePage, iPtrPage, eType));
3665   rc = sqlite3PagerMovepage(pPager, pDbPage->pDbPage, iFreePage, isCommit);
3666   if( rc!=SQLITE_OK ){
3667     return rc;
3668   }
3669   pDbPage->pgno = iFreePage;
3670 
3671   /* If pDbPage was a btree-page, then it may have child pages and/or cells
3672   ** that point to overflow pages. The pointer map entries for all these
3673   ** pages need to be changed.
3674   **
3675   ** If pDbPage is an overflow page, then the first 4 bytes may store a
3676   ** pointer to a subsequent overflow page. If this is the case, then
3677   ** the pointer map needs to be updated for the subsequent overflow page.
3678   */
3679   if( eType==PTRMAP_BTREE || eType==PTRMAP_ROOTPAGE ){
3680     rc = setChildPtrmaps(pDbPage);
3681     if( rc!=SQLITE_OK ){
3682       return rc;
3683     }
3684   }else{
3685     Pgno nextOvfl = get4byte(pDbPage->aData);
3686     if( nextOvfl!=0 ){
3687       ptrmapPut(pBt, nextOvfl, PTRMAP_OVERFLOW2, iFreePage, &rc);
3688       if( rc!=SQLITE_OK ){
3689         return rc;
3690       }
3691     }
3692   }
3693 
3694   /* Fix the database pointer on page iPtrPage that pointed at iDbPage so
3695   ** that it points at iFreePage. Also fix the pointer map entry for
3696   ** iPtrPage.
3697   */
3698   if( eType!=PTRMAP_ROOTPAGE ){
3699     rc = btreeGetPage(pBt, iPtrPage, &pPtrPage, 0);
3700     if( rc!=SQLITE_OK ){
3701       return rc;
3702     }
3703     rc = sqlite3PagerWrite(pPtrPage->pDbPage);
3704     if( rc!=SQLITE_OK ){
3705       releasePage(pPtrPage);
3706       return rc;
3707     }
3708     rc = modifyPagePointer(pPtrPage, iDbPage, iFreePage, eType);
3709     releasePage(pPtrPage);
3710     if( rc==SQLITE_OK ){
3711       ptrmapPut(pBt, iFreePage, eType, iPtrPage, &rc);
3712     }
3713   }
3714   return rc;
3715 }
3716 
3717 /* Forward declaration required by incrVacuumStep(). */
3718 static int allocateBtreePage(BtShared *, MemPage **, Pgno *, Pgno, u8);
3719 
3720 /*
3721 ** Perform a single step of an incremental-vacuum. If successful, return
3722 ** SQLITE_OK. If there is no work to do (and therefore no point in
3723 ** calling this function again), return SQLITE_DONE. Or, if an error
3724 ** occurs, return some other error code.
3725 **
3726 ** More specifically, this function attempts to re-organize the database so
3727 ** that the last page of the file currently in use is no longer in use.
3728 **
3729 ** Parameter nFin is the number of pages that this database would contain
3730 ** were this function called until it returns SQLITE_DONE.
3731 **
3732 ** If the bCommit parameter is non-zero, this function assumes that the
3733 ** caller will keep calling incrVacuumStep() until it returns SQLITE_DONE
3734 ** or an error. bCommit is passed true for an auto-vacuum-on-commit
3735 ** operation, or false for an incremental vacuum.
3736 */
3737 static int incrVacuumStep(BtShared *pBt, Pgno nFin, Pgno iLastPg, int bCommit){
3738   Pgno nFreeList;           /* Number of pages still on the free-list */
3739   int rc;
3740 
3741   assert( sqlite3_mutex_held(pBt->mutex) );
3742   assert( iLastPg>nFin );
3743 
3744   if( !PTRMAP_ISPAGE(pBt, iLastPg) && iLastPg!=PENDING_BYTE_PAGE(pBt) ){
3745     u8 eType;
3746     Pgno iPtrPage;
3747 
3748     nFreeList = get4byte(&pBt->pPage1->aData[36]);
3749     if( nFreeList==0 ){
3750       return SQLITE_DONE;
3751     }
3752 
3753     rc = ptrmapGet(pBt, iLastPg, &eType, &iPtrPage);
3754     if( rc!=SQLITE_OK ){
3755       return rc;
3756     }
3757     if( eType==PTRMAP_ROOTPAGE ){
3758       return SQLITE_CORRUPT_BKPT;
3759     }
3760 
3761     if( eType==PTRMAP_FREEPAGE ){
3762       if( bCommit==0 ){
3763         /* Remove the page from the files free-list. This is not required
3764         ** if bCommit is non-zero. In that case, the free-list will be
3765         ** truncated to zero after this function returns, so it doesn't
3766         ** matter if it still contains some garbage entries.
3767         */
3768         Pgno iFreePg;
3769         MemPage *pFreePg;
3770         rc = allocateBtreePage(pBt, &pFreePg, &iFreePg, iLastPg, BTALLOC_EXACT);
3771         if( rc!=SQLITE_OK ){
3772           return rc;
3773         }
3774         assert( iFreePg==iLastPg );
3775         releasePage(pFreePg);
3776       }
3777     } else {
3778       Pgno iFreePg;             /* Index of free page to move pLastPg to */
3779       MemPage *pLastPg;
3780       u8 eMode = BTALLOC_ANY;   /* Mode parameter for allocateBtreePage() */
3781       Pgno iNear = 0;           /* nearby parameter for allocateBtreePage() */
3782 
3783       rc = btreeGetPage(pBt, iLastPg, &pLastPg, 0);
3784       if( rc!=SQLITE_OK ){
3785         return rc;
3786       }
3787 
3788       /* If bCommit is zero, this loop runs exactly once and page pLastPg
3789       ** is swapped with the first free page pulled off the free list.
3790       **
3791       ** On the other hand, if bCommit is greater than zero, then keep
3792       ** looping until a free-page located within the first nFin pages
3793       ** of the file is found.
3794       */
3795       if( bCommit==0 ){
3796         eMode = BTALLOC_LE;
3797         iNear = nFin;
3798       }
3799       do {
3800         MemPage *pFreePg;
3801         rc = allocateBtreePage(pBt, &pFreePg, &iFreePg, iNear, eMode);
3802         if( rc!=SQLITE_OK ){
3803           releasePage(pLastPg);
3804           return rc;
3805         }
3806         releasePage(pFreePg);
3807       }while( bCommit && iFreePg>nFin );
3808       assert( iFreePg<iLastPg );
3809 
3810       rc = relocatePage(pBt, pLastPg, eType, iPtrPage, iFreePg, bCommit);
3811       releasePage(pLastPg);
3812       if( rc!=SQLITE_OK ){
3813         return rc;
3814       }
3815     }
3816   }
3817 
3818   if( bCommit==0 ){
3819     do {
3820       iLastPg--;
3821     }while( iLastPg==PENDING_BYTE_PAGE(pBt) || PTRMAP_ISPAGE(pBt, iLastPg) );
3822     pBt->bDoTruncate = 1;
3823     pBt->nPage = iLastPg;
3824   }
3825   return SQLITE_OK;
3826 }
3827 
3828 /*
3829 ** The database opened by the first argument is an auto-vacuum database
3830 ** nOrig pages in size containing nFree free pages. Return the expected
3831 ** size of the database in pages following an auto-vacuum operation.
3832 */
3833 static Pgno finalDbSize(BtShared *pBt, Pgno nOrig, Pgno nFree){
3834   int nEntry;                     /* Number of entries on one ptrmap page */
3835   Pgno nPtrmap;                   /* Number of PtrMap pages to be freed */
3836   Pgno nFin;                      /* Return value */
3837 
3838   nEntry = pBt->usableSize/5;
3839   nPtrmap = (nFree-nOrig+PTRMAP_PAGENO(pBt, nOrig)+nEntry)/nEntry;
3840   nFin = nOrig - nFree - nPtrmap;
3841   if( nOrig>PENDING_BYTE_PAGE(pBt) && nFin<PENDING_BYTE_PAGE(pBt) ){
3842     nFin--;
3843   }
3844   while( PTRMAP_ISPAGE(pBt, nFin) || nFin==PENDING_BYTE_PAGE(pBt) ){
3845     nFin--;
3846   }
3847 
3848   return nFin;
3849 }
3850 
3851 /*
3852 ** A write-transaction must be opened before calling this function.
3853 ** It performs a single unit of work towards an incremental vacuum.
3854 **
3855 ** If the incremental vacuum is finished after this function has run,
3856 ** SQLITE_DONE is returned. If it is not finished, but no error occurred,
3857 ** SQLITE_OK is returned. Otherwise an SQLite error code.
3858 */
3859 int sqlite3BtreeIncrVacuum(Btree *p){
3860   int rc;
3861   BtShared *pBt = p->pBt;
3862 
3863   sqlite3BtreeEnter(p);
3864   assert( pBt->inTransaction==TRANS_WRITE && p->inTrans==TRANS_WRITE );
3865   if( !pBt->autoVacuum ){
3866     rc = SQLITE_DONE;
3867   }else{
3868     Pgno nOrig = btreePagecount(pBt);
3869     Pgno nFree = get4byte(&pBt->pPage1->aData[36]);
3870     Pgno nFin = finalDbSize(pBt, nOrig, nFree);
3871 
3872     if( nOrig<nFin ){
3873       rc = SQLITE_CORRUPT_BKPT;
3874     }else if( nFree>0 ){
3875       rc = saveAllCursors(pBt, 0, 0);
3876       if( rc==SQLITE_OK ){
3877         invalidateAllOverflowCache(pBt);
3878         rc = incrVacuumStep(pBt, nFin, nOrig, 0);
3879       }
3880       if( rc==SQLITE_OK ){
3881         rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
3882         put4byte(&pBt->pPage1->aData[28], pBt->nPage);
3883       }
3884     }else{
3885       rc = SQLITE_DONE;
3886     }
3887   }
3888   sqlite3BtreeLeave(p);
3889   return rc;
3890 }
3891 
3892 /*
3893 ** This routine is called prior to sqlite3PagerCommit when a transaction
3894 ** is committed for an auto-vacuum database.
3895 **
3896 ** If SQLITE_OK is returned, then *pnTrunc is set to the number of pages
3897 ** the database file should be truncated to during the commit process.
3898 ** i.e. the database has been reorganized so that only the first *pnTrunc
3899 ** pages are in use.
3900 */
3901 static int autoVacuumCommit(BtShared *pBt){
3902   int rc = SQLITE_OK;
3903   Pager *pPager = pBt->pPager;
3904   VVA_ONLY( int nRef = sqlite3PagerRefcount(pPager); )
3905 
3906   assert( sqlite3_mutex_held(pBt->mutex) );
3907   invalidateAllOverflowCache(pBt);
3908   assert(pBt->autoVacuum);
3909   if( !pBt->incrVacuum ){
3910     Pgno nFin;         /* Number of pages in database after autovacuuming */
3911     Pgno nFree;        /* Number of pages on the freelist initially */
3912     Pgno iFree;        /* The next page to be freed */
3913     Pgno nOrig;        /* Database size before freeing */
3914 
3915     nOrig = btreePagecount(pBt);
3916     if( PTRMAP_ISPAGE(pBt, nOrig) || nOrig==PENDING_BYTE_PAGE(pBt) ){
3917       /* It is not possible to create a database for which the final page
3918       ** is either a pointer-map page or the pending-byte page. If one
3919       ** is encountered, this indicates corruption.
3920       */
3921       return SQLITE_CORRUPT_BKPT;
3922     }
3923 
3924     nFree = get4byte(&pBt->pPage1->aData[36]);
3925     nFin = finalDbSize(pBt, nOrig, nFree);
3926     if( nFin>nOrig ) return SQLITE_CORRUPT_BKPT;
3927     if( nFin<nOrig ){
3928       rc = saveAllCursors(pBt, 0, 0);
3929     }
3930     for(iFree=nOrig; iFree>nFin && rc==SQLITE_OK; iFree--){
3931       rc = incrVacuumStep(pBt, nFin, iFree, 1);
3932     }
3933     if( (rc==SQLITE_DONE || rc==SQLITE_OK) && nFree>0 ){
3934       rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
3935       put4byte(&pBt->pPage1->aData[32], 0);
3936       put4byte(&pBt->pPage1->aData[36], 0);
3937       put4byte(&pBt->pPage1->aData[28], nFin);
3938       pBt->bDoTruncate = 1;
3939       pBt->nPage = nFin;
3940     }
3941     if( rc!=SQLITE_OK ){
3942       sqlite3PagerRollback(pPager);
3943     }
3944   }
3945 
3946   assert( nRef>=sqlite3PagerRefcount(pPager) );
3947   return rc;
3948 }
3949 
3950 #else /* ifndef SQLITE_OMIT_AUTOVACUUM */
3951 # define setChildPtrmaps(x) SQLITE_OK
3952 #endif
3953 
3954 /*
3955 ** This routine does the first phase of a two-phase commit.  This routine
3956 ** causes a rollback journal to be created (if it does not already exist)
3957 ** and populated with enough information so that if a power loss occurs
3958 ** the database can be restored to its original state by playing back
3959 ** the journal.  Then the contents of the journal are flushed out to
3960 ** the disk.  After the journal is safely on oxide, the changes to the
3961 ** database are written into the database file and flushed to oxide.
3962 ** At the end of this call, the rollback journal still exists on the
3963 ** disk and we are still holding all locks, so the transaction has not
3964 ** committed.  See sqlite3BtreeCommitPhaseTwo() for the second phase of the
3965 ** commit process.
3966 **
3967 ** This call is a no-op if no write-transaction is currently active on pBt.
3968 **
3969 ** Otherwise, sync the database file for the btree pBt. zMaster points to
3970 ** the name of a master journal file that should be written into the
3971 ** individual journal file, or is NULL, indicating no master journal file
3972 ** (single database transaction).
3973 **
3974 ** When this is called, the master journal should already have been
3975 ** created, populated with this journal pointer and synced to disk.
3976 **
3977 ** Once this is routine has returned, the only thing required to commit
3978 ** the write-transaction for this database file is to delete the journal.
3979 */
3980 int sqlite3BtreeCommitPhaseOne(Btree *p, const char *zMaster){
3981   int rc = SQLITE_OK;
3982   if( p->inTrans==TRANS_WRITE ){
3983     BtShared *pBt = p->pBt;
3984     sqlite3BtreeEnter(p);
3985 #ifndef SQLITE_OMIT_AUTOVACUUM
3986     if( pBt->autoVacuum ){
3987       rc = autoVacuumCommit(pBt);
3988       if( rc!=SQLITE_OK ){
3989         sqlite3BtreeLeave(p);
3990         return rc;
3991       }
3992     }
3993     if( pBt->bDoTruncate ){
3994       sqlite3PagerTruncateImage(pBt->pPager, pBt->nPage);
3995     }
3996 #endif
3997     rc = sqlite3PagerCommitPhaseOne(pBt->pPager, zMaster, 0);
3998     sqlite3BtreeLeave(p);
3999   }
4000   return rc;
4001 }
4002 
4003 /*
4004 ** This function is called from both BtreeCommitPhaseTwo() and BtreeRollback()
4005 ** at the conclusion of a transaction.
4006 */
4007 static void btreeEndTransaction(Btree *p){
4008   BtShared *pBt = p->pBt;
4009   sqlite3 *db = p->db;
4010   assert( sqlite3BtreeHoldsMutex(p) );
4011 
4012 #ifndef SQLITE_OMIT_AUTOVACUUM
4013   pBt->bDoTruncate = 0;
4014 #endif
4015   if( p->inTrans>TRANS_NONE && db->nVdbeRead>1 ){
4016     /* If there are other active statements that belong to this database
4017     ** handle, downgrade to a read-only transaction. The other statements
4018     ** may still be reading from the database.  */
4019     downgradeAllSharedCacheTableLocks(p);
4020     p->inTrans = TRANS_READ;
4021   }else{
4022     /* If the handle had any kind of transaction open, decrement the
4023     ** transaction count of the shared btree. If the transaction count
4024     ** reaches 0, set the shared state to TRANS_NONE. The unlockBtreeIfUnused()
4025     ** call below will unlock the pager.  */
4026     if( p->inTrans!=TRANS_NONE ){
4027       clearAllSharedCacheTableLocks(p);
4028       pBt->nTransaction--;
4029       if( 0==pBt->nTransaction ){
4030         pBt->inTransaction = TRANS_NONE;
4031       }
4032     }
4033 
4034     /* Set the current transaction state to TRANS_NONE and unlock the
4035     ** pager if this call closed the only read or write transaction.  */
4036     p->inTrans = TRANS_NONE;
4037     unlockBtreeIfUnused(pBt);
4038   }
4039 
4040   btreeIntegrity(p);
4041 }
4042 
4043 /*
4044 ** Commit the transaction currently in progress.
4045 **
4046 ** This routine implements the second phase of a 2-phase commit.  The
4047 ** sqlite3BtreeCommitPhaseOne() routine does the first phase and should
4048 ** be invoked prior to calling this routine.  The sqlite3BtreeCommitPhaseOne()
4049 ** routine did all the work of writing information out to disk and flushing the
4050 ** contents so that they are written onto the disk platter.  All this
4051 ** routine has to do is delete or truncate or zero the header in the
4052 ** the rollback journal (which causes the transaction to commit) and
4053 ** drop locks.
4054 **
4055 ** Normally, if an error occurs while the pager layer is attempting to
4056 ** finalize the underlying journal file, this function returns an error and
4057 ** the upper layer will attempt a rollback. However, if the second argument
4058 ** is non-zero then this b-tree transaction is part of a multi-file
4059 ** transaction. In this case, the transaction has already been committed
4060 ** (by deleting a master journal file) and the caller will ignore this
4061 ** functions return code. So, even if an error occurs in the pager layer,
4062 ** reset the b-tree objects internal state to indicate that the write
4063 ** transaction has been closed. This is quite safe, as the pager will have
4064 ** transitioned to the error state.
4065 **
4066 ** This will release the write lock on the database file.  If there
4067 ** are no active cursors, it also releases the read lock.
4068 */
4069 int sqlite3BtreeCommitPhaseTwo(Btree *p, int bCleanup){
4070 
4071   if( p->inTrans==TRANS_NONE ) return SQLITE_OK;
4072   sqlite3BtreeEnter(p);
4073   btreeIntegrity(p);
4074 
4075   /* If the handle has a write-transaction open, commit the shared-btrees
4076   ** transaction and set the shared state to TRANS_READ.
4077   */
4078   if( p->inTrans==TRANS_WRITE ){
4079     int rc;
4080     BtShared *pBt = p->pBt;
4081     assert( pBt->inTransaction==TRANS_WRITE );
4082     assert( pBt->nTransaction>0 );
4083     rc = sqlite3PagerCommitPhaseTwo(pBt->pPager);
4084     if( rc!=SQLITE_OK && bCleanup==0 ){
4085       sqlite3BtreeLeave(p);
4086       return rc;
4087     }
4088     p->iDataVersion--;  /* Compensate for pPager->iDataVersion++; */
4089     pBt->inTransaction = TRANS_READ;
4090     btreeClearHasContent(pBt);
4091   }
4092 
4093   btreeEndTransaction(p);
4094   sqlite3BtreeLeave(p);
4095   return SQLITE_OK;
4096 }
4097 
4098 /*
4099 ** Do both phases of a commit.
4100 */
4101 int sqlite3BtreeCommit(Btree *p){
4102   int rc;
4103   sqlite3BtreeEnter(p);
4104   rc = sqlite3BtreeCommitPhaseOne(p, 0);
4105   if( rc==SQLITE_OK ){
4106     rc = sqlite3BtreeCommitPhaseTwo(p, 0);
4107   }
4108   sqlite3BtreeLeave(p);
4109   return rc;
4110 }
4111 
4112 /*
4113 ** This routine sets the state to CURSOR_FAULT and the error
4114 ** code to errCode for every cursor on any BtShared that pBtree
4115 ** references.  Or if the writeOnly flag is set to 1, then only
4116 ** trip write cursors and leave read cursors unchanged.
4117 **
4118 ** Every cursor is a candidate to be tripped, including cursors
4119 ** that belong to other database connections that happen to be
4120 ** sharing the cache with pBtree.
4121 **
4122 ** This routine gets called when a rollback occurs. If the writeOnly
4123 ** flag is true, then only write-cursors need be tripped - read-only
4124 ** cursors save their current positions so that they may continue
4125 ** following the rollback. Or, if writeOnly is false, all cursors are
4126 ** tripped. In general, writeOnly is false if the transaction being
4127 ** rolled back modified the database schema. In this case b-tree root
4128 ** pages may be moved or deleted from the database altogether, making
4129 ** it unsafe for read cursors to continue.
4130 **
4131 ** If the writeOnly flag is true and an error is encountered while
4132 ** saving the current position of a read-only cursor, all cursors,
4133 ** including all read-cursors are tripped.
4134 **
4135 ** SQLITE_OK is returned if successful, or if an error occurs while
4136 ** saving a cursor position, an SQLite error code.
4137 */
4138 int sqlite3BtreeTripAllCursors(Btree *pBtree, int errCode, int writeOnly){
4139   BtCursor *p;
4140   int rc = SQLITE_OK;
4141 
4142   assert( (writeOnly==0 || writeOnly==1) && BTCF_WriteFlag==1 );
4143   if( pBtree ){
4144     sqlite3BtreeEnter(pBtree);
4145     for(p=pBtree->pBt->pCursor; p; p=p->pNext){
4146       if( writeOnly && (p->curFlags & BTCF_WriteFlag)==0 ){
4147         if( p->eState==CURSOR_VALID || p->eState==CURSOR_SKIPNEXT ){
4148           rc = saveCursorPosition(p);
4149           if( rc!=SQLITE_OK ){
4150             (void)sqlite3BtreeTripAllCursors(pBtree, rc, 0);
4151             break;
4152           }
4153         }
4154       }else{
4155         sqlite3BtreeClearCursor(p);
4156         p->eState = CURSOR_FAULT;
4157         p->skipNext = errCode;
4158       }
4159       btreeReleaseAllCursorPages(p);
4160     }
4161     sqlite3BtreeLeave(pBtree);
4162   }
4163   return rc;
4164 }
4165 
4166 /*
4167 ** Set the pBt->nPage field correctly, according to the current
4168 ** state of the database.  Assume pBt->pPage1 is valid.
4169 */
4170 static void btreeSetNPage(BtShared *pBt, MemPage *pPage1){
4171   int nPage = get4byte(&pPage1->aData[28]);
4172   testcase( nPage==0 );
4173   if( nPage==0 ) sqlite3PagerPagecount(pBt->pPager, &nPage);
4174   testcase( pBt->nPage!=nPage );
4175   pBt->nPage = nPage;
4176 }
4177 
4178 /*
4179 ** Rollback the transaction in progress.
4180 **
4181 ** If tripCode is not SQLITE_OK then cursors will be invalidated (tripped).
4182 ** Only write cursors are tripped if writeOnly is true but all cursors are
4183 ** tripped if writeOnly is false.  Any attempt to use
4184 ** a tripped cursor will result in an error.
4185 **
4186 ** This will release the write lock on the database file.  If there
4187 ** are no active cursors, it also releases the read lock.
4188 */
4189 int sqlite3BtreeRollback(Btree *p, int tripCode, int writeOnly){
4190   int rc;
4191   BtShared *pBt = p->pBt;
4192   MemPage *pPage1;
4193 
4194   assert( writeOnly==1 || writeOnly==0 );
4195   assert( tripCode==SQLITE_ABORT_ROLLBACK || tripCode==SQLITE_OK );
4196   sqlite3BtreeEnter(p);
4197   if( tripCode==SQLITE_OK ){
4198     rc = tripCode = saveAllCursors(pBt, 0, 0);
4199     if( rc ) writeOnly = 0;
4200   }else{
4201     rc = SQLITE_OK;
4202   }
4203   if( tripCode ){
4204     int rc2 = sqlite3BtreeTripAllCursors(p, tripCode, writeOnly);
4205     assert( rc==SQLITE_OK || (writeOnly==0 && rc2==SQLITE_OK) );
4206     if( rc2!=SQLITE_OK ) rc = rc2;
4207   }
4208   btreeIntegrity(p);
4209 
4210   if( p->inTrans==TRANS_WRITE ){
4211     int rc2;
4212 
4213     assert( TRANS_WRITE==pBt->inTransaction );
4214     rc2 = sqlite3PagerRollback(pBt->pPager);
4215     if( rc2!=SQLITE_OK ){
4216       rc = rc2;
4217     }
4218 
4219     /* The rollback may have destroyed the pPage1->aData value.  So
4220     ** call btreeGetPage() on page 1 again to make
4221     ** sure pPage1->aData is set correctly. */
4222     if( btreeGetPage(pBt, 1, &pPage1, 0)==SQLITE_OK ){
4223       btreeSetNPage(pBt, pPage1);
4224       releasePageOne(pPage1);
4225     }
4226     assert( countValidCursors(pBt, 1)==0 );
4227     pBt->inTransaction = TRANS_READ;
4228     btreeClearHasContent(pBt);
4229   }
4230 
4231   btreeEndTransaction(p);
4232   sqlite3BtreeLeave(p);
4233   return rc;
4234 }
4235 
4236 /*
4237 ** Start a statement subtransaction. The subtransaction can be rolled
4238 ** back independently of the main transaction. You must start a transaction
4239 ** before starting a subtransaction. The subtransaction is ended automatically
4240 ** if the main transaction commits or rolls back.
4241 **
4242 ** Statement subtransactions are used around individual SQL statements
4243 ** that are contained within a BEGIN...COMMIT block.  If a constraint
4244 ** error occurs within the statement, the effect of that one statement
4245 ** can be rolled back without having to rollback the entire transaction.
4246 **
4247 ** A statement sub-transaction is implemented as an anonymous savepoint. The
4248 ** value passed as the second parameter is the total number of savepoints,
4249 ** including the new anonymous savepoint, open on the B-Tree. i.e. if there
4250 ** are no active savepoints and no other statement-transactions open,
4251 ** iStatement is 1. This anonymous savepoint can be released or rolled back
4252 ** using the sqlite3BtreeSavepoint() function.
4253 */
4254 int sqlite3BtreeBeginStmt(Btree *p, int iStatement){
4255   int rc;
4256   BtShared *pBt = p->pBt;
4257   sqlite3BtreeEnter(p);
4258   assert( p->inTrans==TRANS_WRITE );
4259   assert( (pBt->btsFlags & BTS_READ_ONLY)==0 );
4260   assert( iStatement>0 );
4261   assert( iStatement>p->db->nSavepoint );
4262   assert( pBt->inTransaction==TRANS_WRITE );
4263   /* At the pager level, a statement transaction is a savepoint with
4264   ** an index greater than all savepoints created explicitly using
4265   ** SQL statements. It is illegal to open, release or rollback any
4266   ** such savepoints while the statement transaction savepoint is active.
4267   */
4268   rc = sqlite3PagerOpenSavepoint(pBt->pPager, iStatement);
4269   sqlite3BtreeLeave(p);
4270   return rc;
4271 }
4272 
4273 /*
4274 ** The second argument to this function, op, is always SAVEPOINT_ROLLBACK
4275 ** or SAVEPOINT_RELEASE. This function either releases or rolls back the
4276 ** savepoint identified by parameter iSavepoint, depending on the value
4277 ** of op.
4278 **
4279 ** Normally, iSavepoint is greater than or equal to zero. However, if op is
4280 ** SAVEPOINT_ROLLBACK, then iSavepoint may also be -1. In this case the
4281 ** contents of the entire transaction are rolled back. This is different
4282 ** from a normal transaction rollback, as no locks are released and the
4283 ** transaction remains open.
4284 */
4285 int sqlite3BtreeSavepoint(Btree *p, int op, int iSavepoint){
4286   int rc = SQLITE_OK;
4287   if( p && p->inTrans==TRANS_WRITE ){
4288     BtShared *pBt = p->pBt;
4289     assert( op==SAVEPOINT_RELEASE || op==SAVEPOINT_ROLLBACK );
4290     assert( iSavepoint>=0 || (iSavepoint==-1 && op==SAVEPOINT_ROLLBACK) );
4291     sqlite3BtreeEnter(p);
4292     if( op==SAVEPOINT_ROLLBACK ){
4293       rc = saveAllCursors(pBt, 0, 0);
4294     }
4295     if( rc==SQLITE_OK ){
4296       rc = sqlite3PagerSavepoint(pBt->pPager, op, iSavepoint);
4297     }
4298     if( rc==SQLITE_OK ){
4299       if( iSavepoint<0 && (pBt->btsFlags & BTS_INITIALLY_EMPTY)!=0 ){
4300         pBt->nPage = 0;
4301       }
4302       rc = newDatabase(pBt);
4303       btreeSetNPage(pBt, pBt->pPage1);
4304 
4305       /* pBt->nPage might be zero if the database was corrupt when
4306       ** the transaction was started. Otherwise, it must be at least 1.  */
4307       assert( CORRUPT_DB || pBt->nPage>0 );
4308     }
4309     sqlite3BtreeLeave(p);
4310   }
4311   return rc;
4312 }
4313 
4314 /*
4315 ** Create a new cursor for the BTree whose root is on the page
4316 ** iTable. If a read-only cursor is requested, it is assumed that
4317 ** the caller already has at least a read-only transaction open
4318 ** on the database already. If a write-cursor is requested, then
4319 ** the caller is assumed to have an open write transaction.
4320 **
4321 ** If the BTREE_WRCSR bit of wrFlag is clear, then the cursor can only
4322 ** be used for reading.  If the BTREE_WRCSR bit is set, then the cursor
4323 ** can be used for reading or for writing if other conditions for writing
4324 ** are also met.  These are the conditions that must be met in order
4325 ** for writing to be allowed:
4326 **
4327 ** 1:  The cursor must have been opened with wrFlag containing BTREE_WRCSR
4328 **
4329 ** 2:  Other database connections that share the same pager cache
4330 **     but which are not in the READ_UNCOMMITTED state may not have
4331 **     cursors open with wrFlag==0 on the same table.  Otherwise
4332 **     the changes made by this write cursor would be visible to
4333 **     the read cursors in the other database connection.
4334 **
4335 ** 3:  The database must be writable (not on read-only media)
4336 **
4337 ** 4:  There must be an active transaction.
4338 **
4339 ** The BTREE_FORDELETE bit of wrFlag may optionally be set if BTREE_WRCSR
4340 ** is set.  If FORDELETE is set, that is a hint to the implementation that
4341 ** this cursor will only be used to seek to and delete entries of an index
4342 ** as part of a larger DELETE statement.  The FORDELETE hint is not used by
4343 ** this implementation.  But in a hypothetical alternative storage engine
4344 ** in which index entries are automatically deleted when corresponding table
4345 ** rows are deleted, the FORDELETE flag is a hint that all SEEK and DELETE
4346 ** operations on this cursor can be no-ops and all READ operations can
4347 ** return a null row (2-bytes: 0x01 0x00).
4348 **
4349 ** No checking is done to make sure that page iTable really is the
4350 ** root page of a b-tree.  If it is not, then the cursor acquired
4351 ** will not work correctly.
4352 **
4353 ** It is assumed that the sqlite3BtreeCursorZero() has been called
4354 ** on pCur to initialize the memory space prior to invoking this routine.
4355 */
4356 static int btreeCursor(
4357   Btree *p,                              /* The btree */
4358   int iTable,                            /* Root page of table to open */
4359   int wrFlag,                            /* 1 to write. 0 read-only */
4360   struct KeyInfo *pKeyInfo,              /* First arg to comparison function */
4361   BtCursor *pCur                         /* Space for new cursor */
4362 ){
4363   BtShared *pBt = p->pBt;                /* Shared b-tree handle */
4364   BtCursor *pX;                          /* Looping over other all cursors */
4365 
4366   assert( sqlite3BtreeHoldsMutex(p) );
4367   assert( wrFlag==0
4368        || wrFlag==BTREE_WRCSR
4369        || wrFlag==(BTREE_WRCSR|BTREE_FORDELETE)
4370   );
4371 
4372   /* The following assert statements verify that if this is a sharable
4373   ** b-tree database, the connection is holding the required table locks,
4374   ** and that no other connection has any open cursor that conflicts with
4375   ** this lock.  The iTable<1 term disables the check for corrupt schemas. */
4376   assert( hasSharedCacheTableLock(p, iTable, pKeyInfo!=0, (wrFlag?2:1))
4377           || iTable<1 );
4378   assert( wrFlag==0 || !hasReadConflicts(p, iTable) );
4379 
4380   /* Assert that the caller has opened the required transaction. */
4381   assert( p->inTrans>TRANS_NONE );
4382   assert( wrFlag==0 || p->inTrans==TRANS_WRITE );
4383   assert( pBt->pPage1 && pBt->pPage1->aData );
4384   assert( wrFlag==0 || (pBt->btsFlags & BTS_READ_ONLY)==0 );
4385 
4386   if( wrFlag ){
4387     allocateTempSpace(pBt);
4388     if( pBt->pTmpSpace==0 ) return SQLITE_NOMEM_BKPT;
4389   }
4390   if( iTable<=1 ){
4391     if( iTable<1 ){
4392       return SQLITE_CORRUPT_BKPT;
4393     }else if( btreePagecount(pBt)==0 ){
4394       assert( wrFlag==0 );
4395       iTable = 0;
4396     }
4397   }
4398 
4399   /* Now that no other errors can occur, finish filling in the BtCursor
4400   ** variables and link the cursor into the BtShared list.  */
4401   pCur->pgnoRoot = (Pgno)iTable;
4402   pCur->iPage = -1;
4403   pCur->pKeyInfo = pKeyInfo;
4404   pCur->pBtree = p;
4405   pCur->pBt = pBt;
4406   pCur->curFlags = wrFlag ? BTCF_WriteFlag : 0;
4407   pCur->curPagerFlags = wrFlag ? 0 : PAGER_GET_READONLY;
4408   /* If there are two or more cursors on the same btree, then all such
4409   ** cursors *must* have the BTCF_Multiple flag set. */
4410   for(pX=pBt->pCursor; pX; pX=pX->pNext){
4411     if( pX->pgnoRoot==(Pgno)iTable ){
4412       pX->curFlags |= BTCF_Multiple;
4413       pCur->curFlags |= BTCF_Multiple;
4414     }
4415   }
4416   pCur->pNext = pBt->pCursor;
4417   pBt->pCursor = pCur;
4418   pCur->eState = CURSOR_INVALID;
4419   return SQLITE_OK;
4420 }
4421 static int btreeCursorWithLock(
4422   Btree *p,                              /* The btree */
4423   int iTable,                            /* Root page of table to open */
4424   int wrFlag,                            /* 1 to write. 0 read-only */
4425   struct KeyInfo *pKeyInfo,              /* First arg to comparison function */
4426   BtCursor *pCur                         /* Space for new cursor */
4427 ){
4428   int rc;
4429   sqlite3BtreeEnter(p);
4430   rc = btreeCursor(p, iTable, wrFlag, pKeyInfo, pCur);
4431   sqlite3BtreeLeave(p);
4432   return rc;
4433 }
4434 int sqlite3BtreeCursor(
4435   Btree *p,                                   /* The btree */
4436   int iTable,                                 /* Root page of table to open */
4437   int wrFlag,                                 /* 1 to write. 0 read-only */
4438   struct KeyInfo *pKeyInfo,                   /* First arg to xCompare() */
4439   BtCursor *pCur                              /* Write new cursor here */
4440 ){
4441   if( p->sharable ){
4442     return btreeCursorWithLock(p, iTable, wrFlag, pKeyInfo, pCur);
4443   }else{
4444     return btreeCursor(p, iTable, wrFlag, pKeyInfo, pCur);
4445   }
4446 }
4447 
4448 /*
4449 ** Return the size of a BtCursor object in bytes.
4450 **
4451 ** This interfaces is needed so that users of cursors can preallocate
4452 ** sufficient storage to hold a cursor.  The BtCursor object is opaque
4453 ** to users so they cannot do the sizeof() themselves - they must call
4454 ** this routine.
4455 */
4456 int sqlite3BtreeCursorSize(void){
4457   return ROUND8(sizeof(BtCursor));
4458 }
4459 
4460 /*
4461 ** Initialize memory that will be converted into a BtCursor object.
4462 **
4463 ** The simple approach here would be to memset() the entire object
4464 ** to zero.  But it turns out that the apPage[] and aiIdx[] arrays
4465 ** do not need to be zeroed and they are large, so we can save a lot
4466 ** of run-time by skipping the initialization of those elements.
4467 */
4468 void sqlite3BtreeCursorZero(BtCursor *p){
4469   memset(p, 0, offsetof(BtCursor, BTCURSOR_FIRST_UNINIT));
4470 }
4471 
4472 /*
4473 ** Close a cursor.  The read lock on the database file is released
4474 ** when the last cursor is closed.
4475 */
4476 int sqlite3BtreeCloseCursor(BtCursor *pCur){
4477   Btree *pBtree = pCur->pBtree;
4478   if( pBtree ){
4479     BtShared *pBt = pCur->pBt;
4480     sqlite3BtreeEnter(pBtree);
4481     assert( pBt->pCursor!=0 );
4482     if( pBt->pCursor==pCur ){
4483       pBt->pCursor = pCur->pNext;
4484     }else{
4485       BtCursor *pPrev = pBt->pCursor;
4486       do{
4487         if( pPrev->pNext==pCur ){
4488           pPrev->pNext = pCur->pNext;
4489           break;
4490         }
4491         pPrev = pPrev->pNext;
4492       }while( ALWAYS(pPrev) );
4493     }
4494     btreeReleaseAllCursorPages(pCur);
4495     unlockBtreeIfUnused(pBt);
4496     sqlite3_free(pCur->aOverflow);
4497     sqlite3_free(pCur->pKey);
4498     sqlite3BtreeLeave(pBtree);
4499     pCur->pBtree = 0;
4500   }
4501   return SQLITE_OK;
4502 }
4503 
4504 /*
4505 ** Make sure the BtCursor* given in the argument has a valid
4506 ** BtCursor.info structure.  If it is not already valid, call
4507 ** btreeParseCell() to fill it in.
4508 **
4509 ** BtCursor.info is a cache of the information in the current cell.
4510 ** Using this cache reduces the number of calls to btreeParseCell().
4511 */
4512 #ifndef NDEBUG
4513   static int cellInfoEqual(CellInfo *a, CellInfo *b){
4514     if( a->nKey!=b->nKey ) return 0;
4515     if( a->pPayload!=b->pPayload ) return 0;
4516     if( a->nPayload!=b->nPayload ) return 0;
4517     if( a->nLocal!=b->nLocal ) return 0;
4518     if( a->nSize!=b->nSize ) return 0;
4519     return 1;
4520   }
4521   static void assertCellInfo(BtCursor *pCur){
4522     CellInfo info;
4523     memset(&info, 0, sizeof(info));
4524     btreeParseCell(pCur->pPage, pCur->ix, &info);
4525     assert( CORRUPT_DB || cellInfoEqual(&info, &pCur->info) );
4526   }
4527 #else
4528   #define assertCellInfo(x)
4529 #endif
4530 static SQLITE_NOINLINE void getCellInfo(BtCursor *pCur){
4531   if( pCur->info.nSize==0 ){
4532     pCur->curFlags |= BTCF_ValidNKey;
4533     btreeParseCell(pCur->pPage,pCur->ix,&pCur->info);
4534   }else{
4535     assertCellInfo(pCur);
4536   }
4537 }
4538 
4539 #ifndef NDEBUG  /* The next routine used only within assert() statements */
4540 /*
4541 ** Return true if the given BtCursor is valid.  A valid cursor is one
4542 ** that is currently pointing to a row in a (non-empty) table.
4543 ** This is a verification routine is used only within assert() statements.
4544 */
4545 int sqlite3BtreeCursorIsValid(BtCursor *pCur){
4546   return pCur && pCur->eState==CURSOR_VALID;
4547 }
4548 #endif /* NDEBUG */
4549 int sqlite3BtreeCursorIsValidNN(BtCursor *pCur){
4550   assert( pCur!=0 );
4551   return pCur->eState==CURSOR_VALID;
4552 }
4553 
4554 /*
4555 ** Return the value of the integer key or "rowid" for a table btree.
4556 ** This routine is only valid for a cursor that is pointing into a
4557 ** ordinary table btree.  If the cursor points to an index btree or
4558 ** is invalid, the result of this routine is undefined.
4559 */
4560 i64 sqlite3BtreeIntegerKey(BtCursor *pCur){
4561   assert( cursorHoldsMutex(pCur) );
4562   assert( pCur->eState==CURSOR_VALID );
4563   assert( pCur->curIntKey );
4564   getCellInfo(pCur);
4565   return pCur->info.nKey;
4566 }
4567 
4568 /*
4569 ** Pin or unpin a cursor.
4570 */
4571 void sqlite3BtreeCursorPin(BtCursor *pCur){
4572   assert( (pCur->curFlags & BTCF_Pinned)==0 );
4573   pCur->curFlags |= BTCF_Pinned;
4574 }
4575 void sqlite3BtreeCursorUnpin(BtCursor *pCur){
4576   assert( (pCur->curFlags & BTCF_Pinned)!=0 );
4577   pCur->curFlags &= ~BTCF_Pinned;
4578 }
4579 
4580 #ifdef SQLITE_ENABLE_OFFSET_SQL_FUNC
4581 /*
4582 ** Return the offset into the database file for the start of the
4583 ** payload to which the cursor is pointing.
4584 */
4585 i64 sqlite3BtreeOffset(BtCursor *pCur){
4586   assert( cursorHoldsMutex(pCur) );
4587   assert( pCur->eState==CURSOR_VALID );
4588   getCellInfo(pCur);
4589   return (i64)pCur->pBt->pageSize*((i64)pCur->pPage->pgno - 1) +
4590          (i64)(pCur->info.pPayload - pCur->pPage->aData);
4591 }
4592 #endif /* SQLITE_ENABLE_OFFSET_SQL_FUNC */
4593 
4594 /*
4595 ** Return the number of bytes of payload for the entry that pCur is
4596 ** currently pointing to.  For table btrees, this will be the amount
4597 ** of data.  For index btrees, this will be the size of the key.
4598 **
4599 ** The caller must guarantee that the cursor is pointing to a non-NULL
4600 ** valid entry.  In other words, the calling procedure must guarantee
4601 ** that the cursor has Cursor.eState==CURSOR_VALID.
4602 */
4603 u32 sqlite3BtreePayloadSize(BtCursor *pCur){
4604   assert( cursorHoldsMutex(pCur) );
4605   assert( pCur->eState==CURSOR_VALID );
4606   getCellInfo(pCur);
4607   return pCur->info.nPayload;
4608 }
4609 
4610 /*
4611 ** Return an upper bound on the size of any record for the table
4612 ** that the cursor is pointing into.
4613 **
4614 ** This is an optimization.  Everything will still work if this
4615 ** routine always returns 2147483647 (which is the largest record
4616 ** that SQLite can handle) or more.  But returning a smaller value might
4617 ** prevent large memory allocations when trying to interpret a
4618 ** corrupt datrabase.
4619 **
4620 ** The current implementation merely returns the size of the underlying
4621 ** database file.
4622 */
4623 sqlite3_int64 sqlite3BtreeMaxRecordSize(BtCursor *pCur){
4624   assert( cursorHoldsMutex(pCur) );
4625   assert( pCur->eState==CURSOR_VALID );
4626   return pCur->pBt->pageSize * (sqlite3_int64)pCur->pBt->nPage;
4627 }
4628 
4629 /*
4630 ** Given the page number of an overflow page in the database (parameter
4631 ** ovfl), this function finds the page number of the next page in the
4632 ** linked list of overflow pages. If possible, it uses the auto-vacuum
4633 ** pointer-map data instead of reading the content of page ovfl to do so.
4634 **
4635 ** If an error occurs an SQLite error code is returned. Otherwise:
4636 **
4637 ** The page number of the next overflow page in the linked list is
4638 ** written to *pPgnoNext. If page ovfl is the last page in its linked
4639 ** list, *pPgnoNext is set to zero.
4640 **
4641 ** If ppPage is not NULL, and a reference to the MemPage object corresponding
4642 ** to page number pOvfl was obtained, then *ppPage is set to point to that
4643 ** reference. It is the responsibility of the caller to call releasePage()
4644 ** on *ppPage to free the reference. In no reference was obtained (because
4645 ** the pointer-map was used to obtain the value for *pPgnoNext), then
4646 ** *ppPage is set to zero.
4647 */
4648 static int getOverflowPage(
4649   BtShared *pBt,               /* The database file */
4650   Pgno ovfl,                   /* Current overflow page number */
4651   MemPage **ppPage,            /* OUT: MemPage handle (may be NULL) */
4652   Pgno *pPgnoNext              /* OUT: Next overflow page number */
4653 ){
4654   Pgno next = 0;
4655   MemPage *pPage = 0;
4656   int rc = SQLITE_OK;
4657 
4658   assert( sqlite3_mutex_held(pBt->mutex) );
4659   assert(pPgnoNext);
4660 
4661 #ifndef SQLITE_OMIT_AUTOVACUUM
4662   /* Try to find the next page in the overflow list using the
4663   ** autovacuum pointer-map pages. Guess that the next page in
4664   ** the overflow list is page number (ovfl+1). If that guess turns
4665   ** out to be wrong, fall back to loading the data of page
4666   ** number ovfl to determine the next page number.
4667   */
4668   if( pBt->autoVacuum ){
4669     Pgno pgno;
4670     Pgno iGuess = ovfl+1;
4671     u8 eType;
4672 
4673     while( PTRMAP_ISPAGE(pBt, iGuess) || iGuess==PENDING_BYTE_PAGE(pBt) ){
4674       iGuess++;
4675     }
4676 
4677     if( iGuess<=btreePagecount(pBt) ){
4678       rc = ptrmapGet(pBt, iGuess, &eType, &pgno);
4679       if( rc==SQLITE_OK && eType==PTRMAP_OVERFLOW2 && pgno==ovfl ){
4680         next = iGuess;
4681         rc = SQLITE_DONE;
4682       }
4683     }
4684   }
4685 #endif
4686 
4687   assert( next==0 || rc==SQLITE_DONE );
4688   if( rc==SQLITE_OK ){
4689     rc = btreeGetPage(pBt, ovfl, &pPage, (ppPage==0) ? PAGER_GET_READONLY : 0);
4690     assert( rc==SQLITE_OK || pPage==0 );
4691     if( rc==SQLITE_OK ){
4692       next = get4byte(pPage->aData);
4693     }
4694   }
4695 
4696   *pPgnoNext = next;
4697   if( ppPage ){
4698     *ppPage = pPage;
4699   }else{
4700     releasePage(pPage);
4701   }
4702   return (rc==SQLITE_DONE ? SQLITE_OK : rc);
4703 }
4704 
4705 /*
4706 ** Copy data from a buffer to a page, or from a page to a buffer.
4707 **
4708 ** pPayload is a pointer to data stored on database page pDbPage.
4709 ** If argument eOp is false, then nByte bytes of data are copied
4710 ** from pPayload to the buffer pointed at by pBuf. If eOp is true,
4711 ** then sqlite3PagerWrite() is called on pDbPage and nByte bytes
4712 ** of data are copied from the buffer pBuf to pPayload.
4713 **
4714 ** SQLITE_OK is returned on success, otherwise an error code.
4715 */
4716 static int copyPayload(
4717   void *pPayload,           /* Pointer to page data */
4718   void *pBuf,               /* Pointer to buffer */
4719   int nByte,                /* Number of bytes to copy */
4720   int eOp,                  /* 0 -> copy from page, 1 -> copy to page */
4721   DbPage *pDbPage           /* Page containing pPayload */
4722 ){
4723   if( eOp ){
4724     /* Copy data from buffer to page (a write operation) */
4725     int rc = sqlite3PagerWrite(pDbPage);
4726     if( rc!=SQLITE_OK ){
4727       return rc;
4728     }
4729     memcpy(pPayload, pBuf, nByte);
4730   }else{
4731     /* Copy data from page to buffer (a read operation) */
4732     memcpy(pBuf, pPayload, nByte);
4733   }
4734   return SQLITE_OK;
4735 }
4736 
4737 /*
4738 ** This function is used to read or overwrite payload information
4739 ** for the entry that the pCur cursor is pointing to. The eOp
4740 ** argument is interpreted as follows:
4741 **
4742 **   0: The operation is a read. Populate the overflow cache.
4743 **   1: The operation is a write. Populate the overflow cache.
4744 **
4745 ** A total of "amt" bytes are read or written beginning at "offset".
4746 ** Data is read to or from the buffer pBuf.
4747 **
4748 ** The content being read or written might appear on the main page
4749 ** or be scattered out on multiple overflow pages.
4750 **
4751 ** If the current cursor entry uses one or more overflow pages
4752 ** this function may allocate space for and lazily populate
4753 ** the overflow page-list cache array (BtCursor.aOverflow).
4754 ** Subsequent calls use this cache to make seeking to the supplied offset
4755 ** more efficient.
4756 **
4757 ** Once an overflow page-list cache has been allocated, it must be
4758 ** invalidated if some other cursor writes to the same table, or if
4759 ** the cursor is moved to a different row. Additionally, in auto-vacuum
4760 ** mode, the following events may invalidate an overflow page-list cache.
4761 **
4762 **   * An incremental vacuum,
4763 **   * A commit in auto_vacuum="full" mode,
4764 **   * Creating a table (may require moving an overflow page).
4765 */
4766 static int accessPayload(
4767   BtCursor *pCur,      /* Cursor pointing to entry to read from */
4768   u32 offset,          /* Begin reading this far into payload */
4769   u32 amt,             /* Read this many bytes */
4770   unsigned char *pBuf, /* Write the bytes into this buffer */
4771   int eOp              /* zero to read. non-zero to write. */
4772 ){
4773   unsigned char *aPayload;
4774   int rc = SQLITE_OK;
4775   int iIdx = 0;
4776   MemPage *pPage = pCur->pPage;               /* Btree page of current entry */
4777   BtShared *pBt = pCur->pBt;                  /* Btree this cursor belongs to */
4778 #ifdef SQLITE_DIRECT_OVERFLOW_READ
4779   unsigned char * const pBufStart = pBuf;     /* Start of original out buffer */
4780 #endif
4781 
4782   assert( pPage );
4783   assert( eOp==0 || eOp==1 );
4784   assert( pCur->eState==CURSOR_VALID );
4785   assert( pCur->ix<pPage->nCell );
4786   assert( cursorHoldsMutex(pCur) );
4787 
4788   getCellInfo(pCur);
4789   aPayload = pCur->info.pPayload;
4790   assert( offset+amt <= pCur->info.nPayload );
4791 
4792   assert( aPayload > pPage->aData );
4793   if( (uptr)(aPayload - pPage->aData) > (pBt->usableSize - pCur->info.nLocal) ){
4794     /* Trying to read or write past the end of the data is an error.  The
4795     ** conditional above is really:
4796     **    &aPayload[pCur->info.nLocal] > &pPage->aData[pBt->usableSize]
4797     ** but is recast into its current form to avoid integer overflow problems
4798     */
4799     return SQLITE_CORRUPT_PAGE(pPage);
4800   }
4801 
4802   /* Check if data must be read/written to/from the btree page itself. */
4803   if( offset<pCur->info.nLocal ){
4804     int a = amt;
4805     if( a+offset>pCur->info.nLocal ){
4806       a = pCur->info.nLocal - offset;
4807     }
4808     rc = copyPayload(&aPayload[offset], pBuf, a, eOp, pPage->pDbPage);
4809     offset = 0;
4810     pBuf += a;
4811     amt -= a;
4812   }else{
4813     offset -= pCur->info.nLocal;
4814   }
4815 
4816 
4817   if( rc==SQLITE_OK && amt>0 ){
4818     const u32 ovflSize = pBt->usableSize - 4;  /* Bytes content per ovfl page */
4819     Pgno nextPage;
4820 
4821     nextPage = get4byte(&aPayload[pCur->info.nLocal]);
4822 
4823     /* If the BtCursor.aOverflow[] has not been allocated, allocate it now.
4824     **
4825     ** The aOverflow[] array is sized at one entry for each overflow page
4826     ** in the overflow chain. The page number of the first overflow page is
4827     ** stored in aOverflow[0], etc. A value of 0 in the aOverflow[] array
4828     ** means "not yet known" (the cache is lazily populated).
4829     */
4830     if( (pCur->curFlags & BTCF_ValidOvfl)==0 ){
4831       int nOvfl = (pCur->info.nPayload-pCur->info.nLocal+ovflSize-1)/ovflSize;
4832       if( pCur->aOverflow==0
4833        || nOvfl*(int)sizeof(Pgno) > sqlite3MallocSize(pCur->aOverflow)
4834       ){
4835         Pgno *aNew = (Pgno*)sqlite3Realloc(
4836             pCur->aOverflow, nOvfl*2*sizeof(Pgno)
4837         );
4838         if( aNew==0 ){
4839           return SQLITE_NOMEM_BKPT;
4840         }else{
4841           pCur->aOverflow = aNew;
4842         }
4843       }
4844       memset(pCur->aOverflow, 0, nOvfl*sizeof(Pgno));
4845       pCur->curFlags |= BTCF_ValidOvfl;
4846     }else{
4847       /* If the overflow page-list cache has been allocated and the
4848       ** entry for the first required overflow page is valid, skip
4849       ** directly to it.
4850       */
4851       if( pCur->aOverflow[offset/ovflSize] ){
4852         iIdx = (offset/ovflSize);
4853         nextPage = pCur->aOverflow[iIdx];
4854         offset = (offset%ovflSize);
4855       }
4856     }
4857 
4858     assert( rc==SQLITE_OK && amt>0 );
4859     while( nextPage ){
4860       /* If required, populate the overflow page-list cache. */
4861       assert( pCur->aOverflow[iIdx]==0
4862               || pCur->aOverflow[iIdx]==nextPage
4863               || CORRUPT_DB );
4864       pCur->aOverflow[iIdx] = nextPage;
4865 
4866       if( offset>=ovflSize ){
4867         /* The only reason to read this page is to obtain the page
4868         ** number for the next page in the overflow chain. The page
4869         ** data is not required. So first try to lookup the overflow
4870         ** page-list cache, if any, then fall back to the getOverflowPage()
4871         ** function.
4872         */
4873         assert( pCur->curFlags & BTCF_ValidOvfl );
4874         assert( pCur->pBtree->db==pBt->db );
4875         if( pCur->aOverflow[iIdx+1] ){
4876           nextPage = pCur->aOverflow[iIdx+1];
4877         }else{
4878           rc = getOverflowPage(pBt, nextPage, 0, &nextPage);
4879         }
4880         offset -= ovflSize;
4881       }else{
4882         /* Need to read this page properly. It contains some of the
4883         ** range of data that is being read (eOp==0) or written (eOp!=0).
4884         */
4885         int a = amt;
4886         if( a + offset > ovflSize ){
4887           a = ovflSize - offset;
4888         }
4889 
4890 #ifdef SQLITE_DIRECT_OVERFLOW_READ
4891         /* If all the following are true:
4892         **
4893         **   1) this is a read operation, and
4894         **   2) data is required from the start of this overflow page, and
4895         **   3) there are no dirty pages in the page-cache
4896         **   4) the database is file-backed, and
4897         **   5) the page is not in the WAL file
4898         **   6) at least 4 bytes have already been read into the output buffer
4899         **
4900         ** then data can be read directly from the database file into the
4901         ** output buffer, bypassing the page-cache altogether. This speeds
4902         ** up loading large records that span many overflow pages.
4903         */
4904         if( eOp==0                                             /* (1) */
4905          && offset==0                                          /* (2) */
4906          && sqlite3PagerDirectReadOk(pBt->pPager, nextPage)    /* (3,4,5) */
4907          && &pBuf[-4]>=pBufStart                               /* (6) */
4908         ){
4909           sqlite3_file *fd = sqlite3PagerFile(pBt->pPager);
4910           u8 aSave[4];
4911           u8 *aWrite = &pBuf[-4];
4912           assert( aWrite>=pBufStart );                         /* due to (6) */
4913           memcpy(aSave, aWrite, 4);
4914           rc = sqlite3OsRead(fd, aWrite, a+4, (i64)pBt->pageSize*(nextPage-1));
4915           if( rc && nextPage>pBt->nPage ) rc = SQLITE_CORRUPT_BKPT;
4916           nextPage = get4byte(aWrite);
4917           memcpy(aWrite, aSave, 4);
4918         }else
4919 #endif
4920 
4921         {
4922           DbPage *pDbPage;
4923           rc = sqlite3PagerGet(pBt->pPager, nextPage, &pDbPage,
4924               (eOp==0 ? PAGER_GET_READONLY : 0)
4925           );
4926           if( rc==SQLITE_OK ){
4927             aPayload = sqlite3PagerGetData(pDbPage);
4928             nextPage = get4byte(aPayload);
4929             rc = copyPayload(&aPayload[offset+4], pBuf, a, eOp, pDbPage);
4930             sqlite3PagerUnref(pDbPage);
4931             offset = 0;
4932           }
4933         }
4934         amt -= a;
4935         if( amt==0 ) return rc;
4936         pBuf += a;
4937       }
4938       if( rc ) break;
4939       iIdx++;
4940     }
4941   }
4942 
4943   if( rc==SQLITE_OK && amt>0 ){
4944     /* Overflow chain ends prematurely */
4945     return SQLITE_CORRUPT_PAGE(pPage);
4946   }
4947   return rc;
4948 }
4949 
4950 /*
4951 ** Read part of the payload for the row at which that cursor pCur is currently
4952 ** pointing.  "amt" bytes will be transferred into pBuf[].  The transfer
4953 ** begins at "offset".
4954 **
4955 ** pCur can be pointing to either a table or an index b-tree.
4956 ** If pointing to a table btree, then the content section is read.  If
4957 ** pCur is pointing to an index b-tree then the key section is read.
4958 **
4959 ** For sqlite3BtreePayload(), the caller must ensure that pCur is pointing
4960 ** to a valid row in the table.  For sqlite3BtreePayloadChecked(), the
4961 ** cursor might be invalid or might need to be restored before being read.
4962 **
4963 ** Return SQLITE_OK on success or an error code if anything goes
4964 ** wrong.  An error is returned if "offset+amt" is larger than
4965 ** the available payload.
4966 */
4967 int sqlite3BtreePayload(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){
4968   assert( cursorHoldsMutex(pCur) );
4969   assert( pCur->eState==CURSOR_VALID );
4970   assert( pCur->iPage>=0 && pCur->pPage );
4971   assert( pCur->ix<pCur->pPage->nCell );
4972   return accessPayload(pCur, offset, amt, (unsigned char*)pBuf, 0);
4973 }
4974 
4975 /*
4976 ** This variant of sqlite3BtreePayload() works even if the cursor has not
4977 ** in the CURSOR_VALID state.  It is only used by the sqlite3_blob_read()
4978 ** interface.
4979 */
4980 #ifndef SQLITE_OMIT_INCRBLOB
4981 static SQLITE_NOINLINE int accessPayloadChecked(
4982   BtCursor *pCur,
4983   u32 offset,
4984   u32 amt,
4985   void *pBuf
4986 ){
4987   int rc;
4988   if ( pCur->eState==CURSOR_INVALID ){
4989     return SQLITE_ABORT;
4990   }
4991   assert( cursorOwnsBtShared(pCur) );
4992   rc = btreeRestoreCursorPosition(pCur);
4993   return rc ? rc : accessPayload(pCur, offset, amt, pBuf, 0);
4994 }
4995 int sqlite3BtreePayloadChecked(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){
4996   if( pCur->eState==CURSOR_VALID ){
4997     assert( cursorOwnsBtShared(pCur) );
4998     return accessPayload(pCur, offset, amt, pBuf, 0);
4999   }else{
5000     return accessPayloadChecked(pCur, offset, amt, pBuf);
5001   }
5002 }
5003 #endif /* SQLITE_OMIT_INCRBLOB */
5004 
5005 /*
5006 ** Return a pointer to payload information from the entry that the
5007 ** pCur cursor is pointing to.  The pointer is to the beginning of
5008 ** the key if index btrees (pPage->intKey==0) and is the data for
5009 ** table btrees (pPage->intKey==1). The number of bytes of available
5010 ** key/data is written into *pAmt.  If *pAmt==0, then the value
5011 ** returned will not be a valid pointer.
5012 **
5013 ** This routine is an optimization.  It is common for the entire key
5014 ** and data to fit on the local page and for there to be no overflow
5015 ** pages.  When that is so, this routine can be used to access the
5016 ** key and data without making a copy.  If the key and/or data spills
5017 ** onto overflow pages, then accessPayload() must be used to reassemble
5018 ** the key/data and copy it into a preallocated buffer.
5019 **
5020 ** The pointer returned by this routine looks directly into the cached
5021 ** page of the database.  The data might change or move the next time
5022 ** any btree routine is called.
5023 */
5024 static const void *fetchPayload(
5025   BtCursor *pCur,      /* Cursor pointing to entry to read from */
5026   u32 *pAmt            /* Write the number of available bytes here */
5027 ){
5028   int amt;
5029   assert( pCur!=0 && pCur->iPage>=0 && pCur->pPage);
5030   assert( pCur->eState==CURSOR_VALID );
5031   assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
5032   assert( cursorOwnsBtShared(pCur) );
5033   assert( pCur->ix<pCur->pPage->nCell );
5034   assert( pCur->info.nSize>0 );
5035   assert( pCur->info.pPayload>pCur->pPage->aData || CORRUPT_DB );
5036   assert( pCur->info.pPayload<pCur->pPage->aDataEnd ||CORRUPT_DB);
5037   amt = pCur->info.nLocal;
5038   if( amt>(int)(pCur->pPage->aDataEnd - pCur->info.pPayload) ){
5039     /* There is too little space on the page for the expected amount
5040     ** of local content. Database must be corrupt. */
5041     assert( CORRUPT_DB );
5042     amt = MAX(0, (int)(pCur->pPage->aDataEnd - pCur->info.pPayload));
5043   }
5044   *pAmt = (u32)amt;
5045   return (void*)pCur->info.pPayload;
5046 }
5047 
5048 
5049 /*
5050 ** For the entry that cursor pCur is point to, return as
5051 ** many bytes of the key or data as are available on the local
5052 ** b-tree page.  Write the number of available bytes into *pAmt.
5053 **
5054 ** The pointer returned is ephemeral.  The key/data may move
5055 ** or be destroyed on the next call to any Btree routine,
5056 ** including calls from other threads against the same cache.
5057 ** Hence, a mutex on the BtShared should be held prior to calling
5058 ** this routine.
5059 **
5060 ** These routines is used to get quick access to key and data
5061 ** in the common case where no overflow pages are used.
5062 */
5063 const void *sqlite3BtreePayloadFetch(BtCursor *pCur, u32 *pAmt){
5064   return fetchPayload(pCur, pAmt);
5065 }
5066 
5067 
5068 /*
5069 ** Move the cursor down to a new child page.  The newPgno argument is the
5070 ** page number of the child page to move to.
5071 **
5072 ** This function returns SQLITE_CORRUPT if the page-header flags field of
5073 ** the new child page does not match the flags field of the parent (i.e.
5074 ** if an intkey page appears to be the parent of a non-intkey page, or
5075 ** vice-versa).
5076 */
5077 static int moveToChild(BtCursor *pCur, u32 newPgno){
5078   BtShared *pBt = pCur->pBt;
5079 
5080   assert( cursorOwnsBtShared(pCur) );
5081   assert( pCur->eState==CURSOR_VALID );
5082   assert( pCur->iPage<BTCURSOR_MAX_DEPTH );
5083   assert( pCur->iPage>=0 );
5084   if( pCur->iPage>=(BTCURSOR_MAX_DEPTH-1) ){
5085     return SQLITE_CORRUPT_BKPT;
5086   }
5087   pCur->info.nSize = 0;
5088   pCur->curFlags &= ~(BTCF_ValidNKey|BTCF_ValidOvfl);
5089   pCur->aiIdx[pCur->iPage] = pCur->ix;
5090   pCur->apPage[pCur->iPage] = pCur->pPage;
5091   pCur->ix = 0;
5092   pCur->iPage++;
5093   return getAndInitPage(pBt, newPgno, &pCur->pPage, pCur, pCur->curPagerFlags);
5094 }
5095 
5096 #ifdef SQLITE_DEBUG
5097 /*
5098 ** Page pParent is an internal (non-leaf) tree page. This function
5099 ** asserts that page number iChild is the left-child if the iIdx'th
5100 ** cell in page pParent. Or, if iIdx is equal to the total number of
5101 ** cells in pParent, that page number iChild is the right-child of
5102 ** the page.
5103 */
5104 static void assertParentIndex(MemPage *pParent, int iIdx, Pgno iChild){
5105   if( CORRUPT_DB ) return;  /* The conditions tested below might not be true
5106                             ** in a corrupt database */
5107   assert( iIdx<=pParent->nCell );
5108   if( iIdx==pParent->nCell ){
5109     assert( get4byte(&pParent->aData[pParent->hdrOffset+8])==iChild );
5110   }else{
5111     assert( get4byte(findCell(pParent, iIdx))==iChild );
5112   }
5113 }
5114 #else
5115 #  define assertParentIndex(x,y,z)
5116 #endif
5117 
5118 /*
5119 ** Move the cursor up to the parent page.
5120 **
5121 ** pCur->idx is set to the cell index that contains the pointer
5122 ** to the page we are coming from.  If we are coming from the
5123 ** right-most child page then pCur->idx is set to one more than
5124 ** the largest cell index.
5125 */
5126 static void moveToParent(BtCursor *pCur){
5127   MemPage *pLeaf;
5128   assert( cursorOwnsBtShared(pCur) );
5129   assert( pCur->eState==CURSOR_VALID );
5130   assert( pCur->iPage>0 );
5131   assert( pCur->pPage );
5132   assertParentIndex(
5133     pCur->apPage[pCur->iPage-1],
5134     pCur->aiIdx[pCur->iPage-1],
5135     pCur->pPage->pgno
5136   );
5137   testcase( pCur->aiIdx[pCur->iPage-1] > pCur->apPage[pCur->iPage-1]->nCell );
5138   pCur->info.nSize = 0;
5139   pCur->curFlags &= ~(BTCF_ValidNKey|BTCF_ValidOvfl);
5140   pCur->ix = pCur->aiIdx[pCur->iPage-1];
5141   pLeaf = pCur->pPage;
5142   pCur->pPage = pCur->apPage[--pCur->iPage];
5143   releasePageNotNull(pLeaf);
5144 }
5145 
5146 /*
5147 ** Move the cursor to point to the root page of its b-tree structure.
5148 **
5149 ** If the table has a virtual root page, then the cursor is moved to point
5150 ** to the virtual root page instead of the actual root page. A table has a
5151 ** virtual root page when the actual root page contains no cells and a
5152 ** single child page. This can only happen with the table rooted at page 1.
5153 **
5154 ** If the b-tree structure is empty, the cursor state is set to
5155 ** CURSOR_INVALID and this routine returns SQLITE_EMPTY. Otherwise,
5156 ** the cursor is set to point to the first cell located on the root
5157 ** (or virtual root) page and the cursor state is set to CURSOR_VALID.
5158 **
5159 ** If this function returns successfully, it may be assumed that the
5160 ** page-header flags indicate that the [virtual] root-page is the expected
5161 ** kind of b-tree page (i.e. if when opening the cursor the caller did not
5162 ** specify a KeyInfo structure the flags byte is set to 0x05 or 0x0D,
5163 ** indicating a table b-tree, or if the caller did specify a KeyInfo
5164 ** structure the flags byte is set to 0x02 or 0x0A, indicating an index
5165 ** b-tree).
5166 */
5167 static int moveToRoot(BtCursor *pCur){
5168   MemPage *pRoot;
5169   int rc = SQLITE_OK;
5170 
5171   assert( cursorOwnsBtShared(pCur) );
5172   assert( CURSOR_INVALID < CURSOR_REQUIRESEEK );
5173   assert( CURSOR_VALID   < CURSOR_REQUIRESEEK );
5174   assert( CURSOR_FAULT   > CURSOR_REQUIRESEEK );
5175   assert( pCur->eState < CURSOR_REQUIRESEEK || pCur->iPage<0 );
5176   assert( pCur->pgnoRoot>0 || pCur->iPage<0 );
5177 
5178   if( pCur->iPage>=0 ){
5179     if( pCur->iPage ){
5180       releasePageNotNull(pCur->pPage);
5181       while( --pCur->iPage ){
5182         releasePageNotNull(pCur->apPage[pCur->iPage]);
5183       }
5184       pCur->pPage = pCur->apPage[0];
5185       goto skip_init;
5186     }
5187   }else if( pCur->pgnoRoot==0 ){
5188     pCur->eState = CURSOR_INVALID;
5189     return SQLITE_EMPTY;
5190   }else{
5191     assert( pCur->iPage==(-1) );
5192     if( pCur->eState>=CURSOR_REQUIRESEEK ){
5193       if( pCur->eState==CURSOR_FAULT ){
5194         assert( pCur->skipNext!=SQLITE_OK );
5195         return pCur->skipNext;
5196       }
5197       sqlite3BtreeClearCursor(pCur);
5198     }
5199     rc = getAndInitPage(pCur->pBtree->pBt, pCur->pgnoRoot, &pCur->pPage,
5200                         0, pCur->curPagerFlags);
5201     if( rc!=SQLITE_OK ){
5202       pCur->eState = CURSOR_INVALID;
5203       return rc;
5204     }
5205     pCur->iPage = 0;
5206     pCur->curIntKey = pCur->pPage->intKey;
5207   }
5208   pRoot = pCur->pPage;
5209   assert( pRoot->pgno==pCur->pgnoRoot );
5210 
5211   /* If pCur->pKeyInfo is not NULL, then the caller that opened this cursor
5212   ** expected to open it on an index b-tree. Otherwise, if pKeyInfo is
5213   ** NULL, the caller expects a table b-tree. If this is not the case,
5214   ** return an SQLITE_CORRUPT error.
5215   **
5216   ** Earlier versions of SQLite assumed that this test could not fail
5217   ** if the root page was already loaded when this function was called (i.e.
5218   ** if pCur->iPage>=0). But this is not so if the database is corrupted
5219   ** in such a way that page pRoot is linked into a second b-tree table
5220   ** (or the freelist).  */
5221   assert( pRoot->intKey==1 || pRoot->intKey==0 );
5222   if( pRoot->isInit==0 || (pCur->pKeyInfo==0)!=pRoot->intKey ){
5223     return SQLITE_CORRUPT_PAGE(pCur->pPage);
5224   }
5225 
5226 skip_init:
5227   pCur->ix = 0;
5228   pCur->info.nSize = 0;
5229   pCur->curFlags &= ~(BTCF_AtLast|BTCF_ValidNKey|BTCF_ValidOvfl);
5230 
5231   pRoot = pCur->pPage;
5232   if( pRoot->nCell>0 ){
5233     pCur->eState = CURSOR_VALID;
5234   }else if( !pRoot->leaf ){
5235     Pgno subpage;
5236     if( pRoot->pgno!=1 ) return SQLITE_CORRUPT_BKPT;
5237     subpage = get4byte(&pRoot->aData[pRoot->hdrOffset+8]);
5238     pCur->eState = CURSOR_VALID;
5239     rc = moveToChild(pCur, subpage);
5240   }else{
5241     pCur->eState = CURSOR_INVALID;
5242     rc = SQLITE_EMPTY;
5243   }
5244   return rc;
5245 }
5246 
5247 /*
5248 ** Move the cursor down to the left-most leaf entry beneath the
5249 ** entry to which it is currently pointing.
5250 **
5251 ** The left-most leaf is the one with the smallest key - the first
5252 ** in ascending order.
5253 */
5254 static int moveToLeftmost(BtCursor *pCur){
5255   Pgno pgno;
5256   int rc = SQLITE_OK;
5257   MemPage *pPage;
5258 
5259   assert( cursorOwnsBtShared(pCur) );
5260   assert( pCur->eState==CURSOR_VALID );
5261   while( rc==SQLITE_OK && !(pPage = pCur->pPage)->leaf ){
5262     assert( pCur->ix<pPage->nCell );
5263     pgno = get4byte(findCell(pPage, pCur->ix));
5264     rc = moveToChild(pCur, pgno);
5265   }
5266   return rc;
5267 }
5268 
5269 /*
5270 ** Move the cursor down to the right-most leaf entry beneath the
5271 ** page to which it is currently pointing.  Notice the difference
5272 ** between moveToLeftmost() and moveToRightmost().  moveToLeftmost()
5273 ** finds the left-most entry beneath the *entry* whereas moveToRightmost()
5274 ** finds the right-most entry beneath the *page*.
5275 **
5276 ** The right-most entry is the one with the largest key - the last
5277 ** key in ascending order.
5278 */
5279 static int moveToRightmost(BtCursor *pCur){
5280   Pgno pgno;
5281   int rc = SQLITE_OK;
5282   MemPage *pPage = 0;
5283 
5284   assert( cursorOwnsBtShared(pCur) );
5285   assert( pCur->eState==CURSOR_VALID );
5286   while( !(pPage = pCur->pPage)->leaf ){
5287     pgno = get4byte(&pPage->aData[pPage->hdrOffset+8]);
5288     pCur->ix = pPage->nCell;
5289     rc = moveToChild(pCur, pgno);
5290     if( rc ) return rc;
5291   }
5292   pCur->ix = pPage->nCell-1;
5293   assert( pCur->info.nSize==0 );
5294   assert( (pCur->curFlags & BTCF_ValidNKey)==0 );
5295   return SQLITE_OK;
5296 }
5297 
5298 /* Move the cursor to the first entry in the table.  Return SQLITE_OK
5299 ** on success.  Set *pRes to 0 if the cursor actually points to something
5300 ** or set *pRes to 1 if the table is empty.
5301 */
5302 int sqlite3BtreeFirst(BtCursor *pCur, int *pRes){
5303   int rc;
5304 
5305   assert( cursorOwnsBtShared(pCur) );
5306   assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
5307   rc = moveToRoot(pCur);
5308   if( rc==SQLITE_OK ){
5309     assert( pCur->pPage->nCell>0 );
5310     *pRes = 0;
5311     rc = moveToLeftmost(pCur);
5312   }else if( rc==SQLITE_EMPTY ){
5313     assert( pCur->pgnoRoot==0 || pCur->pPage->nCell==0 );
5314     *pRes = 1;
5315     rc = SQLITE_OK;
5316   }
5317   return rc;
5318 }
5319 
5320 /* Move the cursor to the last entry in the table.  Return SQLITE_OK
5321 ** on success.  Set *pRes to 0 if the cursor actually points to something
5322 ** or set *pRes to 1 if the table is empty.
5323 */
5324 int sqlite3BtreeLast(BtCursor *pCur, int *pRes){
5325   int rc;
5326 
5327   assert( cursorOwnsBtShared(pCur) );
5328   assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
5329 
5330   /* If the cursor already points to the last entry, this is a no-op. */
5331   if( CURSOR_VALID==pCur->eState && (pCur->curFlags & BTCF_AtLast)!=0 ){
5332 #ifdef SQLITE_DEBUG
5333     /* This block serves to assert() that the cursor really does point
5334     ** to the last entry in the b-tree. */
5335     int ii;
5336     for(ii=0; ii<pCur->iPage; ii++){
5337       assert( pCur->aiIdx[ii]==pCur->apPage[ii]->nCell );
5338     }
5339     assert( pCur->ix==pCur->pPage->nCell-1 );
5340     assert( pCur->pPage->leaf );
5341 #endif
5342     *pRes = 0;
5343     return SQLITE_OK;
5344   }
5345 
5346   rc = moveToRoot(pCur);
5347   if( rc==SQLITE_OK ){
5348     assert( pCur->eState==CURSOR_VALID );
5349     *pRes = 0;
5350     rc = moveToRightmost(pCur);
5351     if( rc==SQLITE_OK ){
5352       pCur->curFlags |= BTCF_AtLast;
5353     }else{
5354       pCur->curFlags &= ~BTCF_AtLast;
5355     }
5356   }else if( rc==SQLITE_EMPTY ){
5357     assert( pCur->pgnoRoot==0 || pCur->pPage->nCell==0 );
5358     *pRes = 1;
5359     rc = SQLITE_OK;
5360   }
5361   return rc;
5362 }
5363 
5364 /* Move the cursor so that it points to an entry near the key
5365 ** specified by pIdxKey or intKey.   Return a success code.
5366 **
5367 ** For INTKEY tables, the intKey parameter is used.  pIdxKey
5368 ** must be NULL.  For index tables, pIdxKey is used and intKey
5369 ** is ignored.
5370 **
5371 ** If an exact match is not found, then the cursor is always
5372 ** left pointing at a leaf page which would hold the entry if it
5373 ** were present.  The cursor might point to an entry that comes
5374 ** before or after the key.
5375 **
5376 ** An integer is written into *pRes which is the result of
5377 ** comparing the key with the entry to which the cursor is
5378 ** pointing.  The meaning of the integer written into
5379 ** *pRes is as follows:
5380 **
5381 **     *pRes<0      The cursor is left pointing at an entry that
5382 **                  is smaller than intKey/pIdxKey or if the table is empty
5383 **                  and the cursor is therefore left point to nothing.
5384 **
5385 **     *pRes==0     The cursor is left pointing at an entry that
5386 **                  exactly matches intKey/pIdxKey.
5387 **
5388 **     *pRes>0      The cursor is left pointing at an entry that
5389 **                  is larger than intKey/pIdxKey.
5390 **
5391 ** For index tables, the pIdxKey->eqSeen field is set to 1 if there
5392 ** exists an entry in the table that exactly matches pIdxKey.
5393 */
5394 int sqlite3BtreeMovetoUnpacked(
5395   BtCursor *pCur,          /* The cursor to be moved */
5396   UnpackedRecord *pIdxKey, /* Unpacked index key */
5397   i64 intKey,              /* The table key */
5398   int biasRight,           /* If true, bias the search to the high end */
5399   int *pRes                /* Write search results here */
5400 ){
5401   int rc;
5402   RecordCompare xRecordCompare;
5403 
5404   assert( cursorOwnsBtShared(pCur) );
5405   assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
5406   assert( pRes );
5407   assert( (pIdxKey==0)==(pCur->pKeyInfo==0) );
5408   assert( pCur->eState!=CURSOR_VALID || (pIdxKey==0)==(pCur->curIntKey!=0) );
5409 
5410   /* If the cursor is already positioned at the point we are trying
5411   ** to move to, then just return without doing any work */
5412   if( pIdxKey==0
5413    && pCur->eState==CURSOR_VALID && (pCur->curFlags & BTCF_ValidNKey)!=0
5414   ){
5415     if( pCur->info.nKey==intKey ){
5416       *pRes = 0;
5417       return SQLITE_OK;
5418     }
5419     if( pCur->info.nKey<intKey ){
5420       if( (pCur->curFlags & BTCF_AtLast)!=0 ){
5421         *pRes = -1;
5422         return SQLITE_OK;
5423       }
5424       /* If the requested key is one more than the previous key, then
5425       ** try to get there using sqlite3BtreeNext() rather than a full
5426       ** binary search.  This is an optimization only.  The correct answer
5427       ** is still obtained without this case, only a little more slowely */
5428       if( pCur->info.nKey+1==intKey ){
5429         *pRes = 0;
5430         rc = sqlite3BtreeNext(pCur, 0);
5431         if( rc==SQLITE_OK ){
5432           getCellInfo(pCur);
5433           if( pCur->info.nKey==intKey ){
5434             return SQLITE_OK;
5435           }
5436         }else if( rc==SQLITE_DONE ){
5437           rc = SQLITE_OK;
5438         }else{
5439           return rc;
5440         }
5441       }
5442     }
5443   }
5444 
5445   if( pIdxKey ){
5446     xRecordCompare = sqlite3VdbeFindCompare(pIdxKey);
5447     pIdxKey->errCode = 0;
5448     assert( pIdxKey->default_rc==1
5449          || pIdxKey->default_rc==0
5450          || pIdxKey->default_rc==-1
5451     );
5452   }else{
5453     xRecordCompare = 0; /* All keys are integers */
5454   }
5455 
5456   rc = moveToRoot(pCur);
5457   if( rc ){
5458     if( rc==SQLITE_EMPTY ){
5459       assert( pCur->pgnoRoot==0 || pCur->pPage->nCell==0 );
5460       *pRes = -1;
5461       return SQLITE_OK;
5462     }
5463     return rc;
5464   }
5465   assert( pCur->pPage );
5466   assert( pCur->pPage->isInit );
5467   assert( pCur->eState==CURSOR_VALID );
5468   assert( pCur->pPage->nCell > 0 );
5469   assert( pCur->iPage==0 || pCur->apPage[0]->intKey==pCur->curIntKey );
5470   assert( pCur->curIntKey || pIdxKey );
5471   for(;;){
5472     int lwr, upr, idx, c;
5473     Pgno chldPg;
5474     MemPage *pPage = pCur->pPage;
5475     u8 *pCell;                          /* Pointer to current cell in pPage */
5476 
5477     /* pPage->nCell must be greater than zero. If this is the root-page
5478     ** the cursor would have been INVALID above and this for(;;) loop
5479     ** not run. If this is not the root-page, then the moveToChild() routine
5480     ** would have already detected db corruption. Similarly, pPage must
5481     ** be the right kind (index or table) of b-tree page. Otherwise
5482     ** a moveToChild() or moveToRoot() call would have detected corruption.  */
5483     assert( pPage->nCell>0 );
5484     assert( pPage->intKey==(pIdxKey==0) );
5485     lwr = 0;
5486     upr = pPage->nCell-1;
5487     assert( biasRight==0 || biasRight==1 );
5488     idx = upr>>(1-biasRight); /* idx = biasRight ? upr : (lwr+upr)/2; */
5489     pCur->ix = (u16)idx;
5490     if( xRecordCompare==0 ){
5491       for(;;){
5492         i64 nCellKey;
5493         pCell = findCellPastPtr(pPage, idx);
5494         if( pPage->intKeyLeaf ){
5495           while( 0x80 <= *(pCell++) ){
5496             if( pCell>=pPage->aDataEnd ){
5497               return SQLITE_CORRUPT_PAGE(pPage);
5498             }
5499           }
5500         }
5501         getVarint(pCell, (u64*)&nCellKey);
5502         if( nCellKey<intKey ){
5503           lwr = idx+1;
5504           if( lwr>upr ){ c = -1; break; }
5505         }else if( nCellKey>intKey ){
5506           upr = idx-1;
5507           if( lwr>upr ){ c = +1; break; }
5508         }else{
5509           assert( nCellKey==intKey );
5510           pCur->ix = (u16)idx;
5511           if( !pPage->leaf ){
5512             lwr = idx;
5513             goto moveto_next_layer;
5514           }else{
5515             pCur->curFlags |= BTCF_ValidNKey;
5516             pCur->info.nKey = nCellKey;
5517             pCur->info.nSize = 0;
5518             *pRes = 0;
5519             return SQLITE_OK;
5520           }
5521         }
5522         assert( lwr+upr>=0 );
5523         idx = (lwr+upr)>>1;  /* idx = (lwr+upr)/2; */
5524       }
5525     }else{
5526       for(;;){
5527         int nCell;  /* Size of the pCell cell in bytes */
5528         pCell = findCellPastPtr(pPage, idx);
5529 
5530         /* The maximum supported page-size is 65536 bytes. This means that
5531         ** the maximum number of record bytes stored on an index B-Tree
5532         ** page is less than 16384 bytes and may be stored as a 2-byte
5533         ** varint. This information is used to attempt to avoid parsing
5534         ** the entire cell by checking for the cases where the record is
5535         ** stored entirely within the b-tree page by inspecting the first
5536         ** 2 bytes of the cell.
5537         */
5538         nCell = pCell[0];
5539         if( nCell<=pPage->max1bytePayload ){
5540           /* This branch runs if the record-size field of the cell is a
5541           ** single byte varint and the record fits entirely on the main
5542           ** b-tree page.  */
5543           testcase( pCell+nCell+1==pPage->aDataEnd );
5544           c = xRecordCompare(nCell, (void*)&pCell[1], pIdxKey);
5545         }else if( !(pCell[1] & 0x80)
5546           && (nCell = ((nCell&0x7f)<<7) + pCell[1])<=pPage->maxLocal
5547         ){
5548           /* The record-size field is a 2 byte varint and the record
5549           ** fits entirely on the main b-tree page.  */
5550           testcase( pCell+nCell+2==pPage->aDataEnd );
5551           c = xRecordCompare(nCell, (void*)&pCell[2], pIdxKey);
5552         }else{
5553           /* The record flows over onto one or more overflow pages. In
5554           ** this case the whole cell needs to be parsed, a buffer allocated
5555           ** and accessPayload() used to retrieve the record into the
5556           ** buffer before VdbeRecordCompare() can be called.
5557           **
5558           ** If the record is corrupt, the xRecordCompare routine may read
5559           ** up to two varints past the end of the buffer. An extra 18
5560           ** bytes of padding is allocated at the end of the buffer in
5561           ** case this happens.  */
5562           void *pCellKey;
5563           u8 * const pCellBody = pCell - pPage->childPtrSize;
5564           const int nOverrun = 18;  /* Size of the overrun padding */
5565           pPage->xParseCell(pPage, pCellBody, &pCur->info);
5566           nCell = (int)pCur->info.nKey;
5567           testcase( nCell<0 );   /* True if key size is 2^32 or more */
5568           testcase( nCell==0 );  /* Invalid key size:  0x80 0x80 0x00 */
5569           testcase( nCell==1 );  /* Invalid key size:  0x80 0x80 0x01 */
5570           testcase( nCell==2 );  /* Minimum legal index key size */
5571           if( nCell<2 || nCell/pCur->pBt->usableSize>pCur->pBt->nPage ){
5572             rc = SQLITE_CORRUPT_PAGE(pPage);
5573             goto moveto_finish;
5574           }
5575           pCellKey = sqlite3Malloc( nCell+nOverrun );
5576           if( pCellKey==0 ){
5577             rc = SQLITE_NOMEM_BKPT;
5578             goto moveto_finish;
5579           }
5580           pCur->ix = (u16)idx;
5581           rc = accessPayload(pCur, 0, nCell, (unsigned char*)pCellKey, 0);
5582           memset(((u8*)pCellKey)+nCell,0,nOverrun); /* Fix uninit warnings */
5583           pCur->curFlags &= ~BTCF_ValidOvfl;
5584           if( rc ){
5585             sqlite3_free(pCellKey);
5586             goto moveto_finish;
5587           }
5588           c = sqlite3VdbeRecordCompare(nCell, pCellKey, pIdxKey);
5589           sqlite3_free(pCellKey);
5590         }
5591         assert(
5592             (pIdxKey->errCode!=SQLITE_CORRUPT || c==0)
5593          && (pIdxKey->errCode!=SQLITE_NOMEM || pCur->pBtree->db->mallocFailed)
5594         );
5595         if( c<0 ){
5596           lwr = idx+1;
5597         }else if( c>0 ){
5598           upr = idx-1;
5599         }else{
5600           assert( c==0 );
5601           *pRes = 0;
5602           rc = SQLITE_OK;
5603           pCur->ix = (u16)idx;
5604           if( pIdxKey->errCode ) rc = SQLITE_CORRUPT_BKPT;
5605           goto moveto_finish;
5606         }
5607         if( lwr>upr ) break;
5608         assert( lwr+upr>=0 );
5609         idx = (lwr+upr)>>1;  /* idx = (lwr+upr)/2 */
5610       }
5611     }
5612     assert( lwr==upr+1 || (pPage->intKey && !pPage->leaf) );
5613     assert( pPage->isInit );
5614     if( pPage->leaf ){
5615       assert( pCur->ix<pCur->pPage->nCell );
5616       pCur->ix = (u16)idx;
5617       *pRes = c;
5618       rc = SQLITE_OK;
5619       goto moveto_finish;
5620     }
5621 moveto_next_layer:
5622     if( lwr>=pPage->nCell ){
5623       chldPg = get4byte(&pPage->aData[pPage->hdrOffset+8]);
5624     }else{
5625       chldPg = get4byte(findCell(pPage, lwr));
5626     }
5627     pCur->ix = (u16)lwr;
5628     rc = moveToChild(pCur, chldPg);
5629     if( rc ) break;
5630   }
5631 moveto_finish:
5632   pCur->info.nSize = 0;
5633   assert( (pCur->curFlags & BTCF_ValidOvfl)==0 );
5634   return rc;
5635 }
5636 
5637 
5638 /*
5639 ** Return TRUE if the cursor is not pointing at an entry of the table.
5640 **
5641 ** TRUE will be returned after a call to sqlite3BtreeNext() moves
5642 ** past the last entry in the table or sqlite3BtreePrev() moves past
5643 ** the first entry.  TRUE is also returned if the table is empty.
5644 */
5645 int sqlite3BtreeEof(BtCursor *pCur){
5646   /* TODO: What if the cursor is in CURSOR_REQUIRESEEK but all table entries
5647   ** have been deleted? This API will need to change to return an error code
5648   ** as well as the boolean result value.
5649   */
5650   return (CURSOR_VALID!=pCur->eState);
5651 }
5652 
5653 /*
5654 ** Return an estimate for the number of rows in the table that pCur is
5655 ** pointing to.  Return a negative number if no estimate is currently
5656 ** available.
5657 */
5658 i64 sqlite3BtreeRowCountEst(BtCursor *pCur){
5659   i64 n;
5660   u8 i;
5661 
5662   assert( cursorOwnsBtShared(pCur) );
5663   assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
5664 
5665   /* Currently this interface is only called by the OP_IfSmaller
5666   ** opcode, and it that case the cursor will always be valid and
5667   ** will always point to a leaf node. */
5668   if( NEVER(pCur->eState!=CURSOR_VALID) ) return -1;
5669   if( NEVER(pCur->pPage->leaf==0) ) return -1;
5670 
5671   n = pCur->pPage->nCell;
5672   for(i=0; i<pCur->iPage; i++){
5673     n *= pCur->apPage[i]->nCell;
5674   }
5675   return n;
5676 }
5677 
5678 /*
5679 ** Advance the cursor to the next entry in the database.
5680 ** Return value:
5681 **
5682 **    SQLITE_OK        success
5683 **    SQLITE_DONE      cursor is already pointing at the last element
5684 **    otherwise        some kind of error occurred
5685 **
5686 ** The main entry point is sqlite3BtreeNext().  That routine is optimized
5687 ** for the common case of merely incrementing the cell counter BtCursor.aiIdx
5688 ** to the next cell on the current page.  The (slower) btreeNext() helper
5689 ** routine is called when it is necessary to move to a different page or
5690 ** to restore the cursor.
5691 **
5692 ** If bit 0x01 of the F argument in sqlite3BtreeNext(C,F) is 1, then the
5693 ** cursor corresponds to an SQL index and this routine could have been
5694 ** skipped if the SQL index had been a unique index.  The F argument
5695 ** is a hint to the implement.  SQLite btree implementation does not use
5696 ** this hint, but COMDB2 does.
5697 */
5698 static SQLITE_NOINLINE int btreeNext(BtCursor *pCur){
5699   int rc;
5700   int idx;
5701   MemPage *pPage;
5702 
5703   assert( cursorOwnsBtShared(pCur) );
5704   if( pCur->eState!=CURSOR_VALID ){
5705     assert( (pCur->curFlags & BTCF_ValidOvfl)==0 );
5706     rc = restoreCursorPosition(pCur);
5707     if( rc!=SQLITE_OK ){
5708       return rc;
5709     }
5710     if( CURSOR_INVALID==pCur->eState ){
5711       return SQLITE_DONE;
5712     }
5713     if( pCur->eState==CURSOR_SKIPNEXT ){
5714       pCur->eState = CURSOR_VALID;
5715       if( pCur->skipNext>0 ) return SQLITE_OK;
5716     }
5717   }
5718 
5719   pPage = pCur->pPage;
5720   idx = ++pCur->ix;
5721   if( !pPage->isInit ){
5722     /* The only known way for this to happen is for there to be a
5723     ** recursive SQL function that does a DELETE operation as part of a
5724     ** SELECT which deletes content out from under an active cursor
5725     ** in a corrupt database file where the table being DELETE-ed from
5726     ** has pages in common with the table being queried.  See TH3
5727     ** module cov1/btree78.test testcase 220 (2018-06-08) for an
5728     ** example. */
5729     return SQLITE_CORRUPT_BKPT;
5730   }
5731 
5732   /* If the database file is corrupt, it is possible for the value of idx
5733   ** to be invalid here. This can only occur if a second cursor modifies
5734   ** the page while cursor pCur is holding a reference to it. Which can
5735   ** only happen if the database is corrupt in such a way as to link the
5736   ** page into more than one b-tree structure.
5737   **
5738   ** Update 2019-12-23: appears to long longer be possible after the
5739   ** addition of anotherValidCursor() condition on balance_deeper().  */
5740   harmless( idx>pPage->nCell );
5741 
5742   if( idx>=pPage->nCell ){
5743     if( !pPage->leaf ){
5744       rc = moveToChild(pCur, get4byte(&pPage->aData[pPage->hdrOffset+8]));
5745       if( rc ) return rc;
5746       return moveToLeftmost(pCur);
5747     }
5748     do{
5749       if( pCur->iPage==0 ){
5750         pCur->eState = CURSOR_INVALID;
5751         return SQLITE_DONE;
5752       }
5753       moveToParent(pCur);
5754       pPage = pCur->pPage;
5755     }while( pCur->ix>=pPage->nCell );
5756     if( pPage->intKey ){
5757       return sqlite3BtreeNext(pCur, 0);
5758     }else{
5759       return SQLITE_OK;
5760     }
5761   }
5762   if( pPage->leaf ){
5763     return SQLITE_OK;
5764   }else{
5765     return moveToLeftmost(pCur);
5766   }
5767 }
5768 int sqlite3BtreeNext(BtCursor *pCur, int flags){
5769   MemPage *pPage;
5770   UNUSED_PARAMETER( flags );  /* Used in COMDB2 but not native SQLite */
5771   assert( cursorOwnsBtShared(pCur) );
5772   assert( flags==0 || flags==1 );
5773   pCur->info.nSize = 0;
5774   pCur->curFlags &= ~(BTCF_ValidNKey|BTCF_ValidOvfl);
5775   if( pCur->eState!=CURSOR_VALID ) return btreeNext(pCur);
5776   pPage = pCur->pPage;
5777   if( (++pCur->ix)>=pPage->nCell ){
5778     pCur->ix--;
5779     return btreeNext(pCur);
5780   }
5781   if( pPage->leaf ){
5782     return SQLITE_OK;
5783   }else{
5784     return moveToLeftmost(pCur);
5785   }
5786 }
5787 
5788 /*
5789 ** Step the cursor to the back to the previous entry in the database.
5790 ** Return values:
5791 **
5792 **     SQLITE_OK     success
5793 **     SQLITE_DONE   the cursor is already on the first element of the table
5794 **     otherwise     some kind of error occurred
5795 **
5796 ** The main entry point is sqlite3BtreePrevious().  That routine is optimized
5797 ** for the common case of merely decrementing the cell counter BtCursor.aiIdx
5798 ** to the previous cell on the current page.  The (slower) btreePrevious()
5799 ** helper routine is called when it is necessary to move to a different page
5800 ** or to restore the cursor.
5801 **
5802 ** If bit 0x01 of the F argument to sqlite3BtreePrevious(C,F) is 1, then
5803 ** the cursor corresponds to an SQL index and this routine could have been
5804 ** skipped if the SQL index had been a unique index.  The F argument is a
5805 ** hint to the implement.  The native SQLite btree implementation does not
5806 ** use this hint, but COMDB2 does.
5807 */
5808 static SQLITE_NOINLINE int btreePrevious(BtCursor *pCur){
5809   int rc;
5810   MemPage *pPage;
5811 
5812   assert( cursorOwnsBtShared(pCur) );
5813   assert( (pCur->curFlags & (BTCF_AtLast|BTCF_ValidOvfl|BTCF_ValidNKey))==0 );
5814   assert( pCur->info.nSize==0 );
5815   if( pCur->eState!=CURSOR_VALID ){
5816     rc = restoreCursorPosition(pCur);
5817     if( rc!=SQLITE_OK ){
5818       return rc;
5819     }
5820     if( CURSOR_INVALID==pCur->eState ){
5821       return SQLITE_DONE;
5822     }
5823     if( CURSOR_SKIPNEXT==pCur->eState ){
5824       pCur->eState = CURSOR_VALID;
5825       if( pCur->skipNext<0 ) return SQLITE_OK;
5826     }
5827   }
5828 
5829   pPage = pCur->pPage;
5830   assert( pPage->isInit );
5831   if( !pPage->leaf ){
5832     int idx = pCur->ix;
5833     rc = moveToChild(pCur, get4byte(findCell(pPage, idx)));
5834     if( rc ) return rc;
5835     rc = moveToRightmost(pCur);
5836   }else{
5837     while( pCur->ix==0 ){
5838       if( pCur->iPage==0 ){
5839         pCur->eState = CURSOR_INVALID;
5840         return SQLITE_DONE;
5841       }
5842       moveToParent(pCur);
5843     }
5844     assert( pCur->info.nSize==0 );
5845     assert( (pCur->curFlags & (BTCF_ValidOvfl))==0 );
5846 
5847     pCur->ix--;
5848     pPage = pCur->pPage;
5849     if( pPage->intKey && !pPage->leaf ){
5850       rc = sqlite3BtreePrevious(pCur, 0);
5851     }else{
5852       rc = SQLITE_OK;
5853     }
5854   }
5855   return rc;
5856 }
5857 int sqlite3BtreePrevious(BtCursor *pCur, int flags){
5858   assert( cursorOwnsBtShared(pCur) );
5859   assert( flags==0 || flags==1 );
5860   UNUSED_PARAMETER( flags );  /* Used in COMDB2 but not native SQLite */
5861   pCur->curFlags &= ~(BTCF_AtLast|BTCF_ValidOvfl|BTCF_ValidNKey);
5862   pCur->info.nSize = 0;
5863   if( pCur->eState!=CURSOR_VALID
5864    || pCur->ix==0
5865    || pCur->pPage->leaf==0
5866   ){
5867     return btreePrevious(pCur);
5868   }
5869   pCur->ix--;
5870   return SQLITE_OK;
5871 }
5872 
5873 /*
5874 ** Allocate a new page from the database file.
5875 **
5876 ** The new page is marked as dirty.  (In other words, sqlite3PagerWrite()
5877 ** has already been called on the new page.)  The new page has also
5878 ** been referenced and the calling routine is responsible for calling
5879 ** sqlite3PagerUnref() on the new page when it is done.
5880 **
5881 ** SQLITE_OK is returned on success.  Any other return value indicates
5882 ** an error.  *ppPage is set to NULL in the event of an error.
5883 **
5884 ** If the "nearby" parameter is not 0, then an effort is made to
5885 ** locate a page close to the page number "nearby".  This can be used in an
5886 ** attempt to keep related pages close to each other in the database file,
5887 ** which in turn can make database access faster.
5888 **
5889 ** If the eMode parameter is BTALLOC_EXACT and the nearby page exists
5890 ** anywhere on the free-list, then it is guaranteed to be returned.  If
5891 ** eMode is BTALLOC_LT then the page returned will be less than or equal
5892 ** to nearby if any such page exists.  If eMode is BTALLOC_ANY then there
5893 ** are no restrictions on which page is returned.
5894 */
5895 static int allocateBtreePage(
5896   BtShared *pBt,         /* The btree */
5897   MemPage **ppPage,      /* Store pointer to the allocated page here */
5898   Pgno *pPgno,           /* Store the page number here */
5899   Pgno nearby,           /* Search for a page near this one */
5900   u8 eMode               /* BTALLOC_EXACT, BTALLOC_LT, or BTALLOC_ANY */
5901 ){
5902   MemPage *pPage1;
5903   int rc;
5904   u32 n;     /* Number of pages on the freelist */
5905   u32 k;     /* Number of leaves on the trunk of the freelist */
5906   MemPage *pTrunk = 0;
5907   MemPage *pPrevTrunk = 0;
5908   Pgno mxPage;     /* Total size of the database file */
5909 
5910   assert( sqlite3_mutex_held(pBt->mutex) );
5911   assert( eMode==BTALLOC_ANY || (nearby>0 && IfNotOmitAV(pBt->autoVacuum)) );
5912   pPage1 = pBt->pPage1;
5913   mxPage = btreePagecount(pBt);
5914   /* EVIDENCE-OF: R-05119-02637 The 4-byte big-endian integer at offset 36
5915   ** stores stores the total number of pages on the freelist. */
5916   n = get4byte(&pPage1->aData[36]);
5917   testcase( n==mxPage-1 );
5918   if( n>=mxPage ){
5919     return SQLITE_CORRUPT_BKPT;
5920   }
5921   if( n>0 ){
5922     /* There are pages on the freelist.  Reuse one of those pages. */
5923     Pgno iTrunk;
5924     u8 searchList = 0; /* If the free-list must be searched for 'nearby' */
5925     u32 nSearch = 0;   /* Count of the number of search attempts */
5926 
5927     /* If eMode==BTALLOC_EXACT and a query of the pointer-map
5928     ** shows that the page 'nearby' is somewhere on the free-list, then
5929     ** the entire-list will be searched for that page.
5930     */
5931 #ifndef SQLITE_OMIT_AUTOVACUUM
5932     if( eMode==BTALLOC_EXACT ){
5933       if( nearby<=mxPage ){
5934         u8 eType;
5935         assert( nearby>0 );
5936         assert( pBt->autoVacuum );
5937         rc = ptrmapGet(pBt, nearby, &eType, 0);
5938         if( rc ) return rc;
5939         if( eType==PTRMAP_FREEPAGE ){
5940           searchList = 1;
5941         }
5942       }
5943     }else if( eMode==BTALLOC_LE ){
5944       searchList = 1;
5945     }
5946 #endif
5947 
5948     /* Decrement the free-list count by 1. Set iTrunk to the index of the
5949     ** first free-list trunk page. iPrevTrunk is initially 1.
5950     */
5951     rc = sqlite3PagerWrite(pPage1->pDbPage);
5952     if( rc ) return rc;
5953     put4byte(&pPage1->aData[36], n-1);
5954 
5955     /* The code within this loop is run only once if the 'searchList' variable
5956     ** is not true. Otherwise, it runs once for each trunk-page on the
5957     ** free-list until the page 'nearby' is located (eMode==BTALLOC_EXACT)
5958     ** or until a page less than 'nearby' is located (eMode==BTALLOC_LT)
5959     */
5960     do {
5961       pPrevTrunk = pTrunk;
5962       if( pPrevTrunk ){
5963         /* EVIDENCE-OF: R-01506-11053 The first integer on a freelist trunk page
5964         ** is the page number of the next freelist trunk page in the list or
5965         ** zero if this is the last freelist trunk page. */
5966         iTrunk = get4byte(&pPrevTrunk->aData[0]);
5967       }else{
5968         /* EVIDENCE-OF: R-59841-13798 The 4-byte big-endian integer at offset 32
5969         ** stores the page number of the first page of the freelist, or zero if
5970         ** the freelist is empty. */
5971         iTrunk = get4byte(&pPage1->aData[32]);
5972       }
5973       testcase( iTrunk==mxPage );
5974       if( iTrunk>mxPage || nSearch++ > n ){
5975         rc = SQLITE_CORRUPT_PGNO(pPrevTrunk ? pPrevTrunk->pgno : 1);
5976       }else{
5977         rc = btreeGetUnusedPage(pBt, iTrunk, &pTrunk, 0);
5978       }
5979       if( rc ){
5980         pTrunk = 0;
5981         goto end_allocate_page;
5982       }
5983       assert( pTrunk!=0 );
5984       assert( pTrunk->aData!=0 );
5985       /* EVIDENCE-OF: R-13523-04394 The second integer on a freelist trunk page
5986       ** is the number of leaf page pointers to follow. */
5987       k = get4byte(&pTrunk->aData[4]);
5988       if( k==0 && !searchList ){
5989         /* The trunk has no leaves and the list is not being searched.
5990         ** So extract the trunk page itself and use it as the newly
5991         ** allocated page */
5992         assert( pPrevTrunk==0 );
5993         rc = sqlite3PagerWrite(pTrunk->pDbPage);
5994         if( rc ){
5995           goto end_allocate_page;
5996         }
5997         *pPgno = iTrunk;
5998         memcpy(&pPage1->aData[32], &pTrunk->aData[0], 4);
5999         *ppPage = pTrunk;
6000         pTrunk = 0;
6001         TRACE(("ALLOCATE: %d trunk - %d free pages left\n", *pPgno, n-1));
6002       }else if( k>(u32)(pBt->usableSize/4 - 2) ){
6003         /* Value of k is out of range.  Database corruption */
6004         rc = SQLITE_CORRUPT_PGNO(iTrunk);
6005         goto end_allocate_page;
6006 #ifndef SQLITE_OMIT_AUTOVACUUM
6007       }else if( searchList
6008             && (nearby==iTrunk || (iTrunk<nearby && eMode==BTALLOC_LE))
6009       ){
6010         /* The list is being searched and this trunk page is the page
6011         ** to allocate, regardless of whether it has leaves.
6012         */
6013         *pPgno = iTrunk;
6014         *ppPage = pTrunk;
6015         searchList = 0;
6016         rc = sqlite3PagerWrite(pTrunk->pDbPage);
6017         if( rc ){
6018           goto end_allocate_page;
6019         }
6020         if( k==0 ){
6021           if( !pPrevTrunk ){
6022             memcpy(&pPage1->aData[32], &pTrunk->aData[0], 4);
6023           }else{
6024             rc = sqlite3PagerWrite(pPrevTrunk->pDbPage);
6025             if( rc!=SQLITE_OK ){
6026               goto end_allocate_page;
6027             }
6028             memcpy(&pPrevTrunk->aData[0], &pTrunk->aData[0], 4);
6029           }
6030         }else{
6031           /* The trunk page is required by the caller but it contains
6032           ** pointers to free-list leaves. The first leaf becomes a trunk
6033           ** page in this case.
6034           */
6035           MemPage *pNewTrunk;
6036           Pgno iNewTrunk = get4byte(&pTrunk->aData[8]);
6037           if( iNewTrunk>mxPage ){
6038             rc = SQLITE_CORRUPT_PGNO(iTrunk);
6039             goto end_allocate_page;
6040           }
6041           testcase( iNewTrunk==mxPage );
6042           rc = btreeGetUnusedPage(pBt, iNewTrunk, &pNewTrunk, 0);
6043           if( rc!=SQLITE_OK ){
6044             goto end_allocate_page;
6045           }
6046           rc = sqlite3PagerWrite(pNewTrunk->pDbPage);
6047           if( rc!=SQLITE_OK ){
6048             releasePage(pNewTrunk);
6049             goto end_allocate_page;
6050           }
6051           memcpy(&pNewTrunk->aData[0], &pTrunk->aData[0], 4);
6052           put4byte(&pNewTrunk->aData[4], k-1);
6053           memcpy(&pNewTrunk->aData[8], &pTrunk->aData[12], (k-1)*4);
6054           releasePage(pNewTrunk);
6055           if( !pPrevTrunk ){
6056             assert( sqlite3PagerIswriteable(pPage1->pDbPage) );
6057             put4byte(&pPage1->aData[32], iNewTrunk);
6058           }else{
6059             rc = sqlite3PagerWrite(pPrevTrunk->pDbPage);
6060             if( rc ){
6061               goto end_allocate_page;
6062             }
6063             put4byte(&pPrevTrunk->aData[0], iNewTrunk);
6064           }
6065         }
6066         pTrunk = 0;
6067         TRACE(("ALLOCATE: %d trunk - %d free pages left\n", *pPgno, n-1));
6068 #endif
6069       }else if( k>0 ){
6070         /* Extract a leaf from the trunk */
6071         u32 closest;
6072         Pgno iPage;
6073         unsigned char *aData = pTrunk->aData;
6074         if( nearby>0 ){
6075           u32 i;
6076           closest = 0;
6077           if( eMode==BTALLOC_LE ){
6078             for(i=0; i<k; i++){
6079               iPage = get4byte(&aData[8+i*4]);
6080               if( iPage<=nearby ){
6081                 closest = i;
6082                 break;
6083               }
6084             }
6085           }else{
6086             int dist;
6087             dist = sqlite3AbsInt32(get4byte(&aData[8]) - nearby);
6088             for(i=1; i<k; i++){
6089               int d2 = sqlite3AbsInt32(get4byte(&aData[8+i*4]) - nearby);
6090               if( d2<dist ){
6091                 closest = i;
6092                 dist = d2;
6093               }
6094             }
6095           }
6096         }else{
6097           closest = 0;
6098         }
6099 
6100         iPage = get4byte(&aData[8+closest*4]);
6101         testcase( iPage==mxPage );
6102         if( iPage>mxPage ){
6103           rc = SQLITE_CORRUPT_PGNO(iTrunk);
6104           goto end_allocate_page;
6105         }
6106         testcase( iPage==mxPage );
6107         if( !searchList
6108          || (iPage==nearby || (iPage<nearby && eMode==BTALLOC_LE))
6109         ){
6110           int noContent;
6111           *pPgno = iPage;
6112           TRACE(("ALLOCATE: %d was leaf %d of %d on trunk %d"
6113                  ": %d more free pages\n",
6114                  *pPgno, closest+1, k, pTrunk->pgno, n-1));
6115           rc = sqlite3PagerWrite(pTrunk->pDbPage);
6116           if( rc ) goto end_allocate_page;
6117           if( closest<k-1 ){
6118             memcpy(&aData[8+closest*4], &aData[4+k*4], 4);
6119           }
6120           put4byte(&aData[4], k-1);
6121           noContent = !btreeGetHasContent(pBt, *pPgno)? PAGER_GET_NOCONTENT : 0;
6122           rc = btreeGetUnusedPage(pBt, *pPgno, ppPage, noContent);
6123           if( rc==SQLITE_OK ){
6124             rc = sqlite3PagerWrite((*ppPage)->pDbPage);
6125             if( rc!=SQLITE_OK ){
6126               releasePage(*ppPage);
6127               *ppPage = 0;
6128             }
6129           }
6130           searchList = 0;
6131         }
6132       }
6133       releasePage(pPrevTrunk);
6134       pPrevTrunk = 0;
6135     }while( searchList );
6136   }else{
6137     /* There are no pages on the freelist, so append a new page to the
6138     ** database image.
6139     **
6140     ** Normally, new pages allocated by this block can be requested from the
6141     ** pager layer with the 'no-content' flag set. This prevents the pager
6142     ** from trying to read the pages content from disk. However, if the
6143     ** current transaction has already run one or more incremental-vacuum
6144     ** steps, then the page we are about to allocate may contain content
6145     ** that is required in the event of a rollback. In this case, do
6146     ** not set the no-content flag. This causes the pager to load and journal
6147     ** the current page content before overwriting it.
6148     **
6149     ** Note that the pager will not actually attempt to load or journal
6150     ** content for any page that really does lie past the end of the database
6151     ** file on disk. So the effects of disabling the no-content optimization
6152     ** here are confined to those pages that lie between the end of the
6153     ** database image and the end of the database file.
6154     */
6155     int bNoContent = (0==IfNotOmitAV(pBt->bDoTruncate))? PAGER_GET_NOCONTENT:0;
6156 
6157     rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
6158     if( rc ) return rc;
6159     pBt->nPage++;
6160     if( pBt->nPage==PENDING_BYTE_PAGE(pBt) ) pBt->nPage++;
6161 
6162 #ifndef SQLITE_OMIT_AUTOVACUUM
6163     if( pBt->autoVacuum && PTRMAP_ISPAGE(pBt, pBt->nPage) ){
6164       /* If *pPgno refers to a pointer-map page, allocate two new pages
6165       ** at the end of the file instead of one. The first allocated page
6166       ** becomes a new pointer-map page, the second is used by the caller.
6167       */
6168       MemPage *pPg = 0;
6169       TRACE(("ALLOCATE: %d from end of file (pointer-map page)\n", pBt->nPage));
6170       assert( pBt->nPage!=PENDING_BYTE_PAGE(pBt) );
6171       rc = btreeGetUnusedPage(pBt, pBt->nPage, &pPg, bNoContent);
6172       if( rc==SQLITE_OK ){
6173         rc = sqlite3PagerWrite(pPg->pDbPage);
6174         releasePage(pPg);
6175       }
6176       if( rc ) return rc;
6177       pBt->nPage++;
6178       if( pBt->nPage==PENDING_BYTE_PAGE(pBt) ){ pBt->nPage++; }
6179     }
6180 #endif
6181     put4byte(28 + (u8*)pBt->pPage1->aData, pBt->nPage);
6182     *pPgno = pBt->nPage;
6183 
6184     assert( *pPgno!=PENDING_BYTE_PAGE(pBt) );
6185     rc = btreeGetUnusedPage(pBt, *pPgno, ppPage, bNoContent);
6186     if( rc ) return rc;
6187     rc = sqlite3PagerWrite((*ppPage)->pDbPage);
6188     if( rc!=SQLITE_OK ){
6189       releasePage(*ppPage);
6190       *ppPage = 0;
6191     }
6192     TRACE(("ALLOCATE: %d from end of file\n", *pPgno));
6193   }
6194 
6195   assert( CORRUPT_DB || *pPgno!=PENDING_BYTE_PAGE(pBt) );
6196 
6197 end_allocate_page:
6198   releasePage(pTrunk);
6199   releasePage(pPrevTrunk);
6200   assert( rc!=SQLITE_OK || sqlite3PagerPageRefcount((*ppPage)->pDbPage)<=1 );
6201   assert( rc!=SQLITE_OK || (*ppPage)->isInit==0 );
6202   return rc;
6203 }
6204 
6205 /*
6206 ** This function is used to add page iPage to the database file free-list.
6207 ** It is assumed that the page is not already a part of the free-list.
6208 **
6209 ** The value passed as the second argument to this function is optional.
6210 ** If the caller happens to have a pointer to the MemPage object
6211 ** corresponding to page iPage handy, it may pass it as the second value.
6212 ** Otherwise, it may pass NULL.
6213 **
6214 ** If a pointer to a MemPage object is passed as the second argument,
6215 ** its reference count is not altered by this function.
6216 */
6217 static int freePage2(BtShared *pBt, MemPage *pMemPage, Pgno iPage){
6218   MemPage *pTrunk = 0;                /* Free-list trunk page */
6219   Pgno iTrunk = 0;                    /* Page number of free-list trunk page */
6220   MemPage *pPage1 = pBt->pPage1;      /* Local reference to page 1 */
6221   MemPage *pPage;                     /* Page being freed. May be NULL. */
6222   int rc;                             /* Return Code */
6223   u32 nFree;                          /* Initial number of pages on free-list */
6224 
6225   assert( sqlite3_mutex_held(pBt->mutex) );
6226   assert( CORRUPT_DB || iPage>1 );
6227   assert( !pMemPage || pMemPage->pgno==iPage );
6228 
6229   if( iPage<2 || iPage>pBt->nPage ){
6230     return SQLITE_CORRUPT_BKPT;
6231   }
6232   if( pMemPage ){
6233     pPage = pMemPage;
6234     sqlite3PagerRef(pPage->pDbPage);
6235   }else{
6236     pPage = btreePageLookup(pBt, iPage);
6237   }
6238 
6239   /* Increment the free page count on pPage1 */
6240   rc = sqlite3PagerWrite(pPage1->pDbPage);
6241   if( rc ) goto freepage_out;
6242   nFree = get4byte(&pPage1->aData[36]);
6243   put4byte(&pPage1->aData[36], nFree+1);
6244 
6245   if( pBt->btsFlags & BTS_SECURE_DELETE ){
6246     /* If the secure_delete option is enabled, then
6247     ** always fully overwrite deleted information with zeros.
6248     */
6249     if( (!pPage && ((rc = btreeGetPage(pBt, iPage, &pPage, 0))!=0) )
6250      ||            ((rc = sqlite3PagerWrite(pPage->pDbPage))!=0)
6251     ){
6252       goto freepage_out;
6253     }
6254     memset(pPage->aData, 0, pPage->pBt->pageSize);
6255   }
6256 
6257   /* If the database supports auto-vacuum, write an entry in the pointer-map
6258   ** to indicate that the page is free.
6259   */
6260   if( ISAUTOVACUUM ){
6261     ptrmapPut(pBt, iPage, PTRMAP_FREEPAGE, 0, &rc);
6262     if( rc ) goto freepage_out;
6263   }
6264 
6265   /* Now manipulate the actual database free-list structure. There are two
6266   ** possibilities. If the free-list is currently empty, or if the first
6267   ** trunk page in the free-list is full, then this page will become a
6268   ** new free-list trunk page. Otherwise, it will become a leaf of the
6269   ** first trunk page in the current free-list. This block tests if it
6270   ** is possible to add the page as a new free-list leaf.
6271   */
6272   if( nFree!=0 ){
6273     u32 nLeaf;                /* Initial number of leaf cells on trunk page */
6274 
6275     iTrunk = get4byte(&pPage1->aData[32]);
6276     rc = btreeGetPage(pBt, iTrunk, &pTrunk, 0);
6277     if( rc!=SQLITE_OK ){
6278       goto freepage_out;
6279     }
6280 
6281     nLeaf = get4byte(&pTrunk->aData[4]);
6282     assert( pBt->usableSize>32 );
6283     if( nLeaf > (u32)pBt->usableSize/4 - 2 ){
6284       rc = SQLITE_CORRUPT_BKPT;
6285       goto freepage_out;
6286     }
6287     if( nLeaf < (u32)pBt->usableSize/4 - 8 ){
6288       /* In this case there is room on the trunk page to insert the page
6289       ** being freed as a new leaf.
6290       **
6291       ** Note that the trunk page is not really full until it contains
6292       ** usableSize/4 - 2 entries, not usableSize/4 - 8 entries as we have
6293       ** coded.  But due to a coding error in versions of SQLite prior to
6294       ** 3.6.0, databases with freelist trunk pages holding more than
6295       ** usableSize/4 - 8 entries will be reported as corrupt.  In order
6296       ** to maintain backwards compatibility with older versions of SQLite,
6297       ** we will continue to restrict the number of entries to usableSize/4 - 8
6298       ** for now.  At some point in the future (once everyone has upgraded
6299       ** to 3.6.0 or later) we should consider fixing the conditional above
6300       ** to read "usableSize/4-2" instead of "usableSize/4-8".
6301       **
6302       ** EVIDENCE-OF: R-19920-11576 However, newer versions of SQLite still
6303       ** avoid using the last six entries in the freelist trunk page array in
6304       ** order that database files created by newer versions of SQLite can be
6305       ** read by older versions of SQLite.
6306       */
6307       rc = sqlite3PagerWrite(pTrunk->pDbPage);
6308       if( rc==SQLITE_OK ){
6309         put4byte(&pTrunk->aData[4], nLeaf+1);
6310         put4byte(&pTrunk->aData[8+nLeaf*4], iPage);
6311         if( pPage && (pBt->btsFlags & BTS_SECURE_DELETE)==0 ){
6312           sqlite3PagerDontWrite(pPage->pDbPage);
6313         }
6314         rc = btreeSetHasContent(pBt, iPage);
6315       }
6316       TRACE(("FREE-PAGE: %d leaf on trunk page %d\n",pPage->pgno,pTrunk->pgno));
6317       goto freepage_out;
6318     }
6319   }
6320 
6321   /* If control flows to this point, then it was not possible to add the
6322   ** the page being freed as a leaf page of the first trunk in the free-list.
6323   ** Possibly because the free-list is empty, or possibly because the
6324   ** first trunk in the free-list is full. Either way, the page being freed
6325   ** will become the new first trunk page in the free-list.
6326   */
6327   if( pPage==0 && SQLITE_OK!=(rc = btreeGetPage(pBt, iPage, &pPage, 0)) ){
6328     goto freepage_out;
6329   }
6330   rc = sqlite3PagerWrite(pPage->pDbPage);
6331   if( rc!=SQLITE_OK ){
6332     goto freepage_out;
6333   }
6334   put4byte(pPage->aData, iTrunk);
6335   put4byte(&pPage->aData[4], 0);
6336   put4byte(&pPage1->aData[32], iPage);
6337   TRACE(("FREE-PAGE: %d new trunk page replacing %d\n", pPage->pgno, iTrunk));
6338 
6339 freepage_out:
6340   if( pPage ){
6341     pPage->isInit = 0;
6342   }
6343   releasePage(pPage);
6344   releasePage(pTrunk);
6345   return rc;
6346 }
6347 static void freePage(MemPage *pPage, int *pRC){
6348   if( (*pRC)==SQLITE_OK ){
6349     *pRC = freePage2(pPage->pBt, pPage, pPage->pgno);
6350   }
6351 }
6352 
6353 /*
6354 ** Free any overflow pages associated with the given Cell.  Store
6355 ** size information about the cell in pInfo.
6356 */
6357 static int clearCell(
6358   MemPage *pPage,          /* The page that contains the Cell */
6359   unsigned char *pCell,    /* First byte of the Cell */
6360   CellInfo *pInfo          /* Size information about the cell */
6361 ){
6362   BtShared *pBt;
6363   Pgno ovflPgno;
6364   int rc;
6365   int nOvfl;
6366   u32 ovflPageSize;
6367 
6368   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
6369   pPage->xParseCell(pPage, pCell, pInfo);
6370   if( pInfo->nLocal==pInfo->nPayload ){
6371     return SQLITE_OK;  /* No overflow pages. Return without doing anything */
6372   }
6373   testcase( pCell + pInfo->nSize == pPage->aDataEnd );
6374   testcase( pCell + (pInfo->nSize-1) == pPage->aDataEnd );
6375   if( pCell + pInfo->nSize > pPage->aDataEnd ){
6376     /* Cell extends past end of page */
6377     return SQLITE_CORRUPT_PAGE(pPage);
6378   }
6379   ovflPgno = get4byte(pCell + pInfo->nSize - 4);
6380   pBt = pPage->pBt;
6381   assert( pBt->usableSize > 4 );
6382   ovflPageSize = pBt->usableSize - 4;
6383   nOvfl = (pInfo->nPayload - pInfo->nLocal + ovflPageSize - 1)/ovflPageSize;
6384   assert( nOvfl>0 ||
6385     (CORRUPT_DB && (pInfo->nPayload + ovflPageSize)<ovflPageSize)
6386   );
6387   while( nOvfl-- ){
6388     Pgno iNext = 0;
6389     MemPage *pOvfl = 0;
6390     if( ovflPgno<2 || ovflPgno>btreePagecount(pBt) ){
6391       /* 0 is not a legal page number and page 1 cannot be an
6392       ** overflow page. Therefore if ovflPgno<2 or past the end of the
6393       ** file the database must be corrupt. */
6394       return SQLITE_CORRUPT_BKPT;
6395     }
6396     if( nOvfl ){
6397       rc = getOverflowPage(pBt, ovflPgno, &pOvfl, &iNext);
6398       if( rc ) return rc;
6399     }
6400 
6401     if( ( pOvfl || ((pOvfl = btreePageLookup(pBt, ovflPgno))!=0) )
6402      && sqlite3PagerPageRefcount(pOvfl->pDbPage)!=1
6403     ){
6404       /* There is no reason any cursor should have an outstanding reference
6405       ** to an overflow page belonging to a cell that is being deleted/updated.
6406       ** So if there exists more than one reference to this page, then it
6407       ** must not really be an overflow page and the database must be corrupt.
6408       ** It is helpful to detect this before calling freePage2(), as
6409       ** freePage2() may zero the page contents if secure-delete mode is
6410       ** enabled. If this 'overflow' page happens to be a page that the
6411       ** caller is iterating through or using in some other way, this
6412       ** can be problematic.
6413       */
6414       rc = SQLITE_CORRUPT_BKPT;
6415     }else{
6416       rc = freePage2(pBt, pOvfl, ovflPgno);
6417     }
6418 
6419     if( pOvfl ){
6420       sqlite3PagerUnref(pOvfl->pDbPage);
6421     }
6422     if( rc ) return rc;
6423     ovflPgno = iNext;
6424   }
6425   return SQLITE_OK;
6426 }
6427 
6428 /*
6429 ** Create the byte sequence used to represent a cell on page pPage
6430 ** and write that byte sequence into pCell[].  Overflow pages are
6431 ** allocated and filled in as necessary.  The calling procedure
6432 ** is responsible for making sure sufficient space has been allocated
6433 ** for pCell[].
6434 **
6435 ** Note that pCell does not necessary need to point to the pPage->aData
6436 ** area.  pCell might point to some temporary storage.  The cell will
6437 ** be constructed in this temporary area then copied into pPage->aData
6438 ** later.
6439 */
6440 static int fillInCell(
6441   MemPage *pPage,                /* The page that contains the cell */
6442   unsigned char *pCell,          /* Complete text of the cell */
6443   const BtreePayload *pX,        /* Payload with which to construct the cell */
6444   int *pnSize                    /* Write cell size here */
6445 ){
6446   int nPayload;
6447   const u8 *pSrc;
6448   int nSrc, n, rc, mn;
6449   int spaceLeft;
6450   MemPage *pToRelease;
6451   unsigned char *pPrior;
6452   unsigned char *pPayload;
6453   BtShared *pBt;
6454   Pgno pgnoOvfl;
6455   int nHeader;
6456 
6457   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
6458 
6459   /* pPage is not necessarily writeable since pCell might be auxiliary
6460   ** buffer space that is separate from the pPage buffer area */
6461   assert( pCell<pPage->aData || pCell>=&pPage->aData[pPage->pBt->pageSize]
6462             || sqlite3PagerIswriteable(pPage->pDbPage) );
6463 
6464   /* Fill in the header. */
6465   nHeader = pPage->childPtrSize;
6466   if( pPage->intKey ){
6467     nPayload = pX->nData + pX->nZero;
6468     pSrc = pX->pData;
6469     nSrc = pX->nData;
6470     assert( pPage->intKeyLeaf ); /* fillInCell() only called for leaves */
6471     nHeader += putVarint32(&pCell[nHeader], nPayload);
6472     nHeader += putVarint(&pCell[nHeader], *(u64*)&pX->nKey);
6473   }else{
6474     assert( pX->nKey<=0x7fffffff && pX->pKey!=0 );
6475     nSrc = nPayload = (int)pX->nKey;
6476     pSrc = pX->pKey;
6477     nHeader += putVarint32(&pCell[nHeader], nPayload);
6478   }
6479 
6480   /* Fill in the payload */
6481   pPayload = &pCell[nHeader];
6482   if( nPayload<=pPage->maxLocal ){
6483     /* This is the common case where everything fits on the btree page
6484     ** and no overflow pages are required. */
6485     n = nHeader + nPayload;
6486     testcase( n==3 );
6487     testcase( n==4 );
6488     if( n<4 ) n = 4;
6489     *pnSize = n;
6490     assert( nSrc<=nPayload );
6491     testcase( nSrc<nPayload );
6492     memcpy(pPayload, pSrc, nSrc);
6493     memset(pPayload+nSrc, 0, nPayload-nSrc);
6494     return SQLITE_OK;
6495   }
6496 
6497   /* If we reach this point, it means that some of the content will need
6498   ** to spill onto overflow pages.
6499   */
6500   mn = pPage->minLocal;
6501   n = mn + (nPayload - mn) % (pPage->pBt->usableSize - 4);
6502   testcase( n==pPage->maxLocal );
6503   testcase( n==pPage->maxLocal+1 );
6504   if( n > pPage->maxLocal ) n = mn;
6505   spaceLeft = n;
6506   *pnSize = n + nHeader + 4;
6507   pPrior = &pCell[nHeader+n];
6508   pToRelease = 0;
6509   pgnoOvfl = 0;
6510   pBt = pPage->pBt;
6511 
6512   /* At this point variables should be set as follows:
6513   **
6514   **   nPayload           Total payload size in bytes
6515   **   pPayload           Begin writing payload here
6516   **   spaceLeft          Space available at pPayload.  If nPayload>spaceLeft,
6517   **                      that means content must spill into overflow pages.
6518   **   *pnSize            Size of the local cell (not counting overflow pages)
6519   **   pPrior             Where to write the pgno of the first overflow page
6520   **
6521   ** Use a call to btreeParseCellPtr() to verify that the values above
6522   ** were computed correctly.
6523   */
6524 #ifdef SQLITE_DEBUG
6525   {
6526     CellInfo info;
6527     pPage->xParseCell(pPage, pCell, &info);
6528     assert( nHeader==(int)(info.pPayload - pCell) );
6529     assert( info.nKey==pX->nKey );
6530     assert( *pnSize == info.nSize );
6531     assert( spaceLeft == info.nLocal );
6532   }
6533 #endif
6534 
6535   /* Write the payload into the local Cell and any extra into overflow pages */
6536   while( 1 ){
6537     n = nPayload;
6538     if( n>spaceLeft ) n = spaceLeft;
6539 
6540     /* If pToRelease is not zero than pPayload points into the data area
6541     ** of pToRelease.  Make sure pToRelease is still writeable. */
6542     assert( pToRelease==0 || sqlite3PagerIswriteable(pToRelease->pDbPage) );
6543 
6544     /* If pPayload is part of the data area of pPage, then make sure pPage
6545     ** is still writeable */
6546     assert( pPayload<pPage->aData || pPayload>=&pPage->aData[pBt->pageSize]
6547             || sqlite3PagerIswriteable(pPage->pDbPage) );
6548 
6549     if( nSrc>=n ){
6550       memcpy(pPayload, pSrc, n);
6551     }else if( nSrc>0 ){
6552       n = nSrc;
6553       memcpy(pPayload, pSrc, n);
6554     }else{
6555       memset(pPayload, 0, n);
6556     }
6557     nPayload -= n;
6558     if( nPayload<=0 ) break;
6559     pPayload += n;
6560     pSrc += n;
6561     nSrc -= n;
6562     spaceLeft -= n;
6563     if( spaceLeft==0 ){
6564       MemPage *pOvfl = 0;
6565 #ifndef SQLITE_OMIT_AUTOVACUUM
6566       Pgno pgnoPtrmap = pgnoOvfl; /* Overflow page pointer-map entry page */
6567       if( pBt->autoVacuum ){
6568         do{
6569           pgnoOvfl++;
6570         } while(
6571           PTRMAP_ISPAGE(pBt, pgnoOvfl) || pgnoOvfl==PENDING_BYTE_PAGE(pBt)
6572         );
6573       }
6574 #endif
6575       rc = allocateBtreePage(pBt, &pOvfl, &pgnoOvfl, pgnoOvfl, 0);
6576 #ifndef SQLITE_OMIT_AUTOVACUUM
6577       /* If the database supports auto-vacuum, and the second or subsequent
6578       ** overflow page is being allocated, add an entry to the pointer-map
6579       ** for that page now.
6580       **
6581       ** If this is the first overflow page, then write a partial entry
6582       ** to the pointer-map. If we write nothing to this pointer-map slot,
6583       ** then the optimistic overflow chain processing in clearCell()
6584       ** may misinterpret the uninitialized values and delete the
6585       ** wrong pages from the database.
6586       */
6587       if( pBt->autoVacuum && rc==SQLITE_OK ){
6588         u8 eType = (pgnoPtrmap?PTRMAP_OVERFLOW2:PTRMAP_OVERFLOW1);
6589         ptrmapPut(pBt, pgnoOvfl, eType, pgnoPtrmap, &rc);
6590         if( rc ){
6591           releasePage(pOvfl);
6592         }
6593       }
6594 #endif
6595       if( rc ){
6596         releasePage(pToRelease);
6597         return rc;
6598       }
6599 
6600       /* If pToRelease is not zero than pPrior points into the data area
6601       ** of pToRelease.  Make sure pToRelease is still writeable. */
6602       assert( pToRelease==0 || sqlite3PagerIswriteable(pToRelease->pDbPage) );
6603 
6604       /* If pPrior is part of the data area of pPage, then make sure pPage
6605       ** is still writeable */
6606       assert( pPrior<pPage->aData || pPrior>=&pPage->aData[pBt->pageSize]
6607             || sqlite3PagerIswriteable(pPage->pDbPage) );
6608 
6609       put4byte(pPrior, pgnoOvfl);
6610       releasePage(pToRelease);
6611       pToRelease = pOvfl;
6612       pPrior = pOvfl->aData;
6613       put4byte(pPrior, 0);
6614       pPayload = &pOvfl->aData[4];
6615       spaceLeft = pBt->usableSize - 4;
6616     }
6617   }
6618   releasePage(pToRelease);
6619   return SQLITE_OK;
6620 }
6621 
6622 /*
6623 ** Remove the i-th cell from pPage.  This routine effects pPage only.
6624 ** The cell content is not freed or deallocated.  It is assumed that
6625 ** the cell content has been copied someplace else.  This routine just
6626 ** removes the reference to the cell from pPage.
6627 **
6628 ** "sz" must be the number of bytes in the cell.
6629 */
6630 static void dropCell(MemPage *pPage, int idx, int sz, int *pRC){
6631   u32 pc;         /* Offset to cell content of cell being deleted */
6632   u8 *data;       /* pPage->aData */
6633   u8 *ptr;        /* Used to move bytes around within data[] */
6634   int rc;         /* The return code */
6635   int hdr;        /* Beginning of the header.  0 most pages.  100 page 1 */
6636 
6637   if( *pRC ) return;
6638   assert( idx>=0 && idx<pPage->nCell );
6639   assert( CORRUPT_DB || sz==cellSize(pPage, idx) );
6640   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
6641   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
6642   assert( pPage->nFree>=0 );
6643   data = pPage->aData;
6644   ptr = &pPage->aCellIdx[2*idx];
6645   pc = get2byte(ptr);
6646   hdr = pPage->hdrOffset;
6647   testcase( pc==get2byte(&data[hdr+5]) );
6648   testcase( pc+sz==pPage->pBt->usableSize );
6649   if( pc+sz > pPage->pBt->usableSize ){
6650     *pRC = SQLITE_CORRUPT_BKPT;
6651     return;
6652   }
6653   rc = freeSpace(pPage, pc, sz);
6654   if( rc ){
6655     *pRC = rc;
6656     return;
6657   }
6658   pPage->nCell--;
6659   if( pPage->nCell==0 ){
6660     memset(&data[hdr+1], 0, 4);
6661     data[hdr+7] = 0;
6662     put2byte(&data[hdr+5], pPage->pBt->usableSize);
6663     pPage->nFree = pPage->pBt->usableSize - pPage->hdrOffset
6664                        - pPage->childPtrSize - 8;
6665   }else{
6666     memmove(ptr, ptr+2, 2*(pPage->nCell - idx));
6667     put2byte(&data[hdr+3], pPage->nCell);
6668     pPage->nFree += 2;
6669   }
6670 }
6671 
6672 /*
6673 ** Insert a new cell on pPage at cell index "i".  pCell points to the
6674 ** content of the cell.
6675 **
6676 ** If the cell content will fit on the page, then put it there.  If it
6677 ** will not fit, then make a copy of the cell content into pTemp if
6678 ** pTemp is not null.  Regardless of pTemp, allocate a new entry
6679 ** in pPage->apOvfl[] and make it point to the cell content (either
6680 ** in pTemp or the original pCell) and also record its index.
6681 ** Allocating a new entry in pPage->aCell[] implies that
6682 ** pPage->nOverflow is incremented.
6683 **
6684 ** *pRC must be SQLITE_OK when this routine is called.
6685 */
6686 static void insertCell(
6687   MemPage *pPage,   /* Page into which we are copying */
6688   int i,            /* New cell becomes the i-th cell of the page */
6689   u8 *pCell,        /* Content of the new cell */
6690   int sz,           /* Bytes of content in pCell */
6691   u8 *pTemp,        /* Temp storage space for pCell, if needed */
6692   Pgno iChild,      /* If non-zero, replace first 4 bytes with this value */
6693   int *pRC          /* Read and write return code from here */
6694 ){
6695   int idx = 0;      /* Where to write new cell content in data[] */
6696   int j;            /* Loop counter */
6697   u8 *data;         /* The content of the whole page */
6698   u8 *pIns;         /* The point in pPage->aCellIdx[] where no cell inserted */
6699 
6700   assert( *pRC==SQLITE_OK );
6701   assert( i>=0 && i<=pPage->nCell+pPage->nOverflow );
6702   assert( MX_CELL(pPage->pBt)<=10921 );
6703   assert( pPage->nCell<=MX_CELL(pPage->pBt) || CORRUPT_DB );
6704   assert( pPage->nOverflow<=ArraySize(pPage->apOvfl) );
6705   assert( ArraySize(pPage->apOvfl)==ArraySize(pPage->aiOvfl) );
6706   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
6707   assert( sz==pPage->xCellSize(pPage, pCell) || CORRUPT_DB );
6708   assert( pPage->nFree>=0 );
6709   if( pPage->nOverflow || sz+2>pPage->nFree ){
6710     if( pTemp ){
6711       memcpy(pTemp, pCell, sz);
6712       pCell = pTemp;
6713     }
6714     if( iChild ){
6715       put4byte(pCell, iChild);
6716     }
6717     j = pPage->nOverflow++;
6718     /* Comparison against ArraySize-1 since we hold back one extra slot
6719     ** as a contingency.  In other words, never need more than 3 overflow
6720     ** slots but 4 are allocated, just to be safe. */
6721     assert( j < ArraySize(pPage->apOvfl)-1 );
6722     pPage->apOvfl[j] = pCell;
6723     pPage->aiOvfl[j] = (u16)i;
6724 
6725     /* When multiple overflows occur, they are always sequential and in
6726     ** sorted order.  This invariants arise because multiple overflows can
6727     ** only occur when inserting divider cells into the parent page during
6728     ** balancing, and the dividers are adjacent and sorted.
6729     */
6730     assert( j==0 || pPage->aiOvfl[j-1]<(u16)i ); /* Overflows in sorted order */
6731     assert( j==0 || i==pPage->aiOvfl[j-1]+1 );   /* Overflows are sequential */
6732   }else{
6733     int rc = sqlite3PagerWrite(pPage->pDbPage);
6734     if( rc!=SQLITE_OK ){
6735       *pRC = rc;
6736       return;
6737     }
6738     assert( sqlite3PagerIswriteable(pPage->pDbPage) );
6739     data = pPage->aData;
6740     assert( &data[pPage->cellOffset]==pPage->aCellIdx );
6741     rc = allocateSpace(pPage, sz, &idx);
6742     if( rc ){ *pRC = rc; return; }
6743     /* The allocateSpace() routine guarantees the following properties
6744     ** if it returns successfully */
6745     assert( idx >= 0 );
6746     assert( idx >= pPage->cellOffset+2*pPage->nCell+2 || CORRUPT_DB );
6747     assert( idx+sz <= (int)pPage->pBt->usableSize );
6748     pPage->nFree -= (u16)(2 + sz);
6749     if( iChild ){
6750       /* In a corrupt database where an entry in the cell index section of
6751       ** a btree page has a value of 3 or less, the pCell value might point
6752       ** as many as 4 bytes in front of the start of the aData buffer for
6753       ** the source page.  Make sure this does not cause problems by not
6754       ** reading the first 4 bytes */
6755       memcpy(&data[idx+4], pCell+4, sz-4);
6756       put4byte(&data[idx], iChild);
6757     }else{
6758       memcpy(&data[idx], pCell, sz);
6759     }
6760     pIns = pPage->aCellIdx + i*2;
6761     memmove(pIns+2, pIns, 2*(pPage->nCell - i));
6762     put2byte(pIns, idx);
6763     pPage->nCell++;
6764     /* increment the cell count */
6765     if( (++data[pPage->hdrOffset+4])==0 ) data[pPage->hdrOffset+3]++;
6766     assert( get2byte(&data[pPage->hdrOffset+3])==pPage->nCell || CORRUPT_DB );
6767 #ifndef SQLITE_OMIT_AUTOVACUUM
6768     if( pPage->pBt->autoVacuum ){
6769       /* The cell may contain a pointer to an overflow page. If so, write
6770       ** the entry for the overflow page into the pointer map.
6771       */
6772       ptrmapPutOvflPtr(pPage, pPage, pCell, pRC);
6773     }
6774 #endif
6775   }
6776 }
6777 
6778 /*
6779 ** The following parameters determine how many adjacent pages get involved
6780 ** in a balancing operation.  NN is the number of neighbors on either side
6781 ** of the page that participate in the balancing operation.  NB is the
6782 ** total number of pages that participate, including the target page and
6783 ** NN neighbors on either side.
6784 **
6785 ** The minimum value of NN is 1 (of course).  Increasing NN above 1
6786 ** (to 2 or 3) gives a modest improvement in SELECT and DELETE performance
6787 ** in exchange for a larger degradation in INSERT and UPDATE performance.
6788 ** The value of NN appears to give the best results overall.
6789 **
6790 ** (Later:) The description above makes it seem as if these values are
6791 ** tunable - as if you could change them and recompile and it would all work.
6792 ** But that is unlikely.  NB has been 3 since the inception of SQLite and
6793 ** we have never tested any other value.
6794 */
6795 #define NN 1             /* Number of neighbors on either side of pPage */
6796 #define NB 3             /* (NN*2+1): Total pages involved in the balance */
6797 
6798 /*
6799 ** A CellArray object contains a cache of pointers and sizes for a
6800 ** consecutive sequence of cells that might be held on multiple pages.
6801 **
6802 ** The cells in this array are the divider cell or cells from the pParent
6803 ** page plus up to three child pages.  There are a total of nCell cells.
6804 **
6805 ** pRef is a pointer to one of the pages that contributes cells.  This is
6806 ** used to access information such as MemPage.intKey and MemPage.pBt->pageSize
6807 ** which should be common to all pages that contribute cells to this array.
6808 **
6809 ** apCell[] and szCell[] hold, respectively, pointers to the start of each
6810 ** cell and the size of each cell.  Some of the apCell[] pointers might refer
6811 ** to overflow cells.  In other words, some apCel[] pointers might not point
6812 ** to content area of the pages.
6813 **
6814 ** A szCell[] of zero means the size of that cell has not yet been computed.
6815 **
6816 ** The cells come from as many as four different pages:
6817 **
6818 **             -----------
6819 **             | Parent  |
6820 **             -----------
6821 **            /     |     \
6822 **           /      |      \
6823 **  ---------   ---------   ---------
6824 **  |Child-1|   |Child-2|   |Child-3|
6825 **  ---------   ---------   ---------
6826 **
6827 ** The order of cells is in the array is for an index btree is:
6828 **
6829 **       1.  All cells from Child-1 in order
6830 **       2.  The first divider cell from Parent
6831 **       3.  All cells from Child-2 in order
6832 **       4.  The second divider cell from Parent
6833 **       5.  All cells from Child-3 in order
6834 **
6835 ** For a table-btree (with rowids) the items 2 and 4 are empty because
6836 ** content exists only in leaves and there are no divider cells.
6837 **
6838 ** For an index btree, the apEnd[] array holds pointer to the end of page
6839 ** for Child-1, the Parent, Child-2, the Parent (again), and Child-3,
6840 ** respectively. The ixNx[] array holds the number of cells contained in
6841 ** each of these 5 stages, and all stages to the left.  Hence:
6842 **
6843 **    ixNx[0] = Number of cells in Child-1.
6844 **    ixNx[1] = Number of cells in Child-1 plus 1 for first divider.
6845 **    ixNx[2] = Number of cells in Child-1 and Child-2 + 1 for 1st divider.
6846 **    ixNx[3] = Number of cells in Child-1 and Child-2 + both divider cells
6847 **    ixNx[4] = Total number of cells.
6848 **
6849 ** For a table-btree, the concept is similar, except only apEnd[0]..apEnd[2]
6850 ** are used and they point to the leaf pages only, and the ixNx value are:
6851 **
6852 **    ixNx[0] = Number of cells in Child-1.
6853 **    ixNx[1] = Number of cells in Child-1 and Child-2.
6854 **    ixNx[2] = Total number of cells.
6855 **
6856 ** Sometimes when deleting, a child page can have zero cells.  In those
6857 ** cases, ixNx[] entries with higher indexes, and the corresponding apEnd[]
6858 ** entries, shift down.  The end result is that each ixNx[] entry should
6859 ** be larger than the previous
6860 */
6861 typedef struct CellArray CellArray;
6862 struct CellArray {
6863   int nCell;              /* Number of cells in apCell[] */
6864   MemPage *pRef;          /* Reference page */
6865   u8 **apCell;            /* All cells begin balanced */
6866   u16 *szCell;            /* Local size of all cells in apCell[] */
6867   u8 *apEnd[NB*2];        /* MemPage.aDataEnd values */
6868   int ixNx[NB*2];         /* Index of at which we move to the next apEnd[] */
6869 };
6870 
6871 /*
6872 ** Make sure the cell sizes at idx, idx+1, ..., idx+N-1 have been
6873 ** computed.
6874 */
6875 static void populateCellCache(CellArray *p, int idx, int N){
6876   assert( idx>=0 && idx+N<=p->nCell );
6877   while( N>0 ){
6878     assert( p->apCell[idx]!=0 );
6879     if( p->szCell[idx]==0 ){
6880       p->szCell[idx] = p->pRef->xCellSize(p->pRef, p->apCell[idx]);
6881     }else{
6882       assert( CORRUPT_DB ||
6883               p->szCell[idx]==p->pRef->xCellSize(p->pRef, p->apCell[idx]) );
6884     }
6885     idx++;
6886     N--;
6887   }
6888 }
6889 
6890 /*
6891 ** Return the size of the Nth element of the cell array
6892 */
6893 static SQLITE_NOINLINE u16 computeCellSize(CellArray *p, int N){
6894   assert( N>=0 && N<p->nCell );
6895   assert( p->szCell[N]==0 );
6896   p->szCell[N] = p->pRef->xCellSize(p->pRef, p->apCell[N]);
6897   return p->szCell[N];
6898 }
6899 static u16 cachedCellSize(CellArray *p, int N){
6900   assert( N>=0 && N<p->nCell );
6901   if( p->szCell[N] ) return p->szCell[N];
6902   return computeCellSize(p, N);
6903 }
6904 
6905 /*
6906 ** Array apCell[] contains pointers to nCell b-tree page cells. The
6907 ** szCell[] array contains the size in bytes of each cell. This function
6908 ** replaces the current contents of page pPg with the contents of the cell
6909 ** array.
6910 **
6911 ** Some of the cells in apCell[] may currently be stored in pPg. This
6912 ** function works around problems caused by this by making a copy of any
6913 ** such cells before overwriting the page data.
6914 **
6915 ** The MemPage.nFree field is invalidated by this function. It is the
6916 ** responsibility of the caller to set it correctly.
6917 */
6918 static int rebuildPage(
6919   CellArray *pCArray,             /* Content to be added to page pPg */
6920   int iFirst,                     /* First cell in pCArray to use */
6921   int nCell,                      /* Final number of cells on page */
6922   MemPage *pPg                    /* The page to be reconstructed */
6923 ){
6924   const int hdr = pPg->hdrOffset;          /* Offset of header on pPg */
6925   u8 * const aData = pPg->aData;           /* Pointer to data for pPg */
6926   const int usableSize = pPg->pBt->usableSize;
6927   u8 * const pEnd = &aData[usableSize];
6928   int i = iFirst;                 /* Which cell to copy from pCArray*/
6929   u32 j;                          /* Start of cell content area */
6930   int iEnd = i+nCell;             /* Loop terminator */
6931   u8 *pCellptr = pPg->aCellIdx;
6932   u8 *pTmp = sqlite3PagerTempSpace(pPg->pBt->pPager);
6933   u8 *pData;
6934   int k;                          /* Current slot in pCArray->apEnd[] */
6935   u8 *pSrcEnd;                    /* Current pCArray->apEnd[k] value */
6936 
6937   assert( i<iEnd );
6938   j = get2byte(&aData[hdr+5]);
6939   if( j>(u32)usableSize ){ j = 0; }
6940   memcpy(&pTmp[j], &aData[j], usableSize - j);
6941 
6942   for(k=0; pCArray->ixNx[k]<=i && ALWAYS(k<NB*2); k++){}
6943   pSrcEnd = pCArray->apEnd[k];
6944 
6945   pData = pEnd;
6946   while( 1/*exit by break*/ ){
6947     u8 *pCell = pCArray->apCell[i];
6948     u16 sz = pCArray->szCell[i];
6949     assert( sz>0 );
6950     if( SQLITE_WITHIN(pCell,aData,pEnd) ){
6951       if( ((uptr)(pCell+sz))>(uptr)pEnd ) return SQLITE_CORRUPT_BKPT;
6952       pCell = &pTmp[pCell - aData];
6953     }else if( (uptr)(pCell+sz)>(uptr)pSrcEnd
6954            && (uptr)(pCell)<(uptr)pSrcEnd
6955     ){
6956       return SQLITE_CORRUPT_BKPT;
6957     }
6958 
6959     pData -= sz;
6960     put2byte(pCellptr, (pData - aData));
6961     pCellptr += 2;
6962     if( pData < pCellptr ) return SQLITE_CORRUPT_BKPT;
6963     memcpy(pData, pCell, sz);
6964     assert( sz==pPg->xCellSize(pPg, pCell) || CORRUPT_DB );
6965     testcase( sz!=pPg->xCellSize(pPg,pCell) );
6966     i++;
6967     if( i>=iEnd ) break;
6968     if( pCArray->ixNx[k]<=i ){
6969       k++;
6970       pSrcEnd = pCArray->apEnd[k];
6971     }
6972   }
6973 
6974   /* The pPg->nFree field is now set incorrectly. The caller will fix it. */
6975   pPg->nCell = nCell;
6976   pPg->nOverflow = 0;
6977 
6978   put2byte(&aData[hdr+1], 0);
6979   put2byte(&aData[hdr+3], pPg->nCell);
6980   put2byte(&aData[hdr+5], pData - aData);
6981   aData[hdr+7] = 0x00;
6982   return SQLITE_OK;
6983 }
6984 
6985 /*
6986 ** The pCArray objects contains pointers to b-tree cells and the cell sizes.
6987 ** This function attempts to add the cells stored in the array to page pPg.
6988 ** If it cannot (because the page needs to be defragmented before the cells
6989 ** will fit), non-zero is returned. Otherwise, if the cells are added
6990 ** successfully, zero is returned.
6991 **
6992 ** Argument pCellptr points to the first entry in the cell-pointer array
6993 ** (part of page pPg) to populate. After cell apCell[0] is written to the
6994 ** page body, a 16-bit offset is written to pCellptr. And so on, for each
6995 ** cell in the array. It is the responsibility of the caller to ensure
6996 ** that it is safe to overwrite this part of the cell-pointer array.
6997 **
6998 ** When this function is called, *ppData points to the start of the
6999 ** content area on page pPg. If the size of the content area is extended,
7000 ** *ppData is updated to point to the new start of the content area
7001 ** before returning.
7002 **
7003 ** Finally, argument pBegin points to the byte immediately following the
7004 ** end of the space required by this page for the cell-pointer area (for
7005 ** all cells - not just those inserted by the current call). If the content
7006 ** area must be extended to before this point in order to accomodate all
7007 ** cells in apCell[], then the cells do not fit and non-zero is returned.
7008 */
7009 static int pageInsertArray(
7010   MemPage *pPg,                   /* Page to add cells to */
7011   u8 *pBegin,                     /* End of cell-pointer array */
7012   u8 **ppData,                    /* IN/OUT: Page content-area pointer */
7013   u8 *pCellptr,                   /* Pointer to cell-pointer area */
7014   int iFirst,                     /* Index of first cell to add */
7015   int nCell,                      /* Number of cells to add to pPg */
7016   CellArray *pCArray              /* Array of cells */
7017 ){
7018   int i = iFirst;                 /* Loop counter - cell index to insert */
7019   u8 *aData = pPg->aData;         /* Complete page */
7020   u8 *pData = *ppData;            /* Content area.  A subset of aData[] */
7021   int iEnd = iFirst + nCell;      /* End of loop. One past last cell to ins */
7022   int k;                          /* Current slot in pCArray->apEnd[] */
7023   u8 *pEnd;                       /* Maximum extent of cell data */
7024   assert( CORRUPT_DB || pPg->hdrOffset==0 );    /* Never called on page 1 */
7025   if( iEnd<=iFirst ) return 0;
7026   for(k=0; pCArray->ixNx[k]<=i && ALWAYS(k<NB*2); k++){}
7027   pEnd = pCArray->apEnd[k];
7028   while( 1 /*Exit by break*/ ){
7029     int sz, rc;
7030     u8 *pSlot;
7031     assert( pCArray->szCell[i]!=0 );
7032     sz = pCArray->szCell[i];
7033     if( (aData[1]==0 && aData[2]==0) || (pSlot = pageFindSlot(pPg,sz,&rc))==0 ){
7034       if( (pData - pBegin)<sz ) return 1;
7035       pData -= sz;
7036       pSlot = pData;
7037     }
7038     /* pSlot and pCArray->apCell[i] will never overlap on a well-formed
7039     ** database.  But they might for a corrupt database.  Hence use memmove()
7040     ** since memcpy() sends SIGABORT with overlapping buffers on OpenBSD */
7041     assert( (pSlot+sz)<=pCArray->apCell[i]
7042          || pSlot>=(pCArray->apCell[i]+sz)
7043          || CORRUPT_DB );
7044     if( (uptr)(pCArray->apCell[i]+sz)>(uptr)pEnd
7045      && (uptr)(pCArray->apCell[i])<(uptr)pEnd
7046     ){
7047       assert( CORRUPT_DB );
7048       (void)SQLITE_CORRUPT_BKPT;
7049       return 1;
7050     }
7051     memmove(pSlot, pCArray->apCell[i], sz);
7052     put2byte(pCellptr, (pSlot - aData));
7053     pCellptr += 2;
7054     i++;
7055     if( i>=iEnd ) break;
7056     if( pCArray->ixNx[k]<=i ){
7057       k++;
7058       pEnd = pCArray->apEnd[k];
7059     }
7060   }
7061   *ppData = pData;
7062   return 0;
7063 }
7064 
7065 /*
7066 ** The pCArray object contains pointers to b-tree cells and their sizes.
7067 **
7068 ** This function adds the space associated with each cell in the array
7069 ** that is currently stored within the body of pPg to the pPg free-list.
7070 ** The cell-pointers and other fields of the page are not updated.
7071 **
7072 ** This function returns the total number of cells added to the free-list.
7073 */
7074 static int pageFreeArray(
7075   MemPage *pPg,                   /* Page to edit */
7076   int iFirst,                     /* First cell to delete */
7077   int nCell,                      /* Cells to delete */
7078   CellArray *pCArray              /* Array of cells */
7079 ){
7080   u8 * const aData = pPg->aData;
7081   u8 * const pEnd = &aData[pPg->pBt->usableSize];
7082   u8 * const pStart = &aData[pPg->hdrOffset + 8 + pPg->childPtrSize];
7083   int nRet = 0;
7084   int i;
7085   int iEnd = iFirst + nCell;
7086   u8 *pFree = 0;
7087   int szFree = 0;
7088 
7089   for(i=iFirst; i<iEnd; i++){
7090     u8 *pCell = pCArray->apCell[i];
7091     if( SQLITE_WITHIN(pCell, pStart, pEnd) ){
7092       int sz;
7093       /* No need to use cachedCellSize() here.  The sizes of all cells that
7094       ** are to be freed have already been computing while deciding which
7095       ** cells need freeing */
7096       sz = pCArray->szCell[i];  assert( sz>0 );
7097       if( pFree!=(pCell + sz) ){
7098         if( pFree ){
7099           assert( pFree>aData && (pFree - aData)<65536 );
7100           freeSpace(pPg, (u16)(pFree - aData), szFree);
7101         }
7102         pFree = pCell;
7103         szFree = sz;
7104         if( pFree+sz>pEnd ) return 0;
7105       }else{
7106         pFree = pCell;
7107         szFree += sz;
7108       }
7109       nRet++;
7110     }
7111   }
7112   if( pFree ){
7113     assert( pFree>aData && (pFree - aData)<65536 );
7114     freeSpace(pPg, (u16)(pFree - aData), szFree);
7115   }
7116   return nRet;
7117 }
7118 
7119 /*
7120 ** pCArray contains pointers to and sizes of all cells in the page being
7121 ** balanced.  The current page, pPg, has pPg->nCell cells starting with
7122 ** pCArray->apCell[iOld].  After balancing, this page should hold nNew cells
7123 ** starting at apCell[iNew].
7124 **
7125 ** This routine makes the necessary adjustments to pPg so that it contains
7126 ** the correct cells after being balanced.
7127 **
7128 ** The pPg->nFree field is invalid when this function returns. It is the
7129 ** responsibility of the caller to set it correctly.
7130 */
7131 static int editPage(
7132   MemPage *pPg,                   /* Edit this page */
7133   int iOld,                       /* Index of first cell currently on page */
7134   int iNew,                       /* Index of new first cell on page */
7135   int nNew,                       /* Final number of cells on page */
7136   CellArray *pCArray              /* Array of cells and sizes */
7137 ){
7138   u8 * const aData = pPg->aData;
7139   const int hdr = pPg->hdrOffset;
7140   u8 *pBegin = &pPg->aCellIdx[nNew * 2];
7141   int nCell = pPg->nCell;       /* Cells stored on pPg */
7142   u8 *pData;
7143   u8 *pCellptr;
7144   int i;
7145   int iOldEnd = iOld + pPg->nCell + pPg->nOverflow;
7146   int iNewEnd = iNew + nNew;
7147 
7148 #ifdef SQLITE_DEBUG
7149   u8 *pTmp = sqlite3PagerTempSpace(pPg->pBt->pPager);
7150   memcpy(pTmp, aData, pPg->pBt->usableSize);
7151 #endif
7152 
7153   /* Remove cells from the start and end of the page */
7154   assert( nCell>=0 );
7155   if( iOld<iNew ){
7156     int nShift = pageFreeArray(pPg, iOld, iNew-iOld, pCArray);
7157     if( nShift>nCell ) return SQLITE_CORRUPT_BKPT;
7158     memmove(pPg->aCellIdx, &pPg->aCellIdx[nShift*2], nCell*2);
7159     nCell -= nShift;
7160   }
7161   if( iNewEnd < iOldEnd ){
7162     int nTail = pageFreeArray(pPg, iNewEnd, iOldEnd - iNewEnd, pCArray);
7163     assert( nCell>=nTail );
7164     nCell -= nTail;
7165   }
7166 
7167   pData = &aData[get2byteNotZero(&aData[hdr+5])];
7168   if( pData<pBegin ) goto editpage_fail;
7169 
7170   /* Add cells to the start of the page */
7171   if( iNew<iOld ){
7172     int nAdd = MIN(nNew,iOld-iNew);
7173     assert( (iOld-iNew)<nNew || nCell==0 || CORRUPT_DB );
7174     assert( nAdd>=0 );
7175     pCellptr = pPg->aCellIdx;
7176     memmove(&pCellptr[nAdd*2], pCellptr, nCell*2);
7177     if( pageInsertArray(
7178           pPg, pBegin, &pData, pCellptr,
7179           iNew, nAdd, pCArray
7180     ) ) goto editpage_fail;
7181     nCell += nAdd;
7182   }
7183 
7184   /* Add any overflow cells */
7185   for(i=0; i<pPg->nOverflow; i++){
7186     int iCell = (iOld + pPg->aiOvfl[i]) - iNew;
7187     if( iCell>=0 && iCell<nNew ){
7188       pCellptr = &pPg->aCellIdx[iCell * 2];
7189       if( nCell>iCell ){
7190         memmove(&pCellptr[2], pCellptr, (nCell - iCell) * 2);
7191       }
7192       nCell++;
7193       cachedCellSize(pCArray, iCell+iNew);
7194       if( pageInsertArray(
7195             pPg, pBegin, &pData, pCellptr,
7196             iCell+iNew, 1, pCArray
7197       ) ) goto editpage_fail;
7198     }
7199   }
7200 
7201   /* Append cells to the end of the page */
7202   assert( nCell>=0 );
7203   pCellptr = &pPg->aCellIdx[nCell*2];
7204   if( pageInsertArray(
7205         pPg, pBegin, &pData, pCellptr,
7206         iNew+nCell, nNew-nCell, pCArray
7207   ) ) goto editpage_fail;
7208 
7209   pPg->nCell = nNew;
7210   pPg->nOverflow = 0;
7211 
7212   put2byte(&aData[hdr+3], pPg->nCell);
7213   put2byte(&aData[hdr+5], pData - aData);
7214 
7215 #ifdef SQLITE_DEBUG
7216   for(i=0; i<nNew && !CORRUPT_DB; i++){
7217     u8 *pCell = pCArray->apCell[i+iNew];
7218     int iOff = get2byteAligned(&pPg->aCellIdx[i*2]);
7219     if( SQLITE_WITHIN(pCell, aData, &aData[pPg->pBt->usableSize]) ){
7220       pCell = &pTmp[pCell - aData];
7221     }
7222     assert( 0==memcmp(pCell, &aData[iOff],
7223             pCArray->pRef->xCellSize(pCArray->pRef, pCArray->apCell[i+iNew])) );
7224   }
7225 #endif
7226 
7227   return SQLITE_OK;
7228  editpage_fail:
7229   /* Unable to edit this page. Rebuild it from scratch instead. */
7230   populateCellCache(pCArray, iNew, nNew);
7231   return rebuildPage(pCArray, iNew, nNew, pPg);
7232 }
7233 
7234 
7235 #ifndef SQLITE_OMIT_QUICKBALANCE
7236 /*
7237 ** This version of balance() handles the common special case where
7238 ** a new entry is being inserted on the extreme right-end of the
7239 ** tree, in other words, when the new entry will become the largest
7240 ** entry in the tree.
7241 **
7242 ** Instead of trying to balance the 3 right-most leaf pages, just add
7243 ** a new page to the right-hand side and put the one new entry in
7244 ** that page.  This leaves the right side of the tree somewhat
7245 ** unbalanced.  But odds are that we will be inserting new entries
7246 ** at the end soon afterwards so the nearly empty page will quickly
7247 ** fill up.  On average.
7248 **
7249 ** pPage is the leaf page which is the right-most page in the tree.
7250 ** pParent is its parent.  pPage must have a single overflow entry
7251 ** which is also the right-most entry on the page.
7252 **
7253 ** The pSpace buffer is used to store a temporary copy of the divider
7254 ** cell that will be inserted into pParent. Such a cell consists of a 4
7255 ** byte page number followed by a variable length integer. In other
7256 ** words, at most 13 bytes. Hence the pSpace buffer must be at
7257 ** least 13 bytes in size.
7258 */
7259 static int balance_quick(MemPage *pParent, MemPage *pPage, u8 *pSpace){
7260   BtShared *const pBt = pPage->pBt;    /* B-Tree Database */
7261   MemPage *pNew;                       /* Newly allocated page */
7262   int rc;                              /* Return Code */
7263   Pgno pgnoNew;                        /* Page number of pNew */
7264 
7265   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
7266   assert( sqlite3PagerIswriteable(pParent->pDbPage) );
7267   assert( pPage->nOverflow==1 );
7268 
7269   if( pPage->nCell==0 ) return SQLITE_CORRUPT_BKPT;  /* dbfuzz001.test */
7270   assert( pPage->nFree>=0 );
7271   assert( pParent->nFree>=0 );
7272 
7273   /* Allocate a new page. This page will become the right-sibling of
7274   ** pPage. Make the parent page writable, so that the new divider cell
7275   ** may be inserted. If both these operations are successful, proceed.
7276   */
7277   rc = allocateBtreePage(pBt, &pNew, &pgnoNew, 0, 0);
7278 
7279   if( rc==SQLITE_OK ){
7280 
7281     u8 *pOut = &pSpace[4];
7282     u8 *pCell = pPage->apOvfl[0];
7283     u16 szCell = pPage->xCellSize(pPage, pCell);
7284     u8 *pStop;
7285     CellArray b;
7286 
7287     assert( sqlite3PagerIswriteable(pNew->pDbPage) );
7288     assert( CORRUPT_DB || pPage->aData[0]==(PTF_INTKEY|PTF_LEAFDATA|PTF_LEAF) );
7289     zeroPage(pNew, PTF_INTKEY|PTF_LEAFDATA|PTF_LEAF);
7290     b.nCell = 1;
7291     b.pRef = pPage;
7292     b.apCell = &pCell;
7293     b.szCell = &szCell;
7294     b.apEnd[0] = pPage->aDataEnd;
7295     b.ixNx[0] = 2;
7296     rc = rebuildPage(&b, 0, 1, pNew);
7297     if( NEVER(rc) ){
7298       releasePage(pNew);
7299       return rc;
7300     }
7301     pNew->nFree = pBt->usableSize - pNew->cellOffset - 2 - szCell;
7302 
7303     /* If this is an auto-vacuum database, update the pointer map
7304     ** with entries for the new page, and any pointer from the
7305     ** cell on the page to an overflow page. If either of these
7306     ** operations fails, the return code is set, but the contents
7307     ** of the parent page are still manipulated by thh code below.
7308     ** That is Ok, at this point the parent page is guaranteed to
7309     ** be marked as dirty. Returning an error code will cause a
7310     ** rollback, undoing any changes made to the parent page.
7311     */
7312     if( ISAUTOVACUUM ){
7313       ptrmapPut(pBt, pgnoNew, PTRMAP_BTREE, pParent->pgno, &rc);
7314       if( szCell>pNew->minLocal ){
7315         ptrmapPutOvflPtr(pNew, pNew, pCell, &rc);
7316       }
7317     }
7318 
7319     /* Create a divider cell to insert into pParent. The divider cell
7320     ** consists of a 4-byte page number (the page number of pPage) and
7321     ** a variable length key value (which must be the same value as the
7322     ** largest key on pPage).
7323     **
7324     ** To find the largest key value on pPage, first find the right-most
7325     ** cell on pPage. The first two fields of this cell are the
7326     ** record-length (a variable length integer at most 32-bits in size)
7327     ** and the key value (a variable length integer, may have any value).
7328     ** The first of the while(...) loops below skips over the record-length
7329     ** field. The second while(...) loop copies the key value from the
7330     ** cell on pPage into the pSpace buffer.
7331     */
7332     pCell = findCell(pPage, pPage->nCell-1);
7333     pStop = &pCell[9];
7334     while( (*(pCell++)&0x80) && pCell<pStop );
7335     pStop = &pCell[9];
7336     while( ((*(pOut++) = *(pCell++))&0x80) && pCell<pStop );
7337 
7338     /* Insert the new divider cell into pParent. */
7339     if( rc==SQLITE_OK ){
7340       insertCell(pParent, pParent->nCell, pSpace, (int)(pOut-pSpace),
7341                    0, pPage->pgno, &rc);
7342     }
7343 
7344     /* Set the right-child pointer of pParent to point to the new page. */
7345     put4byte(&pParent->aData[pParent->hdrOffset+8], pgnoNew);
7346 
7347     /* Release the reference to the new page. */
7348     releasePage(pNew);
7349   }
7350 
7351   return rc;
7352 }
7353 #endif /* SQLITE_OMIT_QUICKBALANCE */
7354 
7355 #if 0
7356 /*
7357 ** This function does not contribute anything to the operation of SQLite.
7358 ** it is sometimes activated temporarily while debugging code responsible
7359 ** for setting pointer-map entries.
7360 */
7361 static int ptrmapCheckPages(MemPage **apPage, int nPage){
7362   int i, j;
7363   for(i=0; i<nPage; i++){
7364     Pgno n;
7365     u8 e;
7366     MemPage *pPage = apPage[i];
7367     BtShared *pBt = pPage->pBt;
7368     assert( pPage->isInit );
7369 
7370     for(j=0; j<pPage->nCell; j++){
7371       CellInfo info;
7372       u8 *z;
7373 
7374       z = findCell(pPage, j);
7375       pPage->xParseCell(pPage, z, &info);
7376       if( info.nLocal<info.nPayload ){
7377         Pgno ovfl = get4byte(&z[info.nSize-4]);
7378         ptrmapGet(pBt, ovfl, &e, &n);
7379         assert( n==pPage->pgno && e==PTRMAP_OVERFLOW1 );
7380       }
7381       if( !pPage->leaf ){
7382         Pgno child = get4byte(z);
7383         ptrmapGet(pBt, child, &e, &n);
7384         assert( n==pPage->pgno && e==PTRMAP_BTREE );
7385       }
7386     }
7387     if( !pPage->leaf ){
7388       Pgno child = get4byte(&pPage->aData[pPage->hdrOffset+8]);
7389       ptrmapGet(pBt, child, &e, &n);
7390       assert( n==pPage->pgno && e==PTRMAP_BTREE );
7391     }
7392   }
7393   return 1;
7394 }
7395 #endif
7396 
7397 /*
7398 ** This function is used to copy the contents of the b-tree node stored
7399 ** on page pFrom to page pTo. If page pFrom was not a leaf page, then
7400 ** the pointer-map entries for each child page are updated so that the
7401 ** parent page stored in the pointer map is page pTo. If pFrom contained
7402 ** any cells with overflow page pointers, then the corresponding pointer
7403 ** map entries are also updated so that the parent page is page pTo.
7404 **
7405 ** If pFrom is currently carrying any overflow cells (entries in the
7406 ** MemPage.apOvfl[] array), they are not copied to pTo.
7407 **
7408 ** Before returning, page pTo is reinitialized using btreeInitPage().
7409 **
7410 ** The performance of this function is not critical. It is only used by
7411 ** the balance_shallower() and balance_deeper() procedures, neither of
7412 ** which are called often under normal circumstances.
7413 */
7414 static void copyNodeContent(MemPage *pFrom, MemPage *pTo, int *pRC){
7415   if( (*pRC)==SQLITE_OK ){
7416     BtShared * const pBt = pFrom->pBt;
7417     u8 * const aFrom = pFrom->aData;
7418     u8 * const aTo = pTo->aData;
7419     int const iFromHdr = pFrom->hdrOffset;
7420     int const iToHdr = ((pTo->pgno==1) ? 100 : 0);
7421     int rc;
7422     int iData;
7423 
7424 
7425     assert( pFrom->isInit );
7426     assert( pFrom->nFree>=iToHdr );
7427     assert( get2byte(&aFrom[iFromHdr+5]) <= (int)pBt->usableSize );
7428 
7429     /* Copy the b-tree node content from page pFrom to page pTo. */
7430     iData = get2byte(&aFrom[iFromHdr+5]);
7431     memcpy(&aTo[iData], &aFrom[iData], pBt->usableSize-iData);
7432     memcpy(&aTo[iToHdr], &aFrom[iFromHdr], pFrom->cellOffset + 2*pFrom->nCell);
7433 
7434     /* Reinitialize page pTo so that the contents of the MemPage structure
7435     ** match the new data. The initialization of pTo can actually fail under
7436     ** fairly obscure circumstances, even though it is a copy of initialized
7437     ** page pFrom.
7438     */
7439     pTo->isInit = 0;
7440     rc = btreeInitPage(pTo);
7441     if( rc==SQLITE_OK ) rc = btreeComputeFreeSpace(pTo);
7442     if( rc!=SQLITE_OK ){
7443       *pRC = rc;
7444       return;
7445     }
7446 
7447     /* If this is an auto-vacuum database, update the pointer-map entries
7448     ** for any b-tree or overflow pages that pTo now contains the pointers to.
7449     */
7450     if( ISAUTOVACUUM ){
7451       *pRC = setChildPtrmaps(pTo);
7452     }
7453   }
7454 }
7455 
7456 /*
7457 ** This routine redistributes cells on the iParentIdx'th child of pParent
7458 ** (hereafter "the page") and up to 2 siblings so that all pages have about the
7459 ** same amount of free space. Usually a single sibling on either side of the
7460 ** page are used in the balancing, though both siblings might come from one
7461 ** side if the page is the first or last child of its parent. If the page
7462 ** has fewer than 2 siblings (something which can only happen if the page
7463 ** is a root page or a child of a root page) then all available siblings
7464 ** participate in the balancing.
7465 **
7466 ** The number of siblings of the page might be increased or decreased by
7467 ** one or two in an effort to keep pages nearly full but not over full.
7468 **
7469 ** Note that when this routine is called, some of the cells on the page
7470 ** might not actually be stored in MemPage.aData[]. This can happen
7471 ** if the page is overfull. This routine ensures that all cells allocated
7472 ** to the page and its siblings fit into MemPage.aData[] before returning.
7473 **
7474 ** In the course of balancing the page and its siblings, cells may be
7475 ** inserted into or removed from the parent page (pParent). Doing so
7476 ** may cause the parent page to become overfull or underfull. If this
7477 ** happens, it is the responsibility of the caller to invoke the correct
7478 ** balancing routine to fix this problem (see the balance() routine).
7479 **
7480 ** If this routine fails for any reason, it might leave the database
7481 ** in a corrupted state. So if this routine fails, the database should
7482 ** be rolled back.
7483 **
7484 ** The third argument to this function, aOvflSpace, is a pointer to a
7485 ** buffer big enough to hold one page. If while inserting cells into the parent
7486 ** page (pParent) the parent page becomes overfull, this buffer is
7487 ** used to store the parent's overflow cells. Because this function inserts
7488 ** a maximum of four divider cells into the parent page, and the maximum
7489 ** size of a cell stored within an internal node is always less than 1/4
7490 ** of the page-size, the aOvflSpace[] buffer is guaranteed to be large
7491 ** enough for all overflow cells.
7492 **
7493 ** If aOvflSpace is set to a null pointer, this function returns
7494 ** SQLITE_NOMEM.
7495 */
7496 static int balance_nonroot(
7497   MemPage *pParent,               /* Parent page of siblings being balanced */
7498   int iParentIdx,                 /* Index of "the page" in pParent */
7499   u8 *aOvflSpace,                 /* page-size bytes of space for parent ovfl */
7500   int isRoot,                     /* True if pParent is a root-page */
7501   int bBulk                       /* True if this call is part of a bulk load */
7502 ){
7503   BtShared *pBt;               /* The whole database */
7504   int nMaxCells = 0;           /* Allocated size of apCell, szCell, aFrom. */
7505   int nNew = 0;                /* Number of pages in apNew[] */
7506   int nOld;                    /* Number of pages in apOld[] */
7507   int i, j, k;                 /* Loop counters */
7508   int nxDiv;                   /* Next divider slot in pParent->aCell[] */
7509   int rc = SQLITE_OK;          /* The return code */
7510   u16 leafCorrection;          /* 4 if pPage is a leaf.  0 if not */
7511   int leafData;                /* True if pPage is a leaf of a LEAFDATA tree */
7512   int usableSpace;             /* Bytes in pPage beyond the header */
7513   int pageFlags;               /* Value of pPage->aData[0] */
7514   int iSpace1 = 0;             /* First unused byte of aSpace1[] */
7515   int iOvflSpace = 0;          /* First unused byte of aOvflSpace[] */
7516   int szScratch;               /* Size of scratch memory requested */
7517   MemPage *apOld[NB];          /* pPage and up to two siblings */
7518   MemPage *apNew[NB+2];        /* pPage and up to NB siblings after balancing */
7519   u8 *pRight;                  /* Location in parent of right-sibling pointer */
7520   u8 *apDiv[NB-1];             /* Divider cells in pParent */
7521   int cntNew[NB+2];            /* Index in b.paCell[] of cell after i-th page */
7522   int cntOld[NB+2];            /* Old index in b.apCell[] */
7523   int szNew[NB+2];             /* Combined size of cells placed on i-th page */
7524   u8 *aSpace1;                 /* Space for copies of dividers cells */
7525   Pgno pgno;                   /* Temp var to store a page number in */
7526   u8 abDone[NB+2];             /* True after i'th new page is populated */
7527   Pgno aPgno[NB+2];            /* Page numbers of new pages before shuffling */
7528   Pgno aPgOrder[NB+2];         /* Copy of aPgno[] used for sorting pages */
7529   u16 aPgFlags[NB+2];          /* flags field of new pages before shuffling */
7530   CellArray b;                  /* Parsed information on cells being balanced */
7531 
7532   memset(abDone, 0, sizeof(abDone));
7533   b.nCell = 0;
7534   b.apCell = 0;
7535   pBt = pParent->pBt;
7536   assert( sqlite3_mutex_held(pBt->mutex) );
7537   assert( sqlite3PagerIswriteable(pParent->pDbPage) );
7538 
7539   /* At this point pParent may have at most one overflow cell. And if
7540   ** this overflow cell is present, it must be the cell with
7541   ** index iParentIdx. This scenario comes about when this function
7542   ** is called (indirectly) from sqlite3BtreeDelete().
7543   */
7544   assert( pParent->nOverflow==0 || pParent->nOverflow==1 );
7545   assert( pParent->nOverflow==0 || pParent->aiOvfl[0]==iParentIdx );
7546 
7547   if( !aOvflSpace ){
7548     return SQLITE_NOMEM_BKPT;
7549   }
7550   assert( pParent->nFree>=0 );
7551 
7552   /* Find the sibling pages to balance. Also locate the cells in pParent
7553   ** that divide the siblings. An attempt is made to find NN siblings on
7554   ** either side of pPage. More siblings are taken from one side, however,
7555   ** if there are fewer than NN siblings on the other side. If pParent
7556   ** has NB or fewer children then all children of pParent are taken.
7557   **
7558   ** This loop also drops the divider cells from the parent page. This
7559   ** way, the remainder of the function does not have to deal with any
7560   ** overflow cells in the parent page, since if any existed they will
7561   ** have already been removed.
7562   */
7563   i = pParent->nOverflow + pParent->nCell;
7564   if( i<2 ){
7565     nxDiv = 0;
7566   }else{
7567     assert( bBulk==0 || bBulk==1 );
7568     if( iParentIdx==0 ){
7569       nxDiv = 0;
7570     }else if( iParentIdx==i ){
7571       nxDiv = i-2+bBulk;
7572     }else{
7573       nxDiv = iParentIdx-1;
7574     }
7575     i = 2-bBulk;
7576   }
7577   nOld = i+1;
7578   if( (i+nxDiv-pParent->nOverflow)==pParent->nCell ){
7579     pRight = &pParent->aData[pParent->hdrOffset+8];
7580   }else{
7581     pRight = findCell(pParent, i+nxDiv-pParent->nOverflow);
7582   }
7583   pgno = get4byte(pRight);
7584   while( 1 ){
7585     rc = getAndInitPage(pBt, pgno, &apOld[i], 0, 0);
7586     if( rc ){
7587       memset(apOld, 0, (i+1)*sizeof(MemPage*));
7588       goto balance_cleanup;
7589     }
7590     if( apOld[i]->nFree<0 ){
7591       rc = btreeComputeFreeSpace(apOld[i]);
7592       if( rc ){
7593         memset(apOld, 0, (i)*sizeof(MemPage*));
7594         goto balance_cleanup;
7595       }
7596     }
7597     if( (i--)==0 ) break;
7598 
7599     if( pParent->nOverflow && i+nxDiv==pParent->aiOvfl[0] ){
7600       apDiv[i] = pParent->apOvfl[0];
7601       pgno = get4byte(apDiv[i]);
7602       szNew[i] = pParent->xCellSize(pParent, apDiv[i]);
7603       pParent->nOverflow = 0;
7604     }else{
7605       apDiv[i] = findCell(pParent, i+nxDiv-pParent->nOverflow);
7606       pgno = get4byte(apDiv[i]);
7607       szNew[i] = pParent->xCellSize(pParent, apDiv[i]);
7608 
7609       /* Drop the cell from the parent page. apDiv[i] still points to
7610       ** the cell within the parent, even though it has been dropped.
7611       ** This is safe because dropping a cell only overwrites the first
7612       ** four bytes of it, and this function does not need the first
7613       ** four bytes of the divider cell. So the pointer is safe to use
7614       ** later on.
7615       **
7616       ** But not if we are in secure-delete mode. In secure-delete mode,
7617       ** the dropCell() routine will overwrite the entire cell with zeroes.
7618       ** In this case, temporarily copy the cell into the aOvflSpace[]
7619       ** buffer. It will be copied out again as soon as the aSpace[] buffer
7620       ** is allocated.  */
7621       if( pBt->btsFlags & BTS_FAST_SECURE ){
7622         int iOff;
7623 
7624         iOff = SQLITE_PTR_TO_INT(apDiv[i]) - SQLITE_PTR_TO_INT(pParent->aData);
7625         if( (iOff+szNew[i])>(int)pBt->usableSize ){
7626           rc = SQLITE_CORRUPT_BKPT;
7627           memset(apOld, 0, (i+1)*sizeof(MemPage*));
7628           goto balance_cleanup;
7629         }else{
7630           memcpy(&aOvflSpace[iOff], apDiv[i], szNew[i]);
7631           apDiv[i] = &aOvflSpace[apDiv[i]-pParent->aData];
7632         }
7633       }
7634       dropCell(pParent, i+nxDiv-pParent->nOverflow, szNew[i], &rc);
7635     }
7636   }
7637 
7638   /* Make nMaxCells a multiple of 4 in order to preserve 8-byte
7639   ** alignment */
7640   nMaxCells = nOld*(MX_CELL(pBt) + ArraySize(pParent->apOvfl));
7641   nMaxCells = (nMaxCells + 3)&~3;
7642 
7643   /*
7644   ** Allocate space for memory structures
7645   */
7646   szScratch =
7647        nMaxCells*sizeof(u8*)                       /* b.apCell */
7648      + nMaxCells*sizeof(u16)                       /* b.szCell */
7649      + pBt->pageSize;                              /* aSpace1 */
7650 
7651   assert( szScratch<=7*(int)pBt->pageSize );
7652   b.apCell = sqlite3StackAllocRaw(0, szScratch );
7653   if( b.apCell==0 ){
7654     rc = SQLITE_NOMEM_BKPT;
7655     goto balance_cleanup;
7656   }
7657   b.szCell = (u16*)&b.apCell[nMaxCells];
7658   aSpace1 = (u8*)&b.szCell[nMaxCells];
7659   assert( EIGHT_BYTE_ALIGNMENT(aSpace1) );
7660 
7661   /*
7662   ** Load pointers to all cells on sibling pages and the divider cells
7663   ** into the local b.apCell[] array.  Make copies of the divider cells
7664   ** into space obtained from aSpace1[]. The divider cells have already
7665   ** been removed from pParent.
7666   **
7667   ** If the siblings are on leaf pages, then the child pointers of the
7668   ** divider cells are stripped from the cells before they are copied
7669   ** into aSpace1[].  In this way, all cells in b.apCell[] are without
7670   ** child pointers.  If siblings are not leaves, then all cell in
7671   ** b.apCell[] include child pointers.  Either way, all cells in b.apCell[]
7672   ** are alike.
7673   **
7674   ** leafCorrection:  4 if pPage is a leaf.  0 if pPage is not a leaf.
7675   **       leafData:  1 if pPage holds key+data and pParent holds only keys.
7676   */
7677   b.pRef = apOld[0];
7678   leafCorrection = b.pRef->leaf*4;
7679   leafData = b.pRef->intKeyLeaf;
7680   for(i=0; i<nOld; i++){
7681     MemPage *pOld = apOld[i];
7682     int limit = pOld->nCell;
7683     u8 *aData = pOld->aData;
7684     u16 maskPage = pOld->maskPage;
7685     u8 *piCell = aData + pOld->cellOffset;
7686     u8 *piEnd;
7687     VVA_ONLY( int nCellAtStart = b.nCell; )
7688 
7689     /* Verify that all sibling pages are of the same "type" (table-leaf,
7690     ** table-interior, index-leaf, or index-interior).
7691     */
7692     if( pOld->aData[0]!=apOld[0]->aData[0] ){
7693       rc = SQLITE_CORRUPT_BKPT;
7694       goto balance_cleanup;
7695     }
7696 
7697     /* Load b.apCell[] with pointers to all cells in pOld.  If pOld
7698     ** contains overflow cells, include them in the b.apCell[] array
7699     ** in the correct spot.
7700     **
7701     ** Note that when there are multiple overflow cells, it is always the
7702     ** case that they are sequential and adjacent.  This invariant arises
7703     ** because multiple overflows can only occurs when inserting divider
7704     ** cells into a parent on a prior balance, and divider cells are always
7705     ** adjacent and are inserted in order.  There is an assert() tagged
7706     ** with "NOTE 1" in the overflow cell insertion loop to prove this
7707     ** invariant.
7708     **
7709     ** This must be done in advance.  Once the balance starts, the cell
7710     ** offset section of the btree page will be overwritten and we will no
7711     ** long be able to find the cells if a pointer to each cell is not saved
7712     ** first.
7713     */
7714     memset(&b.szCell[b.nCell], 0, sizeof(b.szCell[0])*(limit+pOld->nOverflow));
7715     if( pOld->nOverflow>0 ){
7716       if( NEVER(limit<pOld->aiOvfl[0]) ){
7717         rc = SQLITE_CORRUPT_BKPT;
7718         goto balance_cleanup;
7719       }
7720       limit = pOld->aiOvfl[0];
7721       for(j=0; j<limit; j++){
7722         b.apCell[b.nCell] = aData + (maskPage & get2byteAligned(piCell));
7723         piCell += 2;
7724         b.nCell++;
7725       }
7726       for(k=0; k<pOld->nOverflow; k++){
7727         assert( k==0 || pOld->aiOvfl[k-1]+1==pOld->aiOvfl[k] );/* NOTE 1 */
7728         b.apCell[b.nCell] = pOld->apOvfl[k];
7729         b.nCell++;
7730       }
7731     }
7732     piEnd = aData + pOld->cellOffset + 2*pOld->nCell;
7733     while( piCell<piEnd ){
7734       assert( b.nCell<nMaxCells );
7735       b.apCell[b.nCell] = aData + (maskPage & get2byteAligned(piCell));
7736       piCell += 2;
7737       b.nCell++;
7738     }
7739     assert( (b.nCell-nCellAtStart)==(pOld->nCell+pOld->nOverflow) );
7740 
7741     cntOld[i] = b.nCell;
7742     if( i<nOld-1 && !leafData){
7743       u16 sz = (u16)szNew[i];
7744       u8 *pTemp;
7745       assert( b.nCell<nMaxCells );
7746       b.szCell[b.nCell] = sz;
7747       pTemp = &aSpace1[iSpace1];
7748       iSpace1 += sz;
7749       assert( sz<=pBt->maxLocal+23 );
7750       assert( iSpace1 <= (int)pBt->pageSize );
7751       memcpy(pTemp, apDiv[i], sz);
7752       b.apCell[b.nCell] = pTemp+leafCorrection;
7753       assert( leafCorrection==0 || leafCorrection==4 );
7754       b.szCell[b.nCell] = b.szCell[b.nCell] - leafCorrection;
7755       if( !pOld->leaf ){
7756         assert( leafCorrection==0 );
7757         assert( pOld->hdrOffset==0 );
7758         /* The right pointer of the child page pOld becomes the left
7759         ** pointer of the divider cell */
7760         memcpy(b.apCell[b.nCell], &pOld->aData[8], 4);
7761       }else{
7762         assert( leafCorrection==4 );
7763         while( b.szCell[b.nCell]<4 ){
7764           /* Do not allow any cells smaller than 4 bytes. If a smaller cell
7765           ** does exist, pad it with 0x00 bytes. */
7766           assert( b.szCell[b.nCell]==3 || CORRUPT_DB );
7767           assert( b.apCell[b.nCell]==&aSpace1[iSpace1-3] || CORRUPT_DB );
7768           aSpace1[iSpace1++] = 0x00;
7769           b.szCell[b.nCell]++;
7770         }
7771       }
7772       b.nCell++;
7773     }
7774   }
7775 
7776   /*
7777   ** Figure out the number of pages needed to hold all b.nCell cells.
7778   ** Store this number in "k".  Also compute szNew[] which is the total
7779   ** size of all cells on the i-th page and cntNew[] which is the index
7780   ** in b.apCell[] of the cell that divides page i from page i+1.
7781   ** cntNew[k] should equal b.nCell.
7782   **
7783   ** Values computed by this block:
7784   **
7785   **           k: The total number of sibling pages
7786   **    szNew[i]: Spaced used on the i-th sibling page.
7787   **   cntNew[i]: Index in b.apCell[] and b.szCell[] for the first cell to
7788   **              the right of the i-th sibling page.
7789   ** usableSpace: Number of bytes of space available on each sibling.
7790   **
7791   */
7792   usableSpace = pBt->usableSize - 12 + leafCorrection;
7793   for(i=k=0; i<nOld; i++, k++){
7794     MemPage *p = apOld[i];
7795     b.apEnd[k] = p->aDataEnd;
7796     b.ixNx[k] = cntOld[i];
7797     if( k && b.ixNx[k]==b.ixNx[k-1] ){
7798       k--;  /* Omit b.ixNx[] entry for child pages with no cells */
7799     }
7800     if( !leafData ){
7801       k++;
7802       b.apEnd[k] = pParent->aDataEnd;
7803       b.ixNx[k] = cntOld[i]+1;
7804     }
7805     assert( p->nFree>=0 );
7806     szNew[i] = usableSpace - p->nFree;
7807     for(j=0; j<p->nOverflow; j++){
7808       szNew[i] += 2 + p->xCellSize(p, p->apOvfl[j]);
7809     }
7810     cntNew[i] = cntOld[i];
7811   }
7812   k = nOld;
7813   for(i=0; i<k; i++){
7814     int sz;
7815     while( szNew[i]>usableSpace ){
7816       if( i+1>=k ){
7817         k = i+2;
7818         if( k>NB+2 ){ rc = SQLITE_CORRUPT_BKPT; goto balance_cleanup; }
7819         szNew[k-1] = 0;
7820         cntNew[k-1] = b.nCell;
7821       }
7822       sz = 2 + cachedCellSize(&b, cntNew[i]-1);
7823       szNew[i] -= sz;
7824       if( !leafData ){
7825         if( cntNew[i]<b.nCell ){
7826           sz = 2 + cachedCellSize(&b, cntNew[i]);
7827         }else{
7828           sz = 0;
7829         }
7830       }
7831       szNew[i+1] += sz;
7832       cntNew[i]--;
7833     }
7834     while( cntNew[i]<b.nCell ){
7835       sz = 2 + cachedCellSize(&b, cntNew[i]);
7836       if( szNew[i]+sz>usableSpace ) break;
7837       szNew[i] += sz;
7838       cntNew[i]++;
7839       if( !leafData ){
7840         if( cntNew[i]<b.nCell ){
7841           sz = 2 + cachedCellSize(&b, cntNew[i]);
7842         }else{
7843           sz = 0;
7844         }
7845       }
7846       szNew[i+1] -= sz;
7847     }
7848     if( cntNew[i]>=b.nCell ){
7849       k = i+1;
7850     }else if( cntNew[i] <= (i>0 ? cntNew[i-1] : 0) ){
7851       rc = SQLITE_CORRUPT_BKPT;
7852       goto balance_cleanup;
7853     }
7854   }
7855 
7856   /*
7857   ** The packing computed by the previous block is biased toward the siblings
7858   ** on the left side (siblings with smaller keys). The left siblings are
7859   ** always nearly full, while the right-most sibling might be nearly empty.
7860   ** The next block of code attempts to adjust the packing of siblings to
7861   ** get a better balance.
7862   **
7863   ** This adjustment is more than an optimization.  The packing above might
7864   ** be so out of balance as to be illegal.  For example, the right-most
7865   ** sibling might be completely empty.  This adjustment is not optional.
7866   */
7867   for(i=k-1; i>0; i--){
7868     int szRight = szNew[i];  /* Size of sibling on the right */
7869     int szLeft = szNew[i-1]; /* Size of sibling on the left */
7870     int r;              /* Index of right-most cell in left sibling */
7871     int d;              /* Index of first cell to the left of right sibling */
7872 
7873     r = cntNew[i-1] - 1;
7874     d = r + 1 - leafData;
7875     (void)cachedCellSize(&b, d);
7876     do{
7877       assert( d<nMaxCells );
7878       assert( r<nMaxCells );
7879       (void)cachedCellSize(&b, r);
7880       if( szRight!=0
7881        && (bBulk || szRight+b.szCell[d]+2 > szLeft-(b.szCell[r]+(i==k-1?0:2)))){
7882         break;
7883       }
7884       szRight += b.szCell[d] + 2;
7885       szLeft -= b.szCell[r] + 2;
7886       cntNew[i-1] = r;
7887       r--;
7888       d--;
7889     }while( r>=0 );
7890     szNew[i] = szRight;
7891     szNew[i-1] = szLeft;
7892     if( cntNew[i-1] <= (i>1 ? cntNew[i-2] : 0) ){
7893       rc = SQLITE_CORRUPT_BKPT;
7894       goto balance_cleanup;
7895     }
7896   }
7897 
7898   /* Sanity check:  For a non-corrupt database file one of the follwing
7899   ** must be true:
7900   **    (1) We found one or more cells (cntNew[0])>0), or
7901   **    (2) pPage is a virtual root page.  A virtual root page is when
7902   **        the real root page is page 1 and we are the only child of
7903   **        that page.
7904   */
7905   assert( cntNew[0]>0 || (pParent->pgno==1 && pParent->nCell==0) || CORRUPT_DB);
7906   TRACE(("BALANCE: old: %d(nc=%d) %d(nc=%d) %d(nc=%d)\n",
7907     apOld[0]->pgno, apOld[0]->nCell,
7908     nOld>=2 ? apOld[1]->pgno : 0, nOld>=2 ? apOld[1]->nCell : 0,
7909     nOld>=3 ? apOld[2]->pgno : 0, nOld>=3 ? apOld[2]->nCell : 0
7910   ));
7911 
7912   /*
7913   ** Allocate k new pages.  Reuse old pages where possible.
7914   */
7915   pageFlags = apOld[0]->aData[0];
7916   for(i=0; i<k; i++){
7917     MemPage *pNew;
7918     if( i<nOld ){
7919       pNew = apNew[i] = apOld[i];
7920       apOld[i] = 0;
7921       rc = sqlite3PagerWrite(pNew->pDbPage);
7922       nNew++;
7923       if( rc ) goto balance_cleanup;
7924     }else{
7925       assert( i>0 );
7926       rc = allocateBtreePage(pBt, &pNew, &pgno, (bBulk ? 1 : pgno), 0);
7927       if( rc ) goto balance_cleanup;
7928       zeroPage(pNew, pageFlags);
7929       apNew[i] = pNew;
7930       nNew++;
7931       cntOld[i] = b.nCell;
7932 
7933       /* Set the pointer-map entry for the new sibling page. */
7934       if( ISAUTOVACUUM ){
7935         ptrmapPut(pBt, pNew->pgno, PTRMAP_BTREE, pParent->pgno, &rc);
7936         if( rc!=SQLITE_OK ){
7937           goto balance_cleanup;
7938         }
7939       }
7940     }
7941   }
7942 
7943   /*
7944   ** Reassign page numbers so that the new pages are in ascending order.
7945   ** This helps to keep entries in the disk file in order so that a scan
7946   ** of the table is closer to a linear scan through the file. That in turn
7947   ** helps the operating system to deliver pages from the disk more rapidly.
7948   **
7949   ** An O(n^2) insertion sort algorithm is used, but since n is never more
7950   ** than (NB+2) (a small constant), that should not be a problem.
7951   **
7952   ** When NB==3, this one optimization makes the database about 25% faster
7953   ** for large insertions and deletions.
7954   */
7955   for(i=0; i<nNew; i++){
7956     aPgOrder[i] = aPgno[i] = apNew[i]->pgno;
7957     aPgFlags[i] = apNew[i]->pDbPage->flags;
7958     for(j=0; j<i; j++){
7959       if( aPgno[j]==aPgno[i] ){
7960         /* This branch is taken if the set of sibling pages somehow contains
7961         ** duplicate entries. This can happen if the database is corrupt.
7962         ** It would be simpler to detect this as part of the loop below, but
7963         ** we do the detection here in order to avoid populating the pager
7964         ** cache with two separate objects associated with the same
7965         ** page number.  */
7966         assert( CORRUPT_DB );
7967         rc = SQLITE_CORRUPT_BKPT;
7968         goto balance_cleanup;
7969       }
7970     }
7971   }
7972   for(i=0; i<nNew; i++){
7973     int iBest = 0;                /* aPgno[] index of page number to use */
7974     for(j=1; j<nNew; j++){
7975       if( aPgOrder[j]<aPgOrder[iBest] ) iBest = j;
7976     }
7977     pgno = aPgOrder[iBest];
7978     aPgOrder[iBest] = 0xffffffff;
7979     if( iBest!=i ){
7980       if( iBest>i ){
7981         sqlite3PagerRekey(apNew[iBest]->pDbPage, pBt->nPage+iBest+1, 0);
7982       }
7983       sqlite3PagerRekey(apNew[i]->pDbPage, pgno, aPgFlags[iBest]);
7984       apNew[i]->pgno = pgno;
7985     }
7986   }
7987 
7988   TRACE(("BALANCE: new: %d(%d nc=%d) %d(%d nc=%d) %d(%d nc=%d) "
7989          "%d(%d nc=%d) %d(%d nc=%d)\n",
7990     apNew[0]->pgno, szNew[0], cntNew[0],
7991     nNew>=2 ? apNew[1]->pgno : 0, nNew>=2 ? szNew[1] : 0,
7992     nNew>=2 ? cntNew[1] - cntNew[0] - !leafData : 0,
7993     nNew>=3 ? apNew[2]->pgno : 0, nNew>=3 ? szNew[2] : 0,
7994     nNew>=3 ? cntNew[2] - cntNew[1] - !leafData : 0,
7995     nNew>=4 ? apNew[3]->pgno : 0, nNew>=4 ? szNew[3] : 0,
7996     nNew>=4 ? cntNew[3] - cntNew[2] - !leafData : 0,
7997     nNew>=5 ? apNew[4]->pgno : 0, nNew>=5 ? szNew[4] : 0,
7998     nNew>=5 ? cntNew[4] - cntNew[3] - !leafData : 0
7999   ));
8000 
8001   assert( sqlite3PagerIswriteable(pParent->pDbPage) );
8002   assert( nNew>=1 && nNew<=ArraySize(apNew) );
8003   assert( apNew[nNew-1]!=0 );
8004   put4byte(pRight, apNew[nNew-1]->pgno);
8005 
8006   /* If the sibling pages are not leaves, ensure that the right-child pointer
8007   ** of the right-most new sibling page is set to the value that was
8008   ** originally in the same field of the right-most old sibling page. */
8009   if( (pageFlags & PTF_LEAF)==0 && nOld!=nNew ){
8010     MemPage *pOld = (nNew>nOld ? apNew : apOld)[nOld-1];
8011     memcpy(&apNew[nNew-1]->aData[8], &pOld->aData[8], 4);
8012   }
8013 
8014   /* Make any required updates to pointer map entries associated with
8015   ** cells stored on sibling pages following the balance operation. Pointer
8016   ** map entries associated with divider cells are set by the insertCell()
8017   ** routine. The associated pointer map entries are:
8018   **
8019   **   a) if the cell contains a reference to an overflow chain, the
8020   **      entry associated with the first page in the overflow chain, and
8021   **
8022   **   b) if the sibling pages are not leaves, the child page associated
8023   **      with the cell.
8024   **
8025   ** If the sibling pages are not leaves, then the pointer map entry
8026   ** associated with the right-child of each sibling may also need to be
8027   ** updated. This happens below, after the sibling pages have been
8028   ** populated, not here.
8029   */
8030   if( ISAUTOVACUUM ){
8031     MemPage *pOld;
8032     MemPage *pNew = pOld = apNew[0];
8033     int cntOldNext = pNew->nCell + pNew->nOverflow;
8034     int iNew = 0;
8035     int iOld = 0;
8036 
8037     for(i=0; i<b.nCell; i++){
8038       u8 *pCell = b.apCell[i];
8039       while( i==cntOldNext ){
8040         iOld++;
8041         assert( iOld<nNew || iOld<nOld );
8042         assert( iOld>=0 && iOld<NB );
8043         pOld = iOld<nNew ? apNew[iOld] : apOld[iOld];
8044         cntOldNext += pOld->nCell + pOld->nOverflow + !leafData;
8045       }
8046       if( i==cntNew[iNew] ){
8047         pNew = apNew[++iNew];
8048         if( !leafData ) continue;
8049       }
8050 
8051       /* Cell pCell is destined for new sibling page pNew. Originally, it
8052       ** was either part of sibling page iOld (possibly an overflow cell),
8053       ** or else the divider cell to the left of sibling page iOld. So,
8054       ** if sibling page iOld had the same page number as pNew, and if
8055       ** pCell really was a part of sibling page iOld (not a divider or
8056       ** overflow cell), we can skip updating the pointer map entries.  */
8057       if( iOld>=nNew
8058        || pNew->pgno!=aPgno[iOld]
8059        || !SQLITE_WITHIN(pCell,pOld->aData,pOld->aDataEnd)
8060       ){
8061         if( !leafCorrection ){
8062           ptrmapPut(pBt, get4byte(pCell), PTRMAP_BTREE, pNew->pgno, &rc);
8063         }
8064         if( cachedCellSize(&b,i)>pNew->minLocal ){
8065           ptrmapPutOvflPtr(pNew, pOld, pCell, &rc);
8066         }
8067         if( rc ) goto balance_cleanup;
8068       }
8069     }
8070   }
8071 
8072   /* Insert new divider cells into pParent. */
8073   for(i=0; i<nNew-1; i++){
8074     u8 *pCell;
8075     u8 *pTemp;
8076     int sz;
8077     MemPage *pNew = apNew[i];
8078     j = cntNew[i];
8079 
8080     assert( j<nMaxCells );
8081     assert( b.apCell[j]!=0 );
8082     pCell = b.apCell[j];
8083     sz = b.szCell[j] + leafCorrection;
8084     pTemp = &aOvflSpace[iOvflSpace];
8085     if( !pNew->leaf ){
8086       memcpy(&pNew->aData[8], pCell, 4);
8087     }else if( leafData ){
8088       /* If the tree is a leaf-data tree, and the siblings are leaves,
8089       ** then there is no divider cell in b.apCell[]. Instead, the divider
8090       ** cell consists of the integer key for the right-most cell of
8091       ** the sibling-page assembled above only.
8092       */
8093       CellInfo info;
8094       j--;
8095       pNew->xParseCell(pNew, b.apCell[j], &info);
8096       pCell = pTemp;
8097       sz = 4 + putVarint(&pCell[4], info.nKey);
8098       pTemp = 0;
8099     }else{
8100       pCell -= 4;
8101       /* Obscure case for non-leaf-data trees: If the cell at pCell was
8102       ** previously stored on a leaf node, and its reported size was 4
8103       ** bytes, then it may actually be smaller than this
8104       ** (see btreeParseCellPtr(), 4 bytes is the minimum size of
8105       ** any cell). But it is important to pass the correct size to
8106       ** insertCell(), so reparse the cell now.
8107       **
8108       ** This can only happen for b-trees used to evaluate "IN (SELECT ...)"
8109       ** and WITHOUT ROWID tables with exactly one column which is the
8110       ** primary key.
8111       */
8112       if( b.szCell[j]==4 ){
8113         assert(leafCorrection==4);
8114         sz = pParent->xCellSize(pParent, pCell);
8115       }
8116     }
8117     iOvflSpace += sz;
8118     assert( sz<=pBt->maxLocal+23 );
8119     assert( iOvflSpace <= (int)pBt->pageSize );
8120     insertCell(pParent, nxDiv+i, pCell, sz, pTemp, pNew->pgno, &rc);
8121     if( rc!=SQLITE_OK ) goto balance_cleanup;
8122     assert( sqlite3PagerIswriteable(pParent->pDbPage) );
8123   }
8124 
8125   /* Now update the actual sibling pages. The order in which they are updated
8126   ** is important, as this code needs to avoid disrupting any page from which
8127   ** cells may still to be read. In practice, this means:
8128   **
8129   **  (1) If cells are moving left (from apNew[iPg] to apNew[iPg-1])
8130   **      then it is not safe to update page apNew[iPg] until after
8131   **      the left-hand sibling apNew[iPg-1] has been updated.
8132   **
8133   **  (2) If cells are moving right (from apNew[iPg] to apNew[iPg+1])
8134   **      then it is not safe to update page apNew[iPg] until after
8135   **      the right-hand sibling apNew[iPg+1] has been updated.
8136   **
8137   ** If neither of the above apply, the page is safe to update.
8138   **
8139   ** The iPg value in the following loop starts at nNew-1 goes down
8140   ** to 0, then back up to nNew-1 again, thus making two passes over
8141   ** the pages.  On the initial downward pass, only condition (1) above
8142   ** needs to be tested because (2) will always be true from the previous
8143   ** step.  On the upward pass, both conditions are always true, so the
8144   ** upwards pass simply processes pages that were missed on the downward
8145   ** pass.
8146   */
8147   for(i=1-nNew; i<nNew; i++){
8148     int iPg = i<0 ? -i : i;
8149     assert( iPg>=0 && iPg<nNew );
8150     if( abDone[iPg] ) continue;         /* Skip pages already processed */
8151     if( i>=0                            /* On the upwards pass, or... */
8152      || cntOld[iPg-1]>=cntNew[iPg-1]    /* Condition (1) is true */
8153     ){
8154       int iNew;
8155       int iOld;
8156       int nNewCell;
8157 
8158       /* Verify condition (1):  If cells are moving left, update iPg
8159       ** only after iPg-1 has already been updated. */
8160       assert( iPg==0 || cntOld[iPg-1]>=cntNew[iPg-1] || abDone[iPg-1] );
8161 
8162       /* Verify condition (2):  If cells are moving right, update iPg
8163       ** only after iPg+1 has already been updated. */
8164       assert( cntNew[iPg]>=cntOld[iPg] || abDone[iPg+1] );
8165 
8166       if( iPg==0 ){
8167         iNew = iOld = 0;
8168         nNewCell = cntNew[0];
8169       }else{
8170         iOld = iPg<nOld ? (cntOld[iPg-1] + !leafData) : b.nCell;
8171         iNew = cntNew[iPg-1] + !leafData;
8172         nNewCell = cntNew[iPg] - iNew;
8173       }
8174 
8175       rc = editPage(apNew[iPg], iOld, iNew, nNewCell, &b);
8176       if( rc ) goto balance_cleanup;
8177       abDone[iPg]++;
8178       apNew[iPg]->nFree = usableSpace-szNew[iPg];
8179       assert( apNew[iPg]->nOverflow==0 );
8180       assert( apNew[iPg]->nCell==nNewCell );
8181     }
8182   }
8183 
8184   /* All pages have been processed exactly once */
8185   assert( memcmp(abDone, "\01\01\01\01\01", nNew)==0 );
8186 
8187   assert( nOld>0 );
8188   assert( nNew>0 );
8189 
8190   if( isRoot && pParent->nCell==0 && pParent->hdrOffset<=apNew[0]->nFree ){
8191     /* The root page of the b-tree now contains no cells. The only sibling
8192     ** page is the right-child of the parent. Copy the contents of the
8193     ** child page into the parent, decreasing the overall height of the
8194     ** b-tree structure by one. This is described as the "balance-shallower"
8195     ** sub-algorithm in some documentation.
8196     **
8197     ** If this is an auto-vacuum database, the call to copyNodeContent()
8198     ** sets all pointer-map entries corresponding to database image pages
8199     ** for which the pointer is stored within the content being copied.
8200     **
8201     ** It is critical that the child page be defragmented before being
8202     ** copied into the parent, because if the parent is page 1 then it will
8203     ** by smaller than the child due to the database header, and so all the
8204     ** free space needs to be up front.
8205     */
8206     assert( nNew==1 || CORRUPT_DB );
8207     rc = defragmentPage(apNew[0], -1);
8208     testcase( rc!=SQLITE_OK );
8209     assert( apNew[0]->nFree ==
8210         (get2byteNotZero(&apNew[0]->aData[5]) - apNew[0]->cellOffset
8211           - apNew[0]->nCell*2)
8212       || rc!=SQLITE_OK
8213     );
8214     copyNodeContent(apNew[0], pParent, &rc);
8215     freePage(apNew[0], &rc);
8216   }else if( ISAUTOVACUUM && !leafCorrection ){
8217     /* Fix the pointer map entries associated with the right-child of each
8218     ** sibling page. All other pointer map entries have already been taken
8219     ** care of.  */
8220     for(i=0; i<nNew; i++){
8221       u32 key = get4byte(&apNew[i]->aData[8]);
8222       ptrmapPut(pBt, key, PTRMAP_BTREE, apNew[i]->pgno, &rc);
8223     }
8224   }
8225 
8226   assert( pParent->isInit );
8227   TRACE(("BALANCE: finished: old=%d new=%d cells=%d\n",
8228           nOld, nNew, b.nCell));
8229 
8230   /* Free any old pages that were not reused as new pages.
8231   */
8232   for(i=nNew; i<nOld; i++){
8233     freePage(apOld[i], &rc);
8234   }
8235 
8236 #if 0
8237   if( ISAUTOVACUUM && rc==SQLITE_OK && apNew[0]->isInit ){
8238     /* The ptrmapCheckPages() contains assert() statements that verify that
8239     ** all pointer map pages are set correctly. This is helpful while
8240     ** debugging. This is usually disabled because a corrupt database may
8241     ** cause an assert() statement to fail.  */
8242     ptrmapCheckPages(apNew, nNew);
8243     ptrmapCheckPages(&pParent, 1);
8244   }
8245 #endif
8246 
8247   /*
8248   ** Cleanup before returning.
8249   */
8250 balance_cleanup:
8251   sqlite3StackFree(0, b.apCell);
8252   for(i=0; i<nOld; i++){
8253     releasePage(apOld[i]);
8254   }
8255   for(i=0; i<nNew; i++){
8256     releasePage(apNew[i]);
8257   }
8258 
8259   return rc;
8260 }
8261 
8262 
8263 /*
8264 ** This function is called when the root page of a b-tree structure is
8265 ** overfull (has one or more overflow pages).
8266 **
8267 ** A new child page is allocated and the contents of the current root
8268 ** page, including overflow cells, are copied into the child. The root
8269 ** page is then overwritten to make it an empty page with the right-child
8270 ** pointer pointing to the new page.
8271 **
8272 ** Before returning, all pointer-map entries corresponding to pages
8273 ** that the new child-page now contains pointers to are updated. The
8274 ** entry corresponding to the new right-child pointer of the root
8275 ** page is also updated.
8276 **
8277 ** If successful, *ppChild is set to contain a reference to the child
8278 ** page and SQLITE_OK is returned. In this case the caller is required
8279 ** to call releasePage() on *ppChild exactly once. If an error occurs,
8280 ** an error code is returned and *ppChild is set to 0.
8281 */
8282 static int balance_deeper(MemPage *pRoot, MemPage **ppChild){
8283   int rc;                        /* Return value from subprocedures */
8284   MemPage *pChild = 0;           /* Pointer to a new child page */
8285   Pgno pgnoChild = 0;            /* Page number of the new child page */
8286   BtShared *pBt = pRoot->pBt;    /* The BTree */
8287 
8288   assert( pRoot->nOverflow>0 );
8289   assert( sqlite3_mutex_held(pBt->mutex) );
8290 
8291   /* Make pRoot, the root page of the b-tree, writable. Allocate a new
8292   ** page that will become the new right-child of pPage. Copy the contents
8293   ** of the node stored on pRoot into the new child page.
8294   */
8295   rc = sqlite3PagerWrite(pRoot->pDbPage);
8296   if( rc==SQLITE_OK ){
8297     rc = allocateBtreePage(pBt,&pChild,&pgnoChild,pRoot->pgno,0);
8298     copyNodeContent(pRoot, pChild, &rc);
8299     if( ISAUTOVACUUM ){
8300       ptrmapPut(pBt, pgnoChild, PTRMAP_BTREE, pRoot->pgno, &rc);
8301     }
8302   }
8303   if( rc ){
8304     *ppChild = 0;
8305     releasePage(pChild);
8306     return rc;
8307   }
8308   assert( sqlite3PagerIswriteable(pChild->pDbPage) );
8309   assert( sqlite3PagerIswriteable(pRoot->pDbPage) );
8310   assert( pChild->nCell==pRoot->nCell || CORRUPT_DB );
8311 
8312   TRACE(("BALANCE: copy root %d into %d\n", pRoot->pgno, pChild->pgno));
8313 
8314   /* Copy the overflow cells from pRoot to pChild */
8315   memcpy(pChild->aiOvfl, pRoot->aiOvfl,
8316          pRoot->nOverflow*sizeof(pRoot->aiOvfl[0]));
8317   memcpy(pChild->apOvfl, pRoot->apOvfl,
8318          pRoot->nOverflow*sizeof(pRoot->apOvfl[0]));
8319   pChild->nOverflow = pRoot->nOverflow;
8320 
8321   /* Zero the contents of pRoot. Then install pChild as the right-child. */
8322   zeroPage(pRoot, pChild->aData[0] & ~PTF_LEAF);
8323   put4byte(&pRoot->aData[pRoot->hdrOffset+8], pgnoChild);
8324 
8325   *ppChild = pChild;
8326   return SQLITE_OK;
8327 }
8328 
8329 /*
8330 ** Return SQLITE_CORRUPT if any cursor other than pCur is currently valid
8331 ** on the same B-tree as pCur.
8332 **
8333 ** This can if a database is corrupt with two or more SQL tables
8334 ** pointing to the same b-tree.  If an insert occurs on one SQL table
8335 ** and causes a BEFORE TRIGGER to do a secondary insert on the other SQL
8336 ** table linked to the same b-tree.  If the secondary insert causes a
8337 ** rebalance, that can change content out from under the cursor on the
8338 ** first SQL table, violating invariants on the first insert.
8339 */
8340 static int anotherValidCursor(BtCursor *pCur){
8341   BtCursor *pOther;
8342   for(pOther=pCur->pBt->pCursor; pOther; pOther=pOther->pNext){
8343     if( pOther!=pCur
8344      && pOther->eState==CURSOR_VALID
8345      && pOther->pPage==pCur->pPage
8346     ){
8347       return SQLITE_CORRUPT_BKPT;
8348     }
8349   }
8350   return SQLITE_OK;
8351 }
8352 
8353 /*
8354 ** The page that pCur currently points to has just been modified in
8355 ** some way. This function figures out if this modification means the
8356 ** tree needs to be balanced, and if so calls the appropriate balancing
8357 ** routine. Balancing routines are:
8358 **
8359 **   balance_quick()
8360 **   balance_deeper()
8361 **   balance_nonroot()
8362 */
8363 static int balance(BtCursor *pCur){
8364   int rc = SQLITE_OK;
8365   const int nMin = pCur->pBt->usableSize * 2 / 3;
8366   u8 aBalanceQuickSpace[13];
8367   u8 *pFree = 0;
8368 
8369   VVA_ONLY( int balance_quick_called = 0 );
8370   VVA_ONLY( int balance_deeper_called = 0 );
8371 
8372   do {
8373     int iPage;
8374     MemPage *pPage = pCur->pPage;
8375 
8376     if( NEVER(pPage->nFree<0) && btreeComputeFreeSpace(pPage) ) break;
8377     if( pPage->nOverflow==0 && pPage->nFree<=nMin ){
8378       break;
8379     }else if( (iPage = pCur->iPage)==0 ){
8380       if( pPage->nOverflow && (rc = anotherValidCursor(pCur))==SQLITE_OK ){
8381         /* The root page of the b-tree is overfull. In this case call the
8382         ** balance_deeper() function to create a new child for the root-page
8383         ** and copy the current contents of the root-page to it. The
8384         ** next iteration of the do-loop will balance the child page.
8385         */
8386         assert( balance_deeper_called==0 );
8387         VVA_ONLY( balance_deeper_called++ );
8388         rc = balance_deeper(pPage, &pCur->apPage[1]);
8389         if( rc==SQLITE_OK ){
8390           pCur->iPage = 1;
8391           pCur->ix = 0;
8392           pCur->aiIdx[0] = 0;
8393           pCur->apPage[0] = pPage;
8394           pCur->pPage = pCur->apPage[1];
8395           assert( pCur->pPage->nOverflow );
8396         }
8397       }else{
8398         break;
8399       }
8400     }else{
8401       MemPage * const pParent = pCur->apPage[iPage-1];
8402       int const iIdx = pCur->aiIdx[iPage-1];
8403 
8404       rc = sqlite3PagerWrite(pParent->pDbPage);
8405       if( rc==SQLITE_OK && pParent->nFree<0 ){
8406         rc = btreeComputeFreeSpace(pParent);
8407       }
8408       if( rc==SQLITE_OK ){
8409 #ifndef SQLITE_OMIT_QUICKBALANCE
8410         if( pPage->intKeyLeaf
8411          && pPage->nOverflow==1
8412          && pPage->aiOvfl[0]==pPage->nCell
8413          && pParent->pgno!=1
8414          && pParent->nCell==iIdx
8415         ){
8416           /* Call balance_quick() to create a new sibling of pPage on which
8417           ** to store the overflow cell. balance_quick() inserts a new cell
8418           ** into pParent, which may cause pParent overflow. If this
8419           ** happens, the next iteration of the do-loop will balance pParent
8420           ** use either balance_nonroot() or balance_deeper(). Until this
8421           ** happens, the overflow cell is stored in the aBalanceQuickSpace[]
8422           ** buffer.
8423           **
8424           ** The purpose of the following assert() is to check that only a
8425           ** single call to balance_quick() is made for each call to this
8426           ** function. If this were not verified, a subtle bug involving reuse
8427           ** of the aBalanceQuickSpace[] might sneak in.
8428           */
8429           assert( balance_quick_called==0 );
8430           VVA_ONLY( balance_quick_called++ );
8431           rc = balance_quick(pParent, pPage, aBalanceQuickSpace);
8432         }else
8433 #endif
8434         {
8435           /* In this case, call balance_nonroot() to redistribute cells
8436           ** between pPage and up to 2 of its sibling pages. This involves
8437           ** modifying the contents of pParent, which may cause pParent to
8438           ** become overfull or underfull. The next iteration of the do-loop
8439           ** will balance the parent page to correct this.
8440           **
8441           ** If the parent page becomes overfull, the overflow cell or cells
8442           ** are stored in the pSpace buffer allocated immediately below.
8443           ** A subsequent iteration of the do-loop will deal with this by
8444           ** calling balance_nonroot() (balance_deeper() may be called first,
8445           ** but it doesn't deal with overflow cells - just moves them to a
8446           ** different page). Once this subsequent call to balance_nonroot()
8447           ** has completed, it is safe to release the pSpace buffer used by
8448           ** the previous call, as the overflow cell data will have been
8449           ** copied either into the body of a database page or into the new
8450           ** pSpace buffer passed to the latter call to balance_nonroot().
8451           */
8452           u8 *pSpace = sqlite3PageMalloc(pCur->pBt->pageSize);
8453           rc = balance_nonroot(pParent, iIdx, pSpace, iPage==1,
8454                                pCur->hints&BTREE_BULKLOAD);
8455           if( pFree ){
8456             /* If pFree is not NULL, it points to the pSpace buffer used
8457             ** by a previous call to balance_nonroot(). Its contents are
8458             ** now stored either on real database pages or within the
8459             ** new pSpace buffer, so it may be safely freed here. */
8460             sqlite3PageFree(pFree);
8461           }
8462 
8463           /* The pSpace buffer will be freed after the next call to
8464           ** balance_nonroot(), or just before this function returns, whichever
8465           ** comes first. */
8466           pFree = pSpace;
8467         }
8468       }
8469 
8470       pPage->nOverflow = 0;
8471 
8472       /* The next iteration of the do-loop balances the parent page. */
8473       releasePage(pPage);
8474       pCur->iPage--;
8475       assert( pCur->iPage>=0 );
8476       pCur->pPage = pCur->apPage[pCur->iPage];
8477     }
8478   }while( rc==SQLITE_OK );
8479 
8480   if( pFree ){
8481     sqlite3PageFree(pFree);
8482   }
8483   return rc;
8484 }
8485 
8486 /* Overwrite content from pX into pDest.  Only do the write if the
8487 ** content is different from what is already there.
8488 */
8489 static int btreeOverwriteContent(
8490   MemPage *pPage,           /* MemPage on which writing will occur */
8491   u8 *pDest,                /* Pointer to the place to start writing */
8492   const BtreePayload *pX,   /* Source of data to write */
8493   int iOffset,              /* Offset of first byte to write */
8494   int iAmt                  /* Number of bytes to be written */
8495 ){
8496   int nData = pX->nData - iOffset;
8497   if( nData<=0 ){
8498     /* Overwritting with zeros */
8499     int i;
8500     for(i=0; i<iAmt && pDest[i]==0; i++){}
8501     if( i<iAmt ){
8502       int rc = sqlite3PagerWrite(pPage->pDbPage);
8503       if( rc ) return rc;
8504       memset(pDest + i, 0, iAmt - i);
8505     }
8506   }else{
8507     if( nData<iAmt ){
8508       /* Mixed read data and zeros at the end.  Make a recursive call
8509       ** to write the zeros then fall through to write the real data */
8510       int rc = btreeOverwriteContent(pPage, pDest+nData, pX, iOffset+nData,
8511                                  iAmt-nData);
8512       if( rc ) return rc;
8513       iAmt = nData;
8514     }
8515     if( memcmp(pDest, ((u8*)pX->pData) + iOffset, iAmt)!=0 ){
8516       int rc = sqlite3PagerWrite(pPage->pDbPage);
8517       if( rc ) return rc;
8518       /* In a corrupt database, it is possible for the source and destination
8519       ** buffers to overlap.  This is harmless since the database is already
8520       ** corrupt but it does cause valgrind and ASAN warnings.  So use
8521       ** memmove(). */
8522       memmove(pDest, ((u8*)pX->pData) + iOffset, iAmt);
8523     }
8524   }
8525   return SQLITE_OK;
8526 }
8527 
8528 /*
8529 ** Overwrite the cell that cursor pCur is pointing to with fresh content
8530 ** contained in pX.
8531 */
8532 static int btreeOverwriteCell(BtCursor *pCur, const BtreePayload *pX){
8533   int iOffset;                        /* Next byte of pX->pData to write */
8534   int nTotal = pX->nData + pX->nZero; /* Total bytes of to write */
8535   int rc;                             /* Return code */
8536   MemPage *pPage = pCur->pPage;       /* Page being written */
8537   BtShared *pBt;                      /* Btree */
8538   Pgno ovflPgno;                      /* Next overflow page to write */
8539   u32 ovflPageSize;                   /* Size to write on overflow page */
8540 
8541   if( pCur->info.pPayload + pCur->info.nLocal > pPage->aDataEnd
8542    || pCur->info.pPayload < pPage->aData + pPage->cellOffset
8543   ){
8544     return SQLITE_CORRUPT_BKPT;
8545   }
8546   /* Overwrite the local portion first */
8547   rc = btreeOverwriteContent(pPage, pCur->info.pPayload, pX,
8548                              0, pCur->info.nLocal);
8549   if( rc ) return rc;
8550   if( pCur->info.nLocal==nTotal ) return SQLITE_OK;
8551 
8552   /* Now overwrite the overflow pages */
8553   iOffset = pCur->info.nLocal;
8554   assert( nTotal>=0 );
8555   assert( iOffset>=0 );
8556   ovflPgno = get4byte(pCur->info.pPayload + iOffset);
8557   pBt = pPage->pBt;
8558   ovflPageSize = pBt->usableSize - 4;
8559   do{
8560     rc = btreeGetPage(pBt, ovflPgno, &pPage, 0);
8561     if( rc ) return rc;
8562     if( sqlite3PagerPageRefcount(pPage->pDbPage)!=1 ){
8563       rc = SQLITE_CORRUPT_BKPT;
8564     }else{
8565       if( iOffset+ovflPageSize<(u32)nTotal ){
8566         ovflPgno = get4byte(pPage->aData);
8567       }else{
8568         ovflPageSize = nTotal - iOffset;
8569       }
8570       rc = btreeOverwriteContent(pPage, pPage->aData+4, pX,
8571                                  iOffset, ovflPageSize);
8572     }
8573     sqlite3PagerUnref(pPage->pDbPage);
8574     if( rc ) return rc;
8575     iOffset += ovflPageSize;
8576   }while( iOffset<nTotal );
8577   return SQLITE_OK;
8578 }
8579 
8580 
8581 /*
8582 ** Insert a new record into the BTree.  The content of the new record
8583 ** is described by the pX object.  The pCur cursor is used only to
8584 ** define what table the record should be inserted into, and is left
8585 ** pointing at a random location.
8586 **
8587 ** For a table btree (used for rowid tables), only the pX.nKey value of
8588 ** the key is used. The pX.pKey value must be NULL.  The pX.nKey is the
8589 ** rowid or INTEGER PRIMARY KEY of the row.  The pX.nData,pData,nZero fields
8590 ** hold the content of the row.
8591 **
8592 ** For an index btree (used for indexes and WITHOUT ROWID tables), the
8593 ** key is an arbitrary byte sequence stored in pX.pKey,nKey.  The
8594 ** pX.pData,nData,nZero fields must be zero.
8595 **
8596 ** If the seekResult parameter is non-zero, then a successful call to
8597 ** MovetoUnpacked() to seek cursor pCur to (pKey,nKey) has already
8598 ** been performed.  In other words, if seekResult!=0 then the cursor
8599 ** is currently pointing to a cell that will be adjacent to the cell
8600 ** to be inserted.  If seekResult<0 then pCur points to a cell that is
8601 ** smaller then (pKey,nKey).  If seekResult>0 then pCur points to a cell
8602 ** that is larger than (pKey,nKey).
8603 **
8604 ** If seekResult==0, that means pCur is pointing at some unknown location.
8605 ** In that case, this routine must seek the cursor to the correct insertion
8606 ** point for (pKey,nKey) before doing the insertion.  For index btrees,
8607 ** if pX->nMem is non-zero, then pX->aMem contains pointers to the unpacked
8608 ** key values and pX->aMem can be used instead of pX->pKey to avoid having
8609 ** to decode the key.
8610 */
8611 int sqlite3BtreeInsert(
8612   BtCursor *pCur,                /* Insert data into the table of this cursor */
8613   const BtreePayload *pX,        /* Content of the row to be inserted */
8614   int flags,                     /* True if this is likely an append */
8615   int seekResult                 /* Result of prior MovetoUnpacked() call */
8616 ){
8617   int rc;
8618   int loc = seekResult;          /* -1: before desired location  +1: after */
8619   int szNew = 0;
8620   int idx;
8621   MemPage *pPage;
8622   Btree *p = pCur->pBtree;
8623   BtShared *pBt = p->pBt;
8624   unsigned char *oldCell;
8625   unsigned char *newCell = 0;
8626 
8627   assert( (flags & (BTREE_SAVEPOSITION|BTREE_APPEND))==flags );
8628 
8629   if( pCur->eState==CURSOR_FAULT ){
8630     assert( pCur->skipNext!=SQLITE_OK );
8631     return pCur->skipNext;
8632   }
8633 
8634   assert( cursorOwnsBtShared(pCur) );
8635   assert( (pCur->curFlags & BTCF_WriteFlag)!=0
8636               && pBt->inTransaction==TRANS_WRITE
8637               && (pBt->btsFlags & BTS_READ_ONLY)==0 );
8638   assert( hasSharedCacheTableLock(p, pCur->pgnoRoot, pCur->pKeyInfo!=0, 2) );
8639 
8640   /* Assert that the caller has been consistent. If this cursor was opened
8641   ** expecting an index b-tree, then the caller should be inserting blob
8642   ** keys with no associated data. If the cursor was opened expecting an
8643   ** intkey table, the caller should be inserting integer keys with a
8644   ** blob of associated data.  */
8645   assert( (pX->pKey==0)==(pCur->pKeyInfo==0) );
8646 
8647   /* Save the positions of any other cursors open on this table.
8648   **
8649   ** In some cases, the call to btreeMoveto() below is a no-op. For
8650   ** example, when inserting data into a table with auto-generated integer
8651   ** keys, the VDBE layer invokes sqlite3BtreeLast() to figure out the
8652   ** integer key to use. It then calls this function to actually insert the
8653   ** data into the intkey B-Tree. In this case btreeMoveto() recognizes
8654   ** that the cursor is already where it needs to be and returns without
8655   ** doing any work. To avoid thwarting these optimizations, it is important
8656   ** not to clear the cursor here.
8657   */
8658   if( pCur->curFlags & BTCF_Multiple ){
8659     rc = saveAllCursors(pBt, pCur->pgnoRoot, pCur);
8660     if( rc ) return rc;
8661   }
8662 
8663   if( pCur->pKeyInfo==0 ){
8664     assert( pX->pKey==0 );
8665     /* If this is an insert into a table b-tree, invalidate any incrblob
8666     ** cursors open on the row being replaced */
8667     invalidateIncrblobCursors(p, pCur->pgnoRoot, pX->nKey, 0);
8668 
8669     /* If BTREE_SAVEPOSITION is set, the cursor must already be pointing
8670     ** to a row with the same key as the new entry being inserted.
8671     */
8672 #ifdef SQLITE_DEBUG
8673     if( flags & BTREE_SAVEPOSITION ){
8674       assert( pCur->curFlags & BTCF_ValidNKey );
8675       assert( pX->nKey==pCur->info.nKey );
8676       assert( loc==0 );
8677     }
8678 #endif
8679 
8680     /* On the other hand, BTREE_SAVEPOSITION==0 does not imply
8681     ** that the cursor is not pointing to a row to be overwritten.
8682     ** So do a complete check.
8683     */
8684     if( (pCur->curFlags&BTCF_ValidNKey)!=0 && pX->nKey==pCur->info.nKey ){
8685       /* The cursor is pointing to the entry that is to be
8686       ** overwritten */
8687       assert( pX->nData>=0 && pX->nZero>=0 );
8688       if( pCur->info.nSize!=0
8689        && pCur->info.nPayload==(u32)pX->nData+pX->nZero
8690       ){
8691         /* New entry is the same size as the old.  Do an overwrite */
8692         return btreeOverwriteCell(pCur, pX);
8693       }
8694       assert( loc==0 );
8695     }else if( loc==0 ){
8696       /* The cursor is *not* pointing to the cell to be overwritten, nor
8697       ** to an adjacent cell.  Move the cursor so that it is pointing either
8698       ** to the cell to be overwritten or an adjacent cell.
8699       */
8700       rc = sqlite3BtreeMovetoUnpacked(pCur, 0, pX->nKey, flags!=0, &loc);
8701       if( rc ) return rc;
8702     }
8703   }else{
8704     /* This is an index or a WITHOUT ROWID table */
8705 
8706     /* If BTREE_SAVEPOSITION is set, the cursor must already be pointing
8707     ** to a row with the same key as the new entry being inserted.
8708     */
8709     assert( (flags & BTREE_SAVEPOSITION)==0 || loc==0 );
8710 
8711     /* If the cursor is not already pointing either to the cell to be
8712     ** overwritten, or if a new cell is being inserted, if the cursor is
8713     ** not pointing to an immediately adjacent cell, then move the cursor
8714     ** so that it does.
8715     */
8716     if( loc==0 && (flags & BTREE_SAVEPOSITION)==0 ){
8717       if( pX->nMem ){
8718         UnpackedRecord r;
8719         r.pKeyInfo = pCur->pKeyInfo;
8720         r.aMem = pX->aMem;
8721         r.nField = pX->nMem;
8722         r.default_rc = 0;
8723         r.errCode = 0;
8724         r.r1 = 0;
8725         r.r2 = 0;
8726         r.eqSeen = 0;
8727         rc = sqlite3BtreeMovetoUnpacked(pCur, &r, 0, flags!=0, &loc);
8728       }else{
8729         rc = btreeMoveto(pCur, pX->pKey, pX->nKey, flags!=0, &loc);
8730       }
8731       if( rc ) return rc;
8732     }
8733 
8734     /* If the cursor is currently pointing to an entry to be overwritten
8735     ** and the new content is the same as as the old, then use the
8736     ** overwrite optimization.
8737     */
8738     if( loc==0 ){
8739       getCellInfo(pCur);
8740       if( pCur->info.nKey==pX->nKey ){
8741         BtreePayload x2;
8742         x2.pData = pX->pKey;
8743         x2.nData = pX->nKey;
8744         x2.nZero = 0;
8745         return btreeOverwriteCell(pCur, &x2);
8746       }
8747     }
8748 
8749   }
8750   assert( pCur->eState==CURSOR_VALID
8751        || (pCur->eState==CURSOR_INVALID && loc)
8752        || CORRUPT_DB );
8753 
8754   pPage = pCur->pPage;
8755   assert( pPage->intKey || pX->nKey>=0 );
8756   assert( pPage->leaf || !pPage->intKey );
8757   if( pPage->nFree<0 ){
8758     rc = btreeComputeFreeSpace(pPage);
8759     if( rc ) return rc;
8760   }
8761 
8762   TRACE(("INSERT: table=%d nkey=%lld ndata=%d page=%d %s\n",
8763           pCur->pgnoRoot, pX->nKey, pX->nData, pPage->pgno,
8764           loc==0 ? "overwrite" : "new entry"));
8765   assert( pPage->isInit );
8766   newCell = pBt->pTmpSpace;
8767   assert( newCell!=0 );
8768   rc = fillInCell(pPage, newCell, pX, &szNew);
8769   if( rc ) goto end_insert;
8770   assert( szNew==pPage->xCellSize(pPage, newCell) );
8771   assert( szNew <= MX_CELL_SIZE(pBt) );
8772   idx = pCur->ix;
8773   if( loc==0 ){
8774     CellInfo info;
8775     assert( idx<pPage->nCell );
8776     rc = sqlite3PagerWrite(pPage->pDbPage);
8777     if( rc ){
8778       goto end_insert;
8779     }
8780     oldCell = findCell(pPage, idx);
8781     if( !pPage->leaf ){
8782       memcpy(newCell, oldCell, 4);
8783     }
8784     rc = clearCell(pPage, oldCell, &info);
8785     testcase( pCur->curFlags & BTCF_ValidOvfl );
8786     invalidateOverflowCache(pCur);
8787     if( info.nSize==szNew && info.nLocal==info.nPayload
8788      && (!ISAUTOVACUUM || szNew<pPage->minLocal)
8789     ){
8790       /* Overwrite the old cell with the new if they are the same size.
8791       ** We could also try to do this if the old cell is smaller, then add
8792       ** the leftover space to the free list.  But experiments show that
8793       ** doing that is no faster then skipping this optimization and just
8794       ** calling dropCell() and insertCell().
8795       **
8796       ** This optimization cannot be used on an autovacuum database if the
8797       ** new entry uses overflow pages, as the insertCell() call below is
8798       ** necessary to add the PTRMAP_OVERFLOW1 pointer-map entry.  */
8799       assert( rc==SQLITE_OK ); /* clearCell never fails when nLocal==nPayload */
8800       if( oldCell < pPage->aData+pPage->hdrOffset+10 ){
8801         return SQLITE_CORRUPT_BKPT;
8802       }
8803       if( oldCell+szNew > pPage->aDataEnd ){
8804         return SQLITE_CORRUPT_BKPT;
8805       }
8806       memcpy(oldCell, newCell, szNew);
8807       return SQLITE_OK;
8808     }
8809     dropCell(pPage, idx, info.nSize, &rc);
8810     if( rc ) goto end_insert;
8811   }else if( loc<0 && pPage->nCell>0 ){
8812     assert( pPage->leaf );
8813     idx = ++pCur->ix;
8814     pCur->curFlags &= ~BTCF_ValidNKey;
8815   }else{
8816     assert( pPage->leaf );
8817   }
8818   insertCell(pPage, idx, newCell, szNew, 0, 0, &rc);
8819   assert( pPage->nOverflow==0 || rc==SQLITE_OK );
8820   assert( rc!=SQLITE_OK || pPage->nCell>0 || pPage->nOverflow>0 );
8821 
8822   /* If no error has occurred and pPage has an overflow cell, call balance()
8823   ** to redistribute the cells within the tree. Since balance() may move
8824   ** the cursor, zero the BtCursor.info.nSize and BTCF_ValidNKey
8825   ** variables.
8826   **
8827   ** Previous versions of SQLite called moveToRoot() to move the cursor
8828   ** back to the root page as balance() used to invalidate the contents
8829   ** of BtCursor.apPage[] and BtCursor.aiIdx[]. Instead of doing that,
8830   ** set the cursor state to "invalid". This makes common insert operations
8831   ** slightly faster.
8832   **
8833   ** There is a subtle but important optimization here too. When inserting
8834   ** multiple records into an intkey b-tree using a single cursor (as can
8835   ** happen while processing an "INSERT INTO ... SELECT" statement), it
8836   ** is advantageous to leave the cursor pointing to the last entry in
8837   ** the b-tree if possible. If the cursor is left pointing to the last
8838   ** entry in the table, and the next row inserted has an integer key
8839   ** larger than the largest existing key, it is possible to insert the
8840   ** row without seeking the cursor. This can be a big performance boost.
8841   */
8842   pCur->info.nSize = 0;
8843   if( pPage->nOverflow ){
8844     assert( rc==SQLITE_OK );
8845     pCur->curFlags &= ~(BTCF_ValidNKey);
8846     rc = balance(pCur);
8847 
8848     /* Must make sure nOverflow is reset to zero even if the balance()
8849     ** fails. Internal data structure corruption will result otherwise.
8850     ** Also, set the cursor state to invalid. This stops saveCursorPosition()
8851     ** from trying to save the current position of the cursor.  */
8852     pCur->pPage->nOverflow = 0;
8853     pCur->eState = CURSOR_INVALID;
8854     if( (flags & BTREE_SAVEPOSITION) && rc==SQLITE_OK ){
8855       btreeReleaseAllCursorPages(pCur);
8856       if( pCur->pKeyInfo ){
8857         assert( pCur->pKey==0 );
8858         pCur->pKey = sqlite3Malloc( pX->nKey );
8859         if( pCur->pKey==0 ){
8860           rc = SQLITE_NOMEM;
8861         }else{
8862           memcpy(pCur->pKey, pX->pKey, pX->nKey);
8863         }
8864       }
8865       pCur->eState = CURSOR_REQUIRESEEK;
8866       pCur->nKey = pX->nKey;
8867     }
8868   }
8869   assert( pCur->iPage<0 || pCur->pPage->nOverflow==0 );
8870 
8871 end_insert:
8872   return rc;
8873 }
8874 
8875 /*
8876 ** Delete the entry that the cursor is pointing to.
8877 **
8878 ** If the BTREE_SAVEPOSITION bit of the flags parameter is zero, then
8879 ** the cursor is left pointing at an arbitrary location after the delete.
8880 ** But if that bit is set, then the cursor is left in a state such that
8881 ** the next call to BtreeNext() or BtreePrev() moves it to the same row
8882 ** as it would have been on if the call to BtreeDelete() had been omitted.
8883 **
8884 ** The BTREE_AUXDELETE bit of flags indicates that is one of several deletes
8885 ** associated with a single table entry and its indexes.  Only one of those
8886 ** deletes is considered the "primary" delete.  The primary delete occurs
8887 ** on a cursor that is not a BTREE_FORDELETE cursor.  All but one delete
8888 ** operation on non-FORDELETE cursors is tagged with the AUXDELETE flag.
8889 ** The BTREE_AUXDELETE bit is a hint that is not used by this implementation,
8890 ** but which might be used by alternative storage engines.
8891 */
8892 int sqlite3BtreeDelete(BtCursor *pCur, u8 flags){
8893   Btree *p = pCur->pBtree;
8894   BtShared *pBt = p->pBt;
8895   int rc;                              /* Return code */
8896   MemPage *pPage;                      /* Page to delete cell from */
8897   unsigned char *pCell;                /* Pointer to cell to delete */
8898   int iCellIdx;                        /* Index of cell to delete */
8899   int iCellDepth;                      /* Depth of node containing pCell */
8900   CellInfo info;                       /* Size of the cell being deleted */
8901   int bSkipnext = 0;                   /* Leaf cursor in SKIPNEXT state */
8902   u8 bPreserve = flags & BTREE_SAVEPOSITION;  /* Keep cursor valid */
8903 
8904   assert( cursorOwnsBtShared(pCur) );
8905   assert( pBt->inTransaction==TRANS_WRITE );
8906   assert( (pBt->btsFlags & BTS_READ_ONLY)==0 );
8907   assert( pCur->curFlags & BTCF_WriteFlag );
8908   assert( hasSharedCacheTableLock(p, pCur->pgnoRoot, pCur->pKeyInfo!=0, 2) );
8909   assert( !hasReadConflicts(p, pCur->pgnoRoot) );
8910   assert( (flags & ~(BTREE_SAVEPOSITION | BTREE_AUXDELETE))==0 );
8911   if( pCur->eState==CURSOR_REQUIRESEEK ){
8912     rc = btreeRestoreCursorPosition(pCur);
8913     if( rc ) return rc;
8914   }
8915   assert( pCur->eState==CURSOR_VALID );
8916 
8917   iCellDepth = pCur->iPage;
8918   iCellIdx = pCur->ix;
8919   pPage = pCur->pPage;
8920   pCell = findCell(pPage, iCellIdx);
8921   if( pPage->nFree<0 && btreeComputeFreeSpace(pPage) ) return SQLITE_CORRUPT;
8922 
8923   /* If the bPreserve flag is set to true, then the cursor position must
8924   ** be preserved following this delete operation. If the current delete
8925   ** will cause a b-tree rebalance, then this is done by saving the cursor
8926   ** key and leaving the cursor in CURSOR_REQUIRESEEK state before
8927   ** returning.
8928   **
8929   ** Or, if the current delete will not cause a rebalance, then the cursor
8930   ** will be left in CURSOR_SKIPNEXT state pointing to the entry immediately
8931   ** before or after the deleted entry. In this case set bSkipnext to true.  */
8932   if( bPreserve ){
8933     if( !pPage->leaf
8934      || (pPage->nFree+cellSizePtr(pPage,pCell)+2)>(int)(pBt->usableSize*2/3)
8935      || pPage->nCell==1  /* See dbfuzz001.test for a test case */
8936     ){
8937       /* A b-tree rebalance will be required after deleting this entry.
8938       ** Save the cursor key.  */
8939       rc = saveCursorKey(pCur);
8940       if( rc ) return rc;
8941     }else{
8942       bSkipnext = 1;
8943     }
8944   }
8945 
8946   /* If the page containing the entry to delete is not a leaf page, move
8947   ** the cursor to the largest entry in the tree that is smaller than
8948   ** the entry being deleted. This cell will replace the cell being deleted
8949   ** from the internal node. The 'previous' entry is used for this instead
8950   ** of the 'next' entry, as the previous entry is always a part of the
8951   ** sub-tree headed by the child page of the cell being deleted. This makes
8952   ** balancing the tree following the delete operation easier.  */
8953   if( !pPage->leaf ){
8954     rc = sqlite3BtreePrevious(pCur, 0);
8955     assert( rc!=SQLITE_DONE );
8956     if( rc ) return rc;
8957   }
8958 
8959   /* Save the positions of any other cursors open on this table before
8960   ** making any modifications.  */
8961   if( pCur->curFlags & BTCF_Multiple ){
8962     rc = saveAllCursors(pBt, pCur->pgnoRoot, pCur);
8963     if( rc ) return rc;
8964   }
8965 
8966   /* If this is a delete operation to remove a row from a table b-tree,
8967   ** invalidate any incrblob cursors open on the row being deleted.  */
8968   if( pCur->pKeyInfo==0 ){
8969     invalidateIncrblobCursors(p, pCur->pgnoRoot, pCur->info.nKey, 0);
8970   }
8971 
8972   /* Make the page containing the entry to be deleted writable. Then free any
8973   ** overflow pages associated with the entry and finally remove the cell
8974   ** itself from within the page.  */
8975   rc = sqlite3PagerWrite(pPage->pDbPage);
8976   if( rc ) return rc;
8977   rc = clearCell(pPage, pCell, &info);
8978   dropCell(pPage, iCellIdx, info.nSize, &rc);
8979   if( rc ) return rc;
8980 
8981   /* If the cell deleted was not located on a leaf page, then the cursor
8982   ** is currently pointing to the largest entry in the sub-tree headed
8983   ** by the child-page of the cell that was just deleted from an internal
8984   ** node. The cell from the leaf node needs to be moved to the internal
8985   ** node to replace the deleted cell.  */
8986   if( !pPage->leaf ){
8987     MemPage *pLeaf = pCur->pPage;
8988     int nCell;
8989     Pgno n;
8990     unsigned char *pTmp;
8991 
8992     if( pLeaf->nFree<0 ){
8993       rc = btreeComputeFreeSpace(pLeaf);
8994       if( rc ) return rc;
8995     }
8996     if( iCellDepth<pCur->iPage-1 ){
8997       n = pCur->apPage[iCellDepth+1]->pgno;
8998     }else{
8999       n = pCur->pPage->pgno;
9000     }
9001     pCell = findCell(pLeaf, pLeaf->nCell-1);
9002     if( pCell<&pLeaf->aData[4] ) return SQLITE_CORRUPT_BKPT;
9003     nCell = pLeaf->xCellSize(pLeaf, pCell);
9004     assert( MX_CELL_SIZE(pBt) >= nCell );
9005     pTmp = pBt->pTmpSpace;
9006     assert( pTmp!=0 );
9007     rc = sqlite3PagerWrite(pLeaf->pDbPage);
9008     if( rc==SQLITE_OK ){
9009       insertCell(pPage, iCellIdx, pCell-4, nCell+4, pTmp, n, &rc);
9010     }
9011     dropCell(pLeaf, pLeaf->nCell-1, nCell, &rc);
9012     if( rc ) return rc;
9013   }
9014 
9015   /* Balance the tree. If the entry deleted was located on a leaf page,
9016   ** then the cursor still points to that page. In this case the first
9017   ** call to balance() repairs the tree, and the if(...) condition is
9018   ** never true.
9019   **
9020   ** Otherwise, if the entry deleted was on an internal node page, then
9021   ** pCur is pointing to the leaf page from which a cell was removed to
9022   ** replace the cell deleted from the internal node. This is slightly
9023   ** tricky as the leaf node may be underfull, and the internal node may
9024   ** be either under or overfull. In this case run the balancing algorithm
9025   ** on the leaf node first. If the balance proceeds far enough up the
9026   ** tree that we can be sure that any problem in the internal node has
9027   ** been corrected, so be it. Otherwise, after balancing the leaf node,
9028   ** walk the cursor up the tree to the internal node and balance it as
9029   ** well.  */
9030   rc = balance(pCur);
9031   if( rc==SQLITE_OK && pCur->iPage>iCellDepth ){
9032     releasePageNotNull(pCur->pPage);
9033     pCur->iPage--;
9034     while( pCur->iPage>iCellDepth ){
9035       releasePage(pCur->apPage[pCur->iPage--]);
9036     }
9037     pCur->pPage = pCur->apPage[pCur->iPage];
9038     rc = balance(pCur);
9039   }
9040 
9041   if( rc==SQLITE_OK ){
9042     if( bSkipnext ){
9043       assert( bPreserve && (pCur->iPage==iCellDepth || CORRUPT_DB) );
9044       assert( pPage==pCur->pPage || CORRUPT_DB );
9045       assert( (pPage->nCell>0 || CORRUPT_DB) && iCellIdx<=pPage->nCell );
9046       pCur->eState = CURSOR_SKIPNEXT;
9047       if( iCellIdx>=pPage->nCell ){
9048         pCur->skipNext = -1;
9049         pCur->ix = pPage->nCell-1;
9050       }else{
9051         pCur->skipNext = 1;
9052       }
9053     }else{
9054       rc = moveToRoot(pCur);
9055       if( bPreserve ){
9056         btreeReleaseAllCursorPages(pCur);
9057         pCur->eState = CURSOR_REQUIRESEEK;
9058       }
9059       if( rc==SQLITE_EMPTY ) rc = SQLITE_OK;
9060     }
9061   }
9062   return rc;
9063 }
9064 
9065 /*
9066 ** Create a new BTree table.  Write into *piTable the page
9067 ** number for the root page of the new table.
9068 **
9069 ** The type of type is determined by the flags parameter.  Only the
9070 ** following values of flags are currently in use.  Other values for
9071 ** flags might not work:
9072 **
9073 **     BTREE_INTKEY|BTREE_LEAFDATA     Used for SQL tables with rowid keys
9074 **     BTREE_ZERODATA                  Used for SQL indices
9075 */
9076 static int btreeCreateTable(Btree *p, int *piTable, int createTabFlags){
9077   BtShared *pBt = p->pBt;
9078   MemPage *pRoot;
9079   Pgno pgnoRoot;
9080   int rc;
9081   int ptfFlags;          /* Page-type flage for the root page of new table */
9082 
9083   assert( sqlite3BtreeHoldsMutex(p) );
9084   assert( pBt->inTransaction==TRANS_WRITE );
9085   assert( (pBt->btsFlags & BTS_READ_ONLY)==0 );
9086 
9087 #ifdef SQLITE_OMIT_AUTOVACUUM
9088   rc = allocateBtreePage(pBt, &pRoot, &pgnoRoot, 1, 0);
9089   if( rc ){
9090     return rc;
9091   }
9092 #else
9093   if( pBt->autoVacuum ){
9094     Pgno pgnoMove;      /* Move a page here to make room for the root-page */
9095     MemPage *pPageMove; /* The page to move to. */
9096 
9097     /* Creating a new table may probably require moving an existing database
9098     ** to make room for the new tables root page. In case this page turns
9099     ** out to be an overflow page, delete all overflow page-map caches
9100     ** held by open cursors.
9101     */
9102     invalidateAllOverflowCache(pBt);
9103 
9104     /* Read the value of meta[3] from the database to determine where the
9105     ** root page of the new table should go. meta[3] is the largest root-page
9106     ** created so far, so the new root-page is (meta[3]+1).
9107     */
9108     sqlite3BtreeGetMeta(p, BTREE_LARGEST_ROOT_PAGE, &pgnoRoot);
9109     pgnoRoot++;
9110 
9111     /* The new root-page may not be allocated on a pointer-map page, or the
9112     ** PENDING_BYTE page.
9113     */
9114     while( pgnoRoot==PTRMAP_PAGENO(pBt, pgnoRoot) ||
9115         pgnoRoot==PENDING_BYTE_PAGE(pBt) ){
9116       pgnoRoot++;
9117     }
9118     assert( pgnoRoot>=3 || CORRUPT_DB );
9119     testcase( pgnoRoot<3 );
9120 
9121     /* Allocate a page. The page that currently resides at pgnoRoot will
9122     ** be moved to the allocated page (unless the allocated page happens
9123     ** to reside at pgnoRoot).
9124     */
9125     rc = allocateBtreePage(pBt, &pPageMove, &pgnoMove, pgnoRoot, BTALLOC_EXACT);
9126     if( rc!=SQLITE_OK ){
9127       return rc;
9128     }
9129 
9130     if( pgnoMove!=pgnoRoot ){
9131       /* pgnoRoot is the page that will be used for the root-page of
9132       ** the new table (assuming an error did not occur). But we were
9133       ** allocated pgnoMove. If required (i.e. if it was not allocated
9134       ** by extending the file), the current page at position pgnoMove
9135       ** is already journaled.
9136       */
9137       u8 eType = 0;
9138       Pgno iPtrPage = 0;
9139 
9140       /* Save the positions of any open cursors. This is required in
9141       ** case they are holding a reference to an xFetch reference
9142       ** corresponding to page pgnoRoot.  */
9143       rc = saveAllCursors(pBt, 0, 0);
9144       releasePage(pPageMove);
9145       if( rc!=SQLITE_OK ){
9146         return rc;
9147       }
9148 
9149       /* Move the page currently at pgnoRoot to pgnoMove. */
9150       rc = btreeGetPage(pBt, pgnoRoot, &pRoot, 0);
9151       if( rc!=SQLITE_OK ){
9152         return rc;
9153       }
9154       rc = ptrmapGet(pBt, pgnoRoot, &eType, &iPtrPage);
9155       if( eType==PTRMAP_ROOTPAGE || eType==PTRMAP_FREEPAGE ){
9156         rc = SQLITE_CORRUPT_BKPT;
9157       }
9158       if( rc!=SQLITE_OK ){
9159         releasePage(pRoot);
9160         return rc;
9161       }
9162       assert( eType!=PTRMAP_ROOTPAGE );
9163       assert( eType!=PTRMAP_FREEPAGE );
9164       rc = relocatePage(pBt, pRoot, eType, iPtrPage, pgnoMove, 0);
9165       releasePage(pRoot);
9166 
9167       /* Obtain the page at pgnoRoot */
9168       if( rc!=SQLITE_OK ){
9169         return rc;
9170       }
9171       rc = btreeGetPage(pBt, pgnoRoot, &pRoot, 0);
9172       if( rc!=SQLITE_OK ){
9173         return rc;
9174       }
9175       rc = sqlite3PagerWrite(pRoot->pDbPage);
9176       if( rc!=SQLITE_OK ){
9177         releasePage(pRoot);
9178         return rc;
9179       }
9180     }else{
9181       pRoot = pPageMove;
9182     }
9183 
9184     /* Update the pointer-map and meta-data with the new root-page number. */
9185     ptrmapPut(pBt, pgnoRoot, PTRMAP_ROOTPAGE, 0, &rc);
9186     if( rc ){
9187       releasePage(pRoot);
9188       return rc;
9189     }
9190 
9191     /* When the new root page was allocated, page 1 was made writable in
9192     ** order either to increase the database filesize, or to decrement the
9193     ** freelist count.  Hence, the sqlite3BtreeUpdateMeta() call cannot fail.
9194     */
9195     assert( sqlite3PagerIswriteable(pBt->pPage1->pDbPage) );
9196     rc = sqlite3BtreeUpdateMeta(p, 4, pgnoRoot);
9197     if( NEVER(rc) ){
9198       releasePage(pRoot);
9199       return rc;
9200     }
9201 
9202   }else{
9203     rc = allocateBtreePage(pBt, &pRoot, &pgnoRoot, 1, 0);
9204     if( rc ) return rc;
9205   }
9206 #endif
9207   assert( sqlite3PagerIswriteable(pRoot->pDbPage) );
9208   if( createTabFlags & BTREE_INTKEY ){
9209     ptfFlags = PTF_INTKEY | PTF_LEAFDATA | PTF_LEAF;
9210   }else{
9211     ptfFlags = PTF_ZERODATA | PTF_LEAF;
9212   }
9213   zeroPage(pRoot, ptfFlags);
9214   sqlite3PagerUnref(pRoot->pDbPage);
9215   assert( (pBt->openFlags & BTREE_SINGLE)==0 || pgnoRoot==2 );
9216   *piTable = (int)pgnoRoot;
9217   return SQLITE_OK;
9218 }
9219 int sqlite3BtreeCreateTable(Btree *p, int *piTable, int flags){
9220   int rc;
9221   sqlite3BtreeEnter(p);
9222   rc = btreeCreateTable(p, piTable, flags);
9223   sqlite3BtreeLeave(p);
9224   return rc;
9225 }
9226 
9227 /*
9228 ** Erase the given database page and all its children.  Return
9229 ** the page to the freelist.
9230 */
9231 static int clearDatabasePage(
9232   BtShared *pBt,           /* The BTree that contains the table */
9233   Pgno pgno,               /* Page number to clear */
9234   int freePageFlag,        /* Deallocate page if true */
9235   int *pnChange            /* Add number of Cells freed to this counter */
9236 ){
9237   MemPage *pPage;
9238   int rc;
9239   unsigned char *pCell;
9240   int i;
9241   int hdr;
9242   CellInfo info;
9243 
9244   assert( sqlite3_mutex_held(pBt->mutex) );
9245   if( pgno>btreePagecount(pBt) ){
9246     return SQLITE_CORRUPT_BKPT;
9247   }
9248   rc = getAndInitPage(pBt, pgno, &pPage, 0, 0);
9249   if( rc ) return rc;
9250   if( pPage->bBusy ){
9251     rc = SQLITE_CORRUPT_BKPT;
9252     goto cleardatabasepage_out;
9253   }
9254   pPage->bBusy = 1;
9255   hdr = pPage->hdrOffset;
9256   for(i=0; i<pPage->nCell; i++){
9257     pCell = findCell(pPage, i);
9258     if( !pPage->leaf ){
9259       rc = clearDatabasePage(pBt, get4byte(pCell), 1, pnChange);
9260       if( rc ) goto cleardatabasepage_out;
9261     }
9262     rc = clearCell(pPage, pCell, &info);
9263     if( rc ) goto cleardatabasepage_out;
9264   }
9265   if( !pPage->leaf ){
9266     rc = clearDatabasePage(pBt, get4byte(&pPage->aData[hdr+8]), 1, pnChange);
9267     if( rc ) goto cleardatabasepage_out;
9268   }else if( pnChange ){
9269     assert( pPage->intKey || CORRUPT_DB );
9270     testcase( !pPage->intKey );
9271     *pnChange += pPage->nCell;
9272   }
9273   if( freePageFlag ){
9274     freePage(pPage, &rc);
9275   }else if( (rc = sqlite3PagerWrite(pPage->pDbPage))==0 ){
9276     zeroPage(pPage, pPage->aData[hdr] | PTF_LEAF);
9277   }
9278 
9279 cleardatabasepage_out:
9280   pPage->bBusy = 0;
9281   releasePage(pPage);
9282   return rc;
9283 }
9284 
9285 /*
9286 ** Delete all information from a single table in the database.  iTable is
9287 ** the page number of the root of the table.  After this routine returns,
9288 ** the root page is empty, but still exists.
9289 **
9290 ** This routine will fail with SQLITE_LOCKED if there are any open
9291 ** read cursors on the table.  Open write cursors are moved to the
9292 ** root of the table.
9293 **
9294 ** If pnChange is not NULL, then table iTable must be an intkey table. The
9295 ** integer value pointed to by pnChange is incremented by the number of
9296 ** entries in the table.
9297 */
9298 int sqlite3BtreeClearTable(Btree *p, int iTable, int *pnChange){
9299   int rc;
9300   BtShared *pBt = p->pBt;
9301   sqlite3BtreeEnter(p);
9302   assert( p->inTrans==TRANS_WRITE );
9303 
9304   rc = saveAllCursors(pBt, (Pgno)iTable, 0);
9305 
9306   if( SQLITE_OK==rc ){
9307     /* Invalidate all incrblob cursors open on table iTable (assuming iTable
9308     ** is the root of a table b-tree - if it is not, the following call is
9309     ** a no-op).  */
9310     invalidateIncrblobCursors(p, (Pgno)iTable, 0, 1);
9311     rc = clearDatabasePage(pBt, (Pgno)iTable, 0, pnChange);
9312   }
9313   sqlite3BtreeLeave(p);
9314   return rc;
9315 }
9316 
9317 /*
9318 ** Delete all information from the single table that pCur is open on.
9319 **
9320 ** This routine only work for pCur on an ephemeral table.
9321 */
9322 int sqlite3BtreeClearTableOfCursor(BtCursor *pCur){
9323   return sqlite3BtreeClearTable(pCur->pBtree, pCur->pgnoRoot, 0);
9324 }
9325 
9326 /*
9327 ** Erase all information in a table and add the root of the table to
9328 ** the freelist.  Except, the root of the principle table (the one on
9329 ** page 1) is never added to the freelist.
9330 **
9331 ** This routine will fail with SQLITE_LOCKED if there are any open
9332 ** cursors on the table.
9333 **
9334 ** If AUTOVACUUM is enabled and the page at iTable is not the last
9335 ** root page in the database file, then the last root page
9336 ** in the database file is moved into the slot formerly occupied by
9337 ** iTable and that last slot formerly occupied by the last root page
9338 ** is added to the freelist instead of iTable.  In this say, all
9339 ** root pages are kept at the beginning of the database file, which
9340 ** is necessary for AUTOVACUUM to work right.  *piMoved is set to the
9341 ** page number that used to be the last root page in the file before
9342 ** the move.  If no page gets moved, *piMoved is set to 0.
9343 ** The last root page is recorded in meta[3] and the value of
9344 ** meta[3] is updated by this procedure.
9345 */
9346 static int btreeDropTable(Btree *p, Pgno iTable, int *piMoved){
9347   int rc;
9348   MemPage *pPage = 0;
9349   BtShared *pBt = p->pBt;
9350 
9351   assert( sqlite3BtreeHoldsMutex(p) );
9352   assert( p->inTrans==TRANS_WRITE );
9353   assert( iTable>=2 );
9354   if( iTable>btreePagecount(pBt) ){
9355     return SQLITE_CORRUPT_BKPT;
9356   }
9357 
9358   rc = btreeGetPage(pBt, (Pgno)iTable, &pPage, 0);
9359   if( rc ) return rc;
9360   rc = sqlite3BtreeClearTable(p, iTable, 0);
9361   if( rc ){
9362     releasePage(pPage);
9363     return rc;
9364   }
9365 
9366   *piMoved = 0;
9367 
9368 #ifdef SQLITE_OMIT_AUTOVACUUM
9369   freePage(pPage, &rc);
9370   releasePage(pPage);
9371 #else
9372   if( pBt->autoVacuum ){
9373     Pgno maxRootPgno;
9374     sqlite3BtreeGetMeta(p, BTREE_LARGEST_ROOT_PAGE, &maxRootPgno);
9375 
9376     if( iTable==maxRootPgno ){
9377       /* If the table being dropped is the table with the largest root-page
9378       ** number in the database, put the root page on the free list.
9379       */
9380       freePage(pPage, &rc);
9381       releasePage(pPage);
9382       if( rc!=SQLITE_OK ){
9383         return rc;
9384       }
9385     }else{
9386       /* The table being dropped does not have the largest root-page
9387       ** number in the database. So move the page that does into the
9388       ** gap left by the deleted root-page.
9389       */
9390       MemPage *pMove;
9391       releasePage(pPage);
9392       rc = btreeGetPage(pBt, maxRootPgno, &pMove, 0);
9393       if( rc!=SQLITE_OK ){
9394         return rc;
9395       }
9396       rc = relocatePage(pBt, pMove, PTRMAP_ROOTPAGE, 0, iTable, 0);
9397       releasePage(pMove);
9398       if( rc!=SQLITE_OK ){
9399         return rc;
9400       }
9401       pMove = 0;
9402       rc = btreeGetPage(pBt, maxRootPgno, &pMove, 0);
9403       freePage(pMove, &rc);
9404       releasePage(pMove);
9405       if( rc!=SQLITE_OK ){
9406         return rc;
9407       }
9408       *piMoved = maxRootPgno;
9409     }
9410 
9411     /* Set the new 'max-root-page' value in the database header. This
9412     ** is the old value less one, less one more if that happens to
9413     ** be a root-page number, less one again if that is the
9414     ** PENDING_BYTE_PAGE.
9415     */
9416     maxRootPgno--;
9417     while( maxRootPgno==PENDING_BYTE_PAGE(pBt)
9418            || PTRMAP_ISPAGE(pBt, maxRootPgno) ){
9419       maxRootPgno--;
9420     }
9421     assert( maxRootPgno!=PENDING_BYTE_PAGE(pBt) );
9422 
9423     rc = sqlite3BtreeUpdateMeta(p, 4, maxRootPgno);
9424   }else{
9425     freePage(pPage, &rc);
9426     releasePage(pPage);
9427   }
9428 #endif
9429   return rc;
9430 }
9431 int sqlite3BtreeDropTable(Btree *p, int iTable, int *piMoved){
9432   int rc;
9433   sqlite3BtreeEnter(p);
9434   rc = btreeDropTable(p, iTable, piMoved);
9435   sqlite3BtreeLeave(p);
9436   return rc;
9437 }
9438 
9439 
9440 /*
9441 ** This function may only be called if the b-tree connection already
9442 ** has a read or write transaction open on the database.
9443 **
9444 ** Read the meta-information out of a database file.  Meta[0]
9445 ** is the number of free pages currently in the database.  Meta[1]
9446 ** through meta[15] are available for use by higher layers.  Meta[0]
9447 ** is read-only, the others are read/write.
9448 **
9449 ** The schema layer numbers meta values differently.  At the schema
9450 ** layer (and the SetCookie and ReadCookie opcodes) the number of
9451 ** free pages is not visible.  So Cookie[0] is the same as Meta[1].
9452 **
9453 ** This routine treats Meta[BTREE_DATA_VERSION] as a special case.  Instead
9454 ** of reading the value out of the header, it instead loads the "DataVersion"
9455 ** from the pager.  The BTREE_DATA_VERSION value is not actually stored in the
9456 ** database file.  It is a number computed by the pager.  But its access
9457 ** pattern is the same as header meta values, and so it is convenient to
9458 ** read it from this routine.
9459 */
9460 void sqlite3BtreeGetMeta(Btree *p, int idx, u32 *pMeta){
9461   BtShared *pBt = p->pBt;
9462 
9463   sqlite3BtreeEnter(p);
9464   assert( p->inTrans>TRANS_NONE );
9465   assert( SQLITE_OK==querySharedCacheTableLock(p, MASTER_ROOT, READ_LOCK) );
9466   assert( pBt->pPage1 );
9467   assert( idx>=0 && idx<=15 );
9468 
9469   if( idx==BTREE_DATA_VERSION ){
9470     *pMeta = sqlite3PagerDataVersion(pBt->pPager) + p->iDataVersion;
9471   }else{
9472     *pMeta = get4byte(&pBt->pPage1->aData[36 + idx*4]);
9473   }
9474 
9475   /* If auto-vacuum is disabled in this build and this is an auto-vacuum
9476   ** database, mark the database as read-only.  */
9477 #ifdef SQLITE_OMIT_AUTOVACUUM
9478   if( idx==BTREE_LARGEST_ROOT_PAGE && *pMeta>0 ){
9479     pBt->btsFlags |= BTS_READ_ONLY;
9480   }
9481 #endif
9482 
9483   sqlite3BtreeLeave(p);
9484 }
9485 
9486 /*
9487 ** Write meta-information back into the database.  Meta[0] is
9488 ** read-only and may not be written.
9489 */
9490 int sqlite3BtreeUpdateMeta(Btree *p, int idx, u32 iMeta){
9491   BtShared *pBt = p->pBt;
9492   unsigned char *pP1;
9493   int rc;
9494   assert( idx>=1 && idx<=15 );
9495   sqlite3BtreeEnter(p);
9496   assert( p->inTrans==TRANS_WRITE );
9497   assert( pBt->pPage1!=0 );
9498   pP1 = pBt->pPage1->aData;
9499   rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
9500   if( rc==SQLITE_OK ){
9501     put4byte(&pP1[36 + idx*4], iMeta);
9502 #ifndef SQLITE_OMIT_AUTOVACUUM
9503     if( idx==BTREE_INCR_VACUUM ){
9504       assert( pBt->autoVacuum || iMeta==0 );
9505       assert( iMeta==0 || iMeta==1 );
9506       pBt->incrVacuum = (u8)iMeta;
9507     }
9508 #endif
9509   }
9510   sqlite3BtreeLeave(p);
9511   return rc;
9512 }
9513 
9514 #ifndef SQLITE_OMIT_BTREECOUNT
9515 /*
9516 ** The first argument, pCur, is a cursor opened on some b-tree. Count the
9517 ** number of entries in the b-tree and write the result to *pnEntry.
9518 **
9519 ** SQLITE_OK is returned if the operation is successfully executed.
9520 ** Otherwise, if an error is encountered (i.e. an IO error or database
9521 ** corruption) an SQLite error code is returned.
9522 */
9523 int sqlite3BtreeCount(sqlite3 *db, BtCursor *pCur, i64 *pnEntry){
9524   i64 nEntry = 0;                      /* Value to return in *pnEntry */
9525   int rc;                              /* Return code */
9526 
9527   rc = moveToRoot(pCur);
9528   if( rc==SQLITE_EMPTY ){
9529     *pnEntry = 0;
9530     return SQLITE_OK;
9531   }
9532 
9533   /* Unless an error occurs, the following loop runs one iteration for each
9534   ** page in the B-Tree structure (not including overflow pages).
9535   */
9536   while( rc==SQLITE_OK && !db->u1.isInterrupted ){
9537     int iIdx;                          /* Index of child node in parent */
9538     MemPage *pPage;                    /* Current page of the b-tree */
9539 
9540     /* If this is a leaf page or the tree is not an int-key tree, then
9541     ** this page contains countable entries. Increment the entry counter
9542     ** accordingly.
9543     */
9544     pPage = pCur->pPage;
9545     if( pPage->leaf || !pPage->intKey ){
9546       nEntry += pPage->nCell;
9547     }
9548 
9549     /* pPage is a leaf node. This loop navigates the cursor so that it
9550     ** points to the first interior cell that it points to the parent of
9551     ** the next page in the tree that has not yet been visited. The
9552     ** pCur->aiIdx[pCur->iPage] value is set to the index of the parent cell
9553     ** of the page, or to the number of cells in the page if the next page
9554     ** to visit is the right-child of its parent.
9555     **
9556     ** If all pages in the tree have been visited, return SQLITE_OK to the
9557     ** caller.
9558     */
9559     if( pPage->leaf ){
9560       do {
9561         if( pCur->iPage==0 ){
9562           /* All pages of the b-tree have been visited. Return successfully. */
9563           *pnEntry = nEntry;
9564           return moveToRoot(pCur);
9565         }
9566         moveToParent(pCur);
9567       }while ( pCur->ix>=pCur->pPage->nCell );
9568 
9569       pCur->ix++;
9570       pPage = pCur->pPage;
9571     }
9572 
9573     /* Descend to the child node of the cell that the cursor currently
9574     ** points at. This is the right-child if (iIdx==pPage->nCell).
9575     */
9576     iIdx = pCur->ix;
9577     if( iIdx==pPage->nCell ){
9578       rc = moveToChild(pCur, get4byte(&pPage->aData[pPage->hdrOffset+8]));
9579     }else{
9580       rc = moveToChild(pCur, get4byte(findCell(pPage, iIdx)));
9581     }
9582   }
9583 
9584   /* An error has occurred. Return an error code. */
9585   return rc;
9586 }
9587 #endif
9588 
9589 /*
9590 ** Return the pager associated with a BTree.  This routine is used for
9591 ** testing and debugging only.
9592 */
9593 Pager *sqlite3BtreePager(Btree *p){
9594   return p->pBt->pPager;
9595 }
9596 
9597 #ifndef SQLITE_OMIT_INTEGRITY_CHECK
9598 /*
9599 ** Append a message to the error message string.
9600 */
9601 static void checkAppendMsg(
9602   IntegrityCk *pCheck,
9603   const char *zFormat,
9604   ...
9605 ){
9606   va_list ap;
9607   if( !pCheck->mxErr ) return;
9608   pCheck->mxErr--;
9609   pCheck->nErr++;
9610   va_start(ap, zFormat);
9611   if( pCheck->errMsg.nChar ){
9612     sqlite3_str_append(&pCheck->errMsg, "\n", 1);
9613   }
9614   if( pCheck->zPfx ){
9615     sqlite3_str_appendf(&pCheck->errMsg, pCheck->zPfx, pCheck->v1, pCheck->v2);
9616   }
9617   sqlite3_str_vappendf(&pCheck->errMsg, zFormat, ap);
9618   va_end(ap);
9619   if( pCheck->errMsg.accError==SQLITE_NOMEM ){
9620     pCheck->mallocFailed = 1;
9621   }
9622 }
9623 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
9624 
9625 #ifndef SQLITE_OMIT_INTEGRITY_CHECK
9626 
9627 /*
9628 ** Return non-zero if the bit in the IntegrityCk.aPgRef[] array that
9629 ** corresponds to page iPg is already set.
9630 */
9631 static int getPageReferenced(IntegrityCk *pCheck, Pgno iPg){
9632   assert( iPg<=pCheck->nPage && sizeof(pCheck->aPgRef[0])==1 );
9633   return (pCheck->aPgRef[iPg/8] & (1 << (iPg & 0x07)));
9634 }
9635 
9636 /*
9637 ** Set the bit in the IntegrityCk.aPgRef[] array that corresponds to page iPg.
9638 */
9639 static void setPageReferenced(IntegrityCk *pCheck, Pgno iPg){
9640   assert( iPg<=pCheck->nPage && sizeof(pCheck->aPgRef[0])==1 );
9641   pCheck->aPgRef[iPg/8] |= (1 << (iPg & 0x07));
9642 }
9643 
9644 
9645 /*
9646 ** Add 1 to the reference count for page iPage.  If this is the second
9647 ** reference to the page, add an error message to pCheck->zErrMsg.
9648 ** Return 1 if there are 2 or more references to the page and 0 if
9649 ** if this is the first reference to the page.
9650 **
9651 ** Also check that the page number is in bounds.
9652 */
9653 static int checkRef(IntegrityCk *pCheck, Pgno iPage){
9654   if( iPage>pCheck->nPage || iPage==0 ){
9655     checkAppendMsg(pCheck, "invalid page number %d", iPage);
9656     return 1;
9657   }
9658   if( getPageReferenced(pCheck, iPage) ){
9659     checkAppendMsg(pCheck, "2nd reference to page %d", iPage);
9660     return 1;
9661   }
9662   if( pCheck->db->u1.isInterrupted ) return 1;
9663   setPageReferenced(pCheck, iPage);
9664   return 0;
9665 }
9666 
9667 #ifndef SQLITE_OMIT_AUTOVACUUM
9668 /*
9669 ** Check that the entry in the pointer-map for page iChild maps to
9670 ** page iParent, pointer type ptrType. If not, append an error message
9671 ** to pCheck.
9672 */
9673 static void checkPtrmap(
9674   IntegrityCk *pCheck,   /* Integrity check context */
9675   Pgno iChild,           /* Child page number */
9676   u8 eType,              /* Expected pointer map type */
9677   Pgno iParent           /* Expected pointer map parent page number */
9678 ){
9679   int rc;
9680   u8 ePtrmapType;
9681   Pgno iPtrmapParent;
9682 
9683   rc = ptrmapGet(pCheck->pBt, iChild, &ePtrmapType, &iPtrmapParent);
9684   if( rc!=SQLITE_OK ){
9685     if( rc==SQLITE_NOMEM || rc==SQLITE_IOERR_NOMEM ) pCheck->mallocFailed = 1;
9686     checkAppendMsg(pCheck, "Failed to read ptrmap key=%d", iChild);
9687     return;
9688   }
9689 
9690   if( ePtrmapType!=eType || iPtrmapParent!=iParent ){
9691     checkAppendMsg(pCheck,
9692       "Bad ptr map entry key=%d expected=(%d,%d) got=(%d,%d)",
9693       iChild, eType, iParent, ePtrmapType, iPtrmapParent);
9694   }
9695 }
9696 #endif
9697 
9698 /*
9699 ** Check the integrity of the freelist or of an overflow page list.
9700 ** Verify that the number of pages on the list is N.
9701 */
9702 static void checkList(
9703   IntegrityCk *pCheck,  /* Integrity checking context */
9704   int isFreeList,       /* True for a freelist.  False for overflow page list */
9705   int iPage,            /* Page number for first page in the list */
9706   u32 N                 /* Expected number of pages in the list */
9707 ){
9708   int i;
9709   u32 expected = N;
9710   int nErrAtStart = pCheck->nErr;
9711   while( iPage!=0 && pCheck->mxErr ){
9712     DbPage *pOvflPage;
9713     unsigned char *pOvflData;
9714     if( checkRef(pCheck, iPage) ) break;
9715     N--;
9716     if( sqlite3PagerGet(pCheck->pPager, (Pgno)iPage, &pOvflPage, 0) ){
9717       checkAppendMsg(pCheck, "failed to get page %d", iPage);
9718       break;
9719     }
9720     pOvflData = (unsigned char *)sqlite3PagerGetData(pOvflPage);
9721     if( isFreeList ){
9722       u32 n = (u32)get4byte(&pOvflData[4]);
9723 #ifndef SQLITE_OMIT_AUTOVACUUM
9724       if( pCheck->pBt->autoVacuum ){
9725         checkPtrmap(pCheck, iPage, PTRMAP_FREEPAGE, 0);
9726       }
9727 #endif
9728       if( n>pCheck->pBt->usableSize/4-2 ){
9729         checkAppendMsg(pCheck,
9730            "freelist leaf count too big on page %d", iPage);
9731         N--;
9732       }else{
9733         for(i=0; i<(int)n; i++){
9734           Pgno iFreePage = get4byte(&pOvflData[8+i*4]);
9735 #ifndef SQLITE_OMIT_AUTOVACUUM
9736           if( pCheck->pBt->autoVacuum ){
9737             checkPtrmap(pCheck, iFreePage, PTRMAP_FREEPAGE, 0);
9738           }
9739 #endif
9740           checkRef(pCheck, iFreePage);
9741         }
9742         N -= n;
9743       }
9744     }
9745 #ifndef SQLITE_OMIT_AUTOVACUUM
9746     else{
9747       /* If this database supports auto-vacuum and iPage is not the last
9748       ** page in this overflow list, check that the pointer-map entry for
9749       ** the following page matches iPage.
9750       */
9751       if( pCheck->pBt->autoVacuum && N>0 ){
9752         i = get4byte(pOvflData);
9753         checkPtrmap(pCheck, i, PTRMAP_OVERFLOW2, iPage);
9754       }
9755     }
9756 #endif
9757     iPage = get4byte(pOvflData);
9758     sqlite3PagerUnref(pOvflPage);
9759   }
9760   if( N && nErrAtStart==pCheck->nErr ){
9761     checkAppendMsg(pCheck,
9762       "%s is %d but should be %d",
9763       isFreeList ? "size" : "overflow list length",
9764       expected-N, expected);
9765   }
9766 }
9767 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
9768 
9769 /*
9770 ** An implementation of a min-heap.
9771 **
9772 ** aHeap[0] is the number of elements on the heap.  aHeap[1] is the
9773 ** root element.  The daughter nodes of aHeap[N] are aHeap[N*2]
9774 ** and aHeap[N*2+1].
9775 **
9776 ** The heap property is this:  Every node is less than or equal to both
9777 ** of its daughter nodes.  A consequence of the heap property is that the
9778 ** root node aHeap[1] is always the minimum value currently in the heap.
9779 **
9780 ** The btreeHeapInsert() routine inserts an unsigned 32-bit number onto
9781 ** the heap, preserving the heap property.  The btreeHeapPull() routine
9782 ** removes the root element from the heap (the minimum value in the heap)
9783 ** and then moves other nodes around as necessary to preserve the heap
9784 ** property.
9785 **
9786 ** This heap is used for cell overlap and coverage testing.  Each u32
9787 ** entry represents the span of a cell or freeblock on a btree page.
9788 ** The upper 16 bits are the index of the first byte of a range and the
9789 ** lower 16 bits are the index of the last byte of that range.
9790 */
9791 static void btreeHeapInsert(u32 *aHeap, u32 x){
9792   u32 j, i = ++aHeap[0];
9793   aHeap[i] = x;
9794   while( (j = i/2)>0 && aHeap[j]>aHeap[i] ){
9795     x = aHeap[j];
9796     aHeap[j] = aHeap[i];
9797     aHeap[i] = x;
9798     i = j;
9799   }
9800 }
9801 static int btreeHeapPull(u32 *aHeap, u32 *pOut){
9802   u32 j, i, x;
9803   if( (x = aHeap[0])==0 ) return 0;
9804   *pOut = aHeap[1];
9805   aHeap[1] = aHeap[x];
9806   aHeap[x] = 0xffffffff;
9807   aHeap[0]--;
9808   i = 1;
9809   while( (j = i*2)<=aHeap[0] ){
9810     if( aHeap[j]>aHeap[j+1] ) j++;
9811     if( aHeap[i]<aHeap[j] ) break;
9812     x = aHeap[i];
9813     aHeap[i] = aHeap[j];
9814     aHeap[j] = x;
9815     i = j;
9816   }
9817   return 1;
9818 }
9819 
9820 #ifndef SQLITE_OMIT_INTEGRITY_CHECK
9821 /*
9822 ** Do various sanity checks on a single page of a tree.  Return
9823 ** the tree depth.  Root pages return 0.  Parents of root pages
9824 ** return 1, and so forth.
9825 **
9826 ** These checks are done:
9827 **
9828 **      1.  Make sure that cells and freeblocks do not overlap
9829 **          but combine to completely cover the page.
9830 **      2.  Make sure integer cell keys are in order.
9831 **      3.  Check the integrity of overflow pages.
9832 **      4.  Recursively call checkTreePage on all children.
9833 **      5.  Verify that the depth of all children is the same.
9834 */
9835 static int checkTreePage(
9836   IntegrityCk *pCheck,  /* Context for the sanity check */
9837   int iPage,            /* Page number of the page to check */
9838   i64 *piMinKey,        /* Write minimum integer primary key here */
9839   i64 maxKey            /* Error if integer primary key greater than this */
9840 ){
9841   MemPage *pPage = 0;      /* The page being analyzed */
9842   int i;                   /* Loop counter */
9843   int rc;                  /* Result code from subroutine call */
9844   int depth = -1, d2;      /* Depth of a subtree */
9845   int pgno;                /* Page number */
9846   int nFrag;               /* Number of fragmented bytes on the page */
9847   int hdr;                 /* Offset to the page header */
9848   int cellStart;           /* Offset to the start of the cell pointer array */
9849   int nCell;               /* Number of cells */
9850   int doCoverageCheck = 1; /* True if cell coverage checking should be done */
9851   int keyCanBeEqual = 1;   /* True if IPK can be equal to maxKey
9852                            ** False if IPK must be strictly less than maxKey */
9853   u8 *data;                /* Page content */
9854   u8 *pCell;               /* Cell content */
9855   u8 *pCellIdx;            /* Next element of the cell pointer array */
9856   BtShared *pBt;           /* The BtShared object that owns pPage */
9857   u32 pc;                  /* Address of a cell */
9858   u32 usableSize;          /* Usable size of the page */
9859   u32 contentOffset;       /* Offset to the start of the cell content area */
9860   u32 *heap = 0;           /* Min-heap used for checking cell coverage */
9861   u32 x, prev = 0;         /* Next and previous entry on the min-heap */
9862   const char *saved_zPfx = pCheck->zPfx;
9863   int saved_v1 = pCheck->v1;
9864   int saved_v2 = pCheck->v2;
9865   u8 savedIsInit = 0;
9866 
9867   /* Check that the page exists
9868   */
9869   pBt = pCheck->pBt;
9870   usableSize = pBt->usableSize;
9871   if( iPage==0 ) return 0;
9872   if( checkRef(pCheck, iPage) ) return 0;
9873   pCheck->zPfx = "Page %d: ";
9874   pCheck->v1 = iPage;
9875   if( (rc = btreeGetPage(pBt, (Pgno)iPage, &pPage, 0))!=0 ){
9876     checkAppendMsg(pCheck,
9877        "unable to get the page. error code=%d", rc);
9878     goto end_of_check;
9879   }
9880 
9881   /* Clear MemPage.isInit to make sure the corruption detection code in
9882   ** btreeInitPage() is executed.  */
9883   savedIsInit = pPage->isInit;
9884   pPage->isInit = 0;
9885   if( (rc = btreeInitPage(pPage))!=0 ){
9886     assert( rc==SQLITE_CORRUPT );  /* The only possible error from InitPage */
9887     checkAppendMsg(pCheck,
9888                    "btreeInitPage() returns error code %d", rc);
9889     goto end_of_check;
9890   }
9891   if( (rc = btreeComputeFreeSpace(pPage))!=0 ){
9892     assert( rc==SQLITE_CORRUPT );
9893     checkAppendMsg(pCheck, "free space corruption", rc);
9894     goto end_of_check;
9895   }
9896   data = pPage->aData;
9897   hdr = pPage->hdrOffset;
9898 
9899   /* Set up for cell analysis */
9900   pCheck->zPfx = "On tree page %d cell %d: ";
9901   contentOffset = get2byteNotZero(&data[hdr+5]);
9902   assert( contentOffset<=usableSize );  /* Enforced by btreeInitPage() */
9903 
9904   /* EVIDENCE-OF: R-37002-32774 The two-byte integer at offset 3 gives the
9905   ** number of cells on the page. */
9906   nCell = get2byte(&data[hdr+3]);
9907   assert( pPage->nCell==nCell );
9908 
9909   /* EVIDENCE-OF: R-23882-45353 The cell pointer array of a b-tree page
9910   ** immediately follows the b-tree page header. */
9911   cellStart = hdr + 12 - 4*pPage->leaf;
9912   assert( pPage->aCellIdx==&data[cellStart] );
9913   pCellIdx = &data[cellStart + 2*(nCell-1)];
9914 
9915   if( !pPage->leaf ){
9916     /* Analyze the right-child page of internal pages */
9917     pgno = get4byte(&data[hdr+8]);
9918 #ifndef SQLITE_OMIT_AUTOVACUUM
9919     if( pBt->autoVacuum ){
9920       pCheck->zPfx = "On page %d at right child: ";
9921       checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage);
9922     }
9923 #endif
9924     depth = checkTreePage(pCheck, pgno, &maxKey, maxKey);
9925     keyCanBeEqual = 0;
9926   }else{
9927     /* For leaf pages, the coverage check will occur in the same loop
9928     ** as the other cell checks, so initialize the heap.  */
9929     heap = pCheck->heap;
9930     heap[0] = 0;
9931   }
9932 
9933   /* EVIDENCE-OF: R-02776-14802 The cell pointer array consists of K 2-byte
9934   ** integer offsets to the cell contents. */
9935   for(i=nCell-1; i>=0 && pCheck->mxErr; i--){
9936     CellInfo info;
9937 
9938     /* Check cell size */
9939     pCheck->v2 = i;
9940     assert( pCellIdx==&data[cellStart + i*2] );
9941     pc = get2byteAligned(pCellIdx);
9942     pCellIdx -= 2;
9943     if( pc<contentOffset || pc>usableSize-4 ){
9944       checkAppendMsg(pCheck, "Offset %d out of range %d..%d",
9945                              pc, contentOffset, usableSize-4);
9946       doCoverageCheck = 0;
9947       continue;
9948     }
9949     pCell = &data[pc];
9950     pPage->xParseCell(pPage, pCell, &info);
9951     if( pc+info.nSize>usableSize ){
9952       checkAppendMsg(pCheck, "Extends off end of page");
9953       doCoverageCheck = 0;
9954       continue;
9955     }
9956 
9957     /* Check for integer primary key out of range */
9958     if( pPage->intKey ){
9959       if( keyCanBeEqual ? (info.nKey > maxKey) : (info.nKey >= maxKey) ){
9960         checkAppendMsg(pCheck, "Rowid %lld out of order", info.nKey);
9961       }
9962       maxKey = info.nKey;
9963       keyCanBeEqual = 0;     /* Only the first key on the page may ==maxKey */
9964     }
9965 
9966     /* Check the content overflow list */
9967     if( info.nPayload>info.nLocal ){
9968       u32 nPage;       /* Number of pages on the overflow chain */
9969       Pgno pgnoOvfl;   /* First page of the overflow chain */
9970       assert( pc + info.nSize - 4 <= usableSize );
9971       nPage = (info.nPayload - info.nLocal + usableSize - 5)/(usableSize - 4);
9972       pgnoOvfl = get4byte(&pCell[info.nSize - 4]);
9973 #ifndef SQLITE_OMIT_AUTOVACUUM
9974       if( pBt->autoVacuum ){
9975         checkPtrmap(pCheck, pgnoOvfl, PTRMAP_OVERFLOW1, iPage);
9976       }
9977 #endif
9978       checkList(pCheck, 0, pgnoOvfl, nPage);
9979     }
9980 
9981     if( !pPage->leaf ){
9982       /* Check sanity of left child page for internal pages */
9983       pgno = get4byte(pCell);
9984 #ifndef SQLITE_OMIT_AUTOVACUUM
9985       if( pBt->autoVacuum ){
9986         checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage);
9987       }
9988 #endif
9989       d2 = checkTreePage(pCheck, pgno, &maxKey, maxKey);
9990       keyCanBeEqual = 0;
9991       if( d2!=depth ){
9992         checkAppendMsg(pCheck, "Child page depth differs");
9993         depth = d2;
9994       }
9995     }else{
9996       /* Populate the coverage-checking heap for leaf pages */
9997       btreeHeapInsert(heap, (pc<<16)|(pc+info.nSize-1));
9998     }
9999   }
10000   *piMinKey = maxKey;
10001 
10002   /* Check for complete coverage of the page
10003   */
10004   pCheck->zPfx = 0;
10005   if( doCoverageCheck && pCheck->mxErr>0 ){
10006     /* For leaf pages, the min-heap has already been initialized and the
10007     ** cells have already been inserted.  But for internal pages, that has
10008     ** not yet been done, so do it now */
10009     if( !pPage->leaf ){
10010       heap = pCheck->heap;
10011       heap[0] = 0;
10012       for(i=nCell-1; i>=0; i--){
10013         u32 size;
10014         pc = get2byteAligned(&data[cellStart+i*2]);
10015         size = pPage->xCellSize(pPage, &data[pc]);
10016         btreeHeapInsert(heap, (pc<<16)|(pc+size-1));
10017       }
10018     }
10019     /* Add the freeblocks to the min-heap
10020     **
10021     ** EVIDENCE-OF: R-20690-50594 The second field of the b-tree page header
10022     ** is the offset of the first freeblock, or zero if there are no
10023     ** freeblocks on the page.
10024     */
10025     i = get2byte(&data[hdr+1]);
10026     while( i>0 ){
10027       int size, j;
10028       assert( (u32)i<=usableSize-4 ); /* Enforced by btreeComputeFreeSpace() */
10029       size = get2byte(&data[i+2]);
10030       assert( (u32)(i+size)<=usableSize ); /* due to btreeComputeFreeSpace() */
10031       btreeHeapInsert(heap, (((u32)i)<<16)|(i+size-1));
10032       /* EVIDENCE-OF: R-58208-19414 The first 2 bytes of a freeblock are a
10033       ** big-endian integer which is the offset in the b-tree page of the next
10034       ** freeblock in the chain, or zero if the freeblock is the last on the
10035       ** chain. */
10036       j = get2byte(&data[i]);
10037       /* EVIDENCE-OF: R-06866-39125 Freeblocks are always connected in order of
10038       ** increasing offset. */
10039       assert( j==0 || j>i+size );     /* Enforced by btreeComputeFreeSpace() */
10040       assert( (u32)j<=usableSize-4 ); /* Enforced by btreeComputeFreeSpace() */
10041       i = j;
10042     }
10043     /* Analyze the min-heap looking for overlap between cells and/or
10044     ** freeblocks, and counting the number of untracked bytes in nFrag.
10045     **
10046     ** Each min-heap entry is of the form:    (start_address<<16)|end_address.
10047     ** There is an implied first entry the covers the page header, the cell
10048     ** pointer index, and the gap between the cell pointer index and the start
10049     ** of cell content.
10050     **
10051     ** The loop below pulls entries from the min-heap in order and compares
10052     ** the start_address against the previous end_address.  If there is an
10053     ** overlap, that means bytes are used multiple times.  If there is a gap,
10054     ** that gap is added to the fragmentation count.
10055     */
10056     nFrag = 0;
10057     prev = contentOffset - 1;   /* Implied first min-heap entry */
10058     while( btreeHeapPull(heap,&x) ){
10059       if( (prev&0xffff)>=(x>>16) ){
10060         checkAppendMsg(pCheck,
10061           "Multiple uses for byte %u of page %d", x>>16, iPage);
10062         break;
10063       }else{
10064         nFrag += (x>>16) - (prev&0xffff) - 1;
10065         prev = x;
10066       }
10067     }
10068     nFrag += usableSize - (prev&0xffff) - 1;
10069     /* EVIDENCE-OF: R-43263-13491 The total number of bytes in all fragments
10070     ** is stored in the fifth field of the b-tree page header.
10071     ** EVIDENCE-OF: R-07161-27322 The one-byte integer at offset 7 gives the
10072     ** number of fragmented free bytes within the cell content area.
10073     */
10074     if( heap[0]==0 && nFrag!=data[hdr+7] ){
10075       checkAppendMsg(pCheck,
10076           "Fragmentation of %d bytes reported as %d on page %d",
10077           nFrag, data[hdr+7], iPage);
10078     }
10079   }
10080 
10081 end_of_check:
10082   if( !doCoverageCheck ) pPage->isInit = savedIsInit;
10083   releasePage(pPage);
10084   pCheck->zPfx = saved_zPfx;
10085   pCheck->v1 = saved_v1;
10086   pCheck->v2 = saved_v2;
10087   return depth+1;
10088 }
10089 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
10090 
10091 #ifndef SQLITE_OMIT_INTEGRITY_CHECK
10092 /*
10093 ** This routine does a complete check of the given BTree file.  aRoot[] is
10094 ** an array of pages numbers were each page number is the root page of
10095 ** a table.  nRoot is the number of entries in aRoot.
10096 **
10097 ** A read-only or read-write transaction must be opened before calling
10098 ** this function.
10099 **
10100 ** Write the number of error seen in *pnErr.  Except for some memory
10101 ** allocation errors,  an error message held in memory obtained from
10102 ** malloc is returned if *pnErr is non-zero.  If *pnErr==0 then NULL is
10103 ** returned.  If a memory allocation error occurs, NULL is returned.
10104 */
10105 char *sqlite3BtreeIntegrityCheck(
10106   sqlite3 *db,  /* Database connection that is running the check */
10107   Btree *p,     /* The btree to be checked */
10108   int *aRoot,   /* An array of root pages numbers for individual trees */
10109   int nRoot,    /* Number of entries in aRoot[] */
10110   int mxErr,    /* Stop reporting errors after this many */
10111   int *pnErr    /* Write number of errors seen to this variable */
10112 ){
10113   Pgno i;
10114   IntegrityCk sCheck;
10115   BtShared *pBt = p->pBt;
10116   u64 savedDbFlags = pBt->db->flags;
10117   char zErr[100];
10118   VVA_ONLY( int nRef );
10119 
10120   sqlite3BtreeEnter(p);
10121   assert( p->inTrans>TRANS_NONE && pBt->inTransaction>TRANS_NONE );
10122   VVA_ONLY( nRef = sqlite3PagerRefcount(pBt->pPager) );
10123   assert( nRef>=0 );
10124   sCheck.db = db;
10125   sCheck.pBt = pBt;
10126   sCheck.pPager = pBt->pPager;
10127   sCheck.nPage = btreePagecount(sCheck.pBt);
10128   sCheck.mxErr = mxErr;
10129   sCheck.nErr = 0;
10130   sCheck.mallocFailed = 0;
10131   sCheck.zPfx = 0;
10132   sCheck.v1 = 0;
10133   sCheck.v2 = 0;
10134   sCheck.aPgRef = 0;
10135   sCheck.heap = 0;
10136   sqlite3StrAccumInit(&sCheck.errMsg, 0, zErr, sizeof(zErr), SQLITE_MAX_LENGTH);
10137   sCheck.errMsg.printfFlags = SQLITE_PRINTF_INTERNAL;
10138   if( sCheck.nPage==0 ){
10139     goto integrity_ck_cleanup;
10140   }
10141 
10142   sCheck.aPgRef = sqlite3MallocZero((sCheck.nPage / 8)+ 1);
10143   if( !sCheck.aPgRef ){
10144     sCheck.mallocFailed = 1;
10145     goto integrity_ck_cleanup;
10146   }
10147   sCheck.heap = (u32*)sqlite3PageMalloc( pBt->pageSize );
10148   if( sCheck.heap==0 ){
10149     sCheck.mallocFailed = 1;
10150     goto integrity_ck_cleanup;
10151   }
10152 
10153   i = PENDING_BYTE_PAGE(pBt);
10154   if( i<=sCheck.nPage ) setPageReferenced(&sCheck, i);
10155 
10156   /* Check the integrity of the freelist
10157   */
10158   sCheck.zPfx = "Main freelist: ";
10159   checkList(&sCheck, 1, get4byte(&pBt->pPage1->aData[32]),
10160             get4byte(&pBt->pPage1->aData[36]));
10161   sCheck.zPfx = 0;
10162 
10163   /* Check all the tables.
10164   */
10165 #ifndef SQLITE_OMIT_AUTOVACUUM
10166   if( pBt->autoVacuum ){
10167     int mx = 0;
10168     int mxInHdr;
10169     for(i=0; (int)i<nRoot; i++) if( mx<aRoot[i] ) mx = aRoot[i];
10170     mxInHdr = get4byte(&pBt->pPage1->aData[52]);
10171     if( mx!=mxInHdr ){
10172       checkAppendMsg(&sCheck,
10173         "max rootpage (%d) disagrees with header (%d)",
10174         mx, mxInHdr
10175       );
10176     }
10177   }else if( get4byte(&pBt->pPage1->aData[64])!=0 ){
10178     checkAppendMsg(&sCheck,
10179       "incremental_vacuum enabled with a max rootpage of zero"
10180     );
10181   }
10182 #endif
10183   testcase( pBt->db->flags & SQLITE_CellSizeCk );
10184   pBt->db->flags &= ~(u64)SQLITE_CellSizeCk;
10185   for(i=0; (int)i<nRoot && sCheck.mxErr; i++){
10186     i64 notUsed;
10187     if( aRoot[i]==0 ) continue;
10188 #ifndef SQLITE_OMIT_AUTOVACUUM
10189     if( pBt->autoVacuum && aRoot[i]>1 ){
10190       checkPtrmap(&sCheck, aRoot[i], PTRMAP_ROOTPAGE, 0);
10191     }
10192 #endif
10193     checkTreePage(&sCheck, aRoot[i], &notUsed, LARGEST_INT64);
10194   }
10195   pBt->db->flags = savedDbFlags;
10196 
10197   /* Make sure every page in the file is referenced
10198   */
10199   for(i=1; i<=sCheck.nPage && sCheck.mxErr; i++){
10200 #ifdef SQLITE_OMIT_AUTOVACUUM
10201     if( getPageReferenced(&sCheck, i)==0 ){
10202       checkAppendMsg(&sCheck, "Page %d is never used", i);
10203     }
10204 #else
10205     /* If the database supports auto-vacuum, make sure no tables contain
10206     ** references to pointer-map pages.
10207     */
10208     if( getPageReferenced(&sCheck, i)==0 &&
10209        (PTRMAP_PAGENO(pBt, i)!=i || !pBt->autoVacuum) ){
10210       checkAppendMsg(&sCheck, "Page %d is never used", i);
10211     }
10212     if( getPageReferenced(&sCheck, i)!=0 &&
10213        (PTRMAP_PAGENO(pBt, i)==i && pBt->autoVacuum) ){
10214       checkAppendMsg(&sCheck, "Pointer map page %d is referenced", i);
10215     }
10216 #endif
10217   }
10218 
10219   /* Clean  up and report errors.
10220   */
10221 integrity_ck_cleanup:
10222   sqlite3PageFree(sCheck.heap);
10223   sqlite3_free(sCheck.aPgRef);
10224   if( sCheck.mallocFailed ){
10225     sqlite3_str_reset(&sCheck.errMsg);
10226     sCheck.nErr++;
10227   }
10228   *pnErr = sCheck.nErr;
10229   if( sCheck.nErr==0 ) sqlite3_str_reset(&sCheck.errMsg);
10230   /* Make sure this analysis did not leave any unref() pages. */
10231   assert( nRef==sqlite3PagerRefcount(pBt->pPager) );
10232   sqlite3BtreeLeave(p);
10233   return sqlite3StrAccumFinish(&sCheck.errMsg);
10234 }
10235 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
10236 
10237 /*
10238 ** Return the full pathname of the underlying database file.  Return
10239 ** an empty string if the database is in-memory or a TEMP database.
10240 **
10241 ** The pager filename is invariant as long as the pager is
10242 ** open so it is safe to access without the BtShared mutex.
10243 */
10244 const char *sqlite3BtreeGetFilename(Btree *p){
10245   assert( p->pBt->pPager!=0 );
10246   return sqlite3PagerFilename(p->pBt->pPager, 1);
10247 }
10248 
10249 /*
10250 ** Return the pathname of the journal file for this database. The return
10251 ** value of this routine is the same regardless of whether the journal file
10252 ** has been created or not.
10253 **
10254 ** The pager journal filename is invariant as long as the pager is
10255 ** open so it is safe to access without the BtShared mutex.
10256 */
10257 const char *sqlite3BtreeGetJournalname(Btree *p){
10258   assert( p->pBt->pPager!=0 );
10259   return sqlite3PagerJournalname(p->pBt->pPager);
10260 }
10261 
10262 /*
10263 ** Return non-zero if a transaction is active.
10264 */
10265 int sqlite3BtreeIsInTrans(Btree *p){
10266   assert( p==0 || sqlite3_mutex_held(p->db->mutex) );
10267   return (p && (p->inTrans==TRANS_WRITE));
10268 }
10269 
10270 #ifndef SQLITE_OMIT_WAL
10271 /*
10272 ** Run a checkpoint on the Btree passed as the first argument.
10273 **
10274 ** Return SQLITE_LOCKED if this or any other connection has an open
10275 ** transaction on the shared-cache the argument Btree is connected to.
10276 **
10277 ** Parameter eMode is one of SQLITE_CHECKPOINT_PASSIVE, FULL or RESTART.
10278 */
10279 int sqlite3BtreeCheckpoint(Btree *p, int eMode, int *pnLog, int *pnCkpt){
10280   int rc = SQLITE_OK;
10281   if( p ){
10282     BtShared *pBt = p->pBt;
10283     sqlite3BtreeEnter(p);
10284     if( pBt->inTransaction!=TRANS_NONE ){
10285       rc = SQLITE_LOCKED;
10286     }else{
10287       rc = sqlite3PagerCheckpoint(pBt->pPager, p->db, eMode, pnLog, pnCkpt);
10288     }
10289     sqlite3BtreeLeave(p);
10290   }
10291   return rc;
10292 }
10293 #endif
10294 
10295 /*
10296 ** Return non-zero if a read (or write) transaction is active.
10297 */
10298 int sqlite3BtreeIsInReadTrans(Btree *p){
10299   assert( p );
10300   assert( sqlite3_mutex_held(p->db->mutex) );
10301   return p->inTrans!=TRANS_NONE;
10302 }
10303 
10304 int sqlite3BtreeIsInBackup(Btree *p){
10305   assert( p );
10306   assert( sqlite3_mutex_held(p->db->mutex) );
10307   return p->nBackup!=0;
10308 }
10309 
10310 /*
10311 ** This function returns a pointer to a blob of memory associated with
10312 ** a single shared-btree. The memory is used by client code for its own
10313 ** purposes (for example, to store a high-level schema associated with
10314 ** the shared-btree). The btree layer manages reference counting issues.
10315 **
10316 ** The first time this is called on a shared-btree, nBytes bytes of memory
10317 ** are allocated, zeroed, and returned to the caller. For each subsequent
10318 ** call the nBytes parameter is ignored and a pointer to the same blob
10319 ** of memory returned.
10320 **
10321 ** If the nBytes parameter is 0 and the blob of memory has not yet been
10322 ** allocated, a null pointer is returned. If the blob has already been
10323 ** allocated, it is returned as normal.
10324 **
10325 ** Just before the shared-btree is closed, the function passed as the
10326 ** xFree argument when the memory allocation was made is invoked on the
10327 ** blob of allocated memory. The xFree function should not call sqlite3_free()
10328 ** on the memory, the btree layer does that.
10329 */
10330 void *sqlite3BtreeSchema(Btree *p, int nBytes, void(*xFree)(void *)){
10331   BtShared *pBt = p->pBt;
10332   sqlite3BtreeEnter(p);
10333   if( !pBt->pSchema && nBytes ){
10334     pBt->pSchema = sqlite3DbMallocZero(0, nBytes);
10335     pBt->xFreeSchema = xFree;
10336   }
10337   sqlite3BtreeLeave(p);
10338   return pBt->pSchema;
10339 }
10340 
10341 /*
10342 ** Return SQLITE_LOCKED_SHAREDCACHE if another user of the same shared
10343 ** btree as the argument handle holds an exclusive lock on the
10344 ** sqlite_master table. Otherwise SQLITE_OK.
10345 */
10346 int sqlite3BtreeSchemaLocked(Btree *p){
10347   int rc;
10348   assert( sqlite3_mutex_held(p->db->mutex) );
10349   sqlite3BtreeEnter(p);
10350   rc = querySharedCacheTableLock(p, MASTER_ROOT, READ_LOCK);
10351   assert( rc==SQLITE_OK || rc==SQLITE_LOCKED_SHAREDCACHE );
10352   sqlite3BtreeLeave(p);
10353   return rc;
10354 }
10355 
10356 
10357 #ifndef SQLITE_OMIT_SHARED_CACHE
10358 /*
10359 ** Obtain a lock on the table whose root page is iTab.  The
10360 ** lock is a write lock if isWritelock is true or a read lock
10361 ** if it is false.
10362 */
10363 int sqlite3BtreeLockTable(Btree *p, int iTab, u8 isWriteLock){
10364   int rc = SQLITE_OK;
10365   assert( p->inTrans!=TRANS_NONE );
10366   if( p->sharable ){
10367     u8 lockType = READ_LOCK + isWriteLock;
10368     assert( READ_LOCK+1==WRITE_LOCK );
10369     assert( isWriteLock==0 || isWriteLock==1 );
10370 
10371     sqlite3BtreeEnter(p);
10372     rc = querySharedCacheTableLock(p, iTab, lockType);
10373     if( rc==SQLITE_OK ){
10374       rc = setSharedCacheTableLock(p, iTab, lockType);
10375     }
10376     sqlite3BtreeLeave(p);
10377   }
10378   return rc;
10379 }
10380 #endif
10381 
10382 #ifndef SQLITE_OMIT_INCRBLOB
10383 /*
10384 ** Argument pCsr must be a cursor opened for writing on an
10385 ** INTKEY table currently pointing at a valid table entry.
10386 ** This function modifies the data stored as part of that entry.
10387 **
10388 ** Only the data content may only be modified, it is not possible to
10389 ** change the length of the data stored. If this function is called with
10390 ** parameters that attempt to write past the end of the existing data,
10391 ** no modifications are made and SQLITE_CORRUPT is returned.
10392 */
10393 int sqlite3BtreePutData(BtCursor *pCsr, u32 offset, u32 amt, void *z){
10394   int rc;
10395   assert( cursorOwnsBtShared(pCsr) );
10396   assert( sqlite3_mutex_held(pCsr->pBtree->db->mutex) );
10397   assert( pCsr->curFlags & BTCF_Incrblob );
10398 
10399   rc = restoreCursorPosition(pCsr);
10400   if( rc!=SQLITE_OK ){
10401     return rc;
10402   }
10403   assert( pCsr->eState!=CURSOR_REQUIRESEEK );
10404   if( pCsr->eState!=CURSOR_VALID ){
10405     return SQLITE_ABORT;
10406   }
10407 
10408   /* Save the positions of all other cursors open on this table. This is
10409   ** required in case any of them are holding references to an xFetch
10410   ** version of the b-tree page modified by the accessPayload call below.
10411   **
10412   ** Note that pCsr must be open on a INTKEY table and saveCursorPosition()
10413   ** and hence saveAllCursors() cannot fail on a BTREE_INTKEY table, hence
10414   ** saveAllCursors can only return SQLITE_OK.
10415   */
10416   VVA_ONLY(rc =) saveAllCursors(pCsr->pBt, pCsr->pgnoRoot, pCsr);
10417   assert( rc==SQLITE_OK );
10418 
10419   /* Check some assumptions:
10420   **   (a) the cursor is open for writing,
10421   **   (b) there is a read/write transaction open,
10422   **   (c) the connection holds a write-lock on the table (if required),
10423   **   (d) there are no conflicting read-locks, and
10424   **   (e) the cursor points at a valid row of an intKey table.
10425   */
10426   if( (pCsr->curFlags & BTCF_WriteFlag)==0 ){
10427     return SQLITE_READONLY;
10428   }
10429   assert( (pCsr->pBt->btsFlags & BTS_READ_ONLY)==0
10430               && pCsr->pBt->inTransaction==TRANS_WRITE );
10431   assert( hasSharedCacheTableLock(pCsr->pBtree, pCsr->pgnoRoot, 0, 2) );
10432   assert( !hasReadConflicts(pCsr->pBtree, pCsr->pgnoRoot) );
10433   assert( pCsr->pPage->intKey );
10434 
10435   return accessPayload(pCsr, offset, amt, (unsigned char *)z, 1);
10436 }
10437 
10438 /*
10439 ** Mark this cursor as an incremental blob cursor.
10440 */
10441 void sqlite3BtreeIncrblobCursor(BtCursor *pCur){
10442   pCur->curFlags |= BTCF_Incrblob;
10443   pCur->pBtree->hasIncrblobCur = 1;
10444 }
10445 #endif
10446 
10447 /*
10448 ** Set both the "read version" (single byte at byte offset 18) and
10449 ** "write version" (single byte at byte offset 19) fields in the database
10450 ** header to iVersion.
10451 */
10452 int sqlite3BtreeSetVersion(Btree *pBtree, int iVersion){
10453   BtShared *pBt = pBtree->pBt;
10454   int rc;                         /* Return code */
10455 
10456   assert( iVersion==1 || iVersion==2 );
10457 
10458   /* If setting the version fields to 1, do not automatically open the
10459   ** WAL connection, even if the version fields are currently set to 2.
10460   */
10461   pBt->btsFlags &= ~BTS_NO_WAL;
10462   if( iVersion==1 ) pBt->btsFlags |= BTS_NO_WAL;
10463 
10464   rc = sqlite3BtreeBeginTrans(pBtree, 0, 0);
10465   if( rc==SQLITE_OK ){
10466     u8 *aData = pBt->pPage1->aData;
10467     if( aData[18]!=(u8)iVersion || aData[19]!=(u8)iVersion ){
10468       rc = sqlite3BtreeBeginTrans(pBtree, 2, 0);
10469       if( rc==SQLITE_OK ){
10470         rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
10471         if( rc==SQLITE_OK ){
10472           aData[18] = (u8)iVersion;
10473           aData[19] = (u8)iVersion;
10474         }
10475       }
10476     }
10477   }
10478 
10479   pBt->btsFlags &= ~BTS_NO_WAL;
10480   return rc;
10481 }
10482 
10483 /*
10484 ** Return true if the cursor has a hint specified.  This routine is
10485 ** only used from within assert() statements
10486 */
10487 int sqlite3BtreeCursorHasHint(BtCursor *pCsr, unsigned int mask){
10488   return (pCsr->hints & mask)!=0;
10489 }
10490 
10491 /*
10492 ** Return true if the given Btree is read-only.
10493 */
10494 int sqlite3BtreeIsReadonly(Btree *p){
10495   return (p->pBt->btsFlags & BTS_READ_ONLY)!=0;
10496 }
10497 
10498 /*
10499 ** Return the size of the header added to each page by this module.
10500 */
10501 int sqlite3HeaderSizeBtree(void){ return ROUND8(sizeof(MemPage)); }
10502 
10503 #if !defined(SQLITE_OMIT_SHARED_CACHE)
10504 /*
10505 ** Return true if the Btree passed as the only argument is sharable.
10506 */
10507 int sqlite3BtreeSharable(Btree *p){
10508   return p->sharable;
10509 }
10510 
10511 /*
10512 ** Return the number of connections to the BtShared object accessed by
10513 ** the Btree handle passed as the only argument. For private caches
10514 ** this is always 1. For shared caches it may be 1 or greater.
10515 */
10516 int sqlite3BtreeConnectionCount(Btree *p){
10517   testcase( p->sharable );
10518   return p->pBt->nRef;
10519 }
10520 #endif
10521