xref: /sqlite-3.40.0/src/btree.c (revision dee0359d)
1 /*
2 ** 2004 April 6
3 **
4 ** The author disclaims copyright to this source code.  In place of
5 ** a legal notice, here is a blessing:
6 **
7 **    May you do good and not evil.
8 **    May you find forgiveness for yourself and forgive others.
9 **    May you share freely, never taking more than you give.
10 **
11 *************************************************************************
12 ** This file implements an external (disk-based) database using BTrees.
13 ** See the header comment on "btreeInt.h" for additional information.
14 ** Including a description of file format and an overview of operation.
15 */
16 #include "btreeInt.h"
17 
18 /*
19 ** The header string that appears at the beginning of every
20 ** SQLite database.
21 */
22 static const char zMagicHeader[] = SQLITE_FILE_HEADER;
23 
24 /*
25 ** Set this global variable to 1 to enable tracing using the TRACE
26 ** macro.
27 */
28 #if 0
29 int sqlite3BtreeTrace=1;  /* True to enable tracing */
30 # define TRACE(X)  if(sqlite3BtreeTrace){printf X;fflush(stdout);}
31 #else
32 # define TRACE(X)
33 #endif
34 
35 /*
36 ** Extract a 2-byte big-endian integer from an array of unsigned bytes.
37 ** But if the value is zero, make it 65536.
38 **
39 ** This routine is used to extract the "offset to cell content area" value
40 ** from the header of a btree page.  If the page size is 65536 and the page
41 ** is empty, the offset should be 65536, but the 2-byte value stores zero.
42 ** This routine makes the necessary adjustment to 65536.
43 */
44 #define get2byteNotZero(X)  (((((int)get2byte(X))-1)&0xffff)+1)
45 
46 /*
47 ** Values passed as the 5th argument to allocateBtreePage()
48 */
49 #define BTALLOC_ANY   0           /* Allocate any page */
50 #define BTALLOC_EXACT 1           /* Allocate exact page if possible */
51 #define BTALLOC_LE    2           /* Allocate any page <= the parameter */
52 
53 /*
54 ** Macro IfNotOmitAV(x) returns (x) if SQLITE_OMIT_AUTOVACUUM is not
55 ** defined, or 0 if it is. For example:
56 **
57 **   bIncrVacuum = IfNotOmitAV(pBtShared->incrVacuum);
58 */
59 #ifndef SQLITE_OMIT_AUTOVACUUM
60 #define IfNotOmitAV(expr) (expr)
61 #else
62 #define IfNotOmitAV(expr) 0
63 #endif
64 
65 #ifndef SQLITE_OMIT_SHARED_CACHE
66 /*
67 ** A list of BtShared objects that are eligible for participation
68 ** in shared cache.  This variable has file scope during normal builds,
69 ** but the test harness needs to access it so we make it global for
70 ** test builds.
71 **
72 ** Access to this variable is protected by SQLITE_MUTEX_STATIC_MAIN.
73 */
74 #ifdef SQLITE_TEST
75 BtShared *SQLITE_WSD sqlite3SharedCacheList = 0;
76 #else
77 static BtShared *SQLITE_WSD sqlite3SharedCacheList = 0;
78 #endif
79 #endif /* SQLITE_OMIT_SHARED_CACHE */
80 
81 #ifndef SQLITE_OMIT_SHARED_CACHE
82 /*
83 ** Enable or disable the shared pager and schema features.
84 **
85 ** This routine has no effect on existing database connections.
86 ** The shared cache setting effects only future calls to
87 ** sqlite3_open(), sqlite3_open16(), or sqlite3_open_v2().
88 */
89 int sqlite3_enable_shared_cache(int enable){
90   sqlite3GlobalConfig.sharedCacheEnabled = enable;
91   return SQLITE_OK;
92 }
93 #endif
94 
95 
96 
97 #ifdef SQLITE_OMIT_SHARED_CACHE
98   /*
99   ** The functions querySharedCacheTableLock(), setSharedCacheTableLock(),
100   ** and clearAllSharedCacheTableLocks()
101   ** manipulate entries in the BtShared.pLock linked list used to store
102   ** shared-cache table level locks. If the library is compiled with the
103   ** shared-cache feature disabled, then there is only ever one user
104   ** of each BtShared structure and so this locking is not necessary.
105   ** So define the lock related functions as no-ops.
106   */
107   #define querySharedCacheTableLock(a,b,c) SQLITE_OK
108   #define setSharedCacheTableLock(a,b,c) SQLITE_OK
109   #define clearAllSharedCacheTableLocks(a)
110   #define downgradeAllSharedCacheTableLocks(a)
111   #define hasSharedCacheTableLock(a,b,c,d) 1
112   #define hasReadConflicts(a, b) 0
113 #endif
114 
115 #ifdef SQLITE_DEBUG
116 /*
117 ** Return and reset the seek counter for a Btree object.
118 */
119 sqlite3_uint64 sqlite3BtreeSeekCount(Btree *pBt){
120   u64 n =  pBt->nSeek;
121   pBt->nSeek = 0;
122   return n;
123 }
124 #endif
125 
126 /*
127 ** Implementation of the SQLITE_CORRUPT_PAGE() macro. Takes a single
128 ** (MemPage*) as an argument. The (MemPage*) must not be NULL.
129 **
130 ** If SQLITE_DEBUG is not defined, then this macro is equivalent to
131 ** SQLITE_CORRUPT_BKPT. Or, if SQLITE_DEBUG is set, then the log message
132 ** normally produced as a side-effect of SQLITE_CORRUPT_BKPT is augmented
133 ** with the page number and filename associated with the (MemPage*).
134 */
135 #ifdef SQLITE_DEBUG
136 int corruptPageError(int lineno, MemPage *p){
137   char *zMsg;
138   sqlite3BeginBenignMalloc();
139   zMsg = sqlite3_mprintf("database corruption page %d of %s",
140       (int)p->pgno, sqlite3PagerFilename(p->pBt->pPager, 0)
141   );
142   sqlite3EndBenignMalloc();
143   if( zMsg ){
144     sqlite3ReportError(SQLITE_CORRUPT, lineno, zMsg);
145   }
146   sqlite3_free(zMsg);
147   return SQLITE_CORRUPT_BKPT;
148 }
149 # define SQLITE_CORRUPT_PAGE(pMemPage) corruptPageError(__LINE__, pMemPage)
150 #else
151 # define SQLITE_CORRUPT_PAGE(pMemPage) SQLITE_CORRUPT_PGNO(pMemPage->pgno)
152 #endif
153 
154 #ifndef SQLITE_OMIT_SHARED_CACHE
155 
156 #ifdef SQLITE_DEBUG
157 /*
158 **** This function is only used as part of an assert() statement. ***
159 **
160 ** Check to see if pBtree holds the required locks to read or write to the
161 ** table with root page iRoot.   Return 1 if it does and 0 if not.
162 **
163 ** For example, when writing to a table with root-page iRoot via
164 ** Btree connection pBtree:
165 **
166 **    assert( hasSharedCacheTableLock(pBtree, iRoot, 0, WRITE_LOCK) );
167 **
168 ** When writing to an index that resides in a sharable database, the
169 ** caller should have first obtained a lock specifying the root page of
170 ** the corresponding table. This makes things a bit more complicated,
171 ** as this module treats each table as a separate structure. To determine
172 ** the table corresponding to the index being written, this
173 ** function has to search through the database schema.
174 **
175 ** Instead of a lock on the table/index rooted at page iRoot, the caller may
176 ** hold a write-lock on the schema table (root page 1). This is also
177 ** acceptable.
178 */
179 static int hasSharedCacheTableLock(
180   Btree *pBtree,         /* Handle that must hold lock */
181   Pgno iRoot,            /* Root page of b-tree */
182   int isIndex,           /* True if iRoot is the root of an index b-tree */
183   int eLockType          /* Required lock type (READ_LOCK or WRITE_LOCK) */
184 ){
185   Schema *pSchema = (Schema *)pBtree->pBt->pSchema;
186   Pgno iTab = 0;
187   BtLock *pLock;
188 
189   /* If this database is not shareable, or if the client is reading
190   ** and has the read-uncommitted flag set, then no lock is required.
191   ** Return true immediately.
192   */
193   if( (pBtree->sharable==0)
194    || (eLockType==READ_LOCK && (pBtree->db->flags & SQLITE_ReadUncommit))
195   ){
196     return 1;
197   }
198 
199   /* If the client is reading  or writing an index and the schema is
200   ** not loaded, then it is too difficult to actually check to see if
201   ** the correct locks are held.  So do not bother - just return true.
202   ** This case does not come up very often anyhow.
203   */
204   if( isIndex && (!pSchema || (pSchema->schemaFlags&DB_SchemaLoaded)==0) ){
205     return 1;
206   }
207 
208   /* Figure out the root-page that the lock should be held on. For table
209   ** b-trees, this is just the root page of the b-tree being read or
210   ** written. For index b-trees, it is the root page of the associated
211   ** table.  */
212   if( isIndex ){
213     HashElem *p;
214     int bSeen = 0;
215     for(p=sqliteHashFirst(&pSchema->idxHash); p; p=sqliteHashNext(p)){
216       Index *pIdx = (Index *)sqliteHashData(p);
217       if( pIdx->tnum==(int)iRoot ){
218         if( bSeen ){
219           /* Two or more indexes share the same root page.  There must
220           ** be imposter tables.  So just return true.  The assert is not
221           ** useful in that case. */
222           return 1;
223         }
224         iTab = pIdx->pTable->tnum;
225         bSeen = 1;
226       }
227     }
228   }else{
229     iTab = iRoot;
230   }
231 
232   /* Search for the required lock. Either a write-lock on root-page iTab, a
233   ** write-lock on the schema table, or (if the client is reading) a
234   ** read-lock on iTab will suffice. Return 1 if any of these are found.  */
235   for(pLock=pBtree->pBt->pLock; pLock; pLock=pLock->pNext){
236     if( pLock->pBtree==pBtree
237      && (pLock->iTable==iTab || (pLock->eLock==WRITE_LOCK && pLock->iTable==1))
238      && pLock->eLock>=eLockType
239     ){
240       return 1;
241     }
242   }
243 
244   /* Failed to find the required lock. */
245   return 0;
246 }
247 #endif /* SQLITE_DEBUG */
248 
249 #ifdef SQLITE_DEBUG
250 /*
251 **** This function may be used as part of assert() statements only. ****
252 **
253 ** Return true if it would be illegal for pBtree to write into the
254 ** table or index rooted at iRoot because other shared connections are
255 ** simultaneously reading that same table or index.
256 **
257 ** It is illegal for pBtree to write if some other Btree object that
258 ** shares the same BtShared object is currently reading or writing
259 ** the iRoot table.  Except, if the other Btree object has the
260 ** read-uncommitted flag set, then it is OK for the other object to
261 ** have a read cursor.
262 **
263 ** For example, before writing to any part of the table or index
264 ** rooted at page iRoot, one should call:
265 **
266 **    assert( !hasReadConflicts(pBtree, iRoot) );
267 */
268 static int hasReadConflicts(Btree *pBtree, Pgno iRoot){
269   BtCursor *p;
270   for(p=pBtree->pBt->pCursor; p; p=p->pNext){
271     if( p->pgnoRoot==iRoot
272      && p->pBtree!=pBtree
273      && 0==(p->pBtree->db->flags & SQLITE_ReadUncommit)
274     ){
275       return 1;
276     }
277   }
278   return 0;
279 }
280 #endif    /* #ifdef SQLITE_DEBUG */
281 
282 /*
283 ** Query to see if Btree handle p may obtain a lock of type eLock
284 ** (READ_LOCK or WRITE_LOCK) on the table with root-page iTab. Return
285 ** SQLITE_OK if the lock may be obtained (by calling
286 ** setSharedCacheTableLock()), or SQLITE_LOCKED if not.
287 */
288 static int querySharedCacheTableLock(Btree *p, Pgno iTab, u8 eLock){
289   BtShared *pBt = p->pBt;
290   BtLock *pIter;
291 
292   assert( sqlite3BtreeHoldsMutex(p) );
293   assert( eLock==READ_LOCK || eLock==WRITE_LOCK );
294   assert( p->db!=0 );
295   assert( !(p->db->flags&SQLITE_ReadUncommit)||eLock==WRITE_LOCK||iTab==1 );
296 
297   /* If requesting a write-lock, then the Btree must have an open write
298   ** transaction on this file. And, obviously, for this to be so there
299   ** must be an open write transaction on the file itself.
300   */
301   assert( eLock==READ_LOCK || (p==pBt->pWriter && p->inTrans==TRANS_WRITE) );
302   assert( eLock==READ_LOCK || pBt->inTransaction==TRANS_WRITE );
303 
304   /* This routine is a no-op if the shared-cache is not enabled */
305   if( !p->sharable ){
306     return SQLITE_OK;
307   }
308 
309   /* If some other connection is holding an exclusive lock, the
310   ** requested lock may not be obtained.
311   */
312   if( pBt->pWriter!=p && (pBt->btsFlags & BTS_EXCLUSIVE)!=0 ){
313     sqlite3ConnectionBlocked(p->db, pBt->pWriter->db);
314     return SQLITE_LOCKED_SHAREDCACHE;
315   }
316 
317   for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){
318     /* The condition (pIter->eLock!=eLock) in the following if(...)
319     ** statement is a simplification of:
320     **
321     **   (eLock==WRITE_LOCK || pIter->eLock==WRITE_LOCK)
322     **
323     ** since we know that if eLock==WRITE_LOCK, then no other connection
324     ** may hold a WRITE_LOCK on any table in this file (since there can
325     ** only be a single writer).
326     */
327     assert( pIter->eLock==READ_LOCK || pIter->eLock==WRITE_LOCK );
328     assert( eLock==READ_LOCK || pIter->pBtree==p || pIter->eLock==READ_LOCK);
329     if( pIter->pBtree!=p && pIter->iTable==iTab && pIter->eLock!=eLock ){
330       sqlite3ConnectionBlocked(p->db, pIter->pBtree->db);
331       if( eLock==WRITE_LOCK ){
332         assert( p==pBt->pWriter );
333         pBt->btsFlags |= BTS_PENDING;
334       }
335       return SQLITE_LOCKED_SHAREDCACHE;
336     }
337   }
338   return SQLITE_OK;
339 }
340 #endif /* !SQLITE_OMIT_SHARED_CACHE */
341 
342 #ifndef SQLITE_OMIT_SHARED_CACHE
343 /*
344 ** Add a lock on the table with root-page iTable to the shared-btree used
345 ** by Btree handle p. Parameter eLock must be either READ_LOCK or
346 ** WRITE_LOCK.
347 **
348 ** This function assumes the following:
349 **
350 **   (a) The specified Btree object p is connected to a sharable
351 **       database (one with the BtShared.sharable flag set), and
352 **
353 **   (b) No other Btree objects hold a lock that conflicts
354 **       with the requested lock (i.e. querySharedCacheTableLock() has
355 **       already been called and returned SQLITE_OK).
356 **
357 ** SQLITE_OK is returned if the lock is added successfully. SQLITE_NOMEM
358 ** is returned if a malloc attempt fails.
359 */
360 static int setSharedCacheTableLock(Btree *p, Pgno iTable, u8 eLock){
361   BtShared *pBt = p->pBt;
362   BtLock *pLock = 0;
363   BtLock *pIter;
364 
365   assert( sqlite3BtreeHoldsMutex(p) );
366   assert( eLock==READ_LOCK || eLock==WRITE_LOCK );
367   assert( p->db!=0 );
368 
369   /* A connection with the read-uncommitted flag set will never try to
370   ** obtain a read-lock using this function. The only read-lock obtained
371   ** by a connection in read-uncommitted mode is on the sqlite_schema
372   ** table, and that lock is obtained in BtreeBeginTrans().  */
373   assert( 0==(p->db->flags&SQLITE_ReadUncommit) || eLock==WRITE_LOCK );
374 
375   /* This function should only be called on a sharable b-tree after it
376   ** has been determined that no other b-tree holds a conflicting lock.  */
377   assert( p->sharable );
378   assert( SQLITE_OK==querySharedCacheTableLock(p, iTable, eLock) );
379 
380   /* First search the list for an existing lock on this table. */
381   for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){
382     if( pIter->iTable==iTable && pIter->pBtree==p ){
383       pLock = pIter;
384       break;
385     }
386   }
387 
388   /* If the above search did not find a BtLock struct associating Btree p
389   ** with table iTable, allocate one and link it into the list.
390   */
391   if( !pLock ){
392     pLock = (BtLock *)sqlite3MallocZero(sizeof(BtLock));
393     if( !pLock ){
394       return SQLITE_NOMEM_BKPT;
395     }
396     pLock->iTable = iTable;
397     pLock->pBtree = p;
398     pLock->pNext = pBt->pLock;
399     pBt->pLock = pLock;
400   }
401 
402   /* Set the BtLock.eLock variable to the maximum of the current lock
403   ** and the requested lock. This means if a write-lock was already held
404   ** and a read-lock requested, we don't incorrectly downgrade the lock.
405   */
406   assert( WRITE_LOCK>READ_LOCK );
407   if( eLock>pLock->eLock ){
408     pLock->eLock = eLock;
409   }
410 
411   return SQLITE_OK;
412 }
413 #endif /* !SQLITE_OMIT_SHARED_CACHE */
414 
415 #ifndef SQLITE_OMIT_SHARED_CACHE
416 /*
417 ** Release all the table locks (locks obtained via calls to
418 ** the setSharedCacheTableLock() procedure) held by Btree object p.
419 **
420 ** This function assumes that Btree p has an open read or write
421 ** transaction. If it does not, then the BTS_PENDING flag
422 ** may be incorrectly cleared.
423 */
424 static void clearAllSharedCacheTableLocks(Btree *p){
425   BtShared *pBt = p->pBt;
426   BtLock **ppIter = &pBt->pLock;
427 
428   assert( sqlite3BtreeHoldsMutex(p) );
429   assert( p->sharable || 0==*ppIter );
430   assert( p->inTrans>0 );
431 
432   while( *ppIter ){
433     BtLock *pLock = *ppIter;
434     assert( (pBt->btsFlags & BTS_EXCLUSIVE)==0 || pBt->pWriter==pLock->pBtree );
435     assert( pLock->pBtree->inTrans>=pLock->eLock );
436     if( pLock->pBtree==p ){
437       *ppIter = pLock->pNext;
438       assert( pLock->iTable!=1 || pLock==&p->lock );
439       if( pLock->iTable!=1 ){
440         sqlite3_free(pLock);
441       }
442     }else{
443       ppIter = &pLock->pNext;
444     }
445   }
446 
447   assert( (pBt->btsFlags & BTS_PENDING)==0 || pBt->pWriter );
448   if( pBt->pWriter==p ){
449     pBt->pWriter = 0;
450     pBt->btsFlags &= ~(BTS_EXCLUSIVE|BTS_PENDING);
451   }else if( pBt->nTransaction==2 ){
452     /* This function is called when Btree p is concluding its
453     ** transaction. If there currently exists a writer, and p is not
454     ** that writer, then the number of locks held by connections other
455     ** than the writer must be about to drop to zero. In this case
456     ** set the BTS_PENDING flag to 0.
457     **
458     ** If there is not currently a writer, then BTS_PENDING must
459     ** be zero already. So this next line is harmless in that case.
460     */
461     pBt->btsFlags &= ~BTS_PENDING;
462   }
463 }
464 
465 /*
466 ** This function changes all write-locks held by Btree p into read-locks.
467 */
468 static void downgradeAllSharedCacheTableLocks(Btree *p){
469   BtShared *pBt = p->pBt;
470   if( pBt->pWriter==p ){
471     BtLock *pLock;
472     pBt->pWriter = 0;
473     pBt->btsFlags &= ~(BTS_EXCLUSIVE|BTS_PENDING);
474     for(pLock=pBt->pLock; pLock; pLock=pLock->pNext){
475       assert( pLock->eLock==READ_LOCK || pLock->pBtree==p );
476       pLock->eLock = READ_LOCK;
477     }
478   }
479 }
480 
481 #endif /* SQLITE_OMIT_SHARED_CACHE */
482 
483 static void releasePage(MemPage *pPage);         /* Forward reference */
484 static void releasePageOne(MemPage *pPage);      /* Forward reference */
485 static void releasePageNotNull(MemPage *pPage);  /* Forward reference */
486 
487 /*
488 ***** This routine is used inside of assert() only ****
489 **
490 ** Verify that the cursor holds the mutex on its BtShared
491 */
492 #ifdef SQLITE_DEBUG
493 static int cursorHoldsMutex(BtCursor *p){
494   return sqlite3_mutex_held(p->pBt->mutex);
495 }
496 
497 /* Verify that the cursor and the BtShared agree about what is the current
498 ** database connetion. This is important in shared-cache mode. If the database
499 ** connection pointers get out-of-sync, it is possible for routines like
500 ** btreeInitPage() to reference an stale connection pointer that references a
501 ** a connection that has already closed.  This routine is used inside assert()
502 ** statements only and for the purpose of double-checking that the btree code
503 ** does keep the database connection pointers up-to-date.
504 */
505 static int cursorOwnsBtShared(BtCursor *p){
506   assert( cursorHoldsMutex(p) );
507   return (p->pBtree->db==p->pBt->db);
508 }
509 #endif
510 
511 /*
512 ** Invalidate the overflow cache of the cursor passed as the first argument.
513 ** on the shared btree structure pBt.
514 */
515 #define invalidateOverflowCache(pCur) (pCur->curFlags &= ~BTCF_ValidOvfl)
516 
517 /*
518 ** Invalidate the overflow page-list cache for all cursors opened
519 ** on the shared btree structure pBt.
520 */
521 static void invalidateAllOverflowCache(BtShared *pBt){
522   BtCursor *p;
523   assert( sqlite3_mutex_held(pBt->mutex) );
524   for(p=pBt->pCursor; p; p=p->pNext){
525     invalidateOverflowCache(p);
526   }
527 }
528 
529 #ifndef SQLITE_OMIT_INCRBLOB
530 /*
531 ** This function is called before modifying the contents of a table
532 ** to invalidate any incrblob cursors that are open on the
533 ** row or one of the rows being modified.
534 **
535 ** If argument isClearTable is true, then the entire contents of the
536 ** table is about to be deleted. In this case invalidate all incrblob
537 ** cursors open on any row within the table with root-page pgnoRoot.
538 **
539 ** Otherwise, if argument isClearTable is false, then the row with
540 ** rowid iRow is being replaced or deleted. In this case invalidate
541 ** only those incrblob cursors open on that specific row.
542 */
543 static void invalidateIncrblobCursors(
544   Btree *pBtree,          /* The database file to check */
545   Pgno pgnoRoot,          /* The table that might be changing */
546   i64 iRow,               /* The rowid that might be changing */
547   int isClearTable        /* True if all rows are being deleted */
548 ){
549   BtCursor *p;
550   assert( pBtree->hasIncrblobCur );
551   assert( sqlite3BtreeHoldsMutex(pBtree) );
552   pBtree->hasIncrblobCur = 0;
553   for(p=pBtree->pBt->pCursor; p; p=p->pNext){
554     if( (p->curFlags & BTCF_Incrblob)!=0 ){
555       pBtree->hasIncrblobCur = 1;
556       if( p->pgnoRoot==pgnoRoot && (isClearTable || p->info.nKey==iRow) ){
557         p->eState = CURSOR_INVALID;
558       }
559     }
560   }
561 }
562 
563 #else
564   /* Stub function when INCRBLOB is omitted */
565   #define invalidateIncrblobCursors(w,x,y,z)
566 #endif /* SQLITE_OMIT_INCRBLOB */
567 
568 /*
569 ** Set bit pgno of the BtShared.pHasContent bitvec. This is called
570 ** when a page that previously contained data becomes a free-list leaf
571 ** page.
572 **
573 ** The BtShared.pHasContent bitvec exists to work around an obscure
574 ** bug caused by the interaction of two useful IO optimizations surrounding
575 ** free-list leaf pages:
576 **
577 **   1) When all data is deleted from a page and the page becomes
578 **      a free-list leaf page, the page is not written to the database
579 **      (as free-list leaf pages contain no meaningful data). Sometimes
580 **      such a page is not even journalled (as it will not be modified,
581 **      why bother journalling it?).
582 **
583 **   2) When a free-list leaf page is reused, its content is not read
584 **      from the database or written to the journal file (why should it
585 **      be, if it is not at all meaningful?).
586 **
587 ** By themselves, these optimizations work fine and provide a handy
588 ** performance boost to bulk delete or insert operations. However, if
589 ** a page is moved to the free-list and then reused within the same
590 ** transaction, a problem comes up. If the page is not journalled when
591 ** it is moved to the free-list and it is also not journalled when it
592 ** is extracted from the free-list and reused, then the original data
593 ** may be lost. In the event of a rollback, it may not be possible
594 ** to restore the database to its original configuration.
595 **
596 ** The solution is the BtShared.pHasContent bitvec. Whenever a page is
597 ** moved to become a free-list leaf page, the corresponding bit is
598 ** set in the bitvec. Whenever a leaf page is extracted from the free-list,
599 ** optimization 2 above is omitted if the corresponding bit is already
600 ** set in BtShared.pHasContent. The contents of the bitvec are cleared
601 ** at the end of every transaction.
602 */
603 static int btreeSetHasContent(BtShared *pBt, Pgno pgno){
604   int rc = SQLITE_OK;
605   if( !pBt->pHasContent ){
606     assert( pgno<=pBt->nPage );
607     pBt->pHasContent = sqlite3BitvecCreate(pBt->nPage);
608     if( !pBt->pHasContent ){
609       rc = SQLITE_NOMEM_BKPT;
610     }
611   }
612   if( rc==SQLITE_OK && pgno<=sqlite3BitvecSize(pBt->pHasContent) ){
613     rc = sqlite3BitvecSet(pBt->pHasContent, pgno);
614   }
615   return rc;
616 }
617 
618 /*
619 ** Query the BtShared.pHasContent vector.
620 **
621 ** This function is called when a free-list leaf page is removed from the
622 ** free-list for reuse. It returns false if it is safe to retrieve the
623 ** page from the pager layer with the 'no-content' flag set. True otherwise.
624 */
625 static int btreeGetHasContent(BtShared *pBt, Pgno pgno){
626   Bitvec *p = pBt->pHasContent;
627   return p && (pgno>sqlite3BitvecSize(p) || sqlite3BitvecTestNotNull(p, pgno));
628 }
629 
630 /*
631 ** Clear (destroy) the BtShared.pHasContent bitvec. This should be
632 ** invoked at the conclusion of each write-transaction.
633 */
634 static void btreeClearHasContent(BtShared *pBt){
635   sqlite3BitvecDestroy(pBt->pHasContent);
636   pBt->pHasContent = 0;
637 }
638 
639 /*
640 ** Release all of the apPage[] pages for a cursor.
641 */
642 static void btreeReleaseAllCursorPages(BtCursor *pCur){
643   int i;
644   if( pCur->iPage>=0 ){
645     for(i=0; i<pCur->iPage; i++){
646       releasePageNotNull(pCur->apPage[i]);
647     }
648     releasePageNotNull(pCur->pPage);
649     pCur->iPage = -1;
650   }
651 }
652 
653 /*
654 ** The cursor passed as the only argument must point to a valid entry
655 ** when this function is called (i.e. have eState==CURSOR_VALID). This
656 ** function saves the current cursor key in variables pCur->nKey and
657 ** pCur->pKey. SQLITE_OK is returned if successful or an SQLite error
658 ** code otherwise.
659 **
660 ** If the cursor is open on an intkey table, then the integer key
661 ** (the rowid) is stored in pCur->nKey and pCur->pKey is left set to
662 ** NULL. If the cursor is open on a non-intkey table, then pCur->pKey is
663 ** set to point to a malloced buffer pCur->nKey bytes in size containing
664 ** the key.
665 */
666 static int saveCursorKey(BtCursor *pCur){
667   int rc = SQLITE_OK;
668   assert( CURSOR_VALID==pCur->eState );
669   assert( 0==pCur->pKey );
670   assert( cursorHoldsMutex(pCur) );
671 
672   if( pCur->curIntKey ){
673     /* Only the rowid is required for a table btree */
674     pCur->nKey = sqlite3BtreeIntegerKey(pCur);
675   }else{
676     /* For an index btree, save the complete key content. It is possible
677     ** that the current key is corrupt. In that case, it is possible that
678     ** the sqlite3VdbeRecordUnpack() function may overread the buffer by
679     ** up to the size of 1 varint plus 1 8-byte value when the cursor
680     ** position is restored. Hence the 17 bytes of padding allocated
681     ** below. */
682     void *pKey;
683     pCur->nKey = sqlite3BtreePayloadSize(pCur);
684     pKey = sqlite3Malloc( pCur->nKey + 9 + 8 );
685     if( pKey ){
686       rc = sqlite3BtreePayload(pCur, 0, (int)pCur->nKey, pKey);
687       if( rc==SQLITE_OK ){
688         memset(((u8*)pKey)+pCur->nKey, 0, 9+8);
689         pCur->pKey = pKey;
690       }else{
691         sqlite3_free(pKey);
692       }
693     }else{
694       rc = SQLITE_NOMEM_BKPT;
695     }
696   }
697   assert( !pCur->curIntKey || !pCur->pKey );
698   return rc;
699 }
700 
701 /*
702 ** Save the current cursor position in the variables BtCursor.nKey
703 ** and BtCursor.pKey. The cursor's state is set to CURSOR_REQUIRESEEK.
704 **
705 ** The caller must ensure that the cursor is valid (has eState==CURSOR_VALID)
706 ** prior to calling this routine.
707 */
708 static int saveCursorPosition(BtCursor *pCur){
709   int rc;
710 
711   assert( CURSOR_VALID==pCur->eState || CURSOR_SKIPNEXT==pCur->eState );
712   assert( 0==pCur->pKey );
713   assert( cursorHoldsMutex(pCur) );
714 
715   if( pCur->curFlags & BTCF_Pinned ){
716     return SQLITE_CONSTRAINT_PINNED;
717   }
718   if( pCur->eState==CURSOR_SKIPNEXT ){
719     pCur->eState = CURSOR_VALID;
720   }else{
721     pCur->skipNext = 0;
722   }
723 
724   rc = saveCursorKey(pCur);
725   if( rc==SQLITE_OK ){
726     btreeReleaseAllCursorPages(pCur);
727     pCur->eState = CURSOR_REQUIRESEEK;
728   }
729 
730   pCur->curFlags &= ~(BTCF_ValidNKey|BTCF_ValidOvfl|BTCF_AtLast);
731   return rc;
732 }
733 
734 /* Forward reference */
735 static int SQLITE_NOINLINE saveCursorsOnList(BtCursor*,Pgno,BtCursor*);
736 
737 /*
738 ** Save the positions of all cursors (except pExcept) that are open on
739 ** the table with root-page iRoot.  "Saving the cursor position" means that
740 ** the location in the btree is remembered in such a way that it can be
741 ** moved back to the same spot after the btree has been modified.  This
742 ** routine is called just before cursor pExcept is used to modify the
743 ** table, for example in BtreeDelete() or BtreeInsert().
744 **
745 ** If there are two or more cursors on the same btree, then all such
746 ** cursors should have their BTCF_Multiple flag set.  The btreeCursor()
747 ** routine enforces that rule.  This routine only needs to be called in
748 ** the uncommon case when pExpect has the BTCF_Multiple flag set.
749 **
750 ** If pExpect!=NULL and if no other cursors are found on the same root-page,
751 ** then the BTCF_Multiple flag on pExpect is cleared, to avoid another
752 ** pointless call to this routine.
753 **
754 ** Implementation note:  This routine merely checks to see if any cursors
755 ** need to be saved.  It calls out to saveCursorsOnList() in the (unusual)
756 ** event that cursors are in need to being saved.
757 */
758 static int saveAllCursors(BtShared *pBt, Pgno iRoot, BtCursor *pExcept){
759   BtCursor *p;
760   assert( sqlite3_mutex_held(pBt->mutex) );
761   assert( pExcept==0 || pExcept->pBt==pBt );
762   for(p=pBt->pCursor; p; p=p->pNext){
763     if( p!=pExcept && (0==iRoot || p->pgnoRoot==iRoot) ) break;
764   }
765   if( p ) return saveCursorsOnList(p, iRoot, pExcept);
766   if( pExcept ) pExcept->curFlags &= ~BTCF_Multiple;
767   return SQLITE_OK;
768 }
769 
770 /* This helper routine to saveAllCursors does the actual work of saving
771 ** the cursors if and when a cursor is found that actually requires saving.
772 ** The common case is that no cursors need to be saved, so this routine is
773 ** broken out from its caller to avoid unnecessary stack pointer movement.
774 */
775 static int SQLITE_NOINLINE saveCursorsOnList(
776   BtCursor *p,         /* The first cursor that needs saving */
777   Pgno iRoot,          /* Only save cursor with this iRoot. Save all if zero */
778   BtCursor *pExcept    /* Do not save this cursor */
779 ){
780   do{
781     if( p!=pExcept && (0==iRoot || p->pgnoRoot==iRoot) ){
782       if( p->eState==CURSOR_VALID || p->eState==CURSOR_SKIPNEXT ){
783         int rc = saveCursorPosition(p);
784         if( SQLITE_OK!=rc ){
785           return rc;
786         }
787       }else{
788         testcase( p->iPage>=0 );
789         btreeReleaseAllCursorPages(p);
790       }
791     }
792     p = p->pNext;
793   }while( p );
794   return SQLITE_OK;
795 }
796 
797 /*
798 ** Clear the current cursor position.
799 */
800 void sqlite3BtreeClearCursor(BtCursor *pCur){
801   assert( cursorHoldsMutex(pCur) );
802   sqlite3_free(pCur->pKey);
803   pCur->pKey = 0;
804   pCur->eState = CURSOR_INVALID;
805 }
806 
807 /*
808 ** In this version of BtreeMoveto, pKey is a packed index record
809 ** such as is generated by the OP_MakeRecord opcode.  Unpack the
810 ** record and then call BtreeMovetoUnpacked() to do the work.
811 */
812 static int btreeMoveto(
813   BtCursor *pCur,     /* Cursor open on the btree to be searched */
814   const void *pKey,   /* Packed key if the btree is an index */
815   i64 nKey,           /* Integer key for tables.  Size of pKey for indices */
816   int bias,           /* Bias search to the high end */
817   int *pRes           /* Write search results here */
818 ){
819   int rc;                    /* Status code */
820   UnpackedRecord *pIdxKey;   /* Unpacked index key */
821 
822   if( pKey ){
823     KeyInfo *pKeyInfo = pCur->pKeyInfo;
824     assert( nKey==(i64)(int)nKey );
825     pIdxKey = sqlite3VdbeAllocUnpackedRecord(pKeyInfo);
826     if( pIdxKey==0 ) return SQLITE_NOMEM_BKPT;
827     sqlite3VdbeRecordUnpack(pKeyInfo, (int)nKey, pKey, pIdxKey);
828     if( pIdxKey->nField==0 || pIdxKey->nField>pKeyInfo->nAllField ){
829       rc = SQLITE_CORRUPT_BKPT;
830     }else{
831       rc = sqlite3BtreeIndexMoveto(pCur, pIdxKey, pRes);
832     }
833     sqlite3DbFree(pCur->pKeyInfo->db, pIdxKey);
834   }else{
835     pIdxKey = 0;
836     rc = sqlite3BtreeTableMoveto(pCur, nKey, bias, pRes);
837   }
838   return rc;
839 }
840 
841 /*
842 ** Restore the cursor to the position it was in (or as close to as possible)
843 ** when saveCursorPosition() was called. Note that this call deletes the
844 ** saved position info stored by saveCursorPosition(), so there can be
845 ** at most one effective restoreCursorPosition() call after each
846 ** saveCursorPosition().
847 */
848 static int btreeRestoreCursorPosition(BtCursor *pCur){
849   int rc;
850   int skipNext = 0;
851   assert( cursorOwnsBtShared(pCur) );
852   assert( pCur->eState>=CURSOR_REQUIRESEEK );
853   if( pCur->eState==CURSOR_FAULT ){
854     return pCur->skipNext;
855   }
856   pCur->eState = CURSOR_INVALID;
857   if( sqlite3FaultSim(410) ){
858     rc = SQLITE_IOERR;
859   }else{
860     rc = btreeMoveto(pCur, pCur->pKey, pCur->nKey, 0, &skipNext);
861   }
862   if( rc==SQLITE_OK ){
863     sqlite3_free(pCur->pKey);
864     pCur->pKey = 0;
865     assert( pCur->eState==CURSOR_VALID || pCur->eState==CURSOR_INVALID );
866     if( skipNext ) pCur->skipNext = skipNext;
867     if( pCur->skipNext && pCur->eState==CURSOR_VALID ){
868       pCur->eState = CURSOR_SKIPNEXT;
869     }
870   }
871   return rc;
872 }
873 
874 #define restoreCursorPosition(p) \
875   (p->eState>=CURSOR_REQUIRESEEK ? \
876          btreeRestoreCursorPosition(p) : \
877          SQLITE_OK)
878 
879 /*
880 ** Determine whether or not a cursor has moved from the position where
881 ** it was last placed, or has been invalidated for any other reason.
882 ** Cursors can move when the row they are pointing at is deleted out
883 ** from under them, for example.  Cursor might also move if a btree
884 ** is rebalanced.
885 **
886 ** Calling this routine with a NULL cursor pointer returns false.
887 **
888 ** Use the separate sqlite3BtreeCursorRestore() routine to restore a cursor
889 ** back to where it ought to be if this routine returns true.
890 */
891 int sqlite3BtreeCursorHasMoved(BtCursor *pCur){
892   assert( EIGHT_BYTE_ALIGNMENT(pCur)
893        || pCur==sqlite3BtreeFakeValidCursor() );
894   assert( offsetof(BtCursor, eState)==0 );
895   assert( sizeof(pCur->eState)==1 );
896   return CURSOR_VALID != *(u8*)pCur;
897 }
898 
899 /*
900 ** Return a pointer to a fake BtCursor object that will always answer
901 ** false to the sqlite3BtreeCursorHasMoved() routine above.  The fake
902 ** cursor returned must not be used with any other Btree interface.
903 */
904 BtCursor *sqlite3BtreeFakeValidCursor(void){
905   static u8 fakeCursor = CURSOR_VALID;
906   assert( offsetof(BtCursor, eState)==0 );
907   return (BtCursor*)&fakeCursor;
908 }
909 
910 /*
911 ** This routine restores a cursor back to its original position after it
912 ** has been moved by some outside activity (such as a btree rebalance or
913 ** a row having been deleted out from under the cursor).
914 **
915 ** On success, the *pDifferentRow parameter is false if the cursor is left
916 ** pointing at exactly the same row.  *pDifferntRow is the row the cursor
917 ** was pointing to has been deleted, forcing the cursor to point to some
918 ** nearby row.
919 **
920 ** This routine should only be called for a cursor that just returned
921 ** TRUE from sqlite3BtreeCursorHasMoved().
922 */
923 int sqlite3BtreeCursorRestore(BtCursor *pCur, int *pDifferentRow){
924   int rc;
925 
926   assert( pCur!=0 );
927   assert( pCur->eState!=CURSOR_VALID );
928   rc = restoreCursorPosition(pCur);
929   if( rc ){
930     *pDifferentRow = 1;
931     return rc;
932   }
933   if( pCur->eState!=CURSOR_VALID ){
934     *pDifferentRow = 1;
935   }else{
936     *pDifferentRow = 0;
937   }
938   return SQLITE_OK;
939 }
940 
941 #ifdef SQLITE_ENABLE_CURSOR_HINTS
942 /*
943 ** Provide hints to the cursor.  The particular hint given (and the type
944 ** and number of the varargs parameters) is determined by the eHintType
945 ** parameter.  See the definitions of the BTREE_HINT_* macros for details.
946 */
947 void sqlite3BtreeCursorHint(BtCursor *pCur, int eHintType, ...){
948   /* Used only by system that substitute their own storage engine */
949 }
950 #endif
951 
952 /*
953 ** Provide flag hints to the cursor.
954 */
955 void sqlite3BtreeCursorHintFlags(BtCursor *pCur, unsigned x){
956   assert( x==BTREE_SEEK_EQ || x==BTREE_BULKLOAD || x==0 );
957   pCur->hints = x;
958 }
959 
960 
961 #ifndef SQLITE_OMIT_AUTOVACUUM
962 /*
963 ** Given a page number of a regular database page, return the page
964 ** number for the pointer-map page that contains the entry for the
965 ** input page number.
966 **
967 ** Return 0 (not a valid page) for pgno==1 since there is
968 ** no pointer map associated with page 1.  The integrity_check logic
969 ** requires that ptrmapPageno(*,1)!=1.
970 */
971 static Pgno ptrmapPageno(BtShared *pBt, Pgno pgno){
972   int nPagesPerMapPage;
973   Pgno iPtrMap, ret;
974   assert( sqlite3_mutex_held(pBt->mutex) );
975   if( pgno<2 ) return 0;
976   nPagesPerMapPage = (pBt->usableSize/5)+1;
977   iPtrMap = (pgno-2)/nPagesPerMapPage;
978   ret = (iPtrMap*nPagesPerMapPage) + 2;
979   if( ret==PENDING_BYTE_PAGE(pBt) ){
980     ret++;
981   }
982   return ret;
983 }
984 
985 /*
986 ** Write an entry into the pointer map.
987 **
988 ** This routine updates the pointer map entry for page number 'key'
989 ** so that it maps to type 'eType' and parent page number 'pgno'.
990 **
991 ** If *pRC is initially non-zero (non-SQLITE_OK) then this routine is
992 ** a no-op.  If an error occurs, the appropriate error code is written
993 ** into *pRC.
994 */
995 static void ptrmapPut(BtShared *pBt, Pgno key, u8 eType, Pgno parent, int *pRC){
996   DbPage *pDbPage;  /* The pointer map page */
997   u8 *pPtrmap;      /* The pointer map data */
998   Pgno iPtrmap;     /* The pointer map page number */
999   int offset;       /* Offset in pointer map page */
1000   int rc;           /* Return code from subfunctions */
1001 
1002   if( *pRC ) return;
1003 
1004   assert( sqlite3_mutex_held(pBt->mutex) );
1005   /* The super-journal page number must never be used as a pointer map page */
1006   assert( 0==PTRMAP_ISPAGE(pBt, PENDING_BYTE_PAGE(pBt)) );
1007 
1008   assert( pBt->autoVacuum );
1009   if( key==0 ){
1010     *pRC = SQLITE_CORRUPT_BKPT;
1011     return;
1012   }
1013   iPtrmap = PTRMAP_PAGENO(pBt, key);
1014   rc = sqlite3PagerGet(pBt->pPager, iPtrmap, &pDbPage, 0);
1015   if( rc!=SQLITE_OK ){
1016     *pRC = rc;
1017     return;
1018   }
1019   if( ((char*)sqlite3PagerGetExtra(pDbPage))[0]!=0 ){
1020     /* The first byte of the extra data is the MemPage.isInit byte.
1021     ** If that byte is set, it means this page is also being used
1022     ** as a btree page. */
1023     *pRC = SQLITE_CORRUPT_BKPT;
1024     goto ptrmap_exit;
1025   }
1026   offset = PTRMAP_PTROFFSET(iPtrmap, key);
1027   if( offset<0 ){
1028     *pRC = SQLITE_CORRUPT_BKPT;
1029     goto ptrmap_exit;
1030   }
1031   assert( offset <= (int)pBt->usableSize-5 );
1032   pPtrmap = (u8 *)sqlite3PagerGetData(pDbPage);
1033 
1034   if( eType!=pPtrmap[offset] || get4byte(&pPtrmap[offset+1])!=parent ){
1035     TRACE(("PTRMAP_UPDATE: %d->(%d,%d)\n", key, eType, parent));
1036     *pRC= rc = sqlite3PagerWrite(pDbPage);
1037     if( rc==SQLITE_OK ){
1038       pPtrmap[offset] = eType;
1039       put4byte(&pPtrmap[offset+1], parent);
1040     }
1041   }
1042 
1043 ptrmap_exit:
1044   sqlite3PagerUnref(pDbPage);
1045 }
1046 
1047 /*
1048 ** Read an entry from the pointer map.
1049 **
1050 ** This routine retrieves the pointer map entry for page 'key', writing
1051 ** the type and parent page number to *pEType and *pPgno respectively.
1052 ** An error code is returned if something goes wrong, otherwise SQLITE_OK.
1053 */
1054 static int ptrmapGet(BtShared *pBt, Pgno key, u8 *pEType, Pgno *pPgno){
1055   DbPage *pDbPage;   /* The pointer map page */
1056   int iPtrmap;       /* Pointer map page index */
1057   u8 *pPtrmap;       /* Pointer map page data */
1058   int offset;        /* Offset of entry in pointer map */
1059   int rc;
1060 
1061   assert( sqlite3_mutex_held(pBt->mutex) );
1062 
1063   iPtrmap = PTRMAP_PAGENO(pBt, key);
1064   rc = sqlite3PagerGet(pBt->pPager, iPtrmap, &pDbPage, 0);
1065   if( rc!=0 ){
1066     return rc;
1067   }
1068   pPtrmap = (u8 *)sqlite3PagerGetData(pDbPage);
1069 
1070   offset = PTRMAP_PTROFFSET(iPtrmap, key);
1071   if( offset<0 ){
1072     sqlite3PagerUnref(pDbPage);
1073     return SQLITE_CORRUPT_BKPT;
1074   }
1075   assert( offset <= (int)pBt->usableSize-5 );
1076   assert( pEType!=0 );
1077   *pEType = pPtrmap[offset];
1078   if( pPgno ) *pPgno = get4byte(&pPtrmap[offset+1]);
1079 
1080   sqlite3PagerUnref(pDbPage);
1081   if( *pEType<1 || *pEType>5 ) return SQLITE_CORRUPT_PGNO(iPtrmap);
1082   return SQLITE_OK;
1083 }
1084 
1085 #else /* if defined SQLITE_OMIT_AUTOVACUUM */
1086   #define ptrmapPut(w,x,y,z,rc)
1087   #define ptrmapGet(w,x,y,z) SQLITE_OK
1088   #define ptrmapPutOvflPtr(x, y, z, rc)
1089 #endif
1090 
1091 /*
1092 ** Given a btree page and a cell index (0 means the first cell on
1093 ** the page, 1 means the second cell, and so forth) return a pointer
1094 ** to the cell content.
1095 **
1096 ** findCellPastPtr() does the same except it skips past the initial
1097 ** 4-byte child pointer found on interior pages, if there is one.
1098 **
1099 ** This routine works only for pages that do not contain overflow cells.
1100 */
1101 #define findCell(P,I) \
1102   ((P)->aData + ((P)->maskPage & get2byteAligned(&(P)->aCellIdx[2*(I)])))
1103 #define findCellPastPtr(P,I) \
1104   ((P)->aDataOfst + ((P)->maskPage & get2byteAligned(&(P)->aCellIdx[2*(I)])))
1105 
1106 
1107 /*
1108 ** This is common tail processing for btreeParseCellPtr() and
1109 ** btreeParseCellPtrIndex() for the case when the cell does not fit entirely
1110 ** on a single B-tree page.  Make necessary adjustments to the CellInfo
1111 ** structure.
1112 */
1113 static SQLITE_NOINLINE void btreeParseCellAdjustSizeForOverflow(
1114   MemPage *pPage,         /* Page containing the cell */
1115   u8 *pCell,              /* Pointer to the cell text. */
1116   CellInfo *pInfo         /* Fill in this structure */
1117 ){
1118   /* If the payload will not fit completely on the local page, we have
1119   ** to decide how much to store locally and how much to spill onto
1120   ** overflow pages.  The strategy is to minimize the amount of unused
1121   ** space on overflow pages while keeping the amount of local storage
1122   ** in between minLocal and maxLocal.
1123   **
1124   ** Warning:  changing the way overflow payload is distributed in any
1125   ** way will result in an incompatible file format.
1126   */
1127   int minLocal;  /* Minimum amount of payload held locally */
1128   int maxLocal;  /* Maximum amount of payload held locally */
1129   int surplus;   /* Overflow payload available for local storage */
1130 
1131   minLocal = pPage->minLocal;
1132   maxLocal = pPage->maxLocal;
1133   surplus = minLocal + (pInfo->nPayload - minLocal)%(pPage->pBt->usableSize-4);
1134   testcase( surplus==maxLocal );
1135   testcase( surplus==maxLocal+1 );
1136   if( surplus <= maxLocal ){
1137     pInfo->nLocal = (u16)surplus;
1138   }else{
1139     pInfo->nLocal = (u16)minLocal;
1140   }
1141   pInfo->nSize = (u16)(&pInfo->pPayload[pInfo->nLocal] - pCell) + 4;
1142 }
1143 
1144 /*
1145 ** Given a record with nPayload bytes of payload stored within btree
1146 ** page pPage, return the number of bytes of payload stored locally.
1147 */
1148 static int btreePayloadToLocal(MemPage *pPage, i64 nPayload){
1149   int maxLocal;  /* Maximum amount of payload held locally */
1150   maxLocal = pPage->maxLocal;
1151   if( nPayload<=maxLocal ){
1152     return nPayload;
1153   }else{
1154     int minLocal;  /* Minimum amount of payload held locally */
1155     int surplus;   /* Overflow payload available for local storage */
1156     minLocal = pPage->minLocal;
1157     surplus = minLocal + (nPayload - minLocal)%(pPage->pBt->usableSize-4);
1158     return ( surplus <= maxLocal ) ? surplus : minLocal;
1159   }
1160 }
1161 
1162 /*
1163 ** The following routines are implementations of the MemPage.xParseCell()
1164 ** method.
1165 **
1166 ** Parse a cell content block and fill in the CellInfo structure.
1167 **
1168 ** btreeParseCellPtr()        =>   table btree leaf nodes
1169 ** btreeParseCellNoPayload()  =>   table btree internal nodes
1170 ** btreeParseCellPtrIndex()   =>   index btree nodes
1171 **
1172 ** There is also a wrapper function btreeParseCell() that works for
1173 ** all MemPage types and that references the cell by index rather than
1174 ** by pointer.
1175 */
1176 static void btreeParseCellPtrNoPayload(
1177   MemPage *pPage,         /* Page containing the cell */
1178   u8 *pCell,              /* Pointer to the cell text. */
1179   CellInfo *pInfo         /* Fill in this structure */
1180 ){
1181   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1182   assert( pPage->leaf==0 );
1183   assert( pPage->childPtrSize==4 );
1184 #ifndef SQLITE_DEBUG
1185   UNUSED_PARAMETER(pPage);
1186 #endif
1187   pInfo->nSize = 4 + getVarint(&pCell[4], (u64*)&pInfo->nKey);
1188   pInfo->nPayload = 0;
1189   pInfo->nLocal = 0;
1190   pInfo->pPayload = 0;
1191   return;
1192 }
1193 static void btreeParseCellPtr(
1194   MemPage *pPage,         /* Page containing the cell */
1195   u8 *pCell,              /* Pointer to the cell text. */
1196   CellInfo *pInfo         /* Fill in this structure */
1197 ){
1198   u8 *pIter;              /* For scanning through pCell */
1199   u32 nPayload;           /* Number of bytes of cell payload */
1200   u64 iKey;               /* Extracted Key value */
1201 
1202   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1203   assert( pPage->leaf==0 || pPage->leaf==1 );
1204   assert( pPage->intKeyLeaf );
1205   assert( pPage->childPtrSize==0 );
1206   pIter = pCell;
1207 
1208   /* The next block of code is equivalent to:
1209   **
1210   **     pIter += getVarint32(pIter, nPayload);
1211   **
1212   ** The code is inlined to avoid a function call.
1213   */
1214   nPayload = *pIter;
1215   if( nPayload>=0x80 ){
1216     u8 *pEnd = &pIter[8];
1217     nPayload &= 0x7f;
1218     do{
1219       nPayload = (nPayload<<7) | (*++pIter & 0x7f);
1220     }while( (*pIter)>=0x80 && pIter<pEnd );
1221   }
1222   pIter++;
1223 
1224   /* The next block of code is equivalent to:
1225   **
1226   **     pIter += getVarint(pIter, (u64*)&pInfo->nKey);
1227   **
1228   ** The code is inlined to avoid a function call.
1229   */
1230   iKey = *pIter;
1231   if( iKey>=0x80 ){
1232     u8 *pEnd = &pIter[7];
1233     iKey &= 0x7f;
1234     while(1){
1235       iKey = (iKey<<7) | (*++pIter & 0x7f);
1236       if( (*pIter)<0x80 ) break;
1237       if( pIter>=pEnd ){
1238         iKey = (iKey<<8) | *++pIter;
1239         break;
1240       }
1241     }
1242   }
1243   pIter++;
1244 
1245   pInfo->nKey = *(i64*)&iKey;
1246   pInfo->nPayload = nPayload;
1247   pInfo->pPayload = pIter;
1248   testcase( nPayload==pPage->maxLocal );
1249   testcase( nPayload==pPage->maxLocal+1 );
1250   if( nPayload<=pPage->maxLocal ){
1251     /* This is the (easy) common case where the entire payload fits
1252     ** on the local page.  No overflow is required.
1253     */
1254     pInfo->nSize = nPayload + (u16)(pIter - pCell);
1255     if( pInfo->nSize<4 ) pInfo->nSize = 4;
1256     pInfo->nLocal = (u16)nPayload;
1257   }else{
1258     btreeParseCellAdjustSizeForOverflow(pPage, pCell, pInfo);
1259   }
1260 }
1261 static void btreeParseCellPtrIndex(
1262   MemPage *pPage,         /* Page containing the cell */
1263   u8 *pCell,              /* Pointer to the cell text. */
1264   CellInfo *pInfo         /* Fill in this structure */
1265 ){
1266   u8 *pIter;              /* For scanning through pCell */
1267   u32 nPayload;           /* Number of bytes of cell payload */
1268 
1269   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1270   assert( pPage->leaf==0 || pPage->leaf==1 );
1271   assert( pPage->intKeyLeaf==0 );
1272   pIter = pCell + pPage->childPtrSize;
1273   nPayload = *pIter;
1274   if( nPayload>=0x80 ){
1275     u8 *pEnd = &pIter[8];
1276     nPayload &= 0x7f;
1277     do{
1278       nPayload = (nPayload<<7) | (*++pIter & 0x7f);
1279     }while( *(pIter)>=0x80 && pIter<pEnd );
1280   }
1281   pIter++;
1282   pInfo->nKey = nPayload;
1283   pInfo->nPayload = nPayload;
1284   pInfo->pPayload = pIter;
1285   testcase( nPayload==pPage->maxLocal );
1286   testcase( nPayload==pPage->maxLocal+1 );
1287   if( nPayload<=pPage->maxLocal ){
1288     /* This is the (easy) common case where the entire payload fits
1289     ** on the local page.  No overflow is required.
1290     */
1291     pInfo->nSize = nPayload + (u16)(pIter - pCell);
1292     if( pInfo->nSize<4 ) pInfo->nSize = 4;
1293     pInfo->nLocal = (u16)nPayload;
1294   }else{
1295     btreeParseCellAdjustSizeForOverflow(pPage, pCell, pInfo);
1296   }
1297 }
1298 static void btreeParseCell(
1299   MemPage *pPage,         /* Page containing the cell */
1300   int iCell,              /* The cell index.  First cell is 0 */
1301   CellInfo *pInfo         /* Fill in this structure */
1302 ){
1303   pPage->xParseCell(pPage, findCell(pPage, iCell), pInfo);
1304 }
1305 
1306 /*
1307 ** The following routines are implementations of the MemPage.xCellSize
1308 ** method.
1309 **
1310 ** Compute the total number of bytes that a Cell needs in the cell
1311 ** data area of the btree-page.  The return number includes the cell
1312 ** data header and the local payload, but not any overflow page or
1313 ** the space used by the cell pointer.
1314 **
1315 ** cellSizePtrNoPayload()    =>   table internal nodes
1316 ** cellSizePtr()             =>   all index nodes & table leaf nodes
1317 */
1318 static u16 cellSizePtr(MemPage *pPage, u8 *pCell){
1319   u8 *pIter = pCell + pPage->childPtrSize; /* For looping over bytes of pCell */
1320   u8 *pEnd;                                /* End mark for a varint */
1321   u32 nSize;                               /* Size value to return */
1322 
1323 #ifdef SQLITE_DEBUG
1324   /* The value returned by this function should always be the same as
1325   ** the (CellInfo.nSize) value found by doing a full parse of the
1326   ** cell. If SQLITE_DEBUG is defined, an assert() at the bottom of
1327   ** this function verifies that this invariant is not violated. */
1328   CellInfo debuginfo;
1329   pPage->xParseCell(pPage, pCell, &debuginfo);
1330 #endif
1331 
1332   nSize = *pIter;
1333   if( nSize>=0x80 ){
1334     pEnd = &pIter[8];
1335     nSize &= 0x7f;
1336     do{
1337       nSize = (nSize<<7) | (*++pIter & 0x7f);
1338     }while( *(pIter)>=0x80 && pIter<pEnd );
1339   }
1340   pIter++;
1341   if( pPage->intKey ){
1342     /* pIter now points at the 64-bit integer key value, a variable length
1343     ** integer. The following block moves pIter to point at the first byte
1344     ** past the end of the key value. */
1345     pEnd = &pIter[9];
1346     while( (*pIter++)&0x80 && pIter<pEnd );
1347   }
1348   testcase( nSize==pPage->maxLocal );
1349   testcase( nSize==pPage->maxLocal+1 );
1350   if( nSize<=pPage->maxLocal ){
1351     nSize += (u32)(pIter - pCell);
1352     if( nSize<4 ) nSize = 4;
1353   }else{
1354     int minLocal = pPage->minLocal;
1355     nSize = minLocal + (nSize - minLocal) % (pPage->pBt->usableSize - 4);
1356     testcase( nSize==pPage->maxLocal );
1357     testcase( nSize==pPage->maxLocal+1 );
1358     if( nSize>pPage->maxLocal ){
1359       nSize = minLocal;
1360     }
1361     nSize += 4 + (u16)(pIter - pCell);
1362   }
1363   assert( nSize==debuginfo.nSize || CORRUPT_DB );
1364   return (u16)nSize;
1365 }
1366 static u16 cellSizePtrNoPayload(MemPage *pPage, u8 *pCell){
1367   u8 *pIter = pCell + 4; /* For looping over bytes of pCell */
1368   u8 *pEnd;              /* End mark for a varint */
1369 
1370 #ifdef SQLITE_DEBUG
1371   /* The value returned by this function should always be the same as
1372   ** the (CellInfo.nSize) value found by doing a full parse of the
1373   ** cell. If SQLITE_DEBUG is defined, an assert() at the bottom of
1374   ** this function verifies that this invariant is not violated. */
1375   CellInfo debuginfo;
1376   pPage->xParseCell(pPage, pCell, &debuginfo);
1377 #else
1378   UNUSED_PARAMETER(pPage);
1379 #endif
1380 
1381   assert( pPage->childPtrSize==4 );
1382   pEnd = pIter + 9;
1383   while( (*pIter++)&0x80 && pIter<pEnd );
1384   assert( debuginfo.nSize==(u16)(pIter - pCell) || CORRUPT_DB );
1385   return (u16)(pIter - pCell);
1386 }
1387 
1388 
1389 #ifdef SQLITE_DEBUG
1390 /* This variation on cellSizePtr() is used inside of assert() statements
1391 ** only. */
1392 static u16 cellSize(MemPage *pPage, int iCell){
1393   return pPage->xCellSize(pPage, findCell(pPage, iCell));
1394 }
1395 #endif
1396 
1397 #ifndef SQLITE_OMIT_AUTOVACUUM
1398 /*
1399 ** The cell pCell is currently part of page pSrc but will ultimately be part
1400 ** of pPage.  (pSrc and pPager are often the same.)  If pCell contains a
1401 ** pointer to an overflow page, insert an entry into the pointer-map for
1402 ** the overflow page that will be valid after pCell has been moved to pPage.
1403 */
1404 static void ptrmapPutOvflPtr(MemPage *pPage, MemPage *pSrc, u8 *pCell,int *pRC){
1405   CellInfo info;
1406   if( *pRC ) return;
1407   assert( pCell!=0 );
1408   pPage->xParseCell(pPage, pCell, &info);
1409   if( info.nLocal<info.nPayload ){
1410     Pgno ovfl;
1411     if( SQLITE_WITHIN(pSrc->aDataEnd, pCell, pCell+info.nLocal) ){
1412       testcase( pSrc!=pPage );
1413       *pRC = SQLITE_CORRUPT_BKPT;
1414       return;
1415     }
1416     ovfl = get4byte(&pCell[info.nSize-4]);
1417     ptrmapPut(pPage->pBt, ovfl, PTRMAP_OVERFLOW1, pPage->pgno, pRC);
1418   }
1419 }
1420 #endif
1421 
1422 
1423 /*
1424 ** Defragment the page given. This routine reorganizes cells within the
1425 ** page so that there are no free-blocks on the free-block list.
1426 **
1427 ** Parameter nMaxFrag is the maximum amount of fragmented space that may be
1428 ** present in the page after this routine returns.
1429 **
1430 ** EVIDENCE-OF: R-44582-60138 SQLite may from time to time reorganize a
1431 ** b-tree page so that there are no freeblocks or fragment bytes, all
1432 ** unused bytes are contained in the unallocated space region, and all
1433 ** cells are packed tightly at the end of the page.
1434 */
1435 static int defragmentPage(MemPage *pPage, int nMaxFrag){
1436   int i;                     /* Loop counter */
1437   int pc;                    /* Address of the i-th cell */
1438   int hdr;                   /* Offset to the page header */
1439   int size;                  /* Size of a cell */
1440   int usableSize;            /* Number of usable bytes on a page */
1441   int cellOffset;            /* Offset to the cell pointer array */
1442   int cbrk;                  /* Offset to the cell content area */
1443   int nCell;                 /* Number of cells on the page */
1444   unsigned char *data;       /* The page data */
1445   unsigned char *temp;       /* Temp area for cell content */
1446   unsigned char *src;        /* Source of content */
1447   int iCellFirst;            /* First allowable cell index */
1448   int iCellLast;             /* Last possible cell index */
1449   int iCellStart;            /* First cell offset in input */
1450 
1451   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
1452   assert( pPage->pBt!=0 );
1453   assert( pPage->pBt->usableSize <= SQLITE_MAX_PAGE_SIZE );
1454   assert( pPage->nOverflow==0 );
1455   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1456   temp = 0;
1457   src = data = pPage->aData;
1458   hdr = pPage->hdrOffset;
1459   cellOffset = pPage->cellOffset;
1460   nCell = pPage->nCell;
1461   assert( nCell==get2byte(&data[hdr+3]) || CORRUPT_DB );
1462   iCellFirst = cellOffset + 2*nCell;
1463   usableSize = pPage->pBt->usableSize;
1464 
1465   /* This block handles pages with two or fewer free blocks and nMaxFrag
1466   ** or fewer fragmented bytes. In this case it is faster to move the
1467   ** two (or one) blocks of cells using memmove() and add the required
1468   ** offsets to each pointer in the cell-pointer array than it is to
1469   ** reconstruct the entire page.  */
1470   if( (int)data[hdr+7]<=nMaxFrag ){
1471     int iFree = get2byte(&data[hdr+1]);
1472     if( iFree>usableSize-4 ) return SQLITE_CORRUPT_PAGE(pPage);
1473     if( iFree ){
1474       int iFree2 = get2byte(&data[iFree]);
1475       if( iFree2>usableSize-4 ) return SQLITE_CORRUPT_PAGE(pPage);
1476       if( 0==iFree2 || (data[iFree2]==0 && data[iFree2+1]==0) ){
1477         u8 *pEnd = &data[cellOffset + nCell*2];
1478         u8 *pAddr;
1479         int sz2 = 0;
1480         int sz = get2byte(&data[iFree+2]);
1481         int top = get2byte(&data[hdr+5]);
1482         if( top>=iFree ){
1483           return SQLITE_CORRUPT_PAGE(pPage);
1484         }
1485         if( iFree2 ){
1486           if( iFree+sz>iFree2 ) return SQLITE_CORRUPT_PAGE(pPage);
1487           sz2 = get2byte(&data[iFree2+2]);
1488           if( iFree2+sz2 > usableSize ) return SQLITE_CORRUPT_PAGE(pPage);
1489           memmove(&data[iFree+sz+sz2], &data[iFree+sz], iFree2-(iFree+sz));
1490           sz += sz2;
1491         }else if( iFree+sz>usableSize ){
1492           return SQLITE_CORRUPT_PAGE(pPage);
1493         }
1494 
1495         cbrk = top+sz;
1496         assert( cbrk+(iFree-top) <= usableSize );
1497         memmove(&data[cbrk], &data[top], iFree-top);
1498         for(pAddr=&data[cellOffset]; pAddr<pEnd; pAddr+=2){
1499           pc = get2byte(pAddr);
1500           if( pc<iFree ){ put2byte(pAddr, pc+sz); }
1501           else if( pc<iFree2 ){ put2byte(pAddr, pc+sz2); }
1502         }
1503         goto defragment_out;
1504       }
1505     }
1506   }
1507 
1508   cbrk = usableSize;
1509   iCellLast = usableSize - 4;
1510   iCellStart = get2byte(&data[hdr+5]);
1511   for(i=0; i<nCell; i++){
1512     u8 *pAddr;     /* The i-th cell pointer */
1513     pAddr = &data[cellOffset + i*2];
1514     pc = get2byte(pAddr);
1515     testcase( pc==iCellFirst );
1516     testcase( pc==iCellLast );
1517     /* These conditions have already been verified in btreeInitPage()
1518     ** if PRAGMA cell_size_check=ON.
1519     */
1520     if( pc<iCellStart || pc>iCellLast ){
1521       return SQLITE_CORRUPT_PAGE(pPage);
1522     }
1523     assert( pc>=iCellStart && pc<=iCellLast );
1524     size = pPage->xCellSize(pPage, &src[pc]);
1525     cbrk -= size;
1526     if( cbrk<iCellStart || pc+size>usableSize ){
1527       return SQLITE_CORRUPT_PAGE(pPage);
1528     }
1529     assert( cbrk+size<=usableSize && cbrk>=iCellStart );
1530     testcase( cbrk+size==usableSize );
1531     testcase( pc+size==usableSize );
1532     put2byte(pAddr, cbrk);
1533     if( temp==0 ){
1534       if( cbrk==pc ) continue;
1535       temp = sqlite3PagerTempSpace(pPage->pBt->pPager);
1536       memcpy(&temp[iCellStart], &data[iCellStart], usableSize - iCellStart);
1537       src = temp;
1538     }
1539     memcpy(&data[cbrk], &src[pc], size);
1540   }
1541   data[hdr+7] = 0;
1542 
1543  defragment_out:
1544   assert( pPage->nFree>=0 );
1545   if( data[hdr+7]+cbrk-iCellFirst!=pPage->nFree ){
1546     return SQLITE_CORRUPT_PAGE(pPage);
1547   }
1548   assert( cbrk>=iCellFirst );
1549   put2byte(&data[hdr+5], cbrk);
1550   data[hdr+1] = 0;
1551   data[hdr+2] = 0;
1552   memset(&data[iCellFirst], 0, cbrk-iCellFirst);
1553   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
1554   return SQLITE_OK;
1555 }
1556 
1557 /*
1558 ** Search the free-list on page pPg for space to store a cell nByte bytes in
1559 ** size. If one can be found, return a pointer to the space and remove it
1560 ** from the free-list.
1561 **
1562 ** If no suitable space can be found on the free-list, return NULL.
1563 **
1564 ** This function may detect corruption within pPg.  If corruption is
1565 ** detected then *pRc is set to SQLITE_CORRUPT and NULL is returned.
1566 **
1567 ** Slots on the free list that are between 1 and 3 bytes larger than nByte
1568 ** will be ignored if adding the extra space to the fragmentation count
1569 ** causes the fragmentation count to exceed 60.
1570 */
1571 static u8 *pageFindSlot(MemPage *pPg, int nByte, int *pRc){
1572   const int hdr = pPg->hdrOffset;            /* Offset to page header */
1573   u8 * const aData = pPg->aData;             /* Page data */
1574   int iAddr = hdr + 1;                       /* Address of ptr to pc */
1575   int pc = get2byte(&aData[iAddr]);          /* Address of a free slot */
1576   int x;                                     /* Excess size of the slot */
1577   int maxPC = pPg->pBt->usableSize - nByte;  /* Max address for a usable slot */
1578   int size;                                  /* Size of the free slot */
1579 
1580   assert( pc>0 );
1581   while( pc<=maxPC ){
1582     /* EVIDENCE-OF: R-22710-53328 The third and fourth bytes of each
1583     ** freeblock form a big-endian integer which is the size of the freeblock
1584     ** in bytes, including the 4-byte header. */
1585     size = get2byte(&aData[pc+2]);
1586     if( (x = size - nByte)>=0 ){
1587       testcase( x==4 );
1588       testcase( x==3 );
1589       if( x<4 ){
1590         /* EVIDENCE-OF: R-11498-58022 In a well-formed b-tree page, the total
1591         ** number of bytes in fragments may not exceed 60. */
1592         if( aData[hdr+7]>57 ) return 0;
1593 
1594         /* Remove the slot from the free-list. Update the number of
1595         ** fragmented bytes within the page. */
1596         memcpy(&aData[iAddr], &aData[pc], 2);
1597         aData[hdr+7] += (u8)x;
1598       }else if( x+pc > maxPC ){
1599         /* This slot extends off the end of the usable part of the page */
1600         *pRc = SQLITE_CORRUPT_PAGE(pPg);
1601         return 0;
1602       }else{
1603         /* The slot remains on the free-list. Reduce its size to account
1604         ** for the portion used by the new allocation. */
1605         put2byte(&aData[pc+2], x);
1606       }
1607       return &aData[pc + x];
1608     }
1609     iAddr = pc;
1610     pc = get2byte(&aData[pc]);
1611     if( pc<=iAddr+size ){
1612       if( pc ){
1613         /* The next slot in the chain is not past the end of the current slot */
1614         *pRc = SQLITE_CORRUPT_PAGE(pPg);
1615       }
1616       return 0;
1617     }
1618   }
1619   if( pc>maxPC+nByte-4 ){
1620     /* The free slot chain extends off the end of the page */
1621     *pRc = SQLITE_CORRUPT_PAGE(pPg);
1622   }
1623   return 0;
1624 }
1625 
1626 /*
1627 ** Allocate nByte bytes of space from within the B-Tree page passed
1628 ** as the first argument. Write into *pIdx the index into pPage->aData[]
1629 ** of the first byte of allocated space. Return either SQLITE_OK or
1630 ** an error code (usually SQLITE_CORRUPT).
1631 **
1632 ** The caller guarantees that there is sufficient space to make the
1633 ** allocation.  This routine might need to defragment in order to bring
1634 ** all the space together, however.  This routine will avoid using
1635 ** the first two bytes past the cell pointer area since presumably this
1636 ** allocation is being made in order to insert a new cell, so we will
1637 ** also end up needing a new cell pointer.
1638 */
1639 static int allocateSpace(MemPage *pPage, int nByte, int *pIdx){
1640   const int hdr = pPage->hdrOffset;    /* Local cache of pPage->hdrOffset */
1641   u8 * const data = pPage->aData;      /* Local cache of pPage->aData */
1642   int top;                             /* First byte of cell content area */
1643   int rc = SQLITE_OK;                  /* Integer return code */
1644   int gap;        /* First byte of gap between cell pointers and cell content */
1645 
1646   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
1647   assert( pPage->pBt );
1648   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1649   assert( nByte>=0 );  /* Minimum cell size is 4 */
1650   assert( pPage->nFree>=nByte );
1651   assert( pPage->nOverflow==0 );
1652   assert( nByte < (int)(pPage->pBt->usableSize-8) );
1653 
1654   assert( pPage->cellOffset == hdr + 12 - 4*pPage->leaf );
1655   gap = pPage->cellOffset + 2*pPage->nCell;
1656   assert( gap<=65536 );
1657   /* EVIDENCE-OF: R-29356-02391 If the database uses a 65536-byte page size
1658   ** and the reserved space is zero (the usual value for reserved space)
1659   ** then the cell content offset of an empty page wants to be 65536.
1660   ** However, that integer is too large to be stored in a 2-byte unsigned
1661   ** integer, so a value of 0 is used in its place. */
1662   top = get2byte(&data[hdr+5]);
1663   assert( top<=(int)pPage->pBt->usableSize ); /* by btreeComputeFreeSpace() */
1664   if( gap>top ){
1665     if( top==0 && pPage->pBt->usableSize==65536 ){
1666       top = 65536;
1667     }else{
1668       return SQLITE_CORRUPT_PAGE(pPage);
1669     }
1670   }
1671 
1672   /* If there is enough space between gap and top for one more cell pointer,
1673   ** and if the freelist is not empty, then search the
1674   ** freelist looking for a slot big enough to satisfy the request.
1675   */
1676   testcase( gap+2==top );
1677   testcase( gap+1==top );
1678   testcase( gap==top );
1679   if( (data[hdr+2] || data[hdr+1]) && gap+2<=top ){
1680     u8 *pSpace = pageFindSlot(pPage, nByte, &rc);
1681     if( pSpace ){
1682       int g2;
1683       assert( pSpace+nByte<=data+pPage->pBt->usableSize );
1684       *pIdx = g2 = (int)(pSpace-data);
1685       if( g2<=gap ){
1686         return SQLITE_CORRUPT_PAGE(pPage);
1687       }else{
1688         return SQLITE_OK;
1689       }
1690     }else if( rc ){
1691       return rc;
1692     }
1693   }
1694 
1695   /* The request could not be fulfilled using a freelist slot.  Check
1696   ** to see if defragmentation is necessary.
1697   */
1698   testcase( gap+2+nByte==top );
1699   if( gap+2+nByte>top ){
1700     assert( pPage->nCell>0 || CORRUPT_DB );
1701     assert( pPage->nFree>=0 );
1702     rc = defragmentPage(pPage, MIN(4, pPage->nFree - (2+nByte)));
1703     if( rc ) return rc;
1704     top = get2byteNotZero(&data[hdr+5]);
1705     assert( gap+2+nByte<=top );
1706   }
1707 
1708 
1709   /* Allocate memory from the gap in between the cell pointer array
1710   ** and the cell content area.  The btreeComputeFreeSpace() call has already
1711   ** validated the freelist.  Given that the freelist is valid, there
1712   ** is no way that the allocation can extend off the end of the page.
1713   ** The assert() below verifies the previous sentence.
1714   */
1715   top -= nByte;
1716   put2byte(&data[hdr+5], top);
1717   assert( top+nByte <= (int)pPage->pBt->usableSize );
1718   *pIdx = top;
1719   return SQLITE_OK;
1720 }
1721 
1722 /*
1723 ** Return a section of the pPage->aData to the freelist.
1724 ** The first byte of the new free block is pPage->aData[iStart]
1725 ** and the size of the block is iSize bytes.
1726 **
1727 ** Adjacent freeblocks are coalesced.
1728 **
1729 ** Even though the freeblock list was checked by btreeComputeFreeSpace(),
1730 ** that routine will not detect overlap between cells or freeblocks.  Nor
1731 ** does it detect cells or freeblocks that encrouch into the reserved bytes
1732 ** at the end of the page.  So do additional corruption checks inside this
1733 ** routine and return SQLITE_CORRUPT if any problems are found.
1734 */
1735 static int freeSpace(MemPage *pPage, u16 iStart, u16 iSize){
1736   u16 iPtr;                             /* Address of ptr to next freeblock */
1737   u16 iFreeBlk;                         /* Address of the next freeblock */
1738   u8 hdr;                               /* Page header size.  0 or 100 */
1739   u8 nFrag = 0;                         /* Reduction in fragmentation */
1740   u16 iOrigSize = iSize;                /* Original value of iSize */
1741   u16 x;                                /* Offset to cell content area */
1742   u32 iEnd = iStart + iSize;            /* First byte past the iStart buffer */
1743   unsigned char *data = pPage->aData;   /* Page content */
1744 
1745   assert( pPage->pBt!=0 );
1746   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
1747   assert( CORRUPT_DB || iStart>=pPage->hdrOffset+6+pPage->childPtrSize );
1748   assert( CORRUPT_DB || iEnd <= pPage->pBt->usableSize );
1749   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1750   assert( iSize>=4 );   /* Minimum cell size is 4 */
1751   assert( iStart<=pPage->pBt->usableSize-4 );
1752 
1753   /* The list of freeblocks must be in ascending order.  Find the
1754   ** spot on the list where iStart should be inserted.
1755   */
1756   hdr = pPage->hdrOffset;
1757   iPtr = hdr + 1;
1758   if( data[iPtr+1]==0 && data[iPtr]==0 ){
1759     iFreeBlk = 0;  /* Shortcut for the case when the freelist is empty */
1760   }else{
1761     while( (iFreeBlk = get2byte(&data[iPtr]))<iStart ){
1762       if( iFreeBlk<iPtr+4 ){
1763         if( iFreeBlk==0 ) break; /* TH3: corrupt082.100 */
1764         return SQLITE_CORRUPT_PAGE(pPage);
1765       }
1766       iPtr = iFreeBlk;
1767     }
1768     if( iFreeBlk>pPage->pBt->usableSize-4 ){ /* TH3: corrupt081.100 */
1769       return SQLITE_CORRUPT_PAGE(pPage);
1770     }
1771     assert( iFreeBlk>iPtr || iFreeBlk==0 );
1772 
1773     /* At this point:
1774     **    iFreeBlk:   First freeblock after iStart, or zero if none
1775     **    iPtr:       The address of a pointer to iFreeBlk
1776     **
1777     ** Check to see if iFreeBlk should be coalesced onto the end of iStart.
1778     */
1779     if( iFreeBlk && iEnd+3>=iFreeBlk ){
1780       nFrag = iFreeBlk - iEnd;
1781       if( iEnd>iFreeBlk ) return SQLITE_CORRUPT_PAGE(pPage);
1782       iEnd = iFreeBlk + get2byte(&data[iFreeBlk+2]);
1783       if( iEnd > pPage->pBt->usableSize ){
1784         return SQLITE_CORRUPT_PAGE(pPage);
1785       }
1786       iSize = iEnd - iStart;
1787       iFreeBlk = get2byte(&data[iFreeBlk]);
1788     }
1789 
1790     /* If iPtr is another freeblock (that is, if iPtr is not the freelist
1791     ** pointer in the page header) then check to see if iStart should be
1792     ** coalesced onto the end of iPtr.
1793     */
1794     if( iPtr>hdr+1 ){
1795       int iPtrEnd = iPtr + get2byte(&data[iPtr+2]);
1796       if( iPtrEnd+3>=iStart ){
1797         if( iPtrEnd>iStart ) return SQLITE_CORRUPT_PAGE(pPage);
1798         nFrag += iStart - iPtrEnd;
1799         iSize = iEnd - iPtr;
1800         iStart = iPtr;
1801       }
1802     }
1803     if( nFrag>data[hdr+7] ) return SQLITE_CORRUPT_PAGE(pPage);
1804     data[hdr+7] -= nFrag;
1805   }
1806   x = get2byte(&data[hdr+5]);
1807   if( iStart<=x ){
1808     /* The new freeblock is at the beginning of the cell content area,
1809     ** so just extend the cell content area rather than create another
1810     ** freelist entry */
1811     if( iStart<x ) return SQLITE_CORRUPT_PAGE(pPage);
1812     if( iPtr!=hdr+1 ) return SQLITE_CORRUPT_PAGE(pPage);
1813     put2byte(&data[hdr+1], iFreeBlk);
1814     put2byte(&data[hdr+5], iEnd);
1815   }else{
1816     /* Insert the new freeblock into the freelist */
1817     put2byte(&data[iPtr], iStart);
1818   }
1819   if( pPage->pBt->btsFlags & BTS_FAST_SECURE ){
1820     /* Overwrite deleted information with zeros when the secure_delete
1821     ** option is enabled */
1822     memset(&data[iStart], 0, iSize);
1823   }
1824   put2byte(&data[iStart], iFreeBlk);
1825   put2byte(&data[iStart+2], iSize);
1826   pPage->nFree += iOrigSize;
1827   return SQLITE_OK;
1828 }
1829 
1830 /*
1831 ** Decode the flags byte (the first byte of the header) for a page
1832 ** and initialize fields of the MemPage structure accordingly.
1833 **
1834 ** Only the following combinations are supported.  Anything different
1835 ** indicates a corrupt database files:
1836 **
1837 **         PTF_ZERODATA
1838 **         PTF_ZERODATA | PTF_LEAF
1839 **         PTF_LEAFDATA | PTF_INTKEY
1840 **         PTF_LEAFDATA | PTF_INTKEY | PTF_LEAF
1841 */
1842 static int decodeFlags(MemPage *pPage, int flagByte){
1843   BtShared *pBt;     /* A copy of pPage->pBt */
1844 
1845   assert( pPage->hdrOffset==(pPage->pgno==1 ? 100 : 0) );
1846   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1847   pPage->leaf = (u8)(flagByte>>3);  assert( PTF_LEAF == 1<<3 );
1848   flagByte &= ~PTF_LEAF;
1849   pPage->childPtrSize = 4-4*pPage->leaf;
1850   pPage->xCellSize = cellSizePtr;
1851   pBt = pPage->pBt;
1852   if( flagByte==(PTF_LEAFDATA | PTF_INTKEY) ){
1853     /* EVIDENCE-OF: R-07291-35328 A value of 5 (0x05) means the page is an
1854     ** interior table b-tree page. */
1855     assert( (PTF_LEAFDATA|PTF_INTKEY)==5 );
1856     /* EVIDENCE-OF: R-26900-09176 A value of 13 (0x0d) means the page is a
1857     ** leaf table b-tree page. */
1858     assert( (PTF_LEAFDATA|PTF_INTKEY|PTF_LEAF)==13 );
1859     pPage->intKey = 1;
1860     if( pPage->leaf ){
1861       pPage->intKeyLeaf = 1;
1862       pPage->xParseCell = btreeParseCellPtr;
1863     }else{
1864       pPage->intKeyLeaf = 0;
1865       pPage->xCellSize = cellSizePtrNoPayload;
1866       pPage->xParseCell = btreeParseCellPtrNoPayload;
1867     }
1868     pPage->maxLocal = pBt->maxLeaf;
1869     pPage->minLocal = pBt->minLeaf;
1870   }else if( flagByte==PTF_ZERODATA ){
1871     /* EVIDENCE-OF: R-43316-37308 A value of 2 (0x02) means the page is an
1872     ** interior index b-tree page. */
1873     assert( (PTF_ZERODATA)==2 );
1874     /* EVIDENCE-OF: R-59615-42828 A value of 10 (0x0a) means the page is a
1875     ** leaf index b-tree page. */
1876     assert( (PTF_ZERODATA|PTF_LEAF)==10 );
1877     pPage->intKey = 0;
1878     pPage->intKeyLeaf = 0;
1879     pPage->xParseCell = btreeParseCellPtrIndex;
1880     pPage->maxLocal = pBt->maxLocal;
1881     pPage->minLocal = pBt->minLocal;
1882   }else{
1883     /* EVIDENCE-OF: R-47608-56469 Any other value for the b-tree page type is
1884     ** an error. */
1885     return SQLITE_CORRUPT_PAGE(pPage);
1886   }
1887   pPage->max1bytePayload = pBt->max1bytePayload;
1888   return SQLITE_OK;
1889 }
1890 
1891 /*
1892 ** Compute the amount of freespace on the page.  In other words, fill
1893 ** in the pPage->nFree field.
1894 */
1895 static int btreeComputeFreeSpace(MemPage *pPage){
1896   int pc;            /* Address of a freeblock within pPage->aData[] */
1897   u8 hdr;            /* Offset to beginning of page header */
1898   u8 *data;          /* Equal to pPage->aData */
1899   int usableSize;    /* Amount of usable space on each page */
1900   int nFree;         /* Number of unused bytes on the page */
1901   int top;           /* First byte of the cell content area */
1902   int iCellFirst;    /* First allowable cell or freeblock offset */
1903   int iCellLast;     /* Last possible cell or freeblock offset */
1904 
1905   assert( pPage->pBt!=0 );
1906   assert( pPage->pBt->db!=0 );
1907   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1908   assert( pPage->pgno==sqlite3PagerPagenumber(pPage->pDbPage) );
1909   assert( pPage == sqlite3PagerGetExtra(pPage->pDbPage) );
1910   assert( pPage->aData == sqlite3PagerGetData(pPage->pDbPage) );
1911   assert( pPage->isInit==1 );
1912   assert( pPage->nFree<0 );
1913 
1914   usableSize = pPage->pBt->usableSize;
1915   hdr = pPage->hdrOffset;
1916   data = pPage->aData;
1917   /* EVIDENCE-OF: R-58015-48175 The two-byte integer at offset 5 designates
1918   ** the start of the cell content area. A zero value for this integer is
1919   ** interpreted as 65536. */
1920   top = get2byteNotZero(&data[hdr+5]);
1921   iCellFirst = hdr + 8 + pPage->childPtrSize + 2*pPage->nCell;
1922   iCellLast = usableSize - 4;
1923 
1924   /* Compute the total free space on the page
1925   ** EVIDENCE-OF: R-23588-34450 The two-byte integer at offset 1 gives the
1926   ** start of the first freeblock on the page, or is zero if there are no
1927   ** freeblocks. */
1928   pc = get2byte(&data[hdr+1]);
1929   nFree = data[hdr+7] + top;  /* Init nFree to non-freeblock free space */
1930   if( pc>0 ){
1931     u32 next, size;
1932     if( pc<top ){
1933       /* EVIDENCE-OF: R-55530-52930 In a well-formed b-tree page, there will
1934       ** always be at least one cell before the first freeblock.
1935       */
1936       return SQLITE_CORRUPT_PAGE(pPage);
1937     }
1938     while( 1 ){
1939       if( pc>iCellLast ){
1940         /* Freeblock off the end of the page */
1941         return SQLITE_CORRUPT_PAGE(pPage);
1942       }
1943       next = get2byte(&data[pc]);
1944       size = get2byte(&data[pc+2]);
1945       nFree = nFree + size;
1946       if( next<=pc+size+3 ) break;
1947       pc = next;
1948     }
1949     if( next>0 ){
1950       /* Freeblock not in ascending order */
1951       return SQLITE_CORRUPT_PAGE(pPage);
1952     }
1953     if( pc+size>(unsigned int)usableSize ){
1954       /* Last freeblock extends past page end */
1955       return SQLITE_CORRUPT_PAGE(pPage);
1956     }
1957   }
1958 
1959   /* At this point, nFree contains the sum of the offset to the start
1960   ** of the cell-content area plus the number of free bytes within
1961   ** the cell-content area. If this is greater than the usable-size
1962   ** of the page, then the page must be corrupted. This check also
1963   ** serves to verify that the offset to the start of the cell-content
1964   ** area, according to the page header, lies within the page.
1965   */
1966   if( nFree>usableSize || nFree<iCellFirst ){
1967     return SQLITE_CORRUPT_PAGE(pPage);
1968   }
1969   pPage->nFree = (u16)(nFree - iCellFirst);
1970   return SQLITE_OK;
1971 }
1972 
1973 /*
1974 ** Do additional sanity check after btreeInitPage() if
1975 ** PRAGMA cell_size_check=ON
1976 */
1977 static SQLITE_NOINLINE int btreeCellSizeCheck(MemPage *pPage){
1978   int iCellFirst;    /* First allowable cell or freeblock offset */
1979   int iCellLast;     /* Last possible cell or freeblock offset */
1980   int i;             /* Index into the cell pointer array */
1981   int sz;            /* Size of a cell */
1982   int pc;            /* Address of a freeblock within pPage->aData[] */
1983   u8 *data;          /* Equal to pPage->aData */
1984   int usableSize;    /* Maximum usable space on the page */
1985   int cellOffset;    /* Start of cell content area */
1986 
1987   iCellFirst = pPage->cellOffset + 2*pPage->nCell;
1988   usableSize = pPage->pBt->usableSize;
1989   iCellLast = usableSize - 4;
1990   data = pPage->aData;
1991   cellOffset = pPage->cellOffset;
1992   if( !pPage->leaf ) iCellLast--;
1993   for(i=0; i<pPage->nCell; i++){
1994     pc = get2byteAligned(&data[cellOffset+i*2]);
1995     testcase( pc==iCellFirst );
1996     testcase( pc==iCellLast );
1997     if( pc<iCellFirst || pc>iCellLast ){
1998       return SQLITE_CORRUPT_PAGE(pPage);
1999     }
2000     sz = pPage->xCellSize(pPage, &data[pc]);
2001     testcase( pc+sz==usableSize );
2002     if( pc+sz>usableSize ){
2003       return SQLITE_CORRUPT_PAGE(pPage);
2004     }
2005   }
2006   return SQLITE_OK;
2007 }
2008 
2009 /*
2010 ** Initialize the auxiliary information for a disk block.
2011 **
2012 ** Return SQLITE_OK on success.  If we see that the page does
2013 ** not contain a well-formed database page, then return
2014 ** SQLITE_CORRUPT.  Note that a return of SQLITE_OK does not
2015 ** guarantee that the page is well-formed.  It only shows that
2016 ** we failed to detect any corruption.
2017 */
2018 static int btreeInitPage(MemPage *pPage){
2019   u8 *data;          /* Equal to pPage->aData */
2020   BtShared *pBt;        /* The main btree structure */
2021 
2022   assert( pPage->pBt!=0 );
2023   assert( pPage->pBt->db!=0 );
2024   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
2025   assert( pPage->pgno==sqlite3PagerPagenumber(pPage->pDbPage) );
2026   assert( pPage == sqlite3PagerGetExtra(pPage->pDbPage) );
2027   assert( pPage->aData == sqlite3PagerGetData(pPage->pDbPage) );
2028   assert( pPage->isInit==0 );
2029 
2030   pBt = pPage->pBt;
2031   data = pPage->aData + pPage->hdrOffset;
2032   /* EVIDENCE-OF: R-28594-02890 The one-byte flag at offset 0 indicating
2033   ** the b-tree page type. */
2034   if( decodeFlags(pPage, data[0]) ){
2035     return SQLITE_CORRUPT_PAGE(pPage);
2036   }
2037   assert( pBt->pageSize>=512 && pBt->pageSize<=65536 );
2038   pPage->maskPage = (u16)(pBt->pageSize - 1);
2039   pPage->nOverflow = 0;
2040   pPage->cellOffset = pPage->hdrOffset + 8 + pPage->childPtrSize;
2041   pPage->aCellIdx = data + pPage->childPtrSize + 8;
2042   pPage->aDataEnd = pPage->aData + pBt->usableSize;
2043   pPage->aDataOfst = pPage->aData + pPage->childPtrSize;
2044   /* EVIDENCE-OF: R-37002-32774 The two-byte integer at offset 3 gives the
2045   ** number of cells on the page. */
2046   pPage->nCell = get2byte(&data[3]);
2047   if( pPage->nCell>MX_CELL(pBt) ){
2048     /* To many cells for a single page.  The page must be corrupt */
2049     return SQLITE_CORRUPT_PAGE(pPage);
2050   }
2051   testcase( pPage->nCell==MX_CELL(pBt) );
2052   /* EVIDENCE-OF: R-24089-57979 If a page contains no cells (which is only
2053   ** possible for a root page of a table that contains no rows) then the
2054   ** offset to the cell content area will equal the page size minus the
2055   ** bytes of reserved space. */
2056   assert( pPage->nCell>0
2057        || get2byteNotZero(&data[5])==(int)pBt->usableSize
2058        || CORRUPT_DB );
2059   pPage->nFree = -1;  /* Indicate that this value is yet uncomputed */
2060   pPage->isInit = 1;
2061   if( pBt->db->flags & SQLITE_CellSizeCk ){
2062     return btreeCellSizeCheck(pPage);
2063   }
2064   return SQLITE_OK;
2065 }
2066 
2067 /*
2068 ** Set up a raw page so that it looks like a database page holding
2069 ** no entries.
2070 */
2071 static void zeroPage(MemPage *pPage, int flags){
2072   unsigned char *data = pPage->aData;
2073   BtShared *pBt = pPage->pBt;
2074   u8 hdr = pPage->hdrOffset;
2075   u16 first;
2076 
2077   assert( sqlite3PagerPagenumber(pPage->pDbPage)==pPage->pgno );
2078   assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage );
2079   assert( sqlite3PagerGetData(pPage->pDbPage) == data );
2080   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
2081   assert( sqlite3_mutex_held(pBt->mutex) );
2082   if( pBt->btsFlags & BTS_FAST_SECURE ){
2083     memset(&data[hdr], 0, pBt->usableSize - hdr);
2084   }
2085   data[hdr] = (char)flags;
2086   first = hdr + ((flags&PTF_LEAF)==0 ? 12 : 8);
2087   memset(&data[hdr+1], 0, 4);
2088   data[hdr+7] = 0;
2089   put2byte(&data[hdr+5], pBt->usableSize);
2090   pPage->nFree = (u16)(pBt->usableSize - first);
2091   decodeFlags(pPage, flags);
2092   pPage->cellOffset = first;
2093   pPage->aDataEnd = &data[pBt->usableSize];
2094   pPage->aCellIdx = &data[first];
2095   pPage->aDataOfst = &data[pPage->childPtrSize];
2096   pPage->nOverflow = 0;
2097   assert( pBt->pageSize>=512 && pBt->pageSize<=65536 );
2098   pPage->maskPage = (u16)(pBt->pageSize - 1);
2099   pPage->nCell = 0;
2100   pPage->isInit = 1;
2101 }
2102 
2103 
2104 /*
2105 ** Convert a DbPage obtained from the pager into a MemPage used by
2106 ** the btree layer.
2107 */
2108 static MemPage *btreePageFromDbPage(DbPage *pDbPage, Pgno pgno, BtShared *pBt){
2109   MemPage *pPage = (MemPage*)sqlite3PagerGetExtra(pDbPage);
2110   if( pgno!=pPage->pgno ){
2111     pPage->aData = sqlite3PagerGetData(pDbPage);
2112     pPage->pDbPage = pDbPage;
2113     pPage->pBt = pBt;
2114     pPage->pgno = pgno;
2115     pPage->hdrOffset = pgno==1 ? 100 : 0;
2116   }
2117   assert( pPage->aData==sqlite3PagerGetData(pDbPage) );
2118   return pPage;
2119 }
2120 
2121 /*
2122 ** Get a page from the pager.  Initialize the MemPage.pBt and
2123 ** MemPage.aData elements if needed.  See also: btreeGetUnusedPage().
2124 **
2125 ** If the PAGER_GET_NOCONTENT flag is set, it means that we do not care
2126 ** about the content of the page at this time.  So do not go to the disk
2127 ** to fetch the content.  Just fill in the content with zeros for now.
2128 ** If in the future we call sqlite3PagerWrite() on this page, that
2129 ** means we have started to be concerned about content and the disk
2130 ** read should occur at that point.
2131 */
2132 static int btreeGetPage(
2133   BtShared *pBt,       /* The btree */
2134   Pgno pgno,           /* Number of the page to fetch */
2135   MemPage **ppPage,    /* Return the page in this parameter */
2136   int flags            /* PAGER_GET_NOCONTENT or PAGER_GET_READONLY */
2137 ){
2138   int rc;
2139   DbPage *pDbPage;
2140 
2141   assert( flags==0 || flags==PAGER_GET_NOCONTENT || flags==PAGER_GET_READONLY );
2142   assert( sqlite3_mutex_held(pBt->mutex) );
2143   rc = sqlite3PagerGet(pBt->pPager, pgno, (DbPage**)&pDbPage, flags);
2144   if( rc ) return rc;
2145   *ppPage = btreePageFromDbPage(pDbPage, pgno, pBt);
2146   return SQLITE_OK;
2147 }
2148 
2149 /*
2150 ** Retrieve a page from the pager cache. If the requested page is not
2151 ** already in the pager cache return NULL. Initialize the MemPage.pBt and
2152 ** MemPage.aData elements if needed.
2153 */
2154 static MemPage *btreePageLookup(BtShared *pBt, Pgno pgno){
2155   DbPage *pDbPage;
2156   assert( sqlite3_mutex_held(pBt->mutex) );
2157   pDbPage = sqlite3PagerLookup(pBt->pPager, pgno);
2158   if( pDbPage ){
2159     return btreePageFromDbPage(pDbPage, pgno, pBt);
2160   }
2161   return 0;
2162 }
2163 
2164 /*
2165 ** Return the size of the database file in pages. If there is any kind of
2166 ** error, return ((unsigned int)-1).
2167 */
2168 static Pgno btreePagecount(BtShared *pBt){
2169   return pBt->nPage;
2170 }
2171 Pgno sqlite3BtreeLastPage(Btree *p){
2172   assert( sqlite3BtreeHoldsMutex(p) );
2173   return btreePagecount(p->pBt);
2174 }
2175 
2176 /*
2177 ** Get a page from the pager and initialize it.
2178 **
2179 ** If pCur!=0 then the page is being fetched as part of a moveToChild()
2180 ** call.  Do additional sanity checking on the page in this case.
2181 ** And if the fetch fails, this routine must decrement pCur->iPage.
2182 **
2183 ** The page is fetched as read-write unless pCur is not NULL and is
2184 ** a read-only cursor.
2185 **
2186 ** If an error occurs, then *ppPage is undefined. It
2187 ** may remain unchanged, or it may be set to an invalid value.
2188 */
2189 static int getAndInitPage(
2190   BtShared *pBt,                  /* The database file */
2191   Pgno pgno,                      /* Number of the page to get */
2192   MemPage **ppPage,               /* Write the page pointer here */
2193   BtCursor *pCur,                 /* Cursor to receive the page, or NULL */
2194   int bReadOnly                   /* True for a read-only page */
2195 ){
2196   int rc;
2197   DbPage *pDbPage;
2198   assert( sqlite3_mutex_held(pBt->mutex) );
2199   assert( pCur==0 || ppPage==&pCur->pPage );
2200   assert( pCur==0 || bReadOnly==pCur->curPagerFlags );
2201   assert( pCur==0 || pCur->iPage>0 );
2202 
2203   if( pgno>btreePagecount(pBt) ){
2204     rc = SQLITE_CORRUPT_BKPT;
2205     goto getAndInitPage_error1;
2206   }
2207   rc = sqlite3PagerGet(pBt->pPager, pgno, (DbPage**)&pDbPage, bReadOnly);
2208   if( rc ){
2209     goto getAndInitPage_error1;
2210   }
2211   *ppPage = (MemPage*)sqlite3PagerGetExtra(pDbPage);
2212   if( (*ppPage)->isInit==0 ){
2213     btreePageFromDbPage(pDbPage, pgno, pBt);
2214     rc = btreeInitPage(*ppPage);
2215     if( rc!=SQLITE_OK ){
2216       goto getAndInitPage_error2;
2217     }
2218   }
2219   assert( (*ppPage)->pgno==pgno );
2220   assert( (*ppPage)->aData==sqlite3PagerGetData(pDbPage) );
2221 
2222   /* If obtaining a child page for a cursor, we must verify that the page is
2223   ** compatible with the root page. */
2224   if( pCur && ((*ppPage)->nCell<1 || (*ppPage)->intKey!=pCur->curIntKey) ){
2225     rc = SQLITE_CORRUPT_PGNO(pgno);
2226     goto getAndInitPage_error2;
2227   }
2228   return SQLITE_OK;
2229 
2230 getAndInitPage_error2:
2231   releasePage(*ppPage);
2232 getAndInitPage_error1:
2233   if( pCur ){
2234     pCur->iPage--;
2235     pCur->pPage = pCur->apPage[pCur->iPage];
2236   }
2237   testcase( pgno==0 );
2238   assert( pgno!=0 || rc==SQLITE_CORRUPT );
2239   return rc;
2240 }
2241 
2242 /*
2243 ** Release a MemPage.  This should be called once for each prior
2244 ** call to btreeGetPage.
2245 **
2246 ** Page1 is a special case and must be released using releasePageOne().
2247 */
2248 static void releasePageNotNull(MemPage *pPage){
2249   assert( pPage->aData );
2250   assert( pPage->pBt );
2251   assert( pPage->pDbPage!=0 );
2252   assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage );
2253   assert( sqlite3PagerGetData(pPage->pDbPage)==pPage->aData );
2254   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
2255   sqlite3PagerUnrefNotNull(pPage->pDbPage);
2256 }
2257 static void releasePage(MemPage *pPage){
2258   if( pPage ) releasePageNotNull(pPage);
2259 }
2260 static void releasePageOne(MemPage *pPage){
2261   assert( pPage!=0 );
2262   assert( pPage->aData );
2263   assert( pPage->pBt );
2264   assert( pPage->pDbPage!=0 );
2265   assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage );
2266   assert( sqlite3PagerGetData(pPage->pDbPage)==pPage->aData );
2267   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
2268   sqlite3PagerUnrefPageOne(pPage->pDbPage);
2269 }
2270 
2271 /*
2272 ** Get an unused page.
2273 **
2274 ** This works just like btreeGetPage() with the addition:
2275 **
2276 **   *  If the page is already in use for some other purpose, immediately
2277 **      release it and return an SQLITE_CURRUPT error.
2278 **   *  Make sure the isInit flag is clear
2279 */
2280 static int btreeGetUnusedPage(
2281   BtShared *pBt,       /* The btree */
2282   Pgno pgno,           /* Number of the page to fetch */
2283   MemPage **ppPage,    /* Return the page in this parameter */
2284   int flags            /* PAGER_GET_NOCONTENT or PAGER_GET_READONLY */
2285 ){
2286   int rc = btreeGetPage(pBt, pgno, ppPage, flags);
2287   if( rc==SQLITE_OK ){
2288     if( sqlite3PagerPageRefcount((*ppPage)->pDbPage)>1 ){
2289       releasePage(*ppPage);
2290       *ppPage = 0;
2291       return SQLITE_CORRUPT_BKPT;
2292     }
2293     (*ppPage)->isInit = 0;
2294   }else{
2295     *ppPage = 0;
2296   }
2297   return rc;
2298 }
2299 
2300 
2301 /*
2302 ** During a rollback, when the pager reloads information into the cache
2303 ** so that the cache is restored to its original state at the start of
2304 ** the transaction, for each page restored this routine is called.
2305 **
2306 ** This routine needs to reset the extra data section at the end of the
2307 ** page to agree with the restored data.
2308 */
2309 static void pageReinit(DbPage *pData){
2310   MemPage *pPage;
2311   pPage = (MemPage *)sqlite3PagerGetExtra(pData);
2312   assert( sqlite3PagerPageRefcount(pData)>0 );
2313   if( pPage->isInit ){
2314     assert( sqlite3_mutex_held(pPage->pBt->mutex) );
2315     pPage->isInit = 0;
2316     if( sqlite3PagerPageRefcount(pData)>1 ){
2317       /* pPage might not be a btree page;  it might be an overflow page
2318       ** or ptrmap page or a free page.  In those cases, the following
2319       ** call to btreeInitPage() will likely return SQLITE_CORRUPT.
2320       ** But no harm is done by this.  And it is very important that
2321       ** btreeInitPage() be called on every btree page so we make
2322       ** the call for every page that comes in for re-initing. */
2323       btreeInitPage(pPage);
2324     }
2325   }
2326 }
2327 
2328 /*
2329 ** Invoke the busy handler for a btree.
2330 */
2331 static int btreeInvokeBusyHandler(void *pArg){
2332   BtShared *pBt = (BtShared*)pArg;
2333   assert( pBt->db );
2334   assert( sqlite3_mutex_held(pBt->db->mutex) );
2335   return sqlite3InvokeBusyHandler(&pBt->db->busyHandler);
2336 }
2337 
2338 /*
2339 ** Open a database file.
2340 **
2341 ** zFilename is the name of the database file.  If zFilename is NULL
2342 ** then an ephemeral database is created.  The ephemeral database might
2343 ** be exclusively in memory, or it might use a disk-based memory cache.
2344 ** Either way, the ephemeral database will be automatically deleted
2345 ** when sqlite3BtreeClose() is called.
2346 **
2347 ** If zFilename is ":memory:" then an in-memory database is created
2348 ** that is automatically destroyed when it is closed.
2349 **
2350 ** The "flags" parameter is a bitmask that might contain bits like
2351 ** BTREE_OMIT_JOURNAL and/or BTREE_MEMORY.
2352 **
2353 ** If the database is already opened in the same database connection
2354 ** and we are in shared cache mode, then the open will fail with an
2355 ** SQLITE_CONSTRAINT error.  We cannot allow two or more BtShared
2356 ** objects in the same database connection since doing so will lead
2357 ** to problems with locking.
2358 */
2359 int sqlite3BtreeOpen(
2360   sqlite3_vfs *pVfs,      /* VFS to use for this b-tree */
2361   const char *zFilename,  /* Name of the file containing the BTree database */
2362   sqlite3 *db,            /* Associated database handle */
2363   Btree **ppBtree,        /* Pointer to new Btree object written here */
2364   int flags,              /* Options */
2365   int vfsFlags            /* Flags passed through to sqlite3_vfs.xOpen() */
2366 ){
2367   BtShared *pBt = 0;             /* Shared part of btree structure */
2368   Btree *p;                      /* Handle to return */
2369   sqlite3_mutex *mutexOpen = 0;  /* Prevents a race condition. Ticket #3537 */
2370   int rc = SQLITE_OK;            /* Result code from this function */
2371   u8 nReserve;                   /* Byte of unused space on each page */
2372   unsigned char zDbHeader[100];  /* Database header content */
2373 
2374   /* True if opening an ephemeral, temporary database */
2375   const int isTempDb = zFilename==0 || zFilename[0]==0;
2376 
2377   /* Set the variable isMemdb to true for an in-memory database, or
2378   ** false for a file-based database.
2379   */
2380 #ifdef SQLITE_OMIT_MEMORYDB
2381   const int isMemdb = 0;
2382 #else
2383   const int isMemdb = (zFilename && strcmp(zFilename, ":memory:")==0)
2384                        || (isTempDb && sqlite3TempInMemory(db))
2385                        || (vfsFlags & SQLITE_OPEN_MEMORY)!=0;
2386 #endif
2387 
2388   assert( db!=0 );
2389   assert( pVfs!=0 );
2390   assert( sqlite3_mutex_held(db->mutex) );
2391   assert( (flags&0xff)==flags );   /* flags fit in 8 bits */
2392 
2393   /* Only a BTREE_SINGLE database can be BTREE_UNORDERED */
2394   assert( (flags & BTREE_UNORDERED)==0 || (flags & BTREE_SINGLE)!=0 );
2395 
2396   /* A BTREE_SINGLE database is always a temporary and/or ephemeral */
2397   assert( (flags & BTREE_SINGLE)==0 || isTempDb );
2398 
2399   if( isMemdb ){
2400     flags |= BTREE_MEMORY;
2401   }
2402   if( (vfsFlags & SQLITE_OPEN_MAIN_DB)!=0 && (isMemdb || isTempDb) ){
2403     vfsFlags = (vfsFlags & ~SQLITE_OPEN_MAIN_DB) | SQLITE_OPEN_TEMP_DB;
2404   }
2405   p = sqlite3MallocZero(sizeof(Btree));
2406   if( !p ){
2407     return SQLITE_NOMEM_BKPT;
2408   }
2409   p->inTrans = TRANS_NONE;
2410   p->db = db;
2411 #ifndef SQLITE_OMIT_SHARED_CACHE
2412   p->lock.pBtree = p;
2413   p->lock.iTable = 1;
2414 #endif
2415 
2416 #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)
2417   /*
2418   ** If this Btree is a candidate for shared cache, try to find an
2419   ** existing BtShared object that we can share with
2420   */
2421   if( isTempDb==0 && (isMemdb==0 || (vfsFlags&SQLITE_OPEN_URI)!=0) ){
2422     if( vfsFlags & SQLITE_OPEN_SHAREDCACHE ){
2423       int nFilename = sqlite3Strlen30(zFilename)+1;
2424       int nFullPathname = pVfs->mxPathname+1;
2425       char *zFullPathname = sqlite3Malloc(MAX(nFullPathname,nFilename));
2426       MUTEX_LOGIC( sqlite3_mutex *mutexShared; )
2427 
2428       p->sharable = 1;
2429       if( !zFullPathname ){
2430         sqlite3_free(p);
2431         return SQLITE_NOMEM_BKPT;
2432       }
2433       if( isMemdb ){
2434         memcpy(zFullPathname, zFilename, nFilename);
2435       }else{
2436         rc = sqlite3OsFullPathname(pVfs, zFilename,
2437                                    nFullPathname, zFullPathname);
2438         if( rc ){
2439           if( rc==SQLITE_OK_SYMLINK ){
2440             rc = SQLITE_OK;
2441           }else{
2442             sqlite3_free(zFullPathname);
2443             sqlite3_free(p);
2444             return rc;
2445           }
2446         }
2447       }
2448 #if SQLITE_THREADSAFE
2449       mutexOpen = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_OPEN);
2450       sqlite3_mutex_enter(mutexOpen);
2451       mutexShared = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MAIN);
2452       sqlite3_mutex_enter(mutexShared);
2453 #endif
2454       for(pBt=GLOBAL(BtShared*,sqlite3SharedCacheList); pBt; pBt=pBt->pNext){
2455         assert( pBt->nRef>0 );
2456         if( 0==strcmp(zFullPathname, sqlite3PagerFilename(pBt->pPager, 0))
2457                  && sqlite3PagerVfs(pBt->pPager)==pVfs ){
2458           int iDb;
2459           for(iDb=db->nDb-1; iDb>=0; iDb--){
2460             Btree *pExisting = db->aDb[iDb].pBt;
2461             if( pExisting && pExisting->pBt==pBt ){
2462               sqlite3_mutex_leave(mutexShared);
2463               sqlite3_mutex_leave(mutexOpen);
2464               sqlite3_free(zFullPathname);
2465               sqlite3_free(p);
2466               return SQLITE_CONSTRAINT;
2467             }
2468           }
2469           p->pBt = pBt;
2470           pBt->nRef++;
2471           break;
2472         }
2473       }
2474       sqlite3_mutex_leave(mutexShared);
2475       sqlite3_free(zFullPathname);
2476     }
2477 #ifdef SQLITE_DEBUG
2478     else{
2479       /* In debug mode, we mark all persistent databases as sharable
2480       ** even when they are not.  This exercises the locking code and
2481       ** gives more opportunity for asserts(sqlite3_mutex_held())
2482       ** statements to find locking problems.
2483       */
2484       p->sharable = 1;
2485     }
2486 #endif
2487   }
2488 #endif
2489   if( pBt==0 ){
2490     /*
2491     ** The following asserts make sure that structures used by the btree are
2492     ** the right size.  This is to guard against size changes that result
2493     ** when compiling on a different architecture.
2494     */
2495     assert( sizeof(i64)==8 );
2496     assert( sizeof(u64)==8 );
2497     assert( sizeof(u32)==4 );
2498     assert( sizeof(u16)==2 );
2499     assert( sizeof(Pgno)==4 );
2500 
2501     pBt = sqlite3MallocZero( sizeof(*pBt) );
2502     if( pBt==0 ){
2503       rc = SQLITE_NOMEM_BKPT;
2504       goto btree_open_out;
2505     }
2506     rc = sqlite3PagerOpen(pVfs, &pBt->pPager, zFilename,
2507                           sizeof(MemPage), flags, vfsFlags, pageReinit);
2508     if( rc==SQLITE_OK ){
2509       sqlite3PagerSetMmapLimit(pBt->pPager, db->szMmap);
2510       rc = sqlite3PagerReadFileheader(pBt->pPager,sizeof(zDbHeader),zDbHeader);
2511     }
2512     if( rc!=SQLITE_OK ){
2513       goto btree_open_out;
2514     }
2515     pBt->openFlags = (u8)flags;
2516     pBt->db = db;
2517     sqlite3PagerSetBusyHandler(pBt->pPager, btreeInvokeBusyHandler, pBt);
2518     p->pBt = pBt;
2519 
2520     pBt->pCursor = 0;
2521     pBt->pPage1 = 0;
2522     if( sqlite3PagerIsreadonly(pBt->pPager) ) pBt->btsFlags |= BTS_READ_ONLY;
2523 #if defined(SQLITE_SECURE_DELETE)
2524     pBt->btsFlags |= BTS_SECURE_DELETE;
2525 #elif defined(SQLITE_FAST_SECURE_DELETE)
2526     pBt->btsFlags |= BTS_OVERWRITE;
2527 #endif
2528     /* EVIDENCE-OF: R-51873-39618 The page size for a database file is
2529     ** determined by the 2-byte integer located at an offset of 16 bytes from
2530     ** the beginning of the database file. */
2531     pBt->pageSize = (zDbHeader[16]<<8) | (zDbHeader[17]<<16);
2532     if( pBt->pageSize<512 || pBt->pageSize>SQLITE_MAX_PAGE_SIZE
2533          || ((pBt->pageSize-1)&pBt->pageSize)!=0 ){
2534       pBt->pageSize = 0;
2535 #ifndef SQLITE_OMIT_AUTOVACUUM
2536       /* If the magic name ":memory:" will create an in-memory database, then
2537       ** leave the autoVacuum mode at 0 (do not auto-vacuum), even if
2538       ** SQLITE_DEFAULT_AUTOVACUUM is true. On the other hand, if
2539       ** SQLITE_OMIT_MEMORYDB has been defined, then ":memory:" is just a
2540       ** regular file-name. In this case the auto-vacuum applies as per normal.
2541       */
2542       if( zFilename && !isMemdb ){
2543         pBt->autoVacuum = (SQLITE_DEFAULT_AUTOVACUUM ? 1 : 0);
2544         pBt->incrVacuum = (SQLITE_DEFAULT_AUTOVACUUM==2 ? 1 : 0);
2545       }
2546 #endif
2547       nReserve = 0;
2548     }else{
2549       /* EVIDENCE-OF: R-37497-42412 The size of the reserved region is
2550       ** determined by the one-byte unsigned integer found at an offset of 20
2551       ** into the database file header. */
2552       nReserve = zDbHeader[20];
2553       pBt->btsFlags |= BTS_PAGESIZE_FIXED;
2554 #ifndef SQLITE_OMIT_AUTOVACUUM
2555       pBt->autoVacuum = (get4byte(&zDbHeader[36 + 4*4])?1:0);
2556       pBt->incrVacuum = (get4byte(&zDbHeader[36 + 7*4])?1:0);
2557 #endif
2558     }
2559     rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize, nReserve);
2560     if( rc ) goto btree_open_out;
2561     pBt->usableSize = pBt->pageSize - nReserve;
2562     assert( (pBt->pageSize & 7)==0 );  /* 8-byte alignment of pageSize */
2563 
2564 #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)
2565     /* Add the new BtShared object to the linked list sharable BtShareds.
2566     */
2567     pBt->nRef = 1;
2568     if( p->sharable ){
2569       MUTEX_LOGIC( sqlite3_mutex *mutexShared; )
2570       MUTEX_LOGIC( mutexShared = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MAIN);)
2571       if( SQLITE_THREADSAFE && sqlite3GlobalConfig.bCoreMutex ){
2572         pBt->mutex = sqlite3MutexAlloc(SQLITE_MUTEX_FAST);
2573         if( pBt->mutex==0 ){
2574           rc = SQLITE_NOMEM_BKPT;
2575           goto btree_open_out;
2576         }
2577       }
2578       sqlite3_mutex_enter(mutexShared);
2579       pBt->pNext = GLOBAL(BtShared*,sqlite3SharedCacheList);
2580       GLOBAL(BtShared*,sqlite3SharedCacheList) = pBt;
2581       sqlite3_mutex_leave(mutexShared);
2582     }
2583 #endif
2584   }
2585 
2586 #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)
2587   /* If the new Btree uses a sharable pBtShared, then link the new
2588   ** Btree into the list of all sharable Btrees for the same connection.
2589   ** The list is kept in ascending order by pBt address.
2590   */
2591   if( p->sharable ){
2592     int i;
2593     Btree *pSib;
2594     for(i=0; i<db->nDb; i++){
2595       if( (pSib = db->aDb[i].pBt)!=0 && pSib->sharable ){
2596         while( pSib->pPrev ){ pSib = pSib->pPrev; }
2597         if( (uptr)p->pBt<(uptr)pSib->pBt ){
2598           p->pNext = pSib;
2599           p->pPrev = 0;
2600           pSib->pPrev = p;
2601         }else{
2602           while( pSib->pNext && (uptr)pSib->pNext->pBt<(uptr)p->pBt ){
2603             pSib = pSib->pNext;
2604           }
2605           p->pNext = pSib->pNext;
2606           p->pPrev = pSib;
2607           if( p->pNext ){
2608             p->pNext->pPrev = p;
2609           }
2610           pSib->pNext = p;
2611         }
2612         break;
2613       }
2614     }
2615   }
2616 #endif
2617   *ppBtree = p;
2618 
2619 btree_open_out:
2620   if( rc!=SQLITE_OK ){
2621     if( pBt && pBt->pPager ){
2622       sqlite3PagerClose(pBt->pPager, 0);
2623     }
2624     sqlite3_free(pBt);
2625     sqlite3_free(p);
2626     *ppBtree = 0;
2627   }else{
2628     sqlite3_file *pFile;
2629 
2630     /* If the B-Tree was successfully opened, set the pager-cache size to the
2631     ** default value. Except, when opening on an existing shared pager-cache,
2632     ** do not change the pager-cache size.
2633     */
2634     if( sqlite3BtreeSchema(p, 0, 0)==0 ){
2635       sqlite3BtreeSetCacheSize(p, SQLITE_DEFAULT_CACHE_SIZE);
2636     }
2637 
2638     pFile = sqlite3PagerFile(pBt->pPager);
2639     if( pFile->pMethods ){
2640       sqlite3OsFileControlHint(pFile, SQLITE_FCNTL_PDB, (void*)&pBt->db);
2641     }
2642   }
2643   if( mutexOpen ){
2644     assert( sqlite3_mutex_held(mutexOpen) );
2645     sqlite3_mutex_leave(mutexOpen);
2646   }
2647   assert( rc!=SQLITE_OK || sqlite3BtreeConnectionCount(*ppBtree)>0 );
2648   return rc;
2649 }
2650 
2651 /*
2652 ** Decrement the BtShared.nRef counter.  When it reaches zero,
2653 ** remove the BtShared structure from the sharing list.  Return
2654 ** true if the BtShared.nRef counter reaches zero and return
2655 ** false if it is still positive.
2656 */
2657 static int removeFromSharingList(BtShared *pBt){
2658 #ifndef SQLITE_OMIT_SHARED_CACHE
2659   MUTEX_LOGIC( sqlite3_mutex *pMainMtx; )
2660   BtShared *pList;
2661   int removed = 0;
2662 
2663   assert( sqlite3_mutex_notheld(pBt->mutex) );
2664   MUTEX_LOGIC( pMainMtx = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MAIN); )
2665   sqlite3_mutex_enter(pMainMtx);
2666   pBt->nRef--;
2667   if( pBt->nRef<=0 ){
2668     if( GLOBAL(BtShared*,sqlite3SharedCacheList)==pBt ){
2669       GLOBAL(BtShared*,sqlite3SharedCacheList) = pBt->pNext;
2670     }else{
2671       pList = GLOBAL(BtShared*,sqlite3SharedCacheList);
2672       while( ALWAYS(pList) && pList->pNext!=pBt ){
2673         pList=pList->pNext;
2674       }
2675       if( ALWAYS(pList) ){
2676         pList->pNext = pBt->pNext;
2677       }
2678     }
2679     if( SQLITE_THREADSAFE ){
2680       sqlite3_mutex_free(pBt->mutex);
2681     }
2682     removed = 1;
2683   }
2684   sqlite3_mutex_leave(pMainMtx);
2685   return removed;
2686 #else
2687   return 1;
2688 #endif
2689 }
2690 
2691 /*
2692 ** Make sure pBt->pTmpSpace points to an allocation of
2693 ** MX_CELL_SIZE(pBt) bytes with a 4-byte prefix for a left-child
2694 ** pointer.
2695 */
2696 static void allocateTempSpace(BtShared *pBt){
2697   if( !pBt->pTmpSpace ){
2698     pBt->pTmpSpace = sqlite3PageMalloc( pBt->pageSize );
2699 
2700     /* One of the uses of pBt->pTmpSpace is to format cells before
2701     ** inserting them into a leaf page (function fillInCell()). If
2702     ** a cell is less than 4 bytes in size, it is rounded up to 4 bytes
2703     ** by the various routines that manipulate binary cells. Which
2704     ** can mean that fillInCell() only initializes the first 2 or 3
2705     ** bytes of pTmpSpace, but that the first 4 bytes are copied from
2706     ** it into a database page. This is not actually a problem, but it
2707     ** does cause a valgrind error when the 1 or 2 bytes of unitialized
2708     ** data is passed to system call write(). So to avoid this error,
2709     ** zero the first 4 bytes of temp space here.
2710     **
2711     ** Also:  Provide four bytes of initialized space before the
2712     ** beginning of pTmpSpace as an area available to prepend the
2713     ** left-child pointer to the beginning of a cell.
2714     */
2715     if( pBt->pTmpSpace ){
2716       memset(pBt->pTmpSpace, 0, 8);
2717       pBt->pTmpSpace += 4;
2718     }
2719   }
2720 }
2721 
2722 /*
2723 ** Free the pBt->pTmpSpace allocation
2724 */
2725 static void freeTempSpace(BtShared *pBt){
2726   if( pBt->pTmpSpace ){
2727     pBt->pTmpSpace -= 4;
2728     sqlite3PageFree(pBt->pTmpSpace);
2729     pBt->pTmpSpace = 0;
2730   }
2731 }
2732 
2733 /*
2734 ** Close an open database and invalidate all cursors.
2735 */
2736 int sqlite3BtreeClose(Btree *p){
2737   BtShared *pBt = p->pBt;
2738 
2739   /* Close all cursors opened via this handle.  */
2740   assert( sqlite3_mutex_held(p->db->mutex) );
2741   sqlite3BtreeEnter(p);
2742 
2743   /* Verify that no other cursors have this Btree open */
2744 #ifdef SQLITE_DEBUG
2745   {
2746     BtCursor *pCur = pBt->pCursor;
2747     while( pCur ){
2748       BtCursor *pTmp = pCur;
2749       pCur = pCur->pNext;
2750       assert( pTmp->pBtree!=p );
2751 
2752     }
2753   }
2754 #endif
2755 
2756   /* Rollback any active transaction and free the handle structure.
2757   ** The call to sqlite3BtreeRollback() drops any table-locks held by
2758   ** this handle.
2759   */
2760   sqlite3BtreeRollback(p, SQLITE_OK, 0);
2761   sqlite3BtreeLeave(p);
2762 
2763   /* If there are still other outstanding references to the shared-btree
2764   ** structure, return now. The remainder of this procedure cleans
2765   ** up the shared-btree.
2766   */
2767   assert( p->wantToLock==0 && p->locked==0 );
2768   if( !p->sharable || removeFromSharingList(pBt) ){
2769     /* The pBt is no longer on the sharing list, so we can access
2770     ** it without having to hold the mutex.
2771     **
2772     ** Clean out and delete the BtShared object.
2773     */
2774     assert( !pBt->pCursor );
2775     sqlite3PagerClose(pBt->pPager, p->db);
2776     if( pBt->xFreeSchema && pBt->pSchema ){
2777       pBt->xFreeSchema(pBt->pSchema);
2778     }
2779     sqlite3DbFree(0, pBt->pSchema);
2780     freeTempSpace(pBt);
2781     sqlite3_free(pBt);
2782   }
2783 
2784 #ifndef SQLITE_OMIT_SHARED_CACHE
2785   assert( p->wantToLock==0 );
2786   assert( p->locked==0 );
2787   if( p->pPrev ) p->pPrev->pNext = p->pNext;
2788   if( p->pNext ) p->pNext->pPrev = p->pPrev;
2789 #endif
2790 
2791   sqlite3_free(p);
2792   return SQLITE_OK;
2793 }
2794 
2795 /*
2796 ** Change the "soft" limit on the number of pages in the cache.
2797 ** Unused and unmodified pages will be recycled when the number of
2798 ** pages in the cache exceeds this soft limit.  But the size of the
2799 ** cache is allowed to grow larger than this limit if it contains
2800 ** dirty pages or pages still in active use.
2801 */
2802 int sqlite3BtreeSetCacheSize(Btree *p, int mxPage){
2803   BtShared *pBt = p->pBt;
2804   assert( sqlite3_mutex_held(p->db->mutex) );
2805   sqlite3BtreeEnter(p);
2806   sqlite3PagerSetCachesize(pBt->pPager, mxPage);
2807   sqlite3BtreeLeave(p);
2808   return SQLITE_OK;
2809 }
2810 
2811 /*
2812 ** Change the "spill" limit on the number of pages in the cache.
2813 ** If the number of pages exceeds this limit during a write transaction,
2814 ** the pager might attempt to "spill" pages to the journal early in
2815 ** order to free up memory.
2816 **
2817 ** The value returned is the current spill size.  If zero is passed
2818 ** as an argument, no changes are made to the spill size setting, so
2819 ** using mxPage of 0 is a way to query the current spill size.
2820 */
2821 int sqlite3BtreeSetSpillSize(Btree *p, int mxPage){
2822   BtShared *pBt = p->pBt;
2823   int res;
2824   assert( sqlite3_mutex_held(p->db->mutex) );
2825   sqlite3BtreeEnter(p);
2826   res = sqlite3PagerSetSpillsize(pBt->pPager, mxPage);
2827   sqlite3BtreeLeave(p);
2828   return res;
2829 }
2830 
2831 #if SQLITE_MAX_MMAP_SIZE>0
2832 /*
2833 ** Change the limit on the amount of the database file that may be
2834 ** memory mapped.
2835 */
2836 int sqlite3BtreeSetMmapLimit(Btree *p, sqlite3_int64 szMmap){
2837   BtShared *pBt = p->pBt;
2838   assert( sqlite3_mutex_held(p->db->mutex) );
2839   sqlite3BtreeEnter(p);
2840   sqlite3PagerSetMmapLimit(pBt->pPager, szMmap);
2841   sqlite3BtreeLeave(p);
2842   return SQLITE_OK;
2843 }
2844 #endif /* SQLITE_MAX_MMAP_SIZE>0 */
2845 
2846 /*
2847 ** Change the way data is synced to disk in order to increase or decrease
2848 ** how well the database resists damage due to OS crashes and power
2849 ** failures.  Level 1 is the same as asynchronous (no syncs() occur and
2850 ** there is a high probability of damage)  Level 2 is the default.  There
2851 ** is a very low but non-zero probability of damage.  Level 3 reduces the
2852 ** probability of damage to near zero but with a write performance reduction.
2853 */
2854 #ifndef SQLITE_OMIT_PAGER_PRAGMAS
2855 int sqlite3BtreeSetPagerFlags(
2856   Btree *p,              /* The btree to set the safety level on */
2857   unsigned pgFlags       /* Various PAGER_* flags */
2858 ){
2859   BtShared *pBt = p->pBt;
2860   assert( sqlite3_mutex_held(p->db->mutex) );
2861   sqlite3BtreeEnter(p);
2862   sqlite3PagerSetFlags(pBt->pPager, pgFlags);
2863   sqlite3BtreeLeave(p);
2864   return SQLITE_OK;
2865 }
2866 #endif
2867 
2868 /*
2869 ** Change the default pages size and the number of reserved bytes per page.
2870 ** Or, if the page size has already been fixed, return SQLITE_READONLY
2871 ** without changing anything.
2872 **
2873 ** The page size must be a power of 2 between 512 and 65536.  If the page
2874 ** size supplied does not meet this constraint then the page size is not
2875 ** changed.
2876 **
2877 ** Page sizes are constrained to be a power of two so that the region
2878 ** of the database file used for locking (beginning at PENDING_BYTE,
2879 ** the first byte past the 1GB boundary, 0x40000000) needs to occur
2880 ** at the beginning of a page.
2881 **
2882 ** If parameter nReserve is less than zero, then the number of reserved
2883 ** bytes per page is left unchanged.
2884 **
2885 ** If the iFix!=0 then the BTS_PAGESIZE_FIXED flag is set so that the page size
2886 ** and autovacuum mode can no longer be changed.
2887 */
2888 int sqlite3BtreeSetPageSize(Btree *p, int pageSize, int nReserve, int iFix){
2889   int rc = SQLITE_OK;
2890   int x;
2891   BtShared *pBt = p->pBt;
2892   assert( nReserve>=0 && nReserve<=255 );
2893   sqlite3BtreeEnter(p);
2894   pBt->nReserveWanted = nReserve;
2895   x = pBt->pageSize - pBt->usableSize;
2896   if( nReserve<x ) nReserve = x;
2897   if( pBt->btsFlags & BTS_PAGESIZE_FIXED ){
2898     sqlite3BtreeLeave(p);
2899     return SQLITE_READONLY;
2900   }
2901   assert( nReserve>=0 && nReserve<=255 );
2902   if( pageSize>=512 && pageSize<=SQLITE_MAX_PAGE_SIZE &&
2903         ((pageSize-1)&pageSize)==0 ){
2904     assert( (pageSize & 7)==0 );
2905     assert( !pBt->pCursor );
2906     if( nReserve>32 && pageSize==512 ) pageSize = 1024;
2907     pBt->pageSize = (u32)pageSize;
2908     freeTempSpace(pBt);
2909   }
2910   rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize, nReserve);
2911   pBt->usableSize = pBt->pageSize - (u16)nReserve;
2912   if( iFix ) pBt->btsFlags |= BTS_PAGESIZE_FIXED;
2913   sqlite3BtreeLeave(p);
2914   return rc;
2915 }
2916 
2917 /*
2918 ** Return the currently defined page size
2919 */
2920 int sqlite3BtreeGetPageSize(Btree *p){
2921   return p->pBt->pageSize;
2922 }
2923 
2924 /*
2925 ** This function is similar to sqlite3BtreeGetReserve(), except that it
2926 ** may only be called if it is guaranteed that the b-tree mutex is already
2927 ** held.
2928 **
2929 ** This is useful in one special case in the backup API code where it is
2930 ** known that the shared b-tree mutex is held, but the mutex on the
2931 ** database handle that owns *p is not. In this case if sqlite3BtreeEnter()
2932 ** were to be called, it might collide with some other operation on the
2933 ** database handle that owns *p, causing undefined behavior.
2934 */
2935 int sqlite3BtreeGetReserveNoMutex(Btree *p){
2936   int n;
2937   assert( sqlite3_mutex_held(p->pBt->mutex) );
2938   n = p->pBt->pageSize - p->pBt->usableSize;
2939   return n;
2940 }
2941 
2942 /*
2943 ** Return the number of bytes of space at the end of every page that
2944 ** are intentually left unused.  This is the "reserved" space that is
2945 ** sometimes used by extensions.
2946 **
2947 ** The value returned is the larger of the current reserve size and
2948 ** the latest reserve size requested by SQLITE_FILECTRL_RESERVE_BYTES.
2949 ** The amount of reserve can only grow - never shrink.
2950 */
2951 int sqlite3BtreeGetRequestedReserve(Btree *p){
2952   int n1, n2;
2953   sqlite3BtreeEnter(p);
2954   n1 = (int)p->pBt->nReserveWanted;
2955   n2 = sqlite3BtreeGetReserveNoMutex(p);
2956   sqlite3BtreeLeave(p);
2957   return n1>n2 ? n1 : n2;
2958 }
2959 
2960 
2961 /*
2962 ** Set the maximum page count for a database if mxPage is positive.
2963 ** No changes are made if mxPage is 0 or negative.
2964 ** Regardless of the value of mxPage, return the maximum page count.
2965 */
2966 Pgno sqlite3BtreeMaxPageCount(Btree *p, Pgno mxPage){
2967   Pgno n;
2968   sqlite3BtreeEnter(p);
2969   n = sqlite3PagerMaxPageCount(p->pBt->pPager, mxPage);
2970   sqlite3BtreeLeave(p);
2971   return n;
2972 }
2973 
2974 /*
2975 ** Change the values for the BTS_SECURE_DELETE and BTS_OVERWRITE flags:
2976 **
2977 **    newFlag==0       Both BTS_SECURE_DELETE and BTS_OVERWRITE are cleared
2978 **    newFlag==1       BTS_SECURE_DELETE set and BTS_OVERWRITE is cleared
2979 **    newFlag==2       BTS_SECURE_DELETE cleared and BTS_OVERWRITE is set
2980 **    newFlag==(-1)    No changes
2981 **
2982 ** This routine acts as a query if newFlag is less than zero
2983 **
2984 ** With BTS_OVERWRITE set, deleted content is overwritten by zeros, but
2985 ** freelist leaf pages are not written back to the database.  Thus in-page
2986 ** deleted content is cleared, but freelist deleted content is not.
2987 **
2988 ** With BTS_SECURE_DELETE, operation is like BTS_OVERWRITE with the addition
2989 ** that freelist leaf pages are written back into the database, increasing
2990 ** the amount of disk I/O.
2991 */
2992 int sqlite3BtreeSecureDelete(Btree *p, int newFlag){
2993   int b;
2994   if( p==0 ) return 0;
2995   sqlite3BtreeEnter(p);
2996   assert( BTS_OVERWRITE==BTS_SECURE_DELETE*2 );
2997   assert( BTS_FAST_SECURE==(BTS_OVERWRITE|BTS_SECURE_DELETE) );
2998   if( newFlag>=0 ){
2999     p->pBt->btsFlags &= ~BTS_FAST_SECURE;
3000     p->pBt->btsFlags |= BTS_SECURE_DELETE*newFlag;
3001   }
3002   b = (p->pBt->btsFlags & BTS_FAST_SECURE)/BTS_SECURE_DELETE;
3003   sqlite3BtreeLeave(p);
3004   return b;
3005 }
3006 
3007 /*
3008 ** Change the 'auto-vacuum' property of the database. If the 'autoVacuum'
3009 ** parameter is non-zero, then auto-vacuum mode is enabled. If zero, it
3010 ** is disabled. The default value for the auto-vacuum property is
3011 ** determined by the SQLITE_DEFAULT_AUTOVACUUM macro.
3012 */
3013 int sqlite3BtreeSetAutoVacuum(Btree *p, int autoVacuum){
3014 #ifdef SQLITE_OMIT_AUTOVACUUM
3015   return SQLITE_READONLY;
3016 #else
3017   BtShared *pBt = p->pBt;
3018   int rc = SQLITE_OK;
3019   u8 av = (u8)autoVacuum;
3020 
3021   sqlite3BtreeEnter(p);
3022   if( (pBt->btsFlags & BTS_PAGESIZE_FIXED)!=0 && (av ?1:0)!=pBt->autoVacuum ){
3023     rc = SQLITE_READONLY;
3024   }else{
3025     pBt->autoVacuum = av ?1:0;
3026     pBt->incrVacuum = av==2 ?1:0;
3027   }
3028   sqlite3BtreeLeave(p);
3029   return rc;
3030 #endif
3031 }
3032 
3033 /*
3034 ** Return the value of the 'auto-vacuum' property. If auto-vacuum is
3035 ** enabled 1 is returned. Otherwise 0.
3036 */
3037 int sqlite3BtreeGetAutoVacuum(Btree *p){
3038 #ifdef SQLITE_OMIT_AUTOVACUUM
3039   return BTREE_AUTOVACUUM_NONE;
3040 #else
3041   int rc;
3042   sqlite3BtreeEnter(p);
3043   rc = (
3044     (!p->pBt->autoVacuum)?BTREE_AUTOVACUUM_NONE:
3045     (!p->pBt->incrVacuum)?BTREE_AUTOVACUUM_FULL:
3046     BTREE_AUTOVACUUM_INCR
3047   );
3048   sqlite3BtreeLeave(p);
3049   return rc;
3050 #endif
3051 }
3052 
3053 /*
3054 ** If the user has not set the safety-level for this database connection
3055 ** using "PRAGMA synchronous", and if the safety-level is not already
3056 ** set to the value passed to this function as the second parameter,
3057 ** set it so.
3058 */
3059 #if SQLITE_DEFAULT_SYNCHRONOUS!=SQLITE_DEFAULT_WAL_SYNCHRONOUS \
3060     && !defined(SQLITE_OMIT_WAL)
3061 static void setDefaultSyncFlag(BtShared *pBt, u8 safety_level){
3062   sqlite3 *db;
3063   Db *pDb;
3064   if( (db=pBt->db)!=0 && (pDb=db->aDb)!=0 ){
3065     while( pDb->pBt==0 || pDb->pBt->pBt!=pBt ){ pDb++; }
3066     if( pDb->bSyncSet==0
3067      && pDb->safety_level!=safety_level
3068      && pDb!=&db->aDb[1]
3069     ){
3070       pDb->safety_level = safety_level;
3071       sqlite3PagerSetFlags(pBt->pPager,
3072           pDb->safety_level | (db->flags & PAGER_FLAGS_MASK));
3073     }
3074   }
3075 }
3076 #else
3077 # define setDefaultSyncFlag(pBt,safety_level)
3078 #endif
3079 
3080 /* Forward declaration */
3081 static int newDatabase(BtShared*);
3082 
3083 
3084 /*
3085 ** Get a reference to pPage1 of the database file.  This will
3086 ** also acquire a readlock on that file.
3087 **
3088 ** SQLITE_OK is returned on success.  If the file is not a
3089 ** well-formed database file, then SQLITE_CORRUPT is returned.
3090 ** SQLITE_BUSY is returned if the database is locked.  SQLITE_NOMEM
3091 ** is returned if we run out of memory.
3092 */
3093 static int lockBtree(BtShared *pBt){
3094   int rc;              /* Result code from subfunctions */
3095   MemPage *pPage1;     /* Page 1 of the database file */
3096   u32 nPage;           /* Number of pages in the database */
3097   u32 nPageFile = 0;   /* Number of pages in the database file */
3098 
3099   assert( sqlite3_mutex_held(pBt->mutex) );
3100   assert( pBt->pPage1==0 );
3101   rc = sqlite3PagerSharedLock(pBt->pPager);
3102   if( rc!=SQLITE_OK ) return rc;
3103   rc = btreeGetPage(pBt, 1, &pPage1, 0);
3104   if( rc!=SQLITE_OK ) return rc;
3105 
3106   /* Do some checking to help insure the file we opened really is
3107   ** a valid database file.
3108   */
3109   nPage = get4byte(28+(u8*)pPage1->aData);
3110   sqlite3PagerPagecount(pBt->pPager, (int*)&nPageFile);
3111   if( nPage==0 || memcmp(24+(u8*)pPage1->aData, 92+(u8*)pPage1->aData,4)!=0 ){
3112     nPage = nPageFile;
3113   }
3114   if( (pBt->db->flags & SQLITE_ResetDatabase)!=0 ){
3115     nPage = 0;
3116   }
3117   if( nPage>0 ){
3118     u32 pageSize;
3119     u32 usableSize;
3120     u8 *page1 = pPage1->aData;
3121     rc = SQLITE_NOTADB;
3122     /* EVIDENCE-OF: R-43737-39999 Every valid SQLite database file begins
3123     ** with the following 16 bytes (in hex): 53 51 4c 69 74 65 20 66 6f 72 6d
3124     ** 61 74 20 33 00. */
3125     if( memcmp(page1, zMagicHeader, 16)!=0 ){
3126       goto page1_init_failed;
3127     }
3128 
3129 #ifdef SQLITE_OMIT_WAL
3130     if( page1[18]>1 ){
3131       pBt->btsFlags |= BTS_READ_ONLY;
3132     }
3133     if( page1[19]>1 ){
3134       goto page1_init_failed;
3135     }
3136 #else
3137     if( page1[18]>2 ){
3138       pBt->btsFlags |= BTS_READ_ONLY;
3139     }
3140     if( page1[19]>2 ){
3141       goto page1_init_failed;
3142     }
3143 
3144     /* If the read version is set to 2, this database should be accessed
3145     ** in WAL mode. If the log is not already open, open it now. Then
3146     ** return SQLITE_OK and return without populating BtShared.pPage1.
3147     ** The caller detects this and calls this function again. This is
3148     ** required as the version of page 1 currently in the page1 buffer
3149     ** may not be the latest version - there may be a newer one in the log
3150     ** file.
3151     */
3152     if( page1[19]==2 && (pBt->btsFlags & BTS_NO_WAL)==0 ){
3153       int isOpen = 0;
3154       rc = sqlite3PagerOpenWal(pBt->pPager, &isOpen);
3155       if( rc!=SQLITE_OK ){
3156         goto page1_init_failed;
3157       }else{
3158         setDefaultSyncFlag(pBt, SQLITE_DEFAULT_WAL_SYNCHRONOUS+1);
3159         if( isOpen==0 ){
3160           releasePageOne(pPage1);
3161           return SQLITE_OK;
3162         }
3163       }
3164       rc = SQLITE_NOTADB;
3165     }else{
3166       setDefaultSyncFlag(pBt, SQLITE_DEFAULT_SYNCHRONOUS+1);
3167     }
3168 #endif
3169 
3170     /* EVIDENCE-OF: R-15465-20813 The maximum and minimum embedded payload
3171     ** fractions and the leaf payload fraction values must be 64, 32, and 32.
3172     **
3173     ** The original design allowed these amounts to vary, but as of
3174     ** version 3.6.0, we require them to be fixed.
3175     */
3176     if( memcmp(&page1[21], "\100\040\040",3)!=0 ){
3177       goto page1_init_failed;
3178     }
3179     /* EVIDENCE-OF: R-51873-39618 The page size for a database file is
3180     ** determined by the 2-byte integer located at an offset of 16 bytes from
3181     ** the beginning of the database file. */
3182     pageSize = (page1[16]<<8) | (page1[17]<<16);
3183     /* EVIDENCE-OF: R-25008-21688 The size of a page is a power of two
3184     ** between 512 and 65536 inclusive. */
3185     if( ((pageSize-1)&pageSize)!=0
3186      || pageSize>SQLITE_MAX_PAGE_SIZE
3187      || pageSize<=256
3188     ){
3189       goto page1_init_failed;
3190     }
3191     pBt->btsFlags |= BTS_PAGESIZE_FIXED;
3192     assert( (pageSize & 7)==0 );
3193     /* EVIDENCE-OF: R-59310-51205 The "reserved space" size in the 1-byte
3194     ** integer at offset 20 is the number of bytes of space at the end of
3195     ** each page to reserve for extensions.
3196     **
3197     ** EVIDENCE-OF: R-37497-42412 The size of the reserved region is
3198     ** determined by the one-byte unsigned integer found at an offset of 20
3199     ** into the database file header. */
3200     usableSize = pageSize - page1[20];
3201     if( (u32)pageSize!=pBt->pageSize ){
3202       /* After reading the first page of the database assuming a page size
3203       ** of BtShared.pageSize, we have discovered that the page-size is
3204       ** actually pageSize. Unlock the database, leave pBt->pPage1 at
3205       ** zero and return SQLITE_OK. The caller will call this function
3206       ** again with the correct page-size.
3207       */
3208       releasePageOne(pPage1);
3209       pBt->usableSize = usableSize;
3210       pBt->pageSize = pageSize;
3211       freeTempSpace(pBt);
3212       rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize,
3213                                    pageSize-usableSize);
3214       return rc;
3215     }
3216     if( sqlite3WritableSchema(pBt->db)==0 && nPage>nPageFile ){
3217       rc = SQLITE_CORRUPT_BKPT;
3218       goto page1_init_failed;
3219     }
3220     /* EVIDENCE-OF: R-28312-64704 However, the usable size is not allowed to
3221     ** be less than 480. In other words, if the page size is 512, then the
3222     ** reserved space size cannot exceed 32. */
3223     if( usableSize<480 ){
3224       goto page1_init_failed;
3225     }
3226     pBt->pageSize = pageSize;
3227     pBt->usableSize = usableSize;
3228 #ifndef SQLITE_OMIT_AUTOVACUUM
3229     pBt->autoVacuum = (get4byte(&page1[36 + 4*4])?1:0);
3230     pBt->incrVacuum = (get4byte(&page1[36 + 7*4])?1:0);
3231 #endif
3232   }
3233 
3234   /* maxLocal is the maximum amount of payload to store locally for
3235   ** a cell.  Make sure it is small enough so that at least minFanout
3236   ** cells can will fit on one page.  We assume a 10-byte page header.
3237   ** Besides the payload, the cell must store:
3238   **     2-byte pointer to the cell
3239   **     4-byte child pointer
3240   **     9-byte nKey value
3241   **     4-byte nData value
3242   **     4-byte overflow page pointer
3243   ** So a cell consists of a 2-byte pointer, a header which is as much as
3244   ** 17 bytes long, 0 to N bytes of payload, and an optional 4 byte overflow
3245   ** page pointer.
3246   */
3247   pBt->maxLocal = (u16)((pBt->usableSize-12)*64/255 - 23);
3248   pBt->minLocal = (u16)((pBt->usableSize-12)*32/255 - 23);
3249   pBt->maxLeaf = (u16)(pBt->usableSize - 35);
3250   pBt->minLeaf = (u16)((pBt->usableSize-12)*32/255 - 23);
3251   if( pBt->maxLocal>127 ){
3252     pBt->max1bytePayload = 127;
3253   }else{
3254     pBt->max1bytePayload = (u8)pBt->maxLocal;
3255   }
3256   assert( pBt->maxLeaf + 23 <= MX_CELL_SIZE(pBt) );
3257   pBt->pPage1 = pPage1;
3258   pBt->nPage = nPage;
3259   return SQLITE_OK;
3260 
3261 page1_init_failed:
3262   releasePageOne(pPage1);
3263   pBt->pPage1 = 0;
3264   return rc;
3265 }
3266 
3267 #ifndef NDEBUG
3268 /*
3269 ** Return the number of cursors open on pBt. This is for use
3270 ** in assert() expressions, so it is only compiled if NDEBUG is not
3271 ** defined.
3272 **
3273 ** Only write cursors are counted if wrOnly is true.  If wrOnly is
3274 ** false then all cursors are counted.
3275 **
3276 ** For the purposes of this routine, a cursor is any cursor that
3277 ** is capable of reading or writing to the database.  Cursors that
3278 ** have been tripped into the CURSOR_FAULT state are not counted.
3279 */
3280 static int countValidCursors(BtShared *pBt, int wrOnly){
3281   BtCursor *pCur;
3282   int r = 0;
3283   for(pCur=pBt->pCursor; pCur; pCur=pCur->pNext){
3284     if( (wrOnly==0 || (pCur->curFlags & BTCF_WriteFlag)!=0)
3285      && pCur->eState!=CURSOR_FAULT ) r++;
3286   }
3287   return r;
3288 }
3289 #endif
3290 
3291 /*
3292 ** If there are no outstanding cursors and we are not in the middle
3293 ** of a transaction but there is a read lock on the database, then
3294 ** this routine unrefs the first page of the database file which
3295 ** has the effect of releasing the read lock.
3296 **
3297 ** If there is a transaction in progress, this routine is a no-op.
3298 */
3299 static void unlockBtreeIfUnused(BtShared *pBt){
3300   assert( sqlite3_mutex_held(pBt->mutex) );
3301   assert( countValidCursors(pBt,0)==0 || pBt->inTransaction>TRANS_NONE );
3302   if( pBt->inTransaction==TRANS_NONE && pBt->pPage1!=0 ){
3303     MemPage *pPage1 = pBt->pPage1;
3304     assert( pPage1->aData );
3305     assert( sqlite3PagerRefcount(pBt->pPager)==1 );
3306     pBt->pPage1 = 0;
3307     releasePageOne(pPage1);
3308   }
3309 }
3310 
3311 /*
3312 ** If pBt points to an empty file then convert that empty file
3313 ** into a new empty database by initializing the first page of
3314 ** the database.
3315 */
3316 static int newDatabase(BtShared *pBt){
3317   MemPage *pP1;
3318   unsigned char *data;
3319   int rc;
3320 
3321   assert( sqlite3_mutex_held(pBt->mutex) );
3322   if( pBt->nPage>0 ){
3323     return SQLITE_OK;
3324   }
3325   pP1 = pBt->pPage1;
3326   assert( pP1!=0 );
3327   data = pP1->aData;
3328   rc = sqlite3PagerWrite(pP1->pDbPage);
3329   if( rc ) return rc;
3330   memcpy(data, zMagicHeader, sizeof(zMagicHeader));
3331   assert( sizeof(zMagicHeader)==16 );
3332   data[16] = (u8)((pBt->pageSize>>8)&0xff);
3333   data[17] = (u8)((pBt->pageSize>>16)&0xff);
3334   data[18] = 1;
3335   data[19] = 1;
3336   assert( pBt->usableSize<=pBt->pageSize && pBt->usableSize+255>=pBt->pageSize);
3337   data[20] = (u8)(pBt->pageSize - pBt->usableSize);
3338   data[21] = 64;
3339   data[22] = 32;
3340   data[23] = 32;
3341   memset(&data[24], 0, 100-24);
3342   zeroPage(pP1, PTF_INTKEY|PTF_LEAF|PTF_LEAFDATA );
3343   pBt->btsFlags |= BTS_PAGESIZE_FIXED;
3344 #ifndef SQLITE_OMIT_AUTOVACUUM
3345   assert( pBt->autoVacuum==1 || pBt->autoVacuum==0 );
3346   assert( pBt->incrVacuum==1 || pBt->incrVacuum==0 );
3347   put4byte(&data[36 + 4*4], pBt->autoVacuum);
3348   put4byte(&data[36 + 7*4], pBt->incrVacuum);
3349 #endif
3350   pBt->nPage = 1;
3351   data[31] = 1;
3352   return SQLITE_OK;
3353 }
3354 
3355 /*
3356 ** Initialize the first page of the database file (creating a database
3357 ** consisting of a single page and no schema objects). Return SQLITE_OK
3358 ** if successful, or an SQLite error code otherwise.
3359 */
3360 int sqlite3BtreeNewDb(Btree *p){
3361   int rc;
3362   sqlite3BtreeEnter(p);
3363   p->pBt->nPage = 0;
3364   rc = newDatabase(p->pBt);
3365   sqlite3BtreeLeave(p);
3366   return rc;
3367 }
3368 
3369 /*
3370 ** Attempt to start a new transaction. A write-transaction
3371 ** is started if the second argument is nonzero, otherwise a read-
3372 ** transaction.  If the second argument is 2 or more and exclusive
3373 ** transaction is started, meaning that no other process is allowed
3374 ** to access the database.  A preexisting transaction may not be
3375 ** upgraded to exclusive by calling this routine a second time - the
3376 ** exclusivity flag only works for a new transaction.
3377 **
3378 ** A write-transaction must be started before attempting any
3379 ** changes to the database.  None of the following routines
3380 ** will work unless a transaction is started first:
3381 **
3382 **      sqlite3BtreeCreateTable()
3383 **      sqlite3BtreeCreateIndex()
3384 **      sqlite3BtreeClearTable()
3385 **      sqlite3BtreeDropTable()
3386 **      sqlite3BtreeInsert()
3387 **      sqlite3BtreeDelete()
3388 **      sqlite3BtreeUpdateMeta()
3389 **
3390 ** If an initial attempt to acquire the lock fails because of lock contention
3391 ** and the database was previously unlocked, then invoke the busy handler
3392 ** if there is one.  But if there was previously a read-lock, do not
3393 ** invoke the busy handler - just return SQLITE_BUSY.  SQLITE_BUSY is
3394 ** returned when there is already a read-lock in order to avoid a deadlock.
3395 **
3396 ** Suppose there are two processes A and B.  A has a read lock and B has
3397 ** a reserved lock.  B tries to promote to exclusive but is blocked because
3398 ** of A's read lock.  A tries to promote to reserved but is blocked by B.
3399 ** One or the other of the two processes must give way or there can be
3400 ** no progress.  By returning SQLITE_BUSY and not invoking the busy callback
3401 ** when A already has a read lock, we encourage A to give up and let B
3402 ** proceed.
3403 */
3404 int sqlite3BtreeBeginTrans(Btree *p, int wrflag, int *pSchemaVersion){
3405   BtShared *pBt = p->pBt;
3406   Pager *pPager = pBt->pPager;
3407   int rc = SQLITE_OK;
3408 
3409   sqlite3BtreeEnter(p);
3410   btreeIntegrity(p);
3411 
3412   /* If the btree is already in a write-transaction, or it
3413   ** is already in a read-transaction and a read-transaction
3414   ** is requested, this is a no-op.
3415   */
3416   if( p->inTrans==TRANS_WRITE || (p->inTrans==TRANS_READ && !wrflag) ){
3417     goto trans_begun;
3418   }
3419   assert( pBt->inTransaction==TRANS_WRITE || IfNotOmitAV(pBt->bDoTruncate)==0 );
3420 
3421   if( (p->db->flags & SQLITE_ResetDatabase)
3422    && sqlite3PagerIsreadonly(pPager)==0
3423   ){
3424     pBt->btsFlags &= ~BTS_READ_ONLY;
3425   }
3426 
3427   /* Write transactions are not possible on a read-only database */
3428   if( (pBt->btsFlags & BTS_READ_ONLY)!=0 && wrflag ){
3429     rc = SQLITE_READONLY;
3430     goto trans_begun;
3431   }
3432 
3433 #ifndef SQLITE_OMIT_SHARED_CACHE
3434   {
3435     sqlite3 *pBlock = 0;
3436     /* If another database handle has already opened a write transaction
3437     ** on this shared-btree structure and a second write transaction is
3438     ** requested, return SQLITE_LOCKED.
3439     */
3440     if( (wrflag && pBt->inTransaction==TRANS_WRITE)
3441      || (pBt->btsFlags & BTS_PENDING)!=0
3442     ){
3443       pBlock = pBt->pWriter->db;
3444     }else if( wrflag>1 ){
3445       BtLock *pIter;
3446       for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){
3447         if( pIter->pBtree!=p ){
3448           pBlock = pIter->pBtree->db;
3449           break;
3450         }
3451       }
3452     }
3453     if( pBlock ){
3454       sqlite3ConnectionBlocked(p->db, pBlock);
3455       rc = SQLITE_LOCKED_SHAREDCACHE;
3456       goto trans_begun;
3457     }
3458   }
3459 #endif
3460 
3461   /* Any read-only or read-write transaction implies a read-lock on
3462   ** page 1. So if some other shared-cache client already has a write-lock
3463   ** on page 1, the transaction cannot be opened. */
3464   rc = querySharedCacheTableLock(p, SCHEMA_ROOT, READ_LOCK);
3465   if( SQLITE_OK!=rc ) goto trans_begun;
3466 
3467   pBt->btsFlags &= ~BTS_INITIALLY_EMPTY;
3468   if( pBt->nPage==0 ) pBt->btsFlags |= BTS_INITIALLY_EMPTY;
3469   do {
3470     sqlite3PagerWalDb(pPager, p->db);
3471 
3472 #ifdef SQLITE_ENABLE_SETLK_TIMEOUT
3473     /* If transitioning from no transaction directly to a write transaction,
3474     ** block for the WRITER lock first if possible. */
3475     if( pBt->pPage1==0 && wrflag ){
3476       assert( pBt->inTransaction==TRANS_NONE );
3477       rc = sqlite3PagerWalWriteLock(pPager, 1);
3478       if( rc!=SQLITE_BUSY && rc!=SQLITE_OK ) break;
3479     }
3480 #endif
3481 
3482     /* Call lockBtree() until either pBt->pPage1 is populated or
3483     ** lockBtree() returns something other than SQLITE_OK. lockBtree()
3484     ** may return SQLITE_OK but leave pBt->pPage1 set to 0 if after
3485     ** reading page 1 it discovers that the page-size of the database
3486     ** file is not pBt->pageSize. In this case lockBtree() will update
3487     ** pBt->pageSize to the page-size of the file on disk.
3488     */
3489     while( pBt->pPage1==0 && SQLITE_OK==(rc = lockBtree(pBt)) );
3490 
3491     if( rc==SQLITE_OK && wrflag ){
3492       if( (pBt->btsFlags & BTS_READ_ONLY)!=0 ){
3493         rc = SQLITE_READONLY;
3494       }else{
3495         rc = sqlite3PagerBegin(pPager, wrflag>1, sqlite3TempInMemory(p->db));
3496         if( rc==SQLITE_OK ){
3497           rc = newDatabase(pBt);
3498         }else if( rc==SQLITE_BUSY_SNAPSHOT && pBt->inTransaction==TRANS_NONE ){
3499           /* if there was no transaction opened when this function was
3500           ** called and SQLITE_BUSY_SNAPSHOT is returned, change the error
3501           ** code to SQLITE_BUSY. */
3502           rc = SQLITE_BUSY;
3503         }
3504       }
3505     }
3506 
3507     if( rc!=SQLITE_OK ){
3508       (void)sqlite3PagerWalWriteLock(pPager, 0);
3509       unlockBtreeIfUnused(pBt);
3510     }
3511   }while( (rc&0xFF)==SQLITE_BUSY && pBt->inTransaction==TRANS_NONE &&
3512           btreeInvokeBusyHandler(pBt) );
3513   sqlite3PagerWalDb(pPager, 0);
3514 #ifdef SQLITE_ENABLE_SETLK_TIMEOUT
3515   if( rc==SQLITE_BUSY_TIMEOUT ) rc = SQLITE_BUSY;
3516 #endif
3517 
3518   if( rc==SQLITE_OK ){
3519     if( p->inTrans==TRANS_NONE ){
3520       pBt->nTransaction++;
3521 #ifndef SQLITE_OMIT_SHARED_CACHE
3522       if( p->sharable ){
3523         assert( p->lock.pBtree==p && p->lock.iTable==1 );
3524         p->lock.eLock = READ_LOCK;
3525         p->lock.pNext = pBt->pLock;
3526         pBt->pLock = &p->lock;
3527       }
3528 #endif
3529     }
3530     p->inTrans = (wrflag?TRANS_WRITE:TRANS_READ);
3531     if( p->inTrans>pBt->inTransaction ){
3532       pBt->inTransaction = p->inTrans;
3533     }
3534     if( wrflag ){
3535       MemPage *pPage1 = pBt->pPage1;
3536 #ifndef SQLITE_OMIT_SHARED_CACHE
3537       assert( !pBt->pWriter );
3538       pBt->pWriter = p;
3539       pBt->btsFlags &= ~BTS_EXCLUSIVE;
3540       if( wrflag>1 ) pBt->btsFlags |= BTS_EXCLUSIVE;
3541 #endif
3542 
3543       /* If the db-size header field is incorrect (as it may be if an old
3544       ** client has been writing the database file), update it now. Doing
3545       ** this sooner rather than later means the database size can safely
3546       ** re-read the database size from page 1 if a savepoint or transaction
3547       ** rollback occurs within the transaction.
3548       */
3549       if( pBt->nPage!=get4byte(&pPage1->aData[28]) ){
3550         rc = sqlite3PagerWrite(pPage1->pDbPage);
3551         if( rc==SQLITE_OK ){
3552           put4byte(&pPage1->aData[28], pBt->nPage);
3553         }
3554       }
3555     }
3556   }
3557 
3558 trans_begun:
3559   if( rc==SQLITE_OK ){
3560     if( pSchemaVersion ){
3561       *pSchemaVersion = get4byte(&pBt->pPage1->aData[40]);
3562     }
3563     if( wrflag ){
3564       /* This call makes sure that the pager has the correct number of
3565       ** open savepoints. If the second parameter is greater than 0 and
3566       ** the sub-journal is not already open, then it will be opened here.
3567       */
3568       rc = sqlite3PagerOpenSavepoint(pPager, p->db->nSavepoint);
3569     }
3570   }
3571 
3572   btreeIntegrity(p);
3573   sqlite3BtreeLeave(p);
3574   return rc;
3575 }
3576 
3577 #ifndef SQLITE_OMIT_AUTOVACUUM
3578 
3579 /*
3580 ** Set the pointer-map entries for all children of page pPage. Also, if
3581 ** pPage contains cells that point to overflow pages, set the pointer
3582 ** map entries for the overflow pages as well.
3583 */
3584 static int setChildPtrmaps(MemPage *pPage){
3585   int i;                             /* Counter variable */
3586   int nCell;                         /* Number of cells in page pPage */
3587   int rc;                            /* Return code */
3588   BtShared *pBt = pPage->pBt;
3589   Pgno pgno = pPage->pgno;
3590 
3591   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
3592   rc = pPage->isInit ? SQLITE_OK : btreeInitPage(pPage);
3593   if( rc!=SQLITE_OK ) return rc;
3594   nCell = pPage->nCell;
3595 
3596   for(i=0; i<nCell; i++){
3597     u8 *pCell = findCell(pPage, i);
3598 
3599     ptrmapPutOvflPtr(pPage, pPage, pCell, &rc);
3600 
3601     if( !pPage->leaf ){
3602       Pgno childPgno = get4byte(pCell);
3603       ptrmapPut(pBt, childPgno, PTRMAP_BTREE, pgno, &rc);
3604     }
3605   }
3606 
3607   if( !pPage->leaf ){
3608     Pgno childPgno = get4byte(&pPage->aData[pPage->hdrOffset+8]);
3609     ptrmapPut(pBt, childPgno, PTRMAP_BTREE, pgno, &rc);
3610   }
3611 
3612   return rc;
3613 }
3614 
3615 /*
3616 ** Somewhere on pPage is a pointer to page iFrom.  Modify this pointer so
3617 ** that it points to iTo. Parameter eType describes the type of pointer to
3618 ** be modified, as  follows:
3619 **
3620 ** PTRMAP_BTREE:     pPage is a btree-page. The pointer points at a child
3621 **                   page of pPage.
3622 **
3623 ** PTRMAP_OVERFLOW1: pPage is a btree-page. The pointer points at an overflow
3624 **                   page pointed to by one of the cells on pPage.
3625 **
3626 ** PTRMAP_OVERFLOW2: pPage is an overflow-page. The pointer points at the next
3627 **                   overflow page in the list.
3628 */
3629 static int modifyPagePointer(MemPage *pPage, Pgno iFrom, Pgno iTo, u8 eType){
3630   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
3631   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
3632   if( eType==PTRMAP_OVERFLOW2 ){
3633     /* The pointer is always the first 4 bytes of the page in this case.  */
3634     if( get4byte(pPage->aData)!=iFrom ){
3635       return SQLITE_CORRUPT_PAGE(pPage);
3636     }
3637     put4byte(pPage->aData, iTo);
3638   }else{
3639     int i;
3640     int nCell;
3641     int rc;
3642 
3643     rc = pPage->isInit ? SQLITE_OK : btreeInitPage(pPage);
3644     if( rc ) return rc;
3645     nCell = pPage->nCell;
3646 
3647     for(i=0; i<nCell; i++){
3648       u8 *pCell = findCell(pPage, i);
3649       if( eType==PTRMAP_OVERFLOW1 ){
3650         CellInfo info;
3651         pPage->xParseCell(pPage, pCell, &info);
3652         if( info.nLocal<info.nPayload ){
3653           if( pCell+info.nSize > pPage->aData+pPage->pBt->usableSize ){
3654             return SQLITE_CORRUPT_PAGE(pPage);
3655           }
3656           if( iFrom==get4byte(pCell+info.nSize-4) ){
3657             put4byte(pCell+info.nSize-4, iTo);
3658             break;
3659           }
3660         }
3661       }else{
3662         if( get4byte(pCell)==iFrom ){
3663           put4byte(pCell, iTo);
3664           break;
3665         }
3666       }
3667     }
3668 
3669     if( i==nCell ){
3670       if( eType!=PTRMAP_BTREE ||
3671           get4byte(&pPage->aData[pPage->hdrOffset+8])!=iFrom ){
3672         return SQLITE_CORRUPT_PAGE(pPage);
3673       }
3674       put4byte(&pPage->aData[pPage->hdrOffset+8], iTo);
3675     }
3676   }
3677   return SQLITE_OK;
3678 }
3679 
3680 
3681 /*
3682 ** Move the open database page pDbPage to location iFreePage in the
3683 ** database. The pDbPage reference remains valid.
3684 **
3685 ** The isCommit flag indicates that there is no need to remember that
3686 ** the journal needs to be sync()ed before database page pDbPage->pgno
3687 ** can be written to. The caller has already promised not to write to that
3688 ** page.
3689 */
3690 static int relocatePage(
3691   BtShared *pBt,           /* Btree */
3692   MemPage *pDbPage,        /* Open page to move */
3693   u8 eType,                /* Pointer map 'type' entry for pDbPage */
3694   Pgno iPtrPage,           /* Pointer map 'page-no' entry for pDbPage */
3695   Pgno iFreePage,          /* The location to move pDbPage to */
3696   int isCommit             /* isCommit flag passed to sqlite3PagerMovepage */
3697 ){
3698   MemPage *pPtrPage;   /* The page that contains a pointer to pDbPage */
3699   Pgno iDbPage = pDbPage->pgno;
3700   Pager *pPager = pBt->pPager;
3701   int rc;
3702 
3703   assert( eType==PTRMAP_OVERFLOW2 || eType==PTRMAP_OVERFLOW1 ||
3704       eType==PTRMAP_BTREE || eType==PTRMAP_ROOTPAGE );
3705   assert( sqlite3_mutex_held(pBt->mutex) );
3706   assert( pDbPage->pBt==pBt );
3707   if( iDbPage<3 ) return SQLITE_CORRUPT_BKPT;
3708 
3709   /* Move page iDbPage from its current location to page number iFreePage */
3710   TRACE(("AUTOVACUUM: Moving %d to free page %d (ptr page %d type %d)\n",
3711       iDbPage, iFreePage, iPtrPage, eType));
3712   rc = sqlite3PagerMovepage(pPager, pDbPage->pDbPage, iFreePage, isCommit);
3713   if( rc!=SQLITE_OK ){
3714     return rc;
3715   }
3716   pDbPage->pgno = iFreePage;
3717 
3718   /* If pDbPage was a btree-page, then it may have child pages and/or cells
3719   ** that point to overflow pages. The pointer map entries for all these
3720   ** pages need to be changed.
3721   **
3722   ** If pDbPage is an overflow page, then the first 4 bytes may store a
3723   ** pointer to a subsequent overflow page. If this is the case, then
3724   ** the pointer map needs to be updated for the subsequent overflow page.
3725   */
3726   if( eType==PTRMAP_BTREE || eType==PTRMAP_ROOTPAGE ){
3727     rc = setChildPtrmaps(pDbPage);
3728     if( rc!=SQLITE_OK ){
3729       return rc;
3730     }
3731   }else{
3732     Pgno nextOvfl = get4byte(pDbPage->aData);
3733     if( nextOvfl!=0 ){
3734       ptrmapPut(pBt, nextOvfl, PTRMAP_OVERFLOW2, iFreePage, &rc);
3735       if( rc!=SQLITE_OK ){
3736         return rc;
3737       }
3738     }
3739   }
3740 
3741   /* Fix the database pointer on page iPtrPage that pointed at iDbPage so
3742   ** that it points at iFreePage. Also fix the pointer map entry for
3743   ** iPtrPage.
3744   */
3745   if( eType!=PTRMAP_ROOTPAGE ){
3746     rc = btreeGetPage(pBt, iPtrPage, &pPtrPage, 0);
3747     if( rc!=SQLITE_OK ){
3748       return rc;
3749     }
3750     rc = sqlite3PagerWrite(pPtrPage->pDbPage);
3751     if( rc!=SQLITE_OK ){
3752       releasePage(pPtrPage);
3753       return rc;
3754     }
3755     rc = modifyPagePointer(pPtrPage, iDbPage, iFreePage, eType);
3756     releasePage(pPtrPage);
3757     if( rc==SQLITE_OK ){
3758       ptrmapPut(pBt, iFreePage, eType, iPtrPage, &rc);
3759     }
3760   }
3761   return rc;
3762 }
3763 
3764 /* Forward declaration required by incrVacuumStep(). */
3765 static int allocateBtreePage(BtShared *, MemPage **, Pgno *, Pgno, u8);
3766 
3767 /*
3768 ** Perform a single step of an incremental-vacuum. If successful, return
3769 ** SQLITE_OK. If there is no work to do (and therefore no point in
3770 ** calling this function again), return SQLITE_DONE. Or, if an error
3771 ** occurs, return some other error code.
3772 **
3773 ** More specifically, this function attempts to re-organize the database so
3774 ** that the last page of the file currently in use is no longer in use.
3775 **
3776 ** Parameter nFin is the number of pages that this database would contain
3777 ** were this function called until it returns SQLITE_DONE.
3778 **
3779 ** If the bCommit parameter is non-zero, this function assumes that the
3780 ** caller will keep calling incrVacuumStep() until it returns SQLITE_DONE
3781 ** or an error. bCommit is passed true for an auto-vacuum-on-commit
3782 ** operation, or false for an incremental vacuum.
3783 */
3784 static int incrVacuumStep(BtShared *pBt, Pgno nFin, Pgno iLastPg, int bCommit){
3785   Pgno nFreeList;           /* Number of pages still on the free-list */
3786   int rc;
3787 
3788   assert( sqlite3_mutex_held(pBt->mutex) );
3789   assert( iLastPg>nFin );
3790 
3791   if( !PTRMAP_ISPAGE(pBt, iLastPg) && iLastPg!=PENDING_BYTE_PAGE(pBt) ){
3792     u8 eType;
3793     Pgno iPtrPage;
3794 
3795     nFreeList = get4byte(&pBt->pPage1->aData[36]);
3796     if( nFreeList==0 ){
3797       return SQLITE_DONE;
3798     }
3799 
3800     rc = ptrmapGet(pBt, iLastPg, &eType, &iPtrPage);
3801     if( rc!=SQLITE_OK ){
3802       return rc;
3803     }
3804     if( eType==PTRMAP_ROOTPAGE ){
3805       return SQLITE_CORRUPT_BKPT;
3806     }
3807 
3808     if( eType==PTRMAP_FREEPAGE ){
3809       if( bCommit==0 ){
3810         /* Remove the page from the files free-list. This is not required
3811         ** if bCommit is non-zero. In that case, the free-list will be
3812         ** truncated to zero after this function returns, so it doesn't
3813         ** matter if it still contains some garbage entries.
3814         */
3815         Pgno iFreePg;
3816         MemPage *pFreePg;
3817         rc = allocateBtreePage(pBt, &pFreePg, &iFreePg, iLastPg, BTALLOC_EXACT);
3818         if( rc!=SQLITE_OK ){
3819           return rc;
3820         }
3821         assert( iFreePg==iLastPg );
3822         releasePage(pFreePg);
3823       }
3824     } else {
3825       Pgno iFreePg;             /* Index of free page to move pLastPg to */
3826       MemPage *pLastPg;
3827       u8 eMode = BTALLOC_ANY;   /* Mode parameter for allocateBtreePage() */
3828       Pgno iNear = 0;           /* nearby parameter for allocateBtreePage() */
3829 
3830       rc = btreeGetPage(pBt, iLastPg, &pLastPg, 0);
3831       if( rc!=SQLITE_OK ){
3832         return rc;
3833       }
3834 
3835       /* If bCommit is zero, this loop runs exactly once and page pLastPg
3836       ** is swapped with the first free page pulled off the free list.
3837       **
3838       ** On the other hand, if bCommit is greater than zero, then keep
3839       ** looping until a free-page located within the first nFin pages
3840       ** of the file is found.
3841       */
3842       if( bCommit==0 ){
3843         eMode = BTALLOC_LE;
3844         iNear = nFin;
3845       }
3846       do {
3847         MemPage *pFreePg;
3848         rc = allocateBtreePage(pBt, &pFreePg, &iFreePg, iNear, eMode);
3849         if( rc!=SQLITE_OK ){
3850           releasePage(pLastPg);
3851           return rc;
3852         }
3853         releasePage(pFreePg);
3854       }while( bCommit && iFreePg>nFin );
3855       assert( iFreePg<iLastPg );
3856 
3857       rc = relocatePage(pBt, pLastPg, eType, iPtrPage, iFreePg, bCommit);
3858       releasePage(pLastPg);
3859       if( rc!=SQLITE_OK ){
3860         return rc;
3861       }
3862     }
3863   }
3864 
3865   if( bCommit==0 ){
3866     do {
3867       iLastPg--;
3868     }while( iLastPg==PENDING_BYTE_PAGE(pBt) || PTRMAP_ISPAGE(pBt, iLastPg) );
3869     pBt->bDoTruncate = 1;
3870     pBt->nPage = iLastPg;
3871   }
3872   return SQLITE_OK;
3873 }
3874 
3875 /*
3876 ** The database opened by the first argument is an auto-vacuum database
3877 ** nOrig pages in size containing nFree free pages. Return the expected
3878 ** size of the database in pages following an auto-vacuum operation.
3879 */
3880 static Pgno finalDbSize(BtShared *pBt, Pgno nOrig, Pgno nFree){
3881   int nEntry;                     /* Number of entries on one ptrmap page */
3882   Pgno nPtrmap;                   /* Number of PtrMap pages to be freed */
3883   Pgno nFin;                      /* Return value */
3884 
3885   nEntry = pBt->usableSize/5;
3886   nPtrmap = (nFree-nOrig+PTRMAP_PAGENO(pBt, nOrig)+nEntry)/nEntry;
3887   nFin = nOrig - nFree - nPtrmap;
3888   if( nOrig>PENDING_BYTE_PAGE(pBt) && nFin<PENDING_BYTE_PAGE(pBt) ){
3889     nFin--;
3890   }
3891   while( PTRMAP_ISPAGE(pBt, nFin) || nFin==PENDING_BYTE_PAGE(pBt) ){
3892     nFin--;
3893   }
3894 
3895   return nFin;
3896 }
3897 
3898 /*
3899 ** A write-transaction must be opened before calling this function.
3900 ** It performs a single unit of work towards an incremental vacuum.
3901 **
3902 ** If the incremental vacuum is finished after this function has run,
3903 ** SQLITE_DONE is returned. If it is not finished, but no error occurred,
3904 ** SQLITE_OK is returned. Otherwise an SQLite error code.
3905 */
3906 int sqlite3BtreeIncrVacuum(Btree *p){
3907   int rc;
3908   BtShared *pBt = p->pBt;
3909 
3910   sqlite3BtreeEnter(p);
3911   assert( pBt->inTransaction==TRANS_WRITE && p->inTrans==TRANS_WRITE );
3912   if( !pBt->autoVacuum ){
3913     rc = SQLITE_DONE;
3914   }else{
3915     Pgno nOrig = btreePagecount(pBt);
3916     Pgno nFree = get4byte(&pBt->pPage1->aData[36]);
3917     Pgno nFin = finalDbSize(pBt, nOrig, nFree);
3918 
3919     if( nOrig<nFin || nFree>=nOrig ){
3920       rc = SQLITE_CORRUPT_BKPT;
3921     }else if( nFree>0 ){
3922       rc = saveAllCursors(pBt, 0, 0);
3923       if( rc==SQLITE_OK ){
3924         invalidateAllOverflowCache(pBt);
3925         rc = incrVacuumStep(pBt, nFin, nOrig, 0);
3926       }
3927       if( rc==SQLITE_OK ){
3928         rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
3929         put4byte(&pBt->pPage1->aData[28], pBt->nPage);
3930       }
3931     }else{
3932       rc = SQLITE_DONE;
3933     }
3934   }
3935   sqlite3BtreeLeave(p);
3936   return rc;
3937 }
3938 
3939 /*
3940 ** This routine is called prior to sqlite3PagerCommit when a transaction
3941 ** is committed for an auto-vacuum database.
3942 **
3943 ** If SQLITE_OK is returned, then *pnTrunc is set to the number of pages
3944 ** the database file should be truncated to during the commit process.
3945 ** i.e. the database has been reorganized so that only the first *pnTrunc
3946 ** pages are in use.
3947 */
3948 static int autoVacuumCommit(BtShared *pBt){
3949   int rc = SQLITE_OK;
3950   Pager *pPager = pBt->pPager;
3951   VVA_ONLY( int nRef = sqlite3PagerRefcount(pPager); )
3952 
3953   assert( sqlite3_mutex_held(pBt->mutex) );
3954   invalidateAllOverflowCache(pBt);
3955   assert(pBt->autoVacuum);
3956   if( !pBt->incrVacuum ){
3957     Pgno nFin;         /* Number of pages in database after autovacuuming */
3958     Pgno nFree;        /* Number of pages on the freelist initially */
3959     Pgno iFree;        /* The next page to be freed */
3960     Pgno nOrig;        /* Database size before freeing */
3961 
3962     nOrig = btreePagecount(pBt);
3963     if( PTRMAP_ISPAGE(pBt, nOrig) || nOrig==PENDING_BYTE_PAGE(pBt) ){
3964       /* It is not possible to create a database for which the final page
3965       ** is either a pointer-map page or the pending-byte page. If one
3966       ** is encountered, this indicates corruption.
3967       */
3968       return SQLITE_CORRUPT_BKPT;
3969     }
3970 
3971     nFree = get4byte(&pBt->pPage1->aData[36]);
3972     nFin = finalDbSize(pBt, nOrig, nFree);
3973     if( nFin>nOrig ) return SQLITE_CORRUPT_BKPT;
3974     if( nFin<nOrig ){
3975       rc = saveAllCursors(pBt, 0, 0);
3976     }
3977     for(iFree=nOrig; iFree>nFin && rc==SQLITE_OK; iFree--){
3978       rc = incrVacuumStep(pBt, nFin, iFree, 1);
3979     }
3980     if( (rc==SQLITE_DONE || rc==SQLITE_OK) && nFree>0 ){
3981       rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
3982       put4byte(&pBt->pPage1->aData[32], 0);
3983       put4byte(&pBt->pPage1->aData[36], 0);
3984       put4byte(&pBt->pPage1->aData[28], nFin);
3985       pBt->bDoTruncate = 1;
3986       pBt->nPage = nFin;
3987     }
3988     if( rc!=SQLITE_OK ){
3989       sqlite3PagerRollback(pPager);
3990     }
3991   }
3992 
3993   assert( nRef>=sqlite3PagerRefcount(pPager) );
3994   return rc;
3995 }
3996 
3997 #else /* ifndef SQLITE_OMIT_AUTOVACUUM */
3998 # define setChildPtrmaps(x) SQLITE_OK
3999 #endif
4000 
4001 /*
4002 ** This routine does the first phase of a two-phase commit.  This routine
4003 ** causes a rollback journal to be created (if it does not already exist)
4004 ** and populated with enough information so that if a power loss occurs
4005 ** the database can be restored to its original state by playing back
4006 ** the journal.  Then the contents of the journal are flushed out to
4007 ** the disk.  After the journal is safely on oxide, the changes to the
4008 ** database are written into the database file and flushed to oxide.
4009 ** At the end of this call, the rollback journal still exists on the
4010 ** disk and we are still holding all locks, so the transaction has not
4011 ** committed.  See sqlite3BtreeCommitPhaseTwo() for the second phase of the
4012 ** commit process.
4013 **
4014 ** This call is a no-op if no write-transaction is currently active on pBt.
4015 **
4016 ** Otherwise, sync the database file for the btree pBt. zSuperJrnl points to
4017 ** the name of a super-journal file that should be written into the
4018 ** individual journal file, or is NULL, indicating no super-journal file
4019 ** (single database transaction).
4020 **
4021 ** When this is called, the super-journal should already have been
4022 ** created, populated with this journal pointer and synced to disk.
4023 **
4024 ** Once this is routine has returned, the only thing required to commit
4025 ** the write-transaction for this database file is to delete the journal.
4026 */
4027 int sqlite3BtreeCommitPhaseOne(Btree *p, const char *zSuperJrnl){
4028   int rc = SQLITE_OK;
4029   if( p->inTrans==TRANS_WRITE ){
4030     BtShared *pBt = p->pBt;
4031     sqlite3BtreeEnter(p);
4032 #ifndef SQLITE_OMIT_AUTOVACUUM
4033     if( pBt->autoVacuum ){
4034       rc = autoVacuumCommit(pBt);
4035       if( rc!=SQLITE_OK ){
4036         sqlite3BtreeLeave(p);
4037         return rc;
4038       }
4039     }
4040     if( pBt->bDoTruncate ){
4041       sqlite3PagerTruncateImage(pBt->pPager, pBt->nPage);
4042     }
4043 #endif
4044     rc = sqlite3PagerCommitPhaseOne(pBt->pPager, zSuperJrnl, 0);
4045     sqlite3BtreeLeave(p);
4046   }
4047   return rc;
4048 }
4049 
4050 /*
4051 ** This function is called from both BtreeCommitPhaseTwo() and BtreeRollback()
4052 ** at the conclusion of a transaction.
4053 */
4054 static void btreeEndTransaction(Btree *p){
4055   BtShared *pBt = p->pBt;
4056   sqlite3 *db = p->db;
4057   assert( sqlite3BtreeHoldsMutex(p) );
4058 
4059 #ifndef SQLITE_OMIT_AUTOVACUUM
4060   pBt->bDoTruncate = 0;
4061 #endif
4062   if( p->inTrans>TRANS_NONE && db->nVdbeRead>1 ){
4063     /* If there are other active statements that belong to this database
4064     ** handle, downgrade to a read-only transaction. The other statements
4065     ** may still be reading from the database.  */
4066     downgradeAllSharedCacheTableLocks(p);
4067     p->inTrans = TRANS_READ;
4068   }else{
4069     /* If the handle had any kind of transaction open, decrement the
4070     ** transaction count of the shared btree. If the transaction count
4071     ** reaches 0, set the shared state to TRANS_NONE. The unlockBtreeIfUnused()
4072     ** call below will unlock the pager.  */
4073     if( p->inTrans!=TRANS_NONE ){
4074       clearAllSharedCacheTableLocks(p);
4075       pBt->nTransaction--;
4076       if( 0==pBt->nTransaction ){
4077         pBt->inTransaction = TRANS_NONE;
4078       }
4079     }
4080 
4081     /* Set the current transaction state to TRANS_NONE and unlock the
4082     ** pager if this call closed the only read or write transaction.  */
4083     p->inTrans = TRANS_NONE;
4084     unlockBtreeIfUnused(pBt);
4085   }
4086 
4087   btreeIntegrity(p);
4088 }
4089 
4090 /*
4091 ** Commit the transaction currently in progress.
4092 **
4093 ** This routine implements the second phase of a 2-phase commit.  The
4094 ** sqlite3BtreeCommitPhaseOne() routine does the first phase and should
4095 ** be invoked prior to calling this routine.  The sqlite3BtreeCommitPhaseOne()
4096 ** routine did all the work of writing information out to disk and flushing the
4097 ** contents so that they are written onto the disk platter.  All this
4098 ** routine has to do is delete or truncate or zero the header in the
4099 ** the rollback journal (which causes the transaction to commit) and
4100 ** drop locks.
4101 **
4102 ** Normally, if an error occurs while the pager layer is attempting to
4103 ** finalize the underlying journal file, this function returns an error and
4104 ** the upper layer will attempt a rollback. However, if the second argument
4105 ** is non-zero then this b-tree transaction is part of a multi-file
4106 ** transaction. In this case, the transaction has already been committed
4107 ** (by deleting a super-journal file) and the caller will ignore this
4108 ** functions return code. So, even if an error occurs in the pager layer,
4109 ** reset the b-tree objects internal state to indicate that the write
4110 ** transaction has been closed. This is quite safe, as the pager will have
4111 ** transitioned to the error state.
4112 **
4113 ** This will release the write lock on the database file.  If there
4114 ** are no active cursors, it also releases the read lock.
4115 */
4116 int sqlite3BtreeCommitPhaseTwo(Btree *p, int bCleanup){
4117 
4118   if( p->inTrans==TRANS_NONE ) return SQLITE_OK;
4119   sqlite3BtreeEnter(p);
4120   btreeIntegrity(p);
4121 
4122   /* If the handle has a write-transaction open, commit the shared-btrees
4123   ** transaction and set the shared state to TRANS_READ.
4124   */
4125   if( p->inTrans==TRANS_WRITE ){
4126     int rc;
4127     BtShared *pBt = p->pBt;
4128     assert( pBt->inTransaction==TRANS_WRITE );
4129     assert( pBt->nTransaction>0 );
4130     rc = sqlite3PagerCommitPhaseTwo(pBt->pPager);
4131     if( rc!=SQLITE_OK && bCleanup==0 ){
4132       sqlite3BtreeLeave(p);
4133       return rc;
4134     }
4135     p->iBDataVersion--;  /* Compensate for pPager->iDataVersion++; */
4136     pBt->inTransaction = TRANS_READ;
4137     btreeClearHasContent(pBt);
4138   }
4139 
4140   btreeEndTransaction(p);
4141   sqlite3BtreeLeave(p);
4142   return SQLITE_OK;
4143 }
4144 
4145 /*
4146 ** Do both phases of a commit.
4147 */
4148 int sqlite3BtreeCommit(Btree *p){
4149   int rc;
4150   sqlite3BtreeEnter(p);
4151   rc = sqlite3BtreeCommitPhaseOne(p, 0);
4152   if( rc==SQLITE_OK ){
4153     rc = sqlite3BtreeCommitPhaseTwo(p, 0);
4154   }
4155   sqlite3BtreeLeave(p);
4156   return rc;
4157 }
4158 
4159 /*
4160 ** This routine sets the state to CURSOR_FAULT and the error
4161 ** code to errCode for every cursor on any BtShared that pBtree
4162 ** references.  Or if the writeOnly flag is set to 1, then only
4163 ** trip write cursors and leave read cursors unchanged.
4164 **
4165 ** Every cursor is a candidate to be tripped, including cursors
4166 ** that belong to other database connections that happen to be
4167 ** sharing the cache with pBtree.
4168 **
4169 ** This routine gets called when a rollback occurs. If the writeOnly
4170 ** flag is true, then only write-cursors need be tripped - read-only
4171 ** cursors save their current positions so that they may continue
4172 ** following the rollback. Or, if writeOnly is false, all cursors are
4173 ** tripped. In general, writeOnly is false if the transaction being
4174 ** rolled back modified the database schema. In this case b-tree root
4175 ** pages may be moved or deleted from the database altogether, making
4176 ** it unsafe for read cursors to continue.
4177 **
4178 ** If the writeOnly flag is true and an error is encountered while
4179 ** saving the current position of a read-only cursor, all cursors,
4180 ** including all read-cursors are tripped.
4181 **
4182 ** SQLITE_OK is returned if successful, or if an error occurs while
4183 ** saving a cursor position, an SQLite error code.
4184 */
4185 int sqlite3BtreeTripAllCursors(Btree *pBtree, int errCode, int writeOnly){
4186   BtCursor *p;
4187   int rc = SQLITE_OK;
4188 
4189   assert( (writeOnly==0 || writeOnly==1) && BTCF_WriteFlag==1 );
4190   if( pBtree ){
4191     sqlite3BtreeEnter(pBtree);
4192     for(p=pBtree->pBt->pCursor; p; p=p->pNext){
4193       if( writeOnly && (p->curFlags & BTCF_WriteFlag)==0 ){
4194         if( p->eState==CURSOR_VALID || p->eState==CURSOR_SKIPNEXT ){
4195           rc = saveCursorPosition(p);
4196           if( rc!=SQLITE_OK ){
4197             (void)sqlite3BtreeTripAllCursors(pBtree, rc, 0);
4198             break;
4199           }
4200         }
4201       }else{
4202         sqlite3BtreeClearCursor(p);
4203         p->eState = CURSOR_FAULT;
4204         p->skipNext = errCode;
4205       }
4206       btreeReleaseAllCursorPages(p);
4207     }
4208     sqlite3BtreeLeave(pBtree);
4209   }
4210   return rc;
4211 }
4212 
4213 /*
4214 ** Set the pBt->nPage field correctly, according to the current
4215 ** state of the database.  Assume pBt->pPage1 is valid.
4216 */
4217 static void btreeSetNPage(BtShared *pBt, MemPage *pPage1){
4218   int nPage = get4byte(&pPage1->aData[28]);
4219   testcase( nPage==0 );
4220   if( nPage==0 ) sqlite3PagerPagecount(pBt->pPager, &nPage);
4221   testcase( pBt->nPage!=nPage );
4222   pBt->nPage = nPage;
4223 }
4224 
4225 /*
4226 ** Rollback the transaction in progress.
4227 **
4228 ** If tripCode is not SQLITE_OK then cursors will be invalidated (tripped).
4229 ** Only write cursors are tripped if writeOnly is true but all cursors are
4230 ** tripped if writeOnly is false.  Any attempt to use
4231 ** a tripped cursor will result in an error.
4232 **
4233 ** This will release the write lock on the database file.  If there
4234 ** are no active cursors, it also releases the read lock.
4235 */
4236 int sqlite3BtreeRollback(Btree *p, int tripCode, int writeOnly){
4237   int rc;
4238   BtShared *pBt = p->pBt;
4239   MemPage *pPage1;
4240 
4241   assert( writeOnly==1 || writeOnly==0 );
4242   assert( tripCode==SQLITE_ABORT_ROLLBACK || tripCode==SQLITE_OK );
4243   sqlite3BtreeEnter(p);
4244   if( tripCode==SQLITE_OK ){
4245     rc = tripCode = saveAllCursors(pBt, 0, 0);
4246     if( rc ) writeOnly = 0;
4247   }else{
4248     rc = SQLITE_OK;
4249   }
4250   if( tripCode ){
4251     int rc2 = sqlite3BtreeTripAllCursors(p, tripCode, writeOnly);
4252     assert( rc==SQLITE_OK || (writeOnly==0 && rc2==SQLITE_OK) );
4253     if( rc2!=SQLITE_OK ) rc = rc2;
4254   }
4255   btreeIntegrity(p);
4256 
4257   if( p->inTrans==TRANS_WRITE ){
4258     int rc2;
4259 
4260     assert( TRANS_WRITE==pBt->inTransaction );
4261     rc2 = sqlite3PagerRollback(pBt->pPager);
4262     if( rc2!=SQLITE_OK ){
4263       rc = rc2;
4264     }
4265 
4266     /* The rollback may have destroyed the pPage1->aData value.  So
4267     ** call btreeGetPage() on page 1 again to make
4268     ** sure pPage1->aData is set correctly. */
4269     if( btreeGetPage(pBt, 1, &pPage1, 0)==SQLITE_OK ){
4270       btreeSetNPage(pBt, pPage1);
4271       releasePageOne(pPage1);
4272     }
4273     assert( countValidCursors(pBt, 1)==0 );
4274     pBt->inTransaction = TRANS_READ;
4275     btreeClearHasContent(pBt);
4276   }
4277 
4278   btreeEndTransaction(p);
4279   sqlite3BtreeLeave(p);
4280   return rc;
4281 }
4282 
4283 /*
4284 ** Start a statement subtransaction. The subtransaction can be rolled
4285 ** back independently of the main transaction. You must start a transaction
4286 ** before starting a subtransaction. The subtransaction is ended automatically
4287 ** if the main transaction commits or rolls back.
4288 **
4289 ** Statement subtransactions are used around individual SQL statements
4290 ** that are contained within a BEGIN...COMMIT block.  If a constraint
4291 ** error occurs within the statement, the effect of that one statement
4292 ** can be rolled back without having to rollback the entire transaction.
4293 **
4294 ** A statement sub-transaction is implemented as an anonymous savepoint. The
4295 ** value passed as the second parameter is the total number of savepoints,
4296 ** including the new anonymous savepoint, open on the B-Tree. i.e. if there
4297 ** are no active savepoints and no other statement-transactions open,
4298 ** iStatement is 1. This anonymous savepoint can be released or rolled back
4299 ** using the sqlite3BtreeSavepoint() function.
4300 */
4301 int sqlite3BtreeBeginStmt(Btree *p, int iStatement){
4302   int rc;
4303   BtShared *pBt = p->pBt;
4304   sqlite3BtreeEnter(p);
4305   assert( p->inTrans==TRANS_WRITE );
4306   assert( (pBt->btsFlags & BTS_READ_ONLY)==0 );
4307   assert( iStatement>0 );
4308   assert( iStatement>p->db->nSavepoint );
4309   assert( pBt->inTransaction==TRANS_WRITE );
4310   /* At the pager level, a statement transaction is a savepoint with
4311   ** an index greater than all savepoints created explicitly using
4312   ** SQL statements. It is illegal to open, release or rollback any
4313   ** such savepoints while the statement transaction savepoint is active.
4314   */
4315   rc = sqlite3PagerOpenSavepoint(pBt->pPager, iStatement);
4316   sqlite3BtreeLeave(p);
4317   return rc;
4318 }
4319 
4320 /*
4321 ** The second argument to this function, op, is always SAVEPOINT_ROLLBACK
4322 ** or SAVEPOINT_RELEASE. This function either releases or rolls back the
4323 ** savepoint identified by parameter iSavepoint, depending on the value
4324 ** of op.
4325 **
4326 ** Normally, iSavepoint is greater than or equal to zero. However, if op is
4327 ** SAVEPOINT_ROLLBACK, then iSavepoint may also be -1. In this case the
4328 ** contents of the entire transaction are rolled back. This is different
4329 ** from a normal transaction rollback, as no locks are released and the
4330 ** transaction remains open.
4331 */
4332 int sqlite3BtreeSavepoint(Btree *p, int op, int iSavepoint){
4333   int rc = SQLITE_OK;
4334   if( p && p->inTrans==TRANS_WRITE ){
4335     BtShared *pBt = p->pBt;
4336     assert( op==SAVEPOINT_RELEASE || op==SAVEPOINT_ROLLBACK );
4337     assert( iSavepoint>=0 || (iSavepoint==-1 && op==SAVEPOINT_ROLLBACK) );
4338     sqlite3BtreeEnter(p);
4339     if( op==SAVEPOINT_ROLLBACK ){
4340       rc = saveAllCursors(pBt, 0, 0);
4341     }
4342     if( rc==SQLITE_OK ){
4343       rc = sqlite3PagerSavepoint(pBt->pPager, op, iSavepoint);
4344     }
4345     if( rc==SQLITE_OK ){
4346       if( iSavepoint<0 && (pBt->btsFlags & BTS_INITIALLY_EMPTY)!=0 ){
4347         pBt->nPage = 0;
4348       }
4349       rc = newDatabase(pBt);
4350       btreeSetNPage(pBt, pBt->pPage1);
4351 
4352       /* pBt->nPage might be zero if the database was corrupt when
4353       ** the transaction was started. Otherwise, it must be at least 1.  */
4354       assert( CORRUPT_DB || pBt->nPage>0 );
4355     }
4356     sqlite3BtreeLeave(p);
4357   }
4358   return rc;
4359 }
4360 
4361 /*
4362 ** Create a new cursor for the BTree whose root is on the page
4363 ** iTable. If a read-only cursor is requested, it is assumed that
4364 ** the caller already has at least a read-only transaction open
4365 ** on the database already. If a write-cursor is requested, then
4366 ** the caller is assumed to have an open write transaction.
4367 **
4368 ** If the BTREE_WRCSR bit of wrFlag is clear, then the cursor can only
4369 ** be used for reading.  If the BTREE_WRCSR bit is set, then the cursor
4370 ** can be used for reading or for writing if other conditions for writing
4371 ** are also met.  These are the conditions that must be met in order
4372 ** for writing to be allowed:
4373 **
4374 ** 1:  The cursor must have been opened with wrFlag containing BTREE_WRCSR
4375 **
4376 ** 2:  Other database connections that share the same pager cache
4377 **     but which are not in the READ_UNCOMMITTED state may not have
4378 **     cursors open with wrFlag==0 on the same table.  Otherwise
4379 **     the changes made by this write cursor would be visible to
4380 **     the read cursors in the other database connection.
4381 **
4382 ** 3:  The database must be writable (not on read-only media)
4383 **
4384 ** 4:  There must be an active transaction.
4385 **
4386 ** The BTREE_FORDELETE bit of wrFlag may optionally be set if BTREE_WRCSR
4387 ** is set.  If FORDELETE is set, that is a hint to the implementation that
4388 ** this cursor will only be used to seek to and delete entries of an index
4389 ** as part of a larger DELETE statement.  The FORDELETE hint is not used by
4390 ** this implementation.  But in a hypothetical alternative storage engine
4391 ** in which index entries are automatically deleted when corresponding table
4392 ** rows are deleted, the FORDELETE flag is a hint that all SEEK and DELETE
4393 ** operations on this cursor can be no-ops and all READ operations can
4394 ** return a null row (2-bytes: 0x01 0x00).
4395 **
4396 ** No checking is done to make sure that page iTable really is the
4397 ** root page of a b-tree.  If it is not, then the cursor acquired
4398 ** will not work correctly.
4399 **
4400 ** It is assumed that the sqlite3BtreeCursorZero() has been called
4401 ** on pCur to initialize the memory space prior to invoking this routine.
4402 */
4403 static int btreeCursor(
4404   Btree *p,                              /* The btree */
4405   Pgno iTable,                           /* Root page of table to open */
4406   int wrFlag,                            /* 1 to write. 0 read-only */
4407   struct KeyInfo *pKeyInfo,              /* First arg to comparison function */
4408   BtCursor *pCur                         /* Space for new cursor */
4409 ){
4410   BtShared *pBt = p->pBt;                /* Shared b-tree handle */
4411   BtCursor *pX;                          /* Looping over other all cursors */
4412 
4413   assert( sqlite3BtreeHoldsMutex(p) );
4414   assert( wrFlag==0
4415        || wrFlag==BTREE_WRCSR
4416        || wrFlag==(BTREE_WRCSR|BTREE_FORDELETE)
4417   );
4418 
4419   /* The following assert statements verify that if this is a sharable
4420   ** b-tree database, the connection is holding the required table locks,
4421   ** and that no other connection has any open cursor that conflicts with
4422   ** this lock.  The iTable<1 term disables the check for corrupt schemas. */
4423   assert( hasSharedCacheTableLock(p, iTable, pKeyInfo!=0, (wrFlag?2:1))
4424           || iTable<1 );
4425   assert( wrFlag==0 || !hasReadConflicts(p, iTable) );
4426 
4427   /* Assert that the caller has opened the required transaction. */
4428   assert( p->inTrans>TRANS_NONE );
4429   assert( wrFlag==0 || p->inTrans==TRANS_WRITE );
4430   assert( pBt->pPage1 && pBt->pPage1->aData );
4431   assert( wrFlag==0 || (pBt->btsFlags & BTS_READ_ONLY)==0 );
4432 
4433   if( wrFlag ){
4434     allocateTempSpace(pBt);
4435     if( pBt->pTmpSpace==0 ) return SQLITE_NOMEM_BKPT;
4436   }
4437   if( iTable<=1 ){
4438     if( iTable<1 ){
4439       return SQLITE_CORRUPT_BKPT;
4440     }else if( btreePagecount(pBt)==0 ){
4441       assert( wrFlag==0 );
4442       iTable = 0;
4443     }
4444   }
4445 
4446   /* Now that no other errors can occur, finish filling in the BtCursor
4447   ** variables and link the cursor into the BtShared list.  */
4448   pCur->pgnoRoot = iTable;
4449   pCur->iPage = -1;
4450   pCur->pKeyInfo = pKeyInfo;
4451   pCur->pBtree = p;
4452   pCur->pBt = pBt;
4453   pCur->curFlags = wrFlag ? BTCF_WriteFlag : 0;
4454   pCur->curPagerFlags = wrFlag ? 0 : PAGER_GET_READONLY;
4455   /* If there are two or more cursors on the same btree, then all such
4456   ** cursors *must* have the BTCF_Multiple flag set. */
4457   for(pX=pBt->pCursor; pX; pX=pX->pNext){
4458     if( pX->pgnoRoot==iTable ){
4459       pX->curFlags |= BTCF_Multiple;
4460       pCur->curFlags |= BTCF_Multiple;
4461     }
4462   }
4463   pCur->pNext = pBt->pCursor;
4464   pBt->pCursor = pCur;
4465   pCur->eState = CURSOR_INVALID;
4466   return SQLITE_OK;
4467 }
4468 static int btreeCursorWithLock(
4469   Btree *p,                              /* The btree */
4470   Pgno iTable,                           /* Root page of table to open */
4471   int wrFlag,                            /* 1 to write. 0 read-only */
4472   struct KeyInfo *pKeyInfo,              /* First arg to comparison function */
4473   BtCursor *pCur                         /* Space for new cursor */
4474 ){
4475   int rc;
4476   sqlite3BtreeEnter(p);
4477   rc = btreeCursor(p, iTable, wrFlag, pKeyInfo, pCur);
4478   sqlite3BtreeLeave(p);
4479   return rc;
4480 }
4481 int sqlite3BtreeCursor(
4482   Btree *p,                                   /* The btree */
4483   Pgno iTable,                                /* Root page of table to open */
4484   int wrFlag,                                 /* 1 to write. 0 read-only */
4485   struct KeyInfo *pKeyInfo,                   /* First arg to xCompare() */
4486   BtCursor *pCur                              /* Write new cursor here */
4487 ){
4488   if( p->sharable ){
4489     return btreeCursorWithLock(p, iTable, wrFlag, pKeyInfo, pCur);
4490   }else{
4491     return btreeCursor(p, iTable, wrFlag, pKeyInfo, pCur);
4492   }
4493 }
4494 
4495 /*
4496 ** Return the size of a BtCursor object in bytes.
4497 **
4498 ** This interfaces is needed so that users of cursors can preallocate
4499 ** sufficient storage to hold a cursor.  The BtCursor object is opaque
4500 ** to users so they cannot do the sizeof() themselves - they must call
4501 ** this routine.
4502 */
4503 int sqlite3BtreeCursorSize(void){
4504   return ROUND8(sizeof(BtCursor));
4505 }
4506 
4507 /*
4508 ** Initialize memory that will be converted into a BtCursor object.
4509 **
4510 ** The simple approach here would be to memset() the entire object
4511 ** to zero.  But it turns out that the apPage[] and aiIdx[] arrays
4512 ** do not need to be zeroed and they are large, so we can save a lot
4513 ** of run-time by skipping the initialization of those elements.
4514 */
4515 void sqlite3BtreeCursorZero(BtCursor *p){
4516   memset(p, 0, offsetof(BtCursor, BTCURSOR_FIRST_UNINIT));
4517 }
4518 
4519 /*
4520 ** Close a cursor.  The read lock on the database file is released
4521 ** when the last cursor is closed.
4522 */
4523 int sqlite3BtreeCloseCursor(BtCursor *pCur){
4524   Btree *pBtree = pCur->pBtree;
4525   if( pBtree ){
4526     BtShared *pBt = pCur->pBt;
4527     sqlite3BtreeEnter(pBtree);
4528     assert( pBt->pCursor!=0 );
4529     if( pBt->pCursor==pCur ){
4530       pBt->pCursor = pCur->pNext;
4531     }else{
4532       BtCursor *pPrev = pBt->pCursor;
4533       do{
4534         if( pPrev->pNext==pCur ){
4535           pPrev->pNext = pCur->pNext;
4536           break;
4537         }
4538         pPrev = pPrev->pNext;
4539       }while( ALWAYS(pPrev) );
4540     }
4541     btreeReleaseAllCursorPages(pCur);
4542     unlockBtreeIfUnused(pBt);
4543     sqlite3_free(pCur->aOverflow);
4544     sqlite3_free(pCur->pKey);
4545     if( (pBt->openFlags & BTREE_SINGLE) && pBt->pCursor==0 ){
4546       /* Since the BtShared is not sharable, there is no need to
4547       ** worry about the missing sqlite3BtreeLeave() call here.  */
4548       assert( pBtree->sharable==0 );
4549       sqlite3BtreeClose(pBtree);
4550     }else{
4551       sqlite3BtreeLeave(pBtree);
4552     }
4553     pCur->pBtree = 0;
4554   }
4555   return SQLITE_OK;
4556 }
4557 
4558 /*
4559 ** Make sure the BtCursor* given in the argument has a valid
4560 ** BtCursor.info structure.  If it is not already valid, call
4561 ** btreeParseCell() to fill it in.
4562 **
4563 ** BtCursor.info is a cache of the information in the current cell.
4564 ** Using this cache reduces the number of calls to btreeParseCell().
4565 */
4566 #ifndef NDEBUG
4567   static int cellInfoEqual(CellInfo *a, CellInfo *b){
4568     if( a->nKey!=b->nKey ) return 0;
4569     if( a->pPayload!=b->pPayload ) return 0;
4570     if( a->nPayload!=b->nPayload ) return 0;
4571     if( a->nLocal!=b->nLocal ) return 0;
4572     if( a->nSize!=b->nSize ) return 0;
4573     return 1;
4574   }
4575   static void assertCellInfo(BtCursor *pCur){
4576     CellInfo info;
4577     memset(&info, 0, sizeof(info));
4578     btreeParseCell(pCur->pPage, pCur->ix, &info);
4579     assert( CORRUPT_DB || cellInfoEqual(&info, &pCur->info) );
4580   }
4581 #else
4582   #define assertCellInfo(x)
4583 #endif
4584 static SQLITE_NOINLINE void getCellInfo(BtCursor *pCur){
4585   if( pCur->info.nSize==0 ){
4586     pCur->curFlags |= BTCF_ValidNKey;
4587     btreeParseCell(pCur->pPage,pCur->ix,&pCur->info);
4588   }else{
4589     assertCellInfo(pCur);
4590   }
4591 }
4592 
4593 #ifndef NDEBUG  /* The next routine used only within assert() statements */
4594 /*
4595 ** Return true if the given BtCursor is valid.  A valid cursor is one
4596 ** that is currently pointing to a row in a (non-empty) table.
4597 ** This is a verification routine is used only within assert() statements.
4598 */
4599 int sqlite3BtreeCursorIsValid(BtCursor *pCur){
4600   return pCur && pCur->eState==CURSOR_VALID;
4601 }
4602 #endif /* NDEBUG */
4603 int sqlite3BtreeCursorIsValidNN(BtCursor *pCur){
4604   assert( pCur!=0 );
4605   return pCur->eState==CURSOR_VALID;
4606 }
4607 
4608 /*
4609 ** Return the value of the integer key or "rowid" for a table btree.
4610 ** This routine is only valid for a cursor that is pointing into a
4611 ** ordinary table btree.  If the cursor points to an index btree or
4612 ** is invalid, the result of this routine is undefined.
4613 */
4614 i64 sqlite3BtreeIntegerKey(BtCursor *pCur){
4615   assert( cursorHoldsMutex(pCur) );
4616   assert( pCur->eState==CURSOR_VALID );
4617   assert( pCur->curIntKey );
4618   getCellInfo(pCur);
4619   return pCur->info.nKey;
4620 }
4621 
4622 /*
4623 ** Pin or unpin a cursor.
4624 */
4625 void sqlite3BtreeCursorPin(BtCursor *pCur){
4626   assert( (pCur->curFlags & BTCF_Pinned)==0 );
4627   pCur->curFlags |= BTCF_Pinned;
4628 }
4629 void sqlite3BtreeCursorUnpin(BtCursor *pCur){
4630   assert( (pCur->curFlags & BTCF_Pinned)!=0 );
4631   pCur->curFlags &= ~BTCF_Pinned;
4632 }
4633 
4634 #ifdef SQLITE_ENABLE_OFFSET_SQL_FUNC
4635 /*
4636 ** Return the offset into the database file for the start of the
4637 ** payload to which the cursor is pointing.
4638 */
4639 i64 sqlite3BtreeOffset(BtCursor *pCur){
4640   assert( cursorHoldsMutex(pCur) );
4641   assert( pCur->eState==CURSOR_VALID );
4642   getCellInfo(pCur);
4643   return (i64)pCur->pBt->pageSize*((i64)pCur->pPage->pgno - 1) +
4644          (i64)(pCur->info.pPayload - pCur->pPage->aData);
4645 }
4646 #endif /* SQLITE_ENABLE_OFFSET_SQL_FUNC */
4647 
4648 /*
4649 ** Return the number of bytes of payload for the entry that pCur is
4650 ** currently pointing to.  For table btrees, this will be the amount
4651 ** of data.  For index btrees, this will be the size of the key.
4652 **
4653 ** The caller must guarantee that the cursor is pointing to a non-NULL
4654 ** valid entry.  In other words, the calling procedure must guarantee
4655 ** that the cursor has Cursor.eState==CURSOR_VALID.
4656 */
4657 u32 sqlite3BtreePayloadSize(BtCursor *pCur){
4658   assert( cursorHoldsMutex(pCur) );
4659   assert( pCur->eState==CURSOR_VALID );
4660   getCellInfo(pCur);
4661   return pCur->info.nPayload;
4662 }
4663 
4664 /*
4665 ** Return an upper bound on the size of any record for the table
4666 ** that the cursor is pointing into.
4667 **
4668 ** This is an optimization.  Everything will still work if this
4669 ** routine always returns 2147483647 (which is the largest record
4670 ** that SQLite can handle) or more.  But returning a smaller value might
4671 ** prevent large memory allocations when trying to interpret a
4672 ** corrupt datrabase.
4673 **
4674 ** The current implementation merely returns the size of the underlying
4675 ** database file.
4676 */
4677 sqlite3_int64 sqlite3BtreeMaxRecordSize(BtCursor *pCur){
4678   assert( cursorHoldsMutex(pCur) );
4679   assert( pCur->eState==CURSOR_VALID );
4680   return pCur->pBt->pageSize * (sqlite3_int64)pCur->pBt->nPage;
4681 }
4682 
4683 /*
4684 ** Given the page number of an overflow page in the database (parameter
4685 ** ovfl), this function finds the page number of the next page in the
4686 ** linked list of overflow pages. If possible, it uses the auto-vacuum
4687 ** pointer-map data instead of reading the content of page ovfl to do so.
4688 **
4689 ** If an error occurs an SQLite error code is returned. Otherwise:
4690 **
4691 ** The page number of the next overflow page in the linked list is
4692 ** written to *pPgnoNext. If page ovfl is the last page in its linked
4693 ** list, *pPgnoNext is set to zero.
4694 **
4695 ** If ppPage is not NULL, and a reference to the MemPage object corresponding
4696 ** to page number pOvfl was obtained, then *ppPage is set to point to that
4697 ** reference. It is the responsibility of the caller to call releasePage()
4698 ** on *ppPage to free the reference. In no reference was obtained (because
4699 ** the pointer-map was used to obtain the value for *pPgnoNext), then
4700 ** *ppPage is set to zero.
4701 */
4702 static int getOverflowPage(
4703   BtShared *pBt,               /* The database file */
4704   Pgno ovfl,                   /* Current overflow page number */
4705   MemPage **ppPage,            /* OUT: MemPage handle (may be NULL) */
4706   Pgno *pPgnoNext              /* OUT: Next overflow page number */
4707 ){
4708   Pgno next = 0;
4709   MemPage *pPage = 0;
4710   int rc = SQLITE_OK;
4711 
4712   assert( sqlite3_mutex_held(pBt->mutex) );
4713   assert(pPgnoNext);
4714 
4715 #ifndef SQLITE_OMIT_AUTOVACUUM
4716   /* Try to find the next page in the overflow list using the
4717   ** autovacuum pointer-map pages. Guess that the next page in
4718   ** the overflow list is page number (ovfl+1). If that guess turns
4719   ** out to be wrong, fall back to loading the data of page
4720   ** number ovfl to determine the next page number.
4721   */
4722   if( pBt->autoVacuum ){
4723     Pgno pgno;
4724     Pgno iGuess = ovfl+1;
4725     u8 eType;
4726 
4727     while( PTRMAP_ISPAGE(pBt, iGuess) || iGuess==PENDING_BYTE_PAGE(pBt) ){
4728       iGuess++;
4729     }
4730 
4731     if( iGuess<=btreePagecount(pBt) ){
4732       rc = ptrmapGet(pBt, iGuess, &eType, &pgno);
4733       if( rc==SQLITE_OK && eType==PTRMAP_OVERFLOW2 && pgno==ovfl ){
4734         next = iGuess;
4735         rc = SQLITE_DONE;
4736       }
4737     }
4738   }
4739 #endif
4740 
4741   assert( next==0 || rc==SQLITE_DONE );
4742   if( rc==SQLITE_OK ){
4743     rc = btreeGetPage(pBt, ovfl, &pPage, (ppPage==0) ? PAGER_GET_READONLY : 0);
4744     assert( rc==SQLITE_OK || pPage==0 );
4745     if( rc==SQLITE_OK ){
4746       next = get4byte(pPage->aData);
4747     }
4748   }
4749 
4750   *pPgnoNext = next;
4751   if( ppPage ){
4752     *ppPage = pPage;
4753   }else{
4754     releasePage(pPage);
4755   }
4756   return (rc==SQLITE_DONE ? SQLITE_OK : rc);
4757 }
4758 
4759 /*
4760 ** Copy data from a buffer to a page, or from a page to a buffer.
4761 **
4762 ** pPayload is a pointer to data stored on database page pDbPage.
4763 ** If argument eOp is false, then nByte bytes of data are copied
4764 ** from pPayload to the buffer pointed at by pBuf. If eOp is true,
4765 ** then sqlite3PagerWrite() is called on pDbPage and nByte bytes
4766 ** of data are copied from the buffer pBuf to pPayload.
4767 **
4768 ** SQLITE_OK is returned on success, otherwise an error code.
4769 */
4770 static int copyPayload(
4771   void *pPayload,           /* Pointer to page data */
4772   void *pBuf,               /* Pointer to buffer */
4773   int nByte,                /* Number of bytes to copy */
4774   int eOp,                  /* 0 -> copy from page, 1 -> copy to page */
4775   DbPage *pDbPage           /* Page containing pPayload */
4776 ){
4777   if( eOp ){
4778     /* Copy data from buffer to page (a write operation) */
4779     int rc = sqlite3PagerWrite(pDbPage);
4780     if( rc!=SQLITE_OK ){
4781       return rc;
4782     }
4783     memcpy(pPayload, pBuf, nByte);
4784   }else{
4785     /* Copy data from page to buffer (a read operation) */
4786     memcpy(pBuf, pPayload, nByte);
4787   }
4788   return SQLITE_OK;
4789 }
4790 
4791 /*
4792 ** This function is used to read or overwrite payload information
4793 ** for the entry that the pCur cursor is pointing to. The eOp
4794 ** argument is interpreted as follows:
4795 **
4796 **   0: The operation is a read. Populate the overflow cache.
4797 **   1: The operation is a write. Populate the overflow cache.
4798 **
4799 ** A total of "amt" bytes are read or written beginning at "offset".
4800 ** Data is read to or from the buffer pBuf.
4801 **
4802 ** The content being read or written might appear on the main page
4803 ** or be scattered out on multiple overflow pages.
4804 **
4805 ** If the current cursor entry uses one or more overflow pages
4806 ** this function may allocate space for and lazily populate
4807 ** the overflow page-list cache array (BtCursor.aOverflow).
4808 ** Subsequent calls use this cache to make seeking to the supplied offset
4809 ** more efficient.
4810 **
4811 ** Once an overflow page-list cache has been allocated, it must be
4812 ** invalidated if some other cursor writes to the same table, or if
4813 ** the cursor is moved to a different row. Additionally, in auto-vacuum
4814 ** mode, the following events may invalidate an overflow page-list cache.
4815 **
4816 **   * An incremental vacuum,
4817 **   * A commit in auto_vacuum="full" mode,
4818 **   * Creating a table (may require moving an overflow page).
4819 */
4820 static int accessPayload(
4821   BtCursor *pCur,      /* Cursor pointing to entry to read from */
4822   u32 offset,          /* Begin reading this far into payload */
4823   u32 amt,             /* Read this many bytes */
4824   unsigned char *pBuf, /* Write the bytes into this buffer */
4825   int eOp              /* zero to read. non-zero to write. */
4826 ){
4827   unsigned char *aPayload;
4828   int rc = SQLITE_OK;
4829   int iIdx = 0;
4830   MemPage *pPage = pCur->pPage;               /* Btree page of current entry */
4831   BtShared *pBt = pCur->pBt;                  /* Btree this cursor belongs to */
4832 #ifdef SQLITE_DIRECT_OVERFLOW_READ
4833   unsigned char * const pBufStart = pBuf;     /* Start of original out buffer */
4834 #endif
4835 
4836   assert( pPage );
4837   assert( eOp==0 || eOp==1 );
4838   assert( pCur->eState==CURSOR_VALID );
4839   assert( pCur->ix<pPage->nCell );
4840   assert( cursorHoldsMutex(pCur) );
4841 
4842   getCellInfo(pCur);
4843   aPayload = pCur->info.pPayload;
4844   assert( offset+amt <= pCur->info.nPayload );
4845 
4846   assert( aPayload > pPage->aData );
4847   if( (uptr)(aPayload - pPage->aData) > (pBt->usableSize - pCur->info.nLocal) ){
4848     /* Trying to read or write past the end of the data is an error.  The
4849     ** conditional above is really:
4850     **    &aPayload[pCur->info.nLocal] > &pPage->aData[pBt->usableSize]
4851     ** but is recast into its current form to avoid integer overflow problems
4852     */
4853     return SQLITE_CORRUPT_PAGE(pPage);
4854   }
4855 
4856   /* Check if data must be read/written to/from the btree page itself. */
4857   if( offset<pCur->info.nLocal ){
4858     int a = amt;
4859     if( a+offset>pCur->info.nLocal ){
4860       a = pCur->info.nLocal - offset;
4861     }
4862     rc = copyPayload(&aPayload[offset], pBuf, a, eOp, pPage->pDbPage);
4863     offset = 0;
4864     pBuf += a;
4865     amt -= a;
4866   }else{
4867     offset -= pCur->info.nLocal;
4868   }
4869 
4870 
4871   if( rc==SQLITE_OK && amt>0 ){
4872     const u32 ovflSize = pBt->usableSize - 4;  /* Bytes content per ovfl page */
4873     Pgno nextPage;
4874 
4875     nextPage = get4byte(&aPayload[pCur->info.nLocal]);
4876 
4877     /* If the BtCursor.aOverflow[] has not been allocated, allocate it now.
4878     **
4879     ** The aOverflow[] array is sized at one entry for each overflow page
4880     ** in the overflow chain. The page number of the first overflow page is
4881     ** stored in aOverflow[0], etc. A value of 0 in the aOverflow[] array
4882     ** means "not yet known" (the cache is lazily populated).
4883     */
4884     if( (pCur->curFlags & BTCF_ValidOvfl)==0 ){
4885       int nOvfl = (pCur->info.nPayload-pCur->info.nLocal+ovflSize-1)/ovflSize;
4886       if( pCur->aOverflow==0
4887        || nOvfl*(int)sizeof(Pgno) > sqlite3MallocSize(pCur->aOverflow)
4888       ){
4889         Pgno *aNew = (Pgno*)sqlite3Realloc(
4890             pCur->aOverflow, nOvfl*2*sizeof(Pgno)
4891         );
4892         if( aNew==0 ){
4893           return SQLITE_NOMEM_BKPT;
4894         }else{
4895           pCur->aOverflow = aNew;
4896         }
4897       }
4898       memset(pCur->aOverflow, 0, nOvfl*sizeof(Pgno));
4899       pCur->curFlags |= BTCF_ValidOvfl;
4900     }else{
4901       /* If the overflow page-list cache has been allocated and the
4902       ** entry for the first required overflow page is valid, skip
4903       ** directly to it.
4904       */
4905       if( pCur->aOverflow[offset/ovflSize] ){
4906         iIdx = (offset/ovflSize);
4907         nextPage = pCur->aOverflow[iIdx];
4908         offset = (offset%ovflSize);
4909       }
4910     }
4911 
4912     assert( rc==SQLITE_OK && amt>0 );
4913     while( nextPage ){
4914       /* If required, populate the overflow page-list cache. */
4915       if( nextPage > pBt->nPage ) return SQLITE_CORRUPT_BKPT;
4916       assert( pCur->aOverflow[iIdx]==0
4917               || pCur->aOverflow[iIdx]==nextPage
4918               || CORRUPT_DB );
4919       pCur->aOverflow[iIdx] = nextPage;
4920 
4921       if( offset>=ovflSize ){
4922         /* The only reason to read this page is to obtain the page
4923         ** number for the next page in the overflow chain. The page
4924         ** data is not required. So first try to lookup the overflow
4925         ** page-list cache, if any, then fall back to the getOverflowPage()
4926         ** function.
4927         */
4928         assert( pCur->curFlags & BTCF_ValidOvfl );
4929         assert( pCur->pBtree->db==pBt->db );
4930         if( pCur->aOverflow[iIdx+1] ){
4931           nextPage = pCur->aOverflow[iIdx+1];
4932         }else{
4933           rc = getOverflowPage(pBt, nextPage, 0, &nextPage);
4934         }
4935         offset -= ovflSize;
4936       }else{
4937         /* Need to read this page properly. It contains some of the
4938         ** range of data that is being read (eOp==0) or written (eOp!=0).
4939         */
4940         int a = amt;
4941         if( a + offset > ovflSize ){
4942           a = ovflSize - offset;
4943         }
4944 
4945 #ifdef SQLITE_DIRECT_OVERFLOW_READ
4946         /* If all the following are true:
4947         **
4948         **   1) this is a read operation, and
4949         **   2) data is required from the start of this overflow page, and
4950         **   3) there are no dirty pages in the page-cache
4951         **   4) the database is file-backed, and
4952         **   5) the page is not in the WAL file
4953         **   6) at least 4 bytes have already been read into the output buffer
4954         **
4955         ** then data can be read directly from the database file into the
4956         ** output buffer, bypassing the page-cache altogether. This speeds
4957         ** up loading large records that span many overflow pages.
4958         */
4959         if( eOp==0                                             /* (1) */
4960          && offset==0                                          /* (2) */
4961          && sqlite3PagerDirectReadOk(pBt->pPager, nextPage)    /* (3,4,5) */
4962          && &pBuf[-4]>=pBufStart                               /* (6) */
4963         ){
4964           sqlite3_file *fd = sqlite3PagerFile(pBt->pPager);
4965           u8 aSave[4];
4966           u8 *aWrite = &pBuf[-4];
4967           assert( aWrite>=pBufStart );                         /* due to (6) */
4968           memcpy(aSave, aWrite, 4);
4969           rc = sqlite3OsRead(fd, aWrite, a+4, (i64)pBt->pageSize*(nextPage-1));
4970           if( rc && nextPage>pBt->nPage ) rc = SQLITE_CORRUPT_BKPT;
4971           nextPage = get4byte(aWrite);
4972           memcpy(aWrite, aSave, 4);
4973         }else
4974 #endif
4975 
4976         {
4977           DbPage *pDbPage;
4978           rc = sqlite3PagerGet(pBt->pPager, nextPage, &pDbPage,
4979               (eOp==0 ? PAGER_GET_READONLY : 0)
4980           );
4981           if( rc==SQLITE_OK ){
4982             aPayload = sqlite3PagerGetData(pDbPage);
4983             nextPage = get4byte(aPayload);
4984             rc = copyPayload(&aPayload[offset+4], pBuf, a, eOp, pDbPage);
4985             sqlite3PagerUnref(pDbPage);
4986             offset = 0;
4987           }
4988         }
4989         amt -= a;
4990         if( amt==0 ) return rc;
4991         pBuf += a;
4992       }
4993       if( rc ) break;
4994       iIdx++;
4995     }
4996   }
4997 
4998   if( rc==SQLITE_OK && amt>0 ){
4999     /* Overflow chain ends prematurely */
5000     return SQLITE_CORRUPT_PAGE(pPage);
5001   }
5002   return rc;
5003 }
5004 
5005 /*
5006 ** Read part of the payload for the row at which that cursor pCur is currently
5007 ** pointing.  "amt" bytes will be transferred into pBuf[].  The transfer
5008 ** begins at "offset".
5009 **
5010 ** pCur can be pointing to either a table or an index b-tree.
5011 ** If pointing to a table btree, then the content section is read.  If
5012 ** pCur is pointing to an index b-tree then the key section is read.
5013 **
5014 ** For sqlite3BtreePayload(), the caller must ensure that pCur is pointing
5015 ** to a valid row in the table.  For sqlite3BtreePayloadChecked(), the
5016 ** cursor might be invalid or might need to be restored before being read.
5017 **
5018 ** Return SQLITE_OK on success or an error code if anything goes
5019 ** wrong.  An error is returned if "offset+amt" is larger than
5020 ** the available payload.
5021 */
5022 int sqlite3BtreePayload(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){
5023   assert( cursorHoldsMutex(pCur) );
5024   assert( pCur->eState==CURSOR_VALID );
5025   assert( pCur->iPage>=0 && pCur->pPage );
5026   assert( pCur->ix<pCur->pPage->nCell );
5027   return accessPayload(pCur, offset, amt, (unsigned char*)pBuf, 0);
5028 }
5029 
5030 /*
5031 ** This variant of sqlite3BtreePayload() works even if the cursor has not
5032 ** in the CURSOR_VALID state.  It is only used by the sqlite3_blob_read()
5033 ** interface.
5034 */
5035 #ifndef SQLITE_OMIT_INCRBLOB
5036 static SQLITE_NOINLINE int accessPayloadChecked(
5037   BtCursor *pCur,
5038   u32 offset,
5039   u32 amt,
5040   void *pBuf
5041 ){
5042   int rc;
5043   if ( pCur->eState==CURSOR_INVALID ){
5044     return SQLITE_ABORT;
5045   }
5046   assert( cursorOwnsBtShared(pCur) );
5047   rc = btreeRestoreCursorPosition(pCur);
5048   return rc ? rc : accessPayload(pCur, offset, amt, pBuf, 0);
5049 }
5050 int sqlite3BtreePayloadChecked(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){
5051   if( pCur->eState==CURSOR_VALID ){
5052     assert( cursorOwnsBtShared(pCur) );
5053     return accessPayload(pCur, offset, amt, pBuf, 0);
5054   }else{
5055     return accessPayloadChecked(pCur, offset, amt, pBuf);
5056   }
5057 }
5058 #endif /* SQLITE_OMIT_INCRBLOB */
5059 
5060 /*
5061 ** Return a pointer to payload information from the entry that the
5062 ** pCur cursor is pointing to.  The pointer is to the beginning of
5063 ** the key if index btrees (pPage->intKey==0) and is the data for
5064 ** table btrees (pPage->intKey==1). The number of bytes of available
5065 ** key/data is written into *pAmt.  If *pAmt==0, then the value
5066 ** returned will not be a valid pointer.
5067 **
5068 ** This routine is an optimization.  It is common for the entire key
5069 ** and data to fit on the local page and for there to be no overflow
5070 ** pages.  When that is so, this routine can be used to access the
5071 ** key and data without making a copy.  If the key and/or data spills
5072 ** onto overflow pages, then accessPayload() must be used to reassemble
5073 ** the key/data and copy it into a preallocated buffer.
5074 **
5075 ** The pointer returned by this routine looks directly into the cached
5076 ** page of the database.  The data might change or move the next time
5077 ** any btree routine is called.
5078 */
5079 static const void *fetchPayload(
5080   BtCursor *pCur,      /* Cursor pointing to entry to read from */
5081   u32 *pAmt            /* Write the number of available bytes here */
5082 ){
5083   int amt;
5084   assert( pCur!=0 && pCur->iPage>=0 && pCur->pPage);
5085   assert( pCur->eState==CURSOR_VALID );
5086   assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
5087   assert( cursorOwnsBtShared(pCur) );
5088   assert( pCur->ix<pCur->pPage->nCell || CORRUPT_DB );
5089   assert( pCur->info.nSize>0 );
5090   assert( pCur->info.pPayload>pCur->pPage->aData || CORRUPT_DB );
5091   assert( pCur->info.pPayload<pCur->pPage->aDataEnd ||CORRUPT_DB);
5092   amt = pCur->info.nLocal;
5093   if( amt>(int)(pCur->pPage->aDataEnd - pCur->info.pPayload) ){
5094     /* There is too little space on the page for the expected amount
5095     ** of local content. Database must be corrupt. */
5096     assert( CORRUPT_DB );
5097     amt = MAX(0, (int)(pCur->pPage->aDataEnd - pCur->info.pPayload));
5098   }
5099   *pAmt = (u32)amt;
5100   return (void*)pCur->info.pPayload;
5101 }
5102 
5103 
5104 /*
5105 ** For the entry that cursor pCur is point to, return as
5106 ** many bytes of the key or data as are available on the local
5107 ** b-tree page.  Write the number of available bytes into *pAmt.
5108 **
5109 ** The pointer returned is ephemeral.  The key/data may move
5110 ** or be destroyed on the next call to any Btree routine,
5111 ** including calls from other threads against the same cache.
5112 ** Hence, a mutex on the BtShared should be held prior to calling
5113 ** this routine.
5114 **
5115 ** These routines is used to get quick access to key and data
5116 ** in the common case where no overflow pages are used.
5117 */
5118 const void *sqlite3BtreePayloadFetch(BtCursor *pCur, u32 *pAmt){
5119   return fetchPayload(pCur, pAmt);
5120 }
5121 
5122 
5123 /*
5124 ** Move the cursor down to a new child page.  The newPgno argument is the
5125 ** page number of the child page to move to.
5126 **
5127 ** This function returns SQLITE_CORRUPT if the page-header flags field of
5128 ** the new child page does not match the flags field of the parent (i.e.
5129 ** if an intkey page appears to be the parent of a non-intkey page, or
5130 ** vice-versa).
5131 */
5132 static int moveToChild(BtCursor *pCur, u32 newPgno){
5133   BtShared *pBt = pCur->pBt;
5134 
5135   assert( cursorOwnsBtShared(pCur) );
5136   assert( pCur->eState==CURSOR_VALID );
5137   assert( pCur->iPage<BTCURSOR_MAX_DEPTH );
5138   assert( pCur->iPage>=0 );
5139   if( pCur->iPage>=(BTCURSOR_MAX_DEPTH-1) ){
5140     return SQLITE_CORRUPT_BKPT;
5141   }
5142   pCur->info.nSize = 0;
5143   pCur->curFlags &= ~(BTCF_ValidNKey|BTCF_ValidOvfl);
5144   pCur->aiIdx[pCur->iPage] = pCur->ix;
5145   pCur->apPage[pCur->iPage] = pCur->pPage;
5146   pCur->ix = 0;
5147   pCur->iPage++;
5148   return getAndInitPage(pBt, newPgno, &pCur->pPage, pCur, pCur->curPagerFlags);
5149 }
5150 
5151 #ifdef SQLITE_DEBUG
5152 /*
5153 ** Page pParent is an internal (non-leaf) tree page. This function
5154 ** asserts that page number iChild is the left-child if the iIdx'th
5155 ** cell in page pParent. Or, if iIdx is equal to the total number of
5156 ** cells in pParent, that page number iChild is the right-child of
5157 ** the page.
5158 */
5159 static void assertParentIndex(MemPage *pParent, int iIdx, Pgno iChild){
5160   if( CORRUPT_DB ) return;  /* The conditions tested below might not be true
5161                             ** in a corrupt database */
5162   assert( iIdx<=pParent->nCell );
5163   if( iIdx==pParent->nCell ){
5164     assert( get4byte(&pParent->aData[pParent->hdrOffset+8])==iChild );
5165   }else{
5166     assert( get4byte(findCell(pParent, iIdx))==iChild );
5167   }
5168 }
5169 #else
5170 #  define assertParentIndex(x,y,z)
5171 #endif
5172 
5173 /*
5174 ** Move the cursor up to the parent page.
5175 **
5176 ** pCur->idx is set to the cell index that contains the pointer
5177 ** to the page we are coming from.  If we are coming from the
5178 ** right-most child page then pCur->idx is set to one more than
5179 ** the largest cell index.
5180 */
5181 static void moveToParent(BtCursor *pCur){
5182   MemPage *pLeaf;
5183   assert( cursorOwnsBtShared(pCur) );
5184   assert( pCur->eState==CURSOR_VALID );
5185   assert( pCur->iPage>0 );
5186   assert( pCur->pPage );
5187   assertParentIndex(
5188     pCur->apPage[pCur->iPage-1],
5189     pCur->aiIdx[pCur->iPage-1],
5190     pCur->pPage->pgno
5191   );
5192   testcase( pCur->aiIdx[pCur->iPage-1] > pCur->apPage[pCur->iPage-1]->nCell );
5193   pCur->info.nSize = 0;
5194   pCur->curFlags &= ~(BTCF_ValidNKey|BTCF_ValidOvfl);
5195   pCur->ix = pCur->aiIdx[pCur->iPage-1];
5196   pLeaf = pCur->pPage;
5197   pCur->pPage = pCur->apPage[--pCur->iPage];
5198   releasePageNotNull(pLeaf);
5199 }
5200 
5201 /*
5202 ** Move the cursor to point to the root page of its b-tree structure.
5203 **
5204 ** If the table has a virtual root page, then the cursor is moved to point
5205 ** to the virtual root page instead of the actual root page. A table has a
5206 ** virtual root page when the actual root page contains no cells and a
5207 ** single child page. This can only happen with the table rooted at page 1.
5208 **
5209 ** If the b-tree structure is empty, the cursor state is set to
5210 ** CURSOR_INVALID and this routine returns SQLITE_EMPTY. Otherwise,
5211 ** the cursor is set to point to the first cell located on the root
5212 ** (or virtual root) page and the cursor state is set to CURSOR_VALID.
5213 **
5214 ** If this function returns successfully, it may be assumed that the
5215 ** page-header flags indicate that the [virtual] root-page is the expected
5216 ** kind of b-tree page (i.e. if when opening the cursor the caller did not
5217 ** specify a KeyInfo structure the flags byte is set to 0x05 or 0x0D,
5218 ** indicating a table b-tree, or if the caller did specify a KeyInfo
5219 ** structure the flags byte is set to 0x02 or 0x0A, indicating an index
5220 ** b-tree).
5221 */
5222 static int moveToRoot(BtCursor *pCur){
5223   MemPage *pRoot;
5224   int rc = SQLITE_OK;
5225 
5226   assert( cursorOwnsBtShared(pCur) );
5227   assert( CURSOR_INVALID < CURSOR_REQUIRESEEK );
5228   assert( CURSOR_VALID   < CURSOR_REQUIRESEEK );
5229   assert( CURSOR_FAULT   > CURSOR_REQUIRESEEK );
5230   assert( pCur->eState < CURSOR_REQUIRESEEK || pCur->iPage<0 );
5231   assert( pCur->pgnoRoot>0 || pCur->iPage<0 );
5232 
5233   if( pCur->iPage>=0 ){
5234     if( pCur->iPage ){
5235       releasePageNotNull(pCur->pPage);
5236       while( --pCur->iPage ){
5237         releasePageNotNull(pCur->apPage[pCur->iPage]);
5238       }
5239       pCur->pPage = pCur->apPage[0];
5240       goto skip_init;
5241     }
5242   }else if( pCur->pgnoRoot==0 ){
5243     pCur->eState = CURSOR_INVALID;
5244     return SQLITE_EMPTY;
5245   }else{
5246     assert( pCur->iPage==(-1) );
5247     if( pCur->eState>=CURSOR_REQUIRESEEK ){
5248       if( pCur->eState==CURSOR_FAULT ){
5249         assert( pCur->skipNext!=SQLITE_OK );
5250         return pCur->skipNext;
5251       }
5252       sqlite3BtreeClearCursor(pCur);
5253     }
5254     rc = getAndInitPage(pCur->pBtree->pBt, pCur->pgnoRoot, &pCur->pPage,
5255                         0, pCur->curPagerFlags);
5256     if( rc!=SQLITE_OK ){
5257       pCur->eState = CURSOR_INVALID;
5258       return rc;
5259     }
5260     pCur->iPage = 0;
5261     pCur->curIntKey = pCur->pPage->intKey;
5262   }
5263   pRoot = pCur->pPage;
5264   assert( pRoot->pgno==pCur->pgnoRoot );
5265 
5266   /* If pCur->pKeyInfo is not NULL, then the caller that opened this cursor
5267   ** expected to open it on an index b-tree. Otherwise, if pKeyInfo is
5268   ** NULL, the caller expects a table b-tree. If this is not the case,
5269   ** return an SQLITE_CORRUPT error.
5270   **
5271   ** Earlier versions of SQLite assumed that this test could not fail
5272   ** if the root page was already loaded when this function was called (i.e.
5273   ** if pCur->iPage>=0). But this is not so if the database is corrupted
5274   ** in such a way that page pRoot is linked into a second b-tree table
5275   ** (or the freelist).  */
5276   assert( pRoot->intKey==1 || pRoot->intKey==0 );
5277   if( pRoot->isInit==0 || (pCur->pKeyInfo==0)!=pRoot->intKey ){
5278     return SQLITE_CORRUPT_PAGE(pCur->pPage);
5279   }
5280 
5281 skip_init:
5282   pCur->ix = 0;
5283   pCur->info.nSize = 0;
5284   pCur->curFlags &= ~(BTCF_AtLast|BTCF_ValidNKey|BTCF_ValidOvfl);
5285 
5286   pRoot = pCur->pPage;
5287   if( pRoot->nCell>0 ){
5288     pCur->eState = CURSOR_VALID;
5289   }else if( !pRoot->leaf ){
5290     Pgno subpage;
5291     if( pRoot->pgno!=1 ) return SQLITE_CORRUPT_BKPT;
5292     subpage = get4byte(&pRoot->aData[pRoot->hdrOffset+8]);
5293     pCur->eState = CURSOR_VALID;
5294     rc = moveToChild(pCur, subpage);
5295   }else{
5296     pCur->eState = CURSOR_INVALID;
5297     rc = SQLITE_EMPTY;
5298   }
5299   return rc;
5300 }
5301 
5302 /*
5303 ** Move the cursor down to the left-most leaf entry beneath the
5304 ** entry to which it is currently pointing.
5305 **
5306 ** The left-most leaf is the one with the smallest key - the first
5307 ** in ascending order.
5308 */
5309 static int moveToLeftmost(BtCursor *pCur){
5310   Pgno pgno;
5311   int rc = SQLITE_OK;
5312   MemPage *pPage;
5313 
5314   assert( cursorOwnsBtShared(pCur) );
5315   assert( pCur->eState==CURSOR_VALID );
5316   while( rc==SQLITE_OK && !(pPage = pCur->pPage)->leaf ){
5317     assert( pCur->ix<pPage->nCell );
5318     pgno = get4byte(findCell(pPage, pCur->ix));
5319     rc = moveToChild(pCur, pgno);
5320   }
5321   return rc;
5322 }
5323 
5324 /*
5325 ** Move the cursor down to the right-most leaf entry beneath the
5326 ** page to which it is currently pointing.  Notice the difference
5327 ** between moveToLeftmost() and moveToRightmost().  moveToLeftmost()
5328 ** finds the left-most entry beneath the *entry* whereas moveToRightmost()
5329 ** finds the right-most entry beneath the *page*.
5330 **
5331 ** The right-most entry is the one with the largest key - the last
5332 ** key in ascending order.
5333 */
5334 static int moveToRightmost(BtCursor *pCur){
5335   Pgno pgno;
5336   int rc = SQLITE_OK;
5337   MemPage *pPage = 0;
5338 
5339   assert( cursorOwnsBtShared(pCur) );
5340   assert( pCur->eState==CURSOR_VALID );
5341   while( !(pPage = pCur->pPage)->leaf ){
5342     pgno = get4byte(&pPage->aData[pPage->hdrOffset+8]);
5343     pCur->ix = pPage->nCell;
5344     rc = moveToChild(pCur, pgno);
5345     if( rc ) return rc;
5346   }
5347   pCur->ix = pPage->nCell-1;
5348   assert( pCur->info.nSize==0 );
5349   assert( (pCur->curFlags & BTCF_ValidNKey)==0 );
5350   return SQLITE_OK;
5351 }
5352 
5353 /* Move the cursor to the first entry in the table.  Return SQLITE_OK
5354 ** on success.  Set *pRes to 0 if the cursor actually points to something
5355 ** or set *pRes to 1 if the table is empty.
5356 */
5357 int sqlite3BtreeFirst(BtCursor *pCur, int *pRes){
5358   int rc;
5359 
5360   assert( cursorOwnsBtShared(pCur) );
5361   assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
5362   rc = moveToRoot(pCur);
5363   if( rc==SQLITE_OK ){
5364     assert( pCur->pPage->nCell>0 );
5365     *pRes = 0;
5366     rc = moveToLeftmost(pCur);
5367   }else if( rc==SQLITE_EMPTY ){
5368     assert( pCur->pgnoRoot==0 || pCur->pPage->nCell==0 );
5369     *pRes = 1;
5370     rc = SQLITE_OK;
5371   }
5372   return rc;
5373 }
5374 
5375 /* Move the cursor to the last entry in the table.  Return SQLITE_OK
5376 ** on success.  Set *pRes to 0 if the cursor actually points to something
5377 ** or set *pRes to 1 if the table is empty.
5378 */
5379 int sqlite3BtreeLast(BtCursor *pCur, int *pRes){
5380   int rc;
5381 
5382   assert( cursorOwnsBtShared(pCur) );
5383   assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
5384 
5385   /* If the cursor already points to the last entry, this is a no-op. */
5386   if( CURSOR_VALID==pCur->eState && (pCur->curFlags & BTCF_AtLast)!=0 ){
5387 #ifdef SQLITE_DEBUG
5388     /* This block serves to assert() that the cursor really does point
5389     ** to the last entry in the b-tree. */
5390     int ii;
5391     for(ii=0; ii<pCur->iPage; ii++){
5392       assert( pCur->aiIdx[ii]==pCur->apPage[ii]->nCell );
5393     }
5394     assert( pCur->ix==pCur->pPage->nCell-1 || CORRUPT_DB );
5395     testcase( pCur->ix!=pCur->pPage->nCell-1 );
5396     /* ^-- dbsqlfuzz b92b72e4de80b5140c30ab71372ca719b8feb618 */
5397     assert( pCur->pPage->leaf );
5398 #endif
5399     *pRes = 0;
5400     return SQLITE_OK;
5401   }
5402 
5403   rc = moveToRoot(pCur);
5404   if( rc==SQLITE_OK ){
5405     assert( pCur->eState==CURSOR_VALID );
5406     *pRes = 0;
5407     rc = moveToRightmost(pCur);
5408     if( rc==SQLITE_OK ){
5409       pCur->curFlags |= BTCF_AtLast;
5410     }else{
5411       pCur->curFlags &= ~BTCF_AtLast;
5412     }
5413   }else if( rc==SQLITE_EMPTY ){
5414     assert( pCur->pgnoRoot==0 || pCur->pPage->nCell==0 );
5415     *pRes = 1;
5416     rc = SQLITE_OK;
5417   }
5418   return rc;
5419 }
5420 
5421 /* Move the cursor so that it points to an entry in a table (a.k.a INTKEY)
5422 ** table near the key intKey.   Return a success code.
5423 **
5424 ** If an exact match is not found, then the cursor is always
5425 ** left pointing at a leaf page which would hold the entry if it
5426 ** were present.  The cursor might point to an entry that comes
5427 ** before or after the key.
5428 **
5429 ** An integer is written into *pRes which is the result of
5430 ** comparing the key with the entry to which the cursor is
5431 ** pointing.  The meaning of the integer written into
5432 ** *pRes is as follows:
5433 **
5434 **     *pRes<0      The cursor is left pointing at an entry that
5435 **                  is smaller than intKey or if the table is empty
5436 **                  and the cursor is therefore left point to nothing.
5437 **
5438 **     *pRes==0     The cursor is left pointing at an entry that
5439 **                  exactly matches intKey.
5440 **
5441 **     *pRes>0      The cursor is left pointing at an entry that
5442 **                  is larger than intKey.
5443 */
5444 int sqlite3BtreeTableMoveto(
5445   BtCursor *pCur,          /* The cursor to be moved */
5446   i64 intKey,              /* The table key */
5447   int biasRight,           /* If true, bias the search to the high end */
5448   int *pRes                /* Write search results here */
5449 ){
5450   int rc;
5451 
5452   assert( cursorOwnsBtShared(pCur) );
5453   assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
5454   assert( pRes );
5455   assert( pCur->pKeyInfo==0 );
5456   assert( pCur->eState!=CURSOR_VALID || pCur->curIntKey!=0 );
5457 
5458   /* If the cursor is already positioned at the point we are trying
5459   ** to move to, then just return without doing any work */
5460   if( pCur->eState==CURSOR_VALID && (pCur->curFlags & BTCF_ValidNKey)!=0 ){
5461     if( pCur->info.nKey==intKey ){
5462       *pRes = 0;
5463       return SQLITE_OK;
5464     }
5465     if( pCur->info.nKey<intKey ){
5466       if( (pCur->curFlags & BTCF_AtLast)!=0 ){
5467         *pRes = -1;
5468         return SQLITE_OK;
5469       }
5470       /* If the requested key is one more than the previous key, then
5471       ** try to get there using sqlite3BtreeNext() rather than a full
5472       ** binary search.  This is an optimization only.  The correct answer
5473       ** is still obtained without this case, only a little more slowely */
5474       if( pCur->info.nKey+1==intKey ){
5475         *pRes = 0;
5476         rc = sqlite3BtreeNext(pCur, 0);
5477         if( rc==SQLITE_OK ){
5478           getCellInfo(pCur);
5479           if( pCur->info.nKey==intKey ){
5480             return SQLITE_OK;
5481           }
5482         }else if( rc!=SQLITE_DONE ){
5483           return rc;
5484         }
5485       }
5486     }
5487   }
5488 
5489 #ifdef SQLITE_DEBUG
5490   pCur->pBtree->nSeek++;   /* Performance measurement during testing */
5491 #endif
5492 
5493   rc = moveToRoot(pCur);
5494   if( rc ){
5495     if( rc==SQLITE_EMPTY ){
5496       assert( pCur->pgnoRoot==0 || pCur->pPage->nCell==0 );
5497       *pRes = -1;
5498       return SQLITE_OK;
5499     }
5500     return rc;
5501   }
5502   assert( pCur->pPage );
5503   assert( pCur->pPage->isInit );
5504   assert( pCur->eState==CURSOR_VALID );
5505   assert( pCur->pPage->nCell > 0 );
5506   assert( pCur->iPage==0 || pCur->apPage[0]->intKey==pCur->curIntKey );
5507   assert( pCur->curIntKey );
5508 
5509   for(;;){
5510     int lwr, upr, idx, c;
5511     Pgno chldPg;
5512     MemPage *pPage = pCur->pPage;
5513     u8 *pCell;                          /* Pointer to current cell in pPage */
5514 
5515     /* pPage->nCell must be greater than zero. If this is the root-page
5516     ** the cursor would have been INVALID above and this for(;;) loop
5517     ** not run. If this is not the root-page, then the moveToChild() routine
5518     ** would have already detected db corruption. Similarly, pPage must
5519     ** be the right kind (index or table) of b-tree page. Otherwise
5520     ** a moveToChild() or moveToRoot() call would have detected corruption.  */
5521     assert( pPage->nCell>0 );
5522     assert( pPage->intKey );
5523     lwr = 0;
5524     upr = pPage->nCell-1;
5525     assert( biasRight==0 || biasRight==1 );
5526     idx = upr>>(1-biasRight); /* idx = biasRight ? upr : (lwr+upr)/2; */
5527     pCur->ix = (u16)idx;
5528     for(;;){
5529       i64 nCellKey;
5530       pCell = findCellPastPtr(pPage, idx);
5531       if( pPage->intKeyLeaf ){
5532         while( 0x80 <= *(pCell++) ){
5533           if( pCell>=pPage->aDataEnd ){
5534             return SQLITE_CORRUPT_PAGE(pPage);
5535           }
5536         }
5537       }
5538       getVarint(pCell, (u64*)&nCellKey);
5539       if( nCellKey<intKey ){
5540         lwr = idx+1;
5541         if( lwr>upr ){ c = -1; break; }
5542       }else if( nCellKey>intKey ){
5543         upr = idx-1;
5544         if( lwr>upr ){ c = +1; break; }
5545       }else{
5546         assert( nCellKey==intKey );
5547         pCur->ix = (u16)idx;
5548         if( !pPage->leaf ){
5549           lwr = idx;
5550           goto moveto_table_next_layer;
5551         }else{
5552           pCur->curFlags |= BTCF_ValidNKey;
5553           pCur->info.nKey = nCellKey;
5554           pCur->info.nSize = 0;
5555           *pRes = 0;
5556           return SQLITE_OK;
5557         }
5558       }
5559       assert( lwr+upr>=0 );
5560       idx = (lwr+upr)>>1;  /* idx = (lwr+upr)/2; */
5561     }
5562     assert( lwr==upr+1 || !pPage->leaf );
5563     assert( pPage->isInit );
5564     if( pPage->leaf ){
5565       assert( pCur->ix<pCur->pPage->nCell );
5566       pCur->ix = (u16)idx;
5567       *pRes = c;
5568       rc = SQLITE_OK;
5569       goto moveto_table_finish;
5570     }
5571 moveto_table_next_layer:
5572     if( lwr>=pPage->nCell ){
5573       chldPg = get4byte(&pPage->aData[pPage->hdrOffset+8]);
5574     }else{
5575       chldPg = get4byte(findCell(pPage, lwr));
5576     }
5577     pCur->ix = (u16)lwr;
5578     rc = moveToChild(pCur, chldPg);
5579     if( rc ) break;
5580   }
5581 moveto_table_finish:
5582   pCur->info.nSize = 0;
5583   assert( (pCur->curFlags & BTCF_ValidOvfl)==0 );
5584   return rc;
5585 }
5586 
5587 /* Move the cursor so that it points to an entry in an index table
5588 ** near the key pIdxKey.   Return a success code.
5589 **
5590 ** If an exact match is not found, then the cursor is always
5591 ** left pointing at a leaf page which would hold the entry if it
5592 ** were present.  The cursor might point to an entry that comes
5593 ** before or after the key.
5594 **
5595 ** An integer is written into *pRes which is the result of
5596 ** comparing the key with the entry to which the cursor is
5597 ** pointing.  The meaning of the integer written into
5598 ** *pRes is as follows:
5599 **
5600 **     *pRes<0      The cursor is left pointing at an entry that
5601 **                  is smaller than pIdxKey or if the table is empty
5602 **                  and the cursor is therefore left point to nothing.
5603 **
5604 **     *pRes==0     The cursor is left pointing at an entry that
5605 **                  exactly matches pIdxKey.
5606 **
5607 **     *pRes>0      The cursor is left pointing at an entry that
5608 **                  is larger than pIdxKey.
5609 **
5610 ** The pIdxKey->eqSeen field is set to 1 if there
5611 ** exists an entry in the table that exactly matches pIdxKey.
5612 */
5613 int sqlite3BtreeIndexMoveto(
5614   BtCursor *pCur,          /* The cursor to be moved */
5615   UnpackedRecord *pIdxKey, /* Unpacked index key */
5616   int *pRes                /* Write search results here */
5617 ){
5618   int rc;
5619   RecordCompare xRecordCompare;
5620 
5621   assert( cursorOwnsBtShared(pCur) );
5622   assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
5623   assert( pRes );
5624   assert( pCur->pKeyInfo!=0 );
5625 
5626 #ifdef SQLITE_DEBUG
5627   pCur->pBtree->nSeek++;   /* Performance measurement during testing */
5628 #endif
5629 
5630   xRecordCompare = sqlite3VdbeFindCompare(pIdxKey);
5631   pIdxKey->errCode = 0;
5632   assert( pIdxKey->default_rc==1
5633        || pIdxKey->default_rc==0
5634        || pIdxKey->default_rc==-1
5635   );
5636 
5637   rc = moveToRoot(pCur);
5638   if( rc ){
5639     if( rc==SQLITE_EMPTY ){
5640       assert( pCur->pgnoRoot==0 || pCur->pPage->nCell==0 );
5641       *pRes = -1;
5642       return SQLITE_OK;
5643     }
5644     return rc;
5645   }
5646   assert( pCur->pPage );
5647   assert( pCur->pPage->isInit );
5648   assert( pCur->eState==CURSOR_VALID );
5649   assert( pCur->pPage->nCell > 0 );
5650   assert( pCur->iPage==0 || pCur->apPage[0]->intKey==pCur->curIntKey );
5651   assert( pCur->curIntKey || pIdxKey );
5652   for(;;){
5653     int lwr, upr, idx, c;
5654     Pgno chldPg;
5655     MemPage *pPage = pCur->pPage;
5656     u8 *pCell;                          /* Pointer to current cell in pPage */
5657 
5658     /* pPage->nCell must be greater than zero. If this is the root-page
5659     ** the cursor would have been INVALID above and this for(;;) loop
5660     ** not run. If this is not the root-page, then the moveToChild() routine
5661     ** would have already detected db corruption. Similarly, pPage must
5662     ** be the right kind (index or table) of b-tree page. Otherwise
5663     ** a moveToChild() or moveToRoot() call would have detected corruption.  */
5664     assert( pPage->nCell>0 );
5665     assert( pPage->intKey==(pIdxKey==0) );
5666     lwr = 0;
5667     upr = pPage->nCell-1;
5668     idx = upr>>1; /* idx = (lwr+upr)/2; */
5669     pCur->ix = (u16)idx;
5670     for(;;){
5671       int nCell;  /* Size of the pCell cell in bytes */
5672       pCell = findCellPastPtr(pPage, idx);
5673 
5674       /* The maximum supported page-size is 65536 bytes. This means that
5675       ** the maximum number of record bytes stored on an index B-Tree
5676       ** page is less than 16384 bytes and may be stored as a 2-byte
5677       ** varint. This information is used to attempt to avoid parsing
5678       ** the entire cell by checking for the cases where the record is
5679       ** stored entirely within the b-tree page by inspecting the first
5680       ** 2 bytes of the cell.
5681       */
5682       nCell = pCell[0];
5683       if( nCell<=pPage->max1bytePayload ){
5684         /* This branch runs if the record-size field of the cell is a
5685         ** single byte varint and the record fits entirely on the main
5686         ** b-tree page.  */
5687         testcase( pCell+nCell+1==pPage->aDataEnd );
5688         c = xRecordCompare(nCell, (void*)&pCell[1], pIdxKey);
5689       }else if( !(pCell[1] & 0x80)
5690         && (nCell = ((nCell&0x7f)<<7) + pCell[1])<=pPage->maxLocal
5691       ){
5692         /* The record-size field is a 2 byte varint and the record
5693         ** fits entirely on the main b-tree page.  */
5694         testcase( pCell+nCell+2==pPage->aDataEnd );
5695         c = xRecordCompare(nCell, (void*)&pCell[2], pIdxKey);
5696       }else{
5697         /* The record flows over onto one or more overflow pages. In
5698         ** this case the whole cell needs to be parsed, a buffer allocated
5699         ** and accessPayload() used to retrieve the record into the
5700         ** buffer before VdbeRecordCompare() can be called.
5701         **
5702         ** If the record is corrupt, the xRecordCompare routine may read
5703         ** up to two varints past the end of the buffer. An extra 18
5704         ** bytes of padding is allocated at the end of the buffer in
5705         ** case this happens.  */
5706         void *pCellKey;
5707         u8 * const pCellBody = pCell - pPage->childPtrSize;
5708         const int nOverrun = 18;  /* Size of the overrun padding */
5709         pPage->xParseCell(pPage, pCellBody, &pCur->info);
5710         nCell = (int)pCur->info.nKey;
5711         testcase( nCell<0 );   /* True if key size is 2^32 or more */
5712         testcase( nCell==0 );  /* Invalid key size:  0x80 0x80 0x00 */
5713         testcase( nCell==1 );  /* Invalid key size:  0x80 0x80 0x01 */
5714         testcase( nCell==2 );  /* Minimum legal index key size */
5715         if( nCell<2 || nCell/pCur->pBt->usableSize>pCur->pBt->nPage ){
5716           rc = SQLITE_CORRUPT_PAGE(pPage);
5717           goto moveto_index_finish;
5718         }
5719         pCellKey = sqlite3Malloc( nCell+nOverrun );
5720         if( pCellKey==0 ){
5721           rc = SQLITE_NOMEM_BKPT;
5722           goto moveto_index_finish;
5723         }
5724         pCur->ix = (u16)idx;
5725         rc = accessPayload(pCur, 0, nCell, (unsigned char*)pCellKey, 0);
5726         memset(((u8*)pCellKey)+nCell,0,nOverrun); /* Fix uninit warnings */
5727         pCur->curFlags &= ~BTCF_ValidOvfl;
5728         if( rc ){
5729           sqlite3_free(pCellKey);
5730           goto moveto_index_finish;
5731         }
5732         c = sqlite3VdbeRecordCompare(nCell, pCellKey, pIdxKey);
5733         sqlite3_free(pCellKey);
5734       }
5735       assert(
5736           (pIdxKey->errCode!=SQLITE_CORRUPT || c==0)
5737        && (pIdxKey->errCode!=SQLITE_NOMEM || pCur->pBtree->db->mallocFailed)
5738       );
5739       if( c<0 ){
5740         lwr = idx+1;
5741       }else if( c>0 ){
5742         upr = idx-1;
5743       }else{
5744         assert( c==0 );
5745         *pRes = 0;
5746         rc = SQLITE_OK;
5747         pCur->ix = (u16)idx;
5748         if( pIdxKey->errCode ) rc = SQLITE_CORRUPT_BKPT;
5749         goto moveto_index_finish;
5750       }
5751       if( lwr>upr ) break;
5752       assert( lwr+upr>=0 );
5753       idx = (lwr+upr)>>1;  /* idx = (lwr+upr)/2 */
5754     }
5755     assert( lwr==upr+1 || (pPage->intKey && !pPage->leaf) );
5756     assert( pPage->isInit );
5757     if( pPage->leaf ){
5758       assert( pCur->ix<pCur->pPage->nCell );
5759       pCur->ix = (u16)idx;
5760       *pRes = c;
5761       rc = SQLITE_OK;
5762       goto moveto_index_finish;
5763     }
5764     if( lwr>=pPage->nCell ){
5765       chldPg = get4byte(&pPage->aData[pPage->hdrOffset+8]);
5766     }else{
5767       chldPg = get4byte(findCell(pPage, lwr));
5768     }
5769     pCur->ix = (u16)lwr;
5770     rc = moveToChild(pCur, chldPg);
5771     if( rc ) break;
5772   }
5773 moveto_index_finish:
5774   pCur->info.nSize = 0;
5775   assert( (pCur->curFlags & BTCF_ValidOvfl)==0 );
5776   return rc;
5777 }
5778 
5779 
5780 /*
5781 ** Return TRUE if the cursor is not pointing at an entry of the table.
5782 **
5783 ** TRUE will be returned after a call to sqlite3BtreeNext() moves
5784 ** past the last entry in the table or sqlite3BtreePrev() moves past
5785 ** the first entry.  TRUE is also returned if the table is empty.
5786 */
5787 int sqlite3BtreeEof(BtCursor *pCur){
5788   /* TODO: What if the cursor is in CURSOR_REQUIRESEEK but all table entries
5789   ** have been deleted? This API will need to change to return an error code
5790   ** as well as the boolean result value.
5791   */
5792   return (CURSOR_VALID!=pCur->eState);
5793 }
5794 
5795 /*
5796 ** Return an estimate for the number of rows in the table that pCur is
5797 ** pointing to.  Return a negative number if no estimate is currently
5798 ** available.
5799 */
5800 i64 sqlite3BtreeRowCountEst(BtCursor *pCur){
5801   i64 n;
5802   u8 i;
5803 
5804   assert( cursorOwnsBtShared(pCur) );
5805   assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
5806 
5807   /* Currently this interface is only called by the OP_IfSmaller
5808   ** opcode, and it that case the cursor will always be valid and
5809   ** will always point to a leaf node. */
5810   if( NEVER(pCur->eState!=CURSOR_VALID) ) return -1;
5811   if( NEVER(pCur->pPage->leaf==0) ) return -1;
5812 
5813   n = pCur->pPage->nCell;
5814   for(i=0; i<pCur->iPage; i++){
5815     n *= pCur->apPage[i]->nCell;
5816   }
5817   return n;
5818 }
5819 
5820 /*
5821 ** Advance the cursor to the next entry in the database.
5822 ** Return value:
5823 **
5824 **    SQLITE_OK        success
5825 **    SQLITE_DONE      cursor is already pointing at the last element
5826 **    otherwise        some kind of error occurred
5827 **
5828 ** The main entry point is sqlite3BtreeNext().  That routine is optimized
5829 ** for the common case of merely incrementing the cell counter BtCursor.aiIdx
5830 ** to the next cell on the current page.  The (slower) btreeNext() helper
5831 ** routine is called when it is necessary to move to a different page or
5832 ** to restore the cursor.
5833 **
5834 ** If bit 0x01 of the F argument in sqlite3BtreeNext(C,F) is 1, then the
5835 ** cursor corresponds to an SQL index and this routine could have been
5836 ** skipped if the SQL index had been a unique index.  The F argument
5837 ** is a hint to the implement.  SQLite btree implementation does not use
5838 ** this hint, but COMDB2 does.
5839 */
5840 static SQLITE_NOINLINE int btreeNext(BtCursor *pCur){
5841   int rc;
5842   int idx;
5843   MemPage *pPage;
5844 
5845   assert( cursorOwnsBtShared(pCur) );
5846   if( pCur->eState!=CURSOR_VALID ){
5847     assert( (pCur->curFlags & BTCF_ValidOvfl)==0 );
5848     rc = restoreCursorPosition(pCur);
5849     if( rc!=SQLITE_OK ){
5850       return rc;
5851     }
5852     if( CURSOR_INVALID==pCur->eState ){
5853       return SQLITE_DONE;
5854     }
5855     if( pCur->eState==CURSOR_SKIPNEXT ){
5856       pCur->eState = CURSOR_VALID;
5857       if( pCur->skipNext>0 ) return SQLITE_OK;
5858     }
5859   }
5860 
5861   pPage = pCur->pPage;
5862   idx = ++pCur->ix;
5863   if( !pPage->isInit || sqlite3FaultSim(412) ){
5864     /* The only known way for this to happen is for there to be a
5865     ** recursive SQL function that does a DELETE operation as part of a
5866     ** SELECT which deletes content out from under an active cursor
5867     ** in a corrupt database file where the table being DELETE-ed from
5868     ** has pages in common with the table being queried.  See TH3
5869     ** module cov1/btree78.test testcase 220 (2018-06-08) for an
5870     ** example. */
5871     return SQLITE_CORRUPT_BKPT;
5872   }
5873 
5874   if( idx>=pPage->nCell ){
5875     if( !pPage->leaf ){
5876       rc = moveToChild(pCur, get4byte(&pPage->aData[pPage->hdrOffset+8]));
5877       if( rc ) return rc;
5878       return moveToLeftmost(pCur);
5879     }
5880     do{
5881       if( pCur->iPage==0 ){
5882         pCur->eState = CURSOR_INVALID;
5883         return SQLITE_DONE;
5884       }
5885       moveToParent(pCur);
5886       pPage = pCur->pPage;
5887     }while( pCur->ix>=pPage->nCell );
5888     if( pPage->intKey ){
5889       return sqlite3BtreeNext(pCur, 0);
5890     }else{
5891       return SQLITE_OK;
5892     }
5893   }
5894   if( pPage->leaf ){
5895     return SQLITE_OK;
5896   }else{
5897     return moveToLeftmost(pCur);
5898   }
5899 }
5900 int sqlite3BtreeNext(BtCursor *pCur, int flags){
5901   MemPage *pPage;
5902   UNUSED_PARAMETER( flags );  /* Used in COMDB2 but not native SQLite */
5903   assert( cursorOwnsBtShared(pCur) );
5904   assert( flags==0 || flags==1 );
5905   pCur->info.nSize = 0;
5906   pCur->curFlags &= ~(BTCF_ValidNKey|BTCF_ValidOvfl);
5907   if( pCur->eState!=CURSOR_VALID ) return btreeNext(pCur);
5908   pPage = pCur->pPage;
5909   if( (++pCur->ix)>=pPage->nCell ){
5910     pCur->ix--;
5911     return btreeNext(pCur);
5912   }
5913   if( pPage->leaf ){
5914     return SQLITE_OK;
5915   }else{
5916     return moveToLeftmost(pCur);
5917   }
5918 }
5919 
5920 /*
5921 ** Step the cursor to the back to the previous entry in the database.
5922 ** Return values:
5923 **
5924 **     SQLITE_OK     success
5925 **     SQLITE_DONE   the cursor is already on the first element of the table
5926 **     otherwise     some kind of error occurred
5927 **
5928 ** The main entry point is sqlite3BtreePrevious().  That routine is optimized
5929 ** for the common case of merely decrementing the cell counter BtCursor.aiIdx
5930 ** to the previous cell on the current page.  The (slower) btreePrevious()
5931 ** helper routine is called when it is necessary to move to a different page
5932 ** or to restore the cursor.
5933 **
5934 ** If bit 0x01 of the F argument to sqlite3BtreePrevious(C,F) is 1, then
5935 ** the cursor corresponds to an SQL index and this routine could have been
5936 ** skipped if the SQL index had been a unique index.  The F argument is a
5937 ** hint to the implement.  The native SQLite btree implementation does not
5938 ** use this hint, but COMDB2 does.
5939 */
5940 static SQLITE_NOINLINE int btreePrevious(BtCursor *pCur){
5941   int rc;
5942   MemPage *pPage;
5943 
5944   assert( cursorOwnsBtShared(pCur) );
5945   assert( (pCur->curFlags & (BTCF_AtLast|BTCF_ValidOvfl|BTCF_ValidNKey))==0 );
5946   assert( pCur->info.nSize==0 );
5947   if( pCur->eState!=CURSOR_VALID ){
5948     rc = restoreCursorPosition(pCur);
5949     if( rc!=SQLITE_OK ){
5950       return rc;
5951     }
5952     if( CURSOR_INVALID==pCur->eState ){
5953       return SQLITE_DONE;
5954     }
5955     if( CURSOR_SKIPNEXT==pCur->eState ){
5956       pCur->eState = CURSOR_VALID;
5957       if( pCur->skipNext<0 ) return SQLITE_OK;
5958     }
5959   }
5960 
5961   pPage = pCur->pPage;
5962   assert( pPage->isInit );
5963   if( !pPage->leaf ){
5964     int idx = pCur->ix;
5965     rc = moveToChild(pCur, get4byte(findCell(pPage, idx)));
5966     if( rc ) return rc;
5967     rc = moveToRightmost(pCur);
5968   }else{
5969     while( pCur->ix==0 ){
5970       if( pCur->iPage==0 ){
5971         pCur->eState = CURSOR_INVALID;
5972         return SQLITE_DONE;
5973       }
5974       moveToParent(pCur);
5975     }
5976     assert( pCur->info.nSize==0 );
5977     assert( (pCur->curFlags & (BTCF_ValidOvfl))==0 );
5978 
5979     pCur->ix--;
5980     pPage = pCur->pPage;
5981     if( pPage->intKey && !pPage->leaf ){
5982       rc = sqlite3BtreePrevious(pCur, 0);
5983     }else{
5984       rc = SQLITE_OK;
5985     }
5986   }
5987   return rc;
5988 }
5989 int sqlite3BtreePrevious(BtCursor *pCur, int flags){
5990   assert( cursorOwnsBtShared(pCur) );
5991   assert( flags==0 || flags==1 );
5992   UNUSED_PARAMETER( flags );  /* Used in COMDB2 but not native SQLite */
5993   pCur->curFlags &= ~(BTCF_AtLast|BTCF_ValidOvfl|BTCF_ValidNKey);
5994   pCur->info.nSize = 0;
5995   if( pCur->eState!=CURSOR_VALID
5996    || pCur->ix==0
5997    || pCur->pPage->leaf==0
5998   ){
5999     return btreePrevious(pCur);
6000   }
6001   pCur->ix--;
6002   return SQLITE_OK;
6003 }
6004 
6005 /*
6006 ** Allocate a new page from the database file.
6007 **
6008 ** The new page is marked as dirty.  (In other words, sqlite3PagerWrite()
6009 ** has already been called on the new page.)  The new page has also
6010 ** been referenced and the calling routine is responsible for calling
6011 ** sqlite3PagerUnref() on the new page when it is done.
6012 **
6013 ** SQLITE_OK is returned on success.  Any other return value indicates
6014 ** an error.  *ppPage is set to NULL in the event of an error.
6015 **
6016 ** If the "nearby" parameter is not 0, then an effort is made to
6017 ** locate a page close to the page number "nearby".  This can be used in an
6018 ** attempt to keep related pages close to each other in the database file,
6019 ** which in turn can make database access faster.
6020 **
6021 ** If the eMode parameter is BTALLOC_EXACT and the nearby page exists
6022 ** anywhere on the free-list, then it is guaranteed to be returned.  If
6023 ** eMode is BTALLOC_LT then the page returned will be less than or equal
6024 ** to nearby if any such page exists.  If eMode is BTALLOC_ANY then there
6025 ** are no restrictions on which page is returned.
6026 */
6027 static int allocateBtreePage(
6028   BtShared *pBt,         /* The btree */
6029   MemPage **ppPage,      /* Store pointer to the allocated page here */
6030   Pgno *pPgno,           /* Store the page number here */
6031   Pgno nearby,           /* Search for a page near this one */
6032   u8 eMode               /* BTALLOC_EXACT, BTALLOC_LT, or BTALLOC_ANY */
6033 ){
6034   MemPage *pPage1;
6035   int rc;
6036   u32 n;     /* Number of pages on the freelist */
6037   u32 k;     /* Number of leaves on the trunk of the freelist */
6038   MemPage *pTrunk = 0;
6039   MemPage *pPrevTrunk = 0;
6040   Pgno mxPage;     /* Total size of the database file */
6041 
6042   assert( sqlite3_mutex_held(pBt->mutex) );
6043   assert( eMode==BTALLOC_ANY || (nearby>0 && IfNotOmitAV(pBt->autoVacuum)) );
6044   pPage1 = pBt->pPage1;
6045   mxPage = btreePagecount(pBt);
6046   /* EVIDENCE-OF: R-05119-02637 The 4-byte big-endian integer at offset 36
6047   ** stores stores the total number of pages on the freelist. */
6048   n = get4byte(&pPage1->aData[36]);
6049   testcase( n==mxPage-1 );
6050   if( n>=mxPage ){
6051     return SQLITE_CORRUPT_BKPT;
6052   }
6053   if( n>0 ){
6054     /* There are pages on the freelist.  Reuse one of those pages. */
6055     Pgno iTrunk;
6056     u8 searchList = 0; /* If the free-list must be searched for 'nearby' */
6057     u32 nSearch = 0;   /* Count of the number of search attempts */
6058 
6059     /* If eMode==BTALLOC_EXACT and a query of the pointer-map
6060     ** shows that the page 'nearby' is somewhere on the free-list, then
6061     ** the entire-list will be searched for that page.
6062     */
6063 #ifndef SQLITE_OMIT_AUTOVACUUM
6064     if( eMode==BTALLOC_EXACT ){
6065       if( nearby<=mxPage ){
6066         u8 eType;
6067         assert( nearby>0 );
6068         assert( pBt->autoVacuum );
6069         rc = ptrmapGet(pBt, nearby, &eType, 0);
6070         if( rc ) return rc;
6071         if( eType==PTRMAP_FREEPAGE ){
6072           searchList = 1;
6073         }
6074       }
6075     }else if( eMode==BTALLOC_LE ){
6076       searchList = 1;
6077     }
6078 #endif
6079 
6080     /* Decrement the free-list count by 1. Set iTrunk to the index of the
6081     ** first free-list trunk page. iPrevTrunk is initially 1.
6082     */
6083     rc = sqlite3PagerWrite(pPage1->pDbPage);
6084     if( rc ) return rc;
6085     put4byte(&pPage1->aData[36], n-1);
6086 
6087     /* The code within this loop is run only once if the 'searchList' variable
6088     ** is not true. Otherwise, it runs once for each trunk-page on the
6089     ** free-list until the page 'nearby' is located (eMode==BTALLOC_EXACT)
6090     ** or until a page less than 'nearby' is located (eMode==BTALLOC_LT)
6091     */
6092     do {
6093       pPrevTrunk = pTrunk;
6094       if( pPrevTrunk ){
6095         /* EVIDENCE-OF: R-01506-11053 The first integer on a freelist trunk page
6096         ** is the page number of the next freelist trunk page in the list or
6097         ** zero if this is the last freelist trunk page. */
6098         iTrunk = get4byte(&pPrevTrunk->aData[0]);
6099       }else{
6100         /* EVIDENCE-OF: R-59841-13798 The 4-byte big-endian integer at offset 32
6101         ** stores the page number of the first page of the freelist, or zero if
6102         ** the freelist is empty. */
6103         iTrunk = get4byte(&pPage1->aData[32]);
6104       }
6105       testcase( iTrunk==mxPage );
6106       if( iTrunk>mxPage || nSearch++ > n ){
6107         rc = SQLITE_CORRUPT_PGNO(pPrevTrunk ? pPrevTrunk->pgno : 1);
6108       }else{
6109         rc = btreeGetUnusedPage(pBt, iTrunk, &pTrunk, 0);
6110       }
6111       if( rc ){
6112         pTrunk = 0;
6113         goto end_allocate_page;
6114       }
6115       assert( pTrunk!=0 );
6116       assert( pTrunk->aData!=0 );
6117       /* EVIDENCE-OF: R-13523-04394 The second integer on a freelist trunk page
6118       ** is the number of leaf page pointers to follow. */
6119       k = get4byte(&pTrunk->aData[4]);
6120       if( k==0 && !searchList ){
6121         /* The trunk has no leaves and the list is not being searched.
6122         ** So extract the trunk page itself and use it as the newly
6123         ** allocated page */
6124         assert( pPrevTrunk==0 );
6125         rc = sqlite3PagerWrite(pTrunk->pDbPage);
6126         if( rc ){
6127           goto end_allocate_page;
6128         }
6129         *pPgno = iTrunk;
6130         memcpy(&pPage1->aData[32], &pTrunk->aData[0], 4);
6131         *ppPage = pTrunk;
6132         pTrunk = 0;
6133         TRACE(("ALLOCATE: %d trunk - %d free pages left\n", *pPgno, n-1));
6134       }else if( k>(u32)(pBt->usableSize/4 - 2) ){
6135         /* Value of k is out of range.  Database corruption */
6136         rc = SQLITE_CORRUPT_PGNO(iTrunk);
6137         goto end_allocate_page;
6138 #ifndef SQLITE_OMIT_AUTOVACUUM
6139       }else if( searchList
6140             && (nearby==iTrunk || (iTrunk<nearby && eMode==BTALLOC_LE))
6141       ){
6142         /* The list is being searched and this trunk page is the page
6143         ** to allocate, regardless of whether it has leaves.
6144         */
6145         *pPgno = iTrunk;
6146         *ppPage = pTrunk;
6147         searchList = 0;
6148         rc = sqlite3PagerWrite(pTrunk->pDbPage);
6149         if( rc ){
6150           goto end_allocate_page;
6151         }
6152         if( k==0 ){
6153           if( !pPrevTrunk ){
6154             memcpy(&pPage1->aData[32], &pTrunk->aData[0], 4);
6155           }else{
6156             rc = sqlite3PagerWrite(pPrevTrunk->pDbPage);
6157             if( rc!=SQLITE_OK ){
6158               goto end_allocate_page;
6159             }
6160             memcpy(&pPrevTrunk->aData[0], &pTrunk->aData[0], 4);
6161           }
6162         }else{
6163           /* The trunk page is required by the caller but it contains
6164           ** pointers to free-list leaves. The first leaf becomes a trunk
6165           ** page in this case.
6166           */
6167           MemPage *pNewTrunk;
6168           Pgno iNewTrunk = get4byte(&pTrunk->aData[8]);
6169           if( iNewTrunk>mxPage ){
6170             rc = SQLITE_CORRUPT_PGNO(iTrunk);
6171             goto end_allocate_page;
6172           }
6173           testcase( iNewTrunk==mxPage );
6174           rc = btreeGetUnusedPage(pBt, iNewTrunk, &pNewTrunk, 0);
6175           if( rc!=SQLITE_OK ){
6176             goto end_allocate_page;
6177           }
6178           rc = sqlite3PagerWrite(pNewTrunk->pDbPage);
6179           if( rc!=SQLITE_OK ){
6180             releasePage(pNewTrunk);
6181             goto end_allocate_page;
6182           }
6183           memcpy(&pNewTrunk->aData[0], &pTrunk->aData[0], 4);
6184           put4byte(&pNewTrunk->aData[4], k-1);
6185           memcpy(&pNewTrunk->aData[8], &pTrunk->aData[12], (k-1)*4);
6186           releasePage(pNewTrunk);
6187           if( !pPrevTrunk ){
6188             assert( sqlite3PagerIswriteable(pPage1->pDbPage) );
6189             put4byte(&pPage1->aData[32], iNewTrunk);
6190           }else{
6191             rc = sqlite3PagerWrite(pPrevTrunk->pDbPage);
6192             if( rc ){
6193               goto end_allocate_page;
6194             }
6195             put4byte(&pPrevTrunk->aData[0], iNewTrunk);
6196           }
6197         }
6198         pTrunk = 0;
6199         TRACE(("ALLOCATE: %d trunk - %d free pages left\n", *pPgno, n-1));
6200 #endif
6201       }else if( k>0 ){
6202         /* Extract a leaf from the trunk */
6203         u32 closest;
6204         Pgno iPage;
6205         unsigned char *aData = pTrunk->aData;
6206         if( nearby>0 ){
6207           u32 i;
6208           closest = 0;
6209           if( eMode==BTALLOC_LE ){
6210             for(i=0; i<k; i++){
6211               iPage = get4byte(&aData[8+i*4]);
6212               if( iPage<=nearby ){
6213                 closest = i;
6214                 break;
6215               }
6216             }
6217           }else{
6218             int dist;
6219             dist = sqlite3AbsInt32(get4byte(&aData[8]) - nearby);
6220             for(i=1; i<k; i++){
6221               int d2 = sqlite3AbsInt32(get4byte(&aData[8+i*4]) - nearby);
6222               if( d2<dist ){
6223                 closest = i;
6224                 dist = d2;
6225               }
6226             }
6227           }
6228         }else{
6229           closest = 0;
6230         }
6231 
6232         iPage = get4byte(&aData[8+closest*4]);
6233         testcase( iPage==mxPage );
6234         if( iPage>mxPage || iPage<2 ){
6235           rc = SQLITE_CORRUPT_PGNO(iTrunk);
6236           goto end_allocate_page;
6237         }
6238         testcase( iPage==mxPage );
6239         if( !searchList
6240          || (iPage==nearby || (iPage<nearby && eMode==BTALLOC_LE))
6241         ){
6242           int noContent;
6243           *pPgno = iPage;
6244           TRACE(("ALLOCATE: %d was leaf %d of %d on trunk %d"
6245                  ": %d more free pages\n",
6246                  *pPgno, closest+1, k, pTrunk->pgno, n-1));
6247           rc = sqlite3PagerWrite(pTrunk->pDbPage);
6248           if( rc ) goto end_allocate_page;
6249           if( closest<k-1 ){
6250             memcpy(&aData[8+closest*4], &aData[4+k*4], 4);
6251           }
6252           put4byte(&aData[4], k-1);
6253           noContent = !btreeGetHasContent(pBt, *pPgno)? PAGER_GET_NOCONTENT : 0;
6254           rc = btreeGetUnusedPage(pBt, *pPgno, ppPage, noContent);
6255           if( rc==SQLITE_OK ){
6256             rc = sqlite3PagerWrite((*ppPage)->pDbPage);
6257             if( rc!=SQLITE_OK ){
6258               releasePage(*ppPage);
6259               *ppPage = 0;
6260             }
6261           }
6262           searchList = 0;
6263         }
6264       }
6265       releasePage(pPrevTrunk);
6266       pPrevTrunk = 0;
6267     }while( searchList );
6268   }else{
6269     /* There are no pages on the freelist, so append a new page to the
6270     ** database image.
6271     **
6272     ** Normally, new pages allocated by this block can be requested from the
6273     ** pager layer with the 'no-content' flag set. This prevents the pager
6274     ** from trying to read the pages content from disk. However, if the
6275     ** current transaction has already run one or more incremental-vacuum
6276     ** steps, then the page we are about to allocate may contain content
6277     ** that is required in the event of a rollback. In this case, do
6278     ** not set the no-content flag. This causes the pager to load and journal
6279     ** the current page content before overwriting it.
6280     **
6281     ** Note that the pager will not actually attempt to load or journal
6282     ** content for any page that really does lie past the end of the database
6283     ** file on disk. So the effects of disabling the no-content optimization
6284     ** here are confined to those pages that lie between the end of the
6285     ** database image and the end of the database file.
6286     */
6287     int bNoContent = (0==IfNotOmitAV(pBt->bDoTruncate))? PAGER_GET_NOCONTENT:0;
6288 
6289     rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
6290     if( rc ) return rc;
6291     pBt->nPage++;
6292     if( pBt->nPage==PENDING_BYTE_PAGE(pBt) ) pBt->nPage++;
6293 
6294 #ifndef SQLITE_OMIT_AUTOVACUUM
6295     if( pBt->autoVacuum && PTRMAP_ISPAGE(pBt, pBt->nPage) ){
6296       /* If *pPgno refers to a pointer-map page, allocate two new pages
6297       ** at the end of the file instead of one. The first allocated page
6298       ** becomes a new pointer-map page, the second is used by the caller.
6299       */
6300       MemPage *pPg = 0;
6301       TRACE(("ALLOCATE: %d from end of file (pointer-map page)\n", pBt->nPage));
6302       assert( pBt->nPage!=PENDING_BYTE_PAGE(pBt) );
6303       rc = btreeGetUnusedPage(pBt, pBt->nPage, &pPg, bNoContent);
6304       if( rc==SQLITE_OK ){
6305         rc = sqlite3PagerWrite(pPg->pDbPage);
6306         releasePage(pPg);
6307       }
6308       if( rc ) return rc;
6309       pBt->nPage++;
6310       if( pBt->nPage==PENDING_BYTE_PAGE(pBt) ){ pBt->nPage++; }
6311     }
6312 #endif
6313     put4byte(28 + (u8*)pBt->pPage1->aData, pBt->nPage);
6314     *pPgno = pBt->nPage;
6315 
6316     assert( *pPgno!=PENDING_BYTE_PAGE(pBt) );
6317     rc = btreeGetUnusedPage(pBt, *pPgno, ppPage, bNoContent);
6318     if( rc ) return rc;
6319     rc = sqlite3PagerWrite((*ppPage)->pDbPage);
6320     if( rc!=SQLITE_OK ){
6321       releasePage(*ppPage);
6322       *ppPage = 0;
6323     }
6324     TRACE(("ALLOCATE: %d from end of file\n", *pPgno));
6325   }
6326 
6327   assert( CORRUPT_DB || *pPgno!=PENDING_BYTE_PAGE(pBt) );
6328 
6329 end_allocate_page:
6330   releasePage(pTrunk);
6331   releasePage(pPrevTrunk);
6332   assert( rc!=SQLITE_OK || sqlite3PagerPageRefcount((*ppPage)->pDbPage)<=1 );
6333   assert( rc!=SQLITE_OK || (*ppPage)->isInit==0 );
6334   return rc;
6335 }
6336 
6337 /*
6338 ** This function is used to add page iPage to the database file free-list.
6339 ** It is assumed that the page is not already a part of the free-list.
6340 **
6341 ** The value passed as the second argument to this function is optional.
6342 ** If the caller happens to have a pointer to the MemPage object
6343 ** corresponding to page iPage handy, it may pass it as the second value.
6344 ** Otherwise, it may pass NULL.
6345 **
6346 ** If a pointer to a MemPage object is passed as the second argument,
6347 ** its reference count is not altered by this function.
6348 */
6349 static int freePage2(BtShared *pBt, MemPage *pMemPage, Pgno iPage){
6350   MemPage *pTrunk = 0;                /* Free-list trunk page */
6351   Pgno iTrunk = 0;                    /* Page number of free-list trunk page */
6352   MemPage *pPage1 = pBt->pPage1;      /* Local reference to page 1 */
6353   MemPage *pPage;                     /* Page being freed. May be NULL. */
6354   int rc;                             /* Return Code */
6355   u32 nFree;                          /* Initial number of pages on free-list */
6356 
6357   assert( sqlite3_mutex_held(pBt->mutex) );
6358   assert( CORRUPT_DB || iPage>1 );
6359   assert( !pMemPage || pMemPage->pgno==iPage );
6360 
6361   if( iPage<2 || iPage>pBt->nPage ){
6362     return SQLITE_CORRUPT_BKPT;
6363   }
6364   if( pMemPage ){
6365     pPage = pMemPage;
6366     sqlite3PagerRef(pPage->pDbPage);
6367   }else{
6368     pPage = btreePageLookup(pBt, iPage);
6369   }
6370 
6371   /* Increment the free page count on pPage1 */
6372   rc = sqlite3PagerWrite(pPage1->pDbPage);
6373   if( rc ) goto freepage_out;
6374   nFree = get4byte(&pPage1->aData[36]);
6375   put4byte(&pPage1->aData[36], nFree+1);
6376 
6377   if( pBt->btsFlags & BTS_SECURE_DELETE ){
6378     /* If the secure_delete option is enabled, then
6379     ** always fully overwrite deleted information with zeros.
6380     */
6381     if( (!pPage && ((rc = btreeGetPage(pBt, iPage, &pPage, 0))!=0) )
6382      ||            ((rc = sqlite3PagerWrite(pPage->pDbPage))!=0)
6383     ){
6384       goto freepage_out;
6385     }
6386     memset(pPage->aData, 0, pPage->pBt->pageSize);
6387   }
6388 
6389   /* If the database supports auto-vacuum, write an entry in the pointer-map
6390   ** to indicate that the page is free.
6391   */
6392   if( ISAUTOVACUUM ){
6393     ptrmapPut(pBt, iPage, PTRMAP_FREEPAGE, 0, &rc);
6394     if( rc ) goto freepage_out;
6395   }
6396 
6397   /* Now manipulate the actual database free-list structure. There are two
6398   ** possibilities. If the free-list is currently empty, or if the first
6399   ** trunk page in the free-list is full, then this page will become a
6400   ** new free-list trunk page. Otherwise, it will become a leaf of the
6401   ** first trunk page in the current free-list. This block tests if it
6402   ** is possible to add the page as a new free-list leaf.
6403   */
6404   if( nFree!=0 ){
6405     u32 nLeaf;                /* Initial number of leaf cells on trunk page */
6406 
6407     iTrunk = get4byte(&pPage1->aData[32]);
6408     if( iTrunk>btreePagecount(pBt) ){
6409       rc = SQLITE_CORRUPT_BKPT;
6410       goto freepage_out;
6411     }
6412     rc = btreeGetPage(pBt, iTrunk, &pTrunk, 0);
6413     if( rc!=SQLITE_OK ){
6414       goto freepage_out;
6415     }
6416 
6417     nLeaf = get4byte(&pTrunk->aData[4]);
6418     assert( pBt->usableSize>32 );
6419     if( nLeaf > (u32)pBt->usableSize/4 - 2 ){
6420       rc = SQLITE_CORRUPT_BKPT;
6421       goto freepage_out;
6422     }
6423     if( nLeaf < (u32)pBt->usableSize/4 - 8 ){
6424       /* In this case there is room on the trunk page to insert the page
6425       ** being freed as a new leaf.
6426       **
6427       ** Note that the trunk page is not really full until it contains
6428       ** usableSize/4 - 2 entries, not usableSize/4 - 8 entries as we have
6429       ** coded.  But due to a coding error in versions of SQLite prior to
6430       ** 3.6.0, databases with freelist trunk pages holding more than
6431       ** usableSize/4 - 8 entries will be reported as corrupt.  In order
6432       ** to maintain backwards compatibility with older versions of SQLite,
6433       ** we will continue to restrict the number of entries to usableSize/4 - 8
6434       ** for now.  At some point in the future (once everyone has upgraded
6435       ** to 3.6.0 or later) we should consider fixing the conditional above
6436       ** to read "usableSize/4-2" instead of "usableSize/4-8".
6437       **
6438       ** EVIDENCE-OF: R-19920-11576 However, newer versions of SQLite still
6439       ** avoid using the last six entries in the freelist trunk page array in
6440       ** order that database files created by newer versions of SQLite can be
6441       ** read by older versions of SQLite.
6442       */
6443       rc = sqlite3PagerWrite(pTrunk->pDbPage);
6444       if( rc==SQLITE_OK ){
6445         put4byte(&pTrunk->aData[4], nLeaf+1);
6446         put4byte(&pTrunk->aData[8+nLeaf*4], iPage);
6447         if( pPage && (pBt->btsFlags & BTS_SECURE_DELETE)==0 ){
6448           sqlite3PagerDontWrite(pPage->pDbPage);
6449         }
6450         rc = btreeSetHasContent(pBt, iPage);
6451       }
6452       TRACE(("FREE-PAGE: %d leaf on trunk page %d\n",pPage->pgno,pTrunk->pgno));
6453       goto freepage_out;
6454     }
6455   }
6456 
6457   /* If control flows to this point, then it was not possible to add the
6458   ** the page being freed as a leaf page of the first trunk in the free-list.
6459   ** Possibly because the free-list is empty, or possibly because the
6460   ** first trunk in the free-list is full. Either way, the page being freed
6461   ** will become the new first trunk page in the free-list.
6462   */
6463   if( pPage==0 && SQLITE_OK!=(rc = btreeGetPage(pBt, iPage, &pPage, 0)) ){
6464     goto freepage_out;
6465   }
6466   rc = sqlite3PagerWrite(pPage->pDbPage);
6467   if( rc!=SQLITE_OK ){
6468     goto freepage_out;
6469   }
6470   put4byte(pPage->aData, iTrunk);
6471   put4byte(&pPage->aData[4], 0);
6472   put4byte(&pPage1->aData[32], iPage);
6473   TRACE(("FREE-PAGE: %d new trunk page replacing %d\n", pPage->pgno, iTrunk));
6474 
6475 freepage_out:
6476   if( pPage ){
6477     pPage->isInit = 0;
6478   }
6479   releasePage(pPage);
6480   releasePage(pTrunk);
6481   return rc;
6482 }
6483 static void freePage(MemPage *pPage, int *pRC){
6484   if( (*pRC)==SQLITE_OK ){
6485     *pRC = freePage2(pPage->pBt, pPage, pPage->pgno);
6486   }
6487 }
6488 
6489 /*
6490 ** Free the overflow pages associated with the given Cell.
6491 */
6492 static SQLITE_NOINLINE int clearCellOverflow(
6493   MemPage *pPage,          /* The page that contains the Cell */
6494   unsigned char *pCell,    /* First byte of the Cell */
6495   CellInfo *pInfo          /* Size information about the cell */
6496 ){
6497   BtShared *pBt;
6498   Pgno ovflPgno;
6499   int rc;
6500   int nOvfl;
6501   u32 ovflPageSize;
6502 
6503   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
6504   assert( pInfo->nLocal!=pInfo->nPayload );
6505   testcase( pCell + pInfo->nSize == pPage->aDataEnd );
6506   testcase( pCell + (pInfo->nSize-1) == pPage->aDataEnd );
6507   if( pCell + pInfo->nSize > pPage->aDataEnd ){
6508     /* Cell extends past end of page */
6509     return SQLITE_CORRUPT_PAGE(pPage);
6510   }
6511   ovflPgno = get4byte(pCell + pInfo->nSize - 4);
6512   pBt = pPage->pBt;
6513   assert( pBt->usableSize > 4 );
6514   ovflPageSize = pBt->usableSize - 4;
6515   nOvfl = (pInfo->nPayload - pInfo->nLocal + ovflPageSize - 1)/ovflPageSize;
6516   assert( nOvfl>0 ||
6517     (CORRUPT_DB && (pInfo->nPayload + ovflPageSize)<ovflPageSize)
6518   );
6519   while( nOvfl-- ){
6520     Pgno iNext = 0;
6521     MemPage *pOvfl = 0;
6522     if( ovflPgno<2 || ovflPgno>btreePagecount(pBt) ){
6523       /* 0 is not a legal page number and page 1 cannot be an
6524       ** overflow page. Therefore if ovflPgno<2 or past the end of the
6525       ** file the database must be corrupt. */
6526       return SQLITE_CORRUPT_BKPT;
6527     }
6528     if( nOvfl ){
6529       rc = getOverflowPage(pBt, ovflPgno, &pOvfl, &iNext);
6530       if( rc ) return rc;
6531     }
6532 
6533     if( ( pOvfl || ((pOvfl = btreePageLookup(pBt, ovflPgno))!=0) )
6534      && sqlite3PagerPageRefcount(pOvfl->pDbPage)!=1
6535     ){
6536       /* There is no reason any cursor should have an outstanding reference
6537       ** to an overflow page belonging to a cell that is being deleted/updated.
6538       ** So if there exists more than one reference to this page, then it
6539       ** must not really be an overflow page and the database must be corrupt.
6540       ** It is helpful to detect this before calling freePage2(), as
6541       ** freePage2() may zero the page contents if secure-delete mode is
6542       ** enabled. If this 'overflow' page happens to be a page that the
6543       ** caller is iterating through or using in some other way, this
6544       ** can be problematic.
6545       */
6546       rc = SQLITE_CORRUPT_BKPT;
6547     }else{
6548       rc = freePage2(pBt, pOvfl, ovflPgno);
6549     }
6550 
6551     if( pOvfl ){
6552       sqlite3PagerUnref(pOvfl->pDbPage);
6553     }
6554     if( rc ) return rc;
6555     ovflPgno = iNext;
6556   }
6557   return SQLITE_OK;
6558 }
6559 
6560 /* Call xParseCell to compute the size of a cell.  If the cell contains
6561 ** overflow, then invoke cellClearOverflow to clear out that overflow.
6562 ** STore the result code (SQLITE_OK or some error code) in rc.
6563 **
6564 ** Implemented as macro to force inlining for performance.
6565 */
6566 #define BTREE_CLEAR_CELL(rc, pPage, pCell, sInfo)   \
6567   pPage->xParseCell(pPage, pCell, &sInfo);          \
6568   if( sInfo.nLocal!=sInfo.nPayload ){               \
6569     rc = clearCellOverflow(pPage, pCell, &sInfo);   \
6570   }else{                                            \
6571     rc = SQLITE_OK;                                 \
6572   }
6573 
6574 
6575 /*
6576 ** Create the byte sequence used to represent a cell on page pPage
6577 ** and write that byte sequence into pCell[].  Overflow pages are
6578 ** allocated and filled in as necessary.  The calling procedure
6579 ** is responsible for making sure sufficient space has been allocated
6580 ** for pCell[].
6581 **
6582 ** Note that pCell does not necessary need to point to the pPage->aData
6583 ** area.  pCell might point to some temporary storage.  The cell will
6584 ** be constructed in this temporary area then copied into pPage->aData
6585 ** later.
6586 */
6587 static int fillInCell(
6588   MemPage *pPage,                /* The page that contains the cell */
6589   unsigned char *pCell,          /* Complete text of the cell */
6590   const BtreePayload *pX,        /* Payload with which to construct the cell */
6591   int *pnSize                    /* Write cell size here */
6592 ){
6593   int nPayload;
6594   const u8 *pSrc;
6595   int nSrc, n, rc, mn;
6596   int spaceLeft;
6597   MemPage *pToRelease;
6598   unsigned char *pPrior;
6599   unsigned char *pPayload;
6600   BtShared *pBt;
6601   Pgno pgnoOvfl;
6602   int nHeader;
6603 
6604   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
6605 
6606   /* pPage is not necessarily writeable since pCell might be auxiliary
6607   ** buffer space that is separate from the pPage buffer area */
6608   assert( pCell<pPage->aData || pCell>=&pPage->aData[pPage->pBt->pageSize]
6609             || sqlite3PagerIswriteable(pPage->pDbPage) );
6610 
6611   /* Fill in the header. */
6612   nHeader = pPage->childPtrSize;
6613   if( pPage->intKey ){
6614     nPayload = pX->nData + pX->nZero;
6615     pSrc = pX->pData;
6616     nSrc = pX->nData;
6617     assert( pPage->intKeyLeaf ); /* fillInCell() only called for leaves */
6618     nHeader += putVarint32(&pCell[nHeader], nPayload);
6619     nHeader += putVarint(&pCell[nHeader], *(u64*)&pX->nKey);
6620   }else{
6621     assert( pX->nKey<=0x7fffffff && pX->pKey!=0 );
6622     nSrc = nPayload = (int)pX->nKey;
6623     pSrc = pX->pKey;
6624     nHeader += putVarint32(&pCell[nHeader], nPayload);
6625   }
6626 
6627   /* Fill in the payload */
6628   pPayload = &pCell[nHeader];
6629   if( nPayload<=pPage->maxLocal ){
6630     /* This is the common case where everything fits on the btree page
6631     ** and no overflow pages are required. */
6632     n = nHeader + nPayload;
6633     testcase( n==3 );
6634     testcase( n==4 );
6635     if( n<4 ) n = 4;
6636     *pnSize = n;
6637     assert( nSrc<=nPayload );
6638     testcase( nSrc<nPayload );
6639     memcpy(pPayload, pSrc, nSrc);
6640     memset(pPayload+nSrc, 0, nPayload-nSrc);
6641     return SQLITE_OK;
6642   }
6643 
6644   /* If we reach this point, it means that some of the content will need
6645   ** to spill onto overflow pages.
6646   */
6647   mn = pPage->minLocal;
6648   n = mn + (nPayload - mn) % (pPage->pBt->usableSize - 4);
6649   testcase( n==pPage->maxLocal );
6650   testcase( n==pPage->maxLocal+1 );
6651   if( n > pPage->maxLocal ) n = mn;
6652   spaceLeft = n;
6653   *pnSize = n + nHeader + 4;
6654   pPrior = &pCell[nHeader+n];
6655   pToRelease = 0;
6656   pgnoOvfl = 0;
6657   pBt = pPage->pBt;
6658 
6659   /* At this point variables should be set as follows:
6660   **
6661   **   nPayload           Total payload size in bytes
6662   **   pPayload           Begin writing payload here
6663   **   spaceLeft          Space available at pPayload.  If nPayload>spaceLeft,
6664   **                      that means content must spill into overflow pages.
6665   **   *pnSize            Size of the local cell (not counting overflow pages)
6666   **   pPrior             Where to write the pgno of the first overflow page
6667   **
6668   ** Use a call to btreeParseCellPtr() to verify that the values above
6669   ** were computed correctly.
6670   */
6671 #ifdef SQLITE_DEBUG
6672   {
6673     CellInfo info;
6674     pPage->xParseCell(pPage, pCell, &info);
6675     assert( nHeader==(int)(info.pPayload - pCell) );
6676     assert( info.nKey==pX->nKey );
6677     assert( *pnSize == info.nSize );
6678     assert( spaceLeft == info.nLocal );
6679   }
6680 #endif
6681 
6682   /* Write the payload into the local Cell and any extra into overflow pages */
6683   while( 1 ){
6684     n = nPayload;
6685     if( n>spaceLeft ) n = spaceLeft;
6686 
6687     /* If pToRelease is not zero than pPayload points into the data area
6688     ** of pToRelease.  Make sure pToRelease is still writeable. */
6689     assert( pToRelease==0 || sqlite3PagerIswriteable(pToRelease->pDbPage) );
6690 
6691     /* If pPayload is part of the data area of pPage, then make sure pPage
6692     ** is still writeable */
6693     assert( pPayload<pPage->aData || pPayload>=&pPage->aData[pBt->pageSize]
6694             || sqlite3PagerIswriteable(pPage->pDbPage) );
6695 
6696     if( nSrc>=n ){
6697       memcpy(pPayload, pSrc, n);
6698     }else if( nSrc>0 ){
6699       n = nSrc;
6700       memcpy(pPayload, pSrc, n);
6701     }else{
6702       memset(pPayload, 0, n);
6703     }
6704     nPayload -= n;
6705     if( nPayload<=0 ) break;
6706     pPayload += n;
6707     pSrc += n;
6708     nSrc -= n;
6709     spaceLeft -= n;
6710     if( spaceLeft==0 ){
6711       MemPage *pOvfl = 0;
6712 #ifndef SQLITE_OMIT_AUTOVACUUM
6713       Pgno pgnoPtrmap = pgnoOvfl; /* Overflow page pointer-map entry page */
6714       if( pBt->autoVacuum ){
6715         do{
6716           pgnoOvfl++;
6717         } while(
6718           PTRMAP_ISPAGE(pBt, pgnoOvfl) || pgnoOvfl==PENDING_BYTE_PAGE(pBt)
6719         );
6720       }
6721 #endif
6722       rc = allocateBtreePage(pBt, &pOvfl, &pgnoOvfl, pgnoOvfl, 0);
6723 #ifndef SQLITE_OMIT_AUTOVACUUM
6724       /* If the database supports auto-vacuum, and the second or subsequent
6725       ** overflow page is being allocated, add an entry to the pointer-map
6726       ** for that page now.
6727       **
6728       ** If this is the first overflow page, then write a partial entry
6729       ** to the pointer-map. If we write nothing to this pointer-map slot,
6730       ** then the optimistic overflow chain processing in clearCell()
6731       ** may misinterpret the uninitialized values and delete the
6732       ** wrong pages from the database.
6733       */
6734       if( pBt->autoVacuum && rc==SQLITE_OK ){
6735         u8 eType = (pgnoPtrmap?PTRMAP_OVERFLOW2:PTRMAP_OVERFLOW1);
6736         ptrmapPut(pBt, pgnoOvfl, eType, pgnoPtrmap, &rc);
6737         if( rc ){
6738           releasePage(pOvfl);
6739         }
6740       }
6741 #endif
6742       if( rc ){
6743         releasePage(pToRelease);
6744         return rc;
6745       }
6746 
6747       /* If pToRelease is not zero than pPrior points into the data area
6748       ** of pToRelease.  Make sure pToRelease is still writeable. */
6749       assert( pToRelease==0 || sqlite3PagerIswriteable(pToRelease->pDbPage) );
6750 
6751       /* If pPrior is part of the data area of pPage, then make sure pPage
6752       ** is still writeable */
6753       assert( pPrior<pPage->aData || pPrior>=&pPage->aData[pBt->pageSize]
6754             || sqlite3PagerIswriteable(pPage->pDbPage) );
6755 
6756       put4byte(pPrior, pgnoOvfl);
6757       releasePage(pToRelease);
6758       pToRelease = pOvfl;
6759       pPrior = pOvfl->aData;
6760       put4byte(pPrior, 0);
6761       pPayload = &pOvfl->aData[4];
6762       spaceLeft = pBt->usableSize - 4;
6763     }
6764   }
6765   releasePage(pToRelease);
6766   return SQLITE_OK;
6767 }
6768 
6769 /*
6770 ** Remove the i-th cell from pPage.  This routine effects pPage only.
6771 ** The cell content is not freed or deallocated.  It is assumed that
6772 ** the cell content has been copied someplace else.  This routine just
6773 ** removes the reference to the cell from pPage.
6774 **
6775 ** "sz" must be the number of bytes in the cell.
6776 */
6777 static void dropCell(MemPage *pPage, int idx, int sz, int *pRC){
6778   u32 pc;         /* Offset to cell content of cell being deleted */
6779   u8 *data;       /* pPage->aData */
6780   u8 *ptr;        /* Used to move bytes around within data[] */
6781   int rc;         /* The return code */
6782   int hdr;        /* Beginning of the header.  0 most pages.  100 page 1 */
6783 
6784   if( *pRC ) return;
6785   assert( idx>=0 && idx<pPage->nCell );
6786   assert( CORRUPT_DB || sz==cellSize(pPage, idx) );
6787   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
6788   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
6789   assert( pPage->nFree>=0 );
6790   data = pPage->aData;
6791   ptr = &pPage->aCellIdx[2*idx];
6792   pc = get2byte(ptr);
6793   hdr = pPage->hdrOffset;
6794   testcase( pc==get2byte(&data[hdr+5]) );
6795   testcase( pc+sz==pPage->pBt->usableSize );
6796   if( pc+sz > pPage->pBt->usableSize ){
6797     *pRC = SQLITE_CORRUPT_BKPT;
6798     return;
6799   }
6800   rc = freeSpace(pPage, pc, sz);
6801   if( rc ){
6802     *pRC = rc;
6803     return;
6804   }
6805   pPage->nCell--;
6806   if( pPage->nCell==0 ){
6807     memset(&data[hdr+1], 0, 4);
6808     data[hdr+7] = 0;
6809     put2byte(&data[hdr+5], pPage->pBt->usableSize);
6810     pPage->nFree = pPage->pBt->usableSize - pPage->hdrOffset
6811                        - pPage->childPtrSize - 8;
6812   }else{
6813     memmove(ptr, ptr+2, 2*(pPage->nCell - idx));
6814     put2byte(&data[hdr+3], pPage->nCell);
6815     pPage->nFree += 2;
6816   }
6817 }
6818 
6819 /*
6820 ** Insert a new cell on pPage at cell index "i".  pCell points to the
6821 ** content of the cell.
6822 **
6823 ** If the cell content will fit on the page, then put it there.  If it
6824 ** will not fit, then make a copy of the cell content into pTemp if
6825 ** pTemp is not null.  Regardless of pTemp, allocate a new entry
6826 ** in pPage->apOvfl[] and make it point to the cell content (either
6827 ** in pTemp or the original pCell) and also record its index.
6828 ** Allocating a new entry in pPage->aCell[] implies that
6829 ** pPage->nOverflow is incremented.
6830 **
6831 ** *pRC must be SQLITE_OK when this routine is called.
6832 */
6833 static void insertCell(
6834   MemPage *pPage,   /* Page into which we are copying */
6835   int i,            /* New cell becomes the i-th cell of the page */
6836   u8 *pCell,        /* Content of the new cell */
6837   int sz,           /* Bytes of content in pCell */
6838   u8 *pTemp,        /* Temp storage space for pCell, if needed */
6839   Pgno iChild,      /* If non-zero, replace first 4 bytes with this value */
6840   int *pRC          /* Read and write return code from here */
6841 ){
6842   int idx = 0;      /* Where to write new cell content in data[] */
6843   int j;            /* Loop counter */
6844   u8 *data;         /* The content of the whole page */
6845   u8 *pIns;         /* The point in pPage->aCellIdx[] where no cell inserted */
6846 
6847   assert( *pRC==SQLITE_OK );
6848   assert( i>=0 && i<=pPage->nCell+pPage->nOverflow );
6849   assert( MX_CELL(pPage->pBt)<=10921 );
6850   assert( pPage->nCell<=MX_CELL(pPage->pBt) || CORRUPT_DB );
6851   assert( pPage->nOverflow<=ArraySize(pPage->apOvfl) );
6852   assert( ArraySize(pPage->apOvfl)==ArraySize(pPage->aiOvfl) );
6853   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
6854   assert( sz==pPage->xCellSize(pPage, pCell) || CORRUPT_DB );
6855   assert( pPage->nFree>=0 );
6856   if( pPage->nOverflow || sz+2>pPage->nFree ){
6857     if( pTemp ){
6858       memcpy(pTemp, pCell, sz);
6859       pCell = pTemp;
6860     }
6861     if( iChild ){
6862       put4byte(pCell, iChild);
6863     }
6864     j = pPage->nOverflow++;
6865     /* Comparison against ArraySize-1 since we hold back one extra slot
6866     ** as a contingency.  In other words, never need more than 3 overflow
6867     ** slots but 4 are allocated, just to be safe. */
6868     assert( j < ArraySize(pPage->apOvfl)-1 );
6869     pPage->apOvfl[j] = pCell;
6870     pPage->aiOvfl[j] = (u16)i;
6871 
6872     /* When multiple overflows occur, they are always sequential and in
6873     ** sorted order.  This invariants arise because multiple overflows can
6874     ** only occur when inserting divider cells into the parent page during
6875     ** balancing, and the dividers are adjacent and sorted.
6876     */
6877     assert( j==0 || pPage->aiOvfl[j-1]<(u16)i ); /* Overflows in sorted order */
6878     assert( j==0 || i==pPage->aiOvfl[j-1]+1 );   /* Overflows are sequential */
6879   }else{
6880     int rc = sqlite3PagerWrite(pPage->pDbPage);
6881     if( rc!=SQLITE_OK ){
6882       *pRC = rc;
6883       return;
6884     }
6885     assert( sqlite3PagerIswriteable(pPage->pDbPage) );
6886     data = pPage->aData;
6887     assert( &data[pPage->cellOffset]==pPage->aCellIdx );
6888     rc = allocateSpace(pPage, sz, &idx);
6889     if( rc ){ *pRC = rc; return; }
6890     /* The allocateSpace() routine guarantees the following properties
6891     ** if it returns successfully */
6892     assert( idx >= 0 );
6893     assert( idx >= pPage->cellOffset+2*pPage->nCell+2 || CORRUPT_DB );
6894     assert( idx+sz <= (int)pPage->pBt->usableSize );
6895     pPage->nFree -= (u16)(2 + sz);
6896     if( iChild ){
6897       /* In a corrupt database where an entry in the cell index section of
6898       ** a btree page has a value of 3 or less, the pCell value might point
6899       ** as many as 4 bytes in front of the start of the aData buffer for
6900       ** the source page.  Make sure this does not cause problems by not
6901       ** reading the first 4 bytes */
6902       memcpy(&data[idx+4], pCell+4, sz-4);
6903       put4byte(&data[idx], iChild);
6904     }else{
6905       memcpy(&data[idx], pCell, sz);
6906     }
6907     pIns = pPage->aCellIdx + i*2;
6908     memmove(pIns+2, pIns, 2*(pPage->nCell - i));
6909     put2byte(pIns, idx);
6910     pPage->nCell++;
6911     /* increment the cell count */
6912     if( (++data[pPage->hdrOffset+4])==0 ) data[pPage->hdrOffset+3]++;
6913     assert( get2byte(&data[pPage->hdrOffset+3])==pPage->nCell || CORRUPT_DB );
6914 #ifndef SQLITE_OMIT_AUTOVACUUM
6915     if( pPage->pBt->autoVacuum ){
6916       /* The cell may contain a pointer to an overflow page. If so, write
6917       ** the entry for the overflow page into the pointer map.
6918       */
6919       ptrmapPutOvflPtr(pPage, pPage, pCell, pRC);
6920     }
6921 #endif
6922   }
6923 }
6924 
6925 /*
6926 ** The following parameters determine how many adjacent pages get involved
6927 ** in a balancing operation.  NN is the number of neighbors on either side
6928 ** of the page that participate in the balancing operation.  NB is the
6929 ** total number of pages that participate, including the target page and
6930 ** NN neighbors on either side.
6931 **
6932 ** The minimum value of NN is 1 (of course).  Increasing NN above 1
6933 ** (to 2 or 3) gives a modest improvement in SELECT and DELETE performance
6934 ** in exchange for a larger degradation in INSERT and UPDATE performance.
6935 ** The value of NN appears to give the best results overall.
6936 **
6937 ** (Later:) The description above makes it seem as if these values are
6938 ** tunable - as if you could change them and recompile and it would all work.
6939 ** But that is unlikely.  NB has been 3 since the inception of SQLite and
6940 ** we have never tested any other value.
6941 */
6942 #define NN 1             /* Number of neighbors on either side of pPage */
6943 #define NB 3             /* (NN*2+1): Total pages involved in the balance */
6944 
6945 /*
6946 ** A CellArray object contains a cache of pointers and sizes for a
6947 ** consecutive sequence of cells that might be held on multiple pages.
6948 **
6949 ** The cells in this array are the divider cell or cells from the pParent
6950 ** page plus up to three child pages.  There are a total of nCell cells.
6951 **
6952 ** pRef is a pointer to one of the pages that contributes cells.  This is
6953 ** used to access information such as MemPage.intKey and MemPage.pBt->pageSize
6954 ** which should be common to all pages that contribute cells to this array.
6955 **
6956 ** apCell[] and szCell[] hold, respectively, pointers to the start of each
6957 ** cell and the size of each cell.  Some of the apCell[] pointers might refer
6958 ** to overflow cells.  In other words, some apCel[] pointers might not point
6959 ** to content area of the pages.
6960 **
6961 ** A szCell[] of zero means the size of that cell has not yet been computed.
6962 **
6963 ** The cells come from as many as four different pages:
6964 **
6965 **             -----------
6966 **             | Parent  |
6967 **             -----------
6968 **            /     |     \
6969 **           /      |      \
6970 **  ---------   ---------   ---------
6971 **  |Child-1|   |Child-2|   |Child-3|
6972 **  ---------   ---------   ---------
6973 **
6974 ** The order of cells is in the array is for an index btree is:
6975 **
6976 **       1.  All cells from Child-1 in order
6977 **       2.  The first divider cell from Parent
6978 **       3.  All cells from Child-2 in order
6979 **       4.  The second divider cell from Parent
6980 **       5.  All cells from Child-3 in order
6981 **
6982 ** For a table-btree (with rowids) the items 2 and 4 are empty because
6983 ** content exists only in leaves and there are no divider cells.
6984 **
6985 ** For an index btree, the apEnd[] array holds pointer to the end of page
6986 ** for Child-1, the Parent, Child-2, the Parent (again), and Child-3,
6987 ** respectively. The ixNx[] array holds the number of cells contained in
6988 ** each of these 5 stages, and all stages to the left.  Hence:
6989 **
6990 **    ixNx[0] = Number of cells in Child-1.
6991 **    ixNx[1] = Number of cells in Child-1 plus 1 for first divider.
6992 **    ixNx[2] = Number of cells in Child-1 and Child-2 + 1 for 1st divider.
6993 **    ixNx[3] = Number of cells in Child-1 and Child-2 + both divider cells
6994 **    ixNx[4] = Total number of cells.
6995 **
6996 ** For a table-btree, the concept is similar, except only apEnd[0]..apEnd[2]
6997 ** are used and they point to the leaf pages only, and the ixNx value are:
6998 **
6999 **    ixNx[0] = Number of cells in Child-1.
7000 **    ixNx[1] = Number of cells in Child-1 and Child-2.
7001 **    ixNx[2] = Total number of cells.
7002 **
7003 ** Sometimes when deleting, a child page can have zero cells.  In those
7004 ** cases, ixNx[] entries with higher indexes, and the corresponding apEnd[]
7005 ** entries, shift down.  The end result is that each ixNx[] entry should
7006 ** be larger than the previous
7007 */
7008 typedef struct CellArray CellArray;
7009 struct CellArray {
7010   int nCell;              /* Number of cells in apCell[] */
7011   MemPage *pRef;          /* Reference page */
7012   u8 **apCell;            /* All cells begin balanced */
7013   u16 *szCell;            /* Local size of all cells in apCell[] */
7014   u8 *apEnd[NB*2];        /* MemPage.aDataEnd values */
7015   int ixNx[NB*2];         /* Index of at which we move to the next apEnd[] */
7016 };
7017 
7018 /*
7019 ** Make sure the cell sizes at idx, idx+1, ..., idx+N-1 have been
7020 ** computed.
7021 */
7022 static void populateCellCache(CellArray *p, int idx, int N){
7023   assert( idx>=0 && idx+N<=p->nCell );
7024   while( N>0 ){
7025     assert( p->apCell[idx]!=0 );
7026     if( p->szCell[idx]==0 ){
7027       p->szCell[idx] = p->pRef->xCellSize(p->pRef, p->apCell[idx]);
7028     }else{
7029       assert( CORRUPT_DB ||
7030               p->szCell[idx]==p->pRef->xCellSize(p->pRef, p->apCell[idx]) );
7031     }
7032     idx++;
7033     N--;
7034   }
7035 }
7036 
7037 /*
7038 ** Return the size of the Nth element of the cell array
7039 */
7040 static SQLITE_NOINLINE u16 computeCellSize(CellArray *p, int N){
7041   assert( N>=0 && N<p->nCell );
7042   assert( p->szCell[N]==0 );
7043   p->szCell[N] = p->pRef->xCellSize(p->pRef, p->apCell[N]);
7044   return p->szCell[N];
7045 }
7046 static u16 cachedCellSize(CellArray *p, int N){
7047   assert( N>=0 && N<p->nCell );
7048   if( p->szCell[N] ) return p->szCell[N];
7049   return computeCellSize(p, N);
7050 }
7051 
7052 /*
7053 ** Array apCell[] contains pointers to nCell b-tree page cells. The
7054 ** szCell[] array contains the size in bytes of each cell. This function
7055 ** replaces the current contents of page pPg with the contents of the cell
7056 ** array.
7057 **
7058 ** Some of the cells in apCell[] may currently be stored in pPg. This
7059 ** function works around problems caused by this by making a copy of any
7060 ** such cells before overwriting the page data.
7061 **
7062 ** The MemPage.nFree field is invalidated by this function. It is the
7063 ** responsibility of the caller to set it correctly.
7064 */
7065 static int rebuildPage(
7066   CellArray *pCArray,             /* Content to be added to page pPg */
7067   int iFirst,                     /* First cell in pCArray to use */
7068   int nCell,                      /* Final number of cells on page */
7069   MemPage *pPg                    /* The page to be reconstructed */
7070 ){
7071   const int hdr = pPg->hdrOffset;          /* Offset of header on pPg */
7072   u8 * const aData = pPg->aData;           /* Pointer to data for pPg */
7073   const int usableSize = pPg->pBt->usableSize;
7074   u8 * const pEnd = &aData[usableSize];
7075   int i = iFirst;                 /* Which cell to copy from pCArray*/
7076   u32 j;                          /* Start of cell content area */
7077   int iEnd = i+nCell;             /* Loop terminator */
7078   u8 *pCellptr = pPg->aCellIdx;
7079   u8 *pTmp = sqlite3PagerTempSpace(pPg->pBt->pPager);
7080   u8 *pData;
7081   int k;                          /* Current slot in pCArray->apEnd[] */
7082   u8 *pSrcEnd;                    /* Current pCArray->apEnd[k] value */
7083 
7084   assert( i<iEnd );
7085   j = get2byte(&aData[hdr+5]);
7086   if( NEVER(j>(u32)usableSize) ){ j = 0; }
7087   memcpy(&pTmp[j], &aData[j], usableSize - j);
7088 
7089   for(k=0; pCArray->ixNx[k]<=i && ALWAYS(k<NB*2); k++){}
7090   pSrcEnd = pCArray->apEnd[k];
7091 
7092   pData = pEnd;
7093   while( 1/*exit by break*/ ){
7094     u8 *pCell = pCArray->apCell[i];
7095     u16 sz = pCArray->szCell[i];
7096     assert( sz>0 );
7097     if( SQLITE_WITHIN(pCell,aData+j,pEnd) ){
7098       if( ((uptr)(pCell+sz))>(uptr)pEnd ) return SQLITE_CORRUPT_BKPT;
7099       pCell = &pTmp[pCell - aData];
7100     }else if( (uptr)(pCell+sz)>(uptr)pSrcEnd
7101            && (uptr)(pCell)<(uptr)pSrcEnd
7102     ){
7103       return SQLITE_CORRUPT_BKPT;
7104     }
7105 
7106     pData -= sz;
7107     put2byte(pCellptr, (pData - aData));
7108     pCellptr += 2;
7109     if( pData < pCellptr ) return SQLITE_CORRUPT_BKPT;
7110     memmove(pData, pCell, sz);
7111     assert( sz==pPg->xCellSize(pPg, pCell) || CORRUPT_DB );
7112     i++;
7113     if( i>=iEnd ) break;
7114     if( pCArray->ixNx[k]<=i ){
7115       k++;
7116       pSrcEnd = pCArray->apEnd[k];
7117     }
7118   }
7119 
7120   /* The pPg->nFree field is now set incorrectly. The caller will fix it. */
7121   pPg->nCell = nCell;
7122   pPg->nOverflow = 0;
7123 
7124   put2byte(&aData[hdr+1], 0);
7125   put2byte(&aData[hdr+3], pPg->nCell);
7126   put2byte(&aData[hdr+5], pData - aData);
7127   aData[hdr+7] = 0x00;
7128   return SQLITE_OK;
7129 }
7130 
7131 /*
7132 ** The pCArray objects contains pointers to b-tree cells and the cell sizes.
7133 ** This function attempts to add the cells stored in the array to page pPg.
7134 ** If it cannot (because the page needs to be defragmented before the cells
7135 ** will fit), non-zero is returned. Otherwise, if the cells are added
7136 ** successfully, zero is returned.
7137 **
7138 ** Argument pCellptr points to the first entry in the cell-pointer array
7139 ** (part of page pPg) to populate. After cell apCell[0] is written to the
7140 ** page body, a 16-bit offset is written to pCellptr. And so on, for each
7141 ** cell in the array. It is the responsibility of the caller to ensure
7142 ** that it is safe to overwrite this part of the cell-pointer array.
7143 **
7144 ** When this function is called, *ppData points to the start of the
7145 ** content area on page pPg. If the size of the content area is extended,
7146 ** *ppData is updated to point to the new start of the content area
7147 ** before returning.
7148 **
7149 ** Finally, argument pBegin points to the byte immediately following the
7150 ** end of the space required by this page for the cell-pointer area (for
7151 ** all cells - not just those inserted by the current call). If the content
7152 ** area must be extended to before this point in order to accomodate all
7153 ** cells in apCell[], then the cells do not fit and non-zero is returned.
7154 */
7155 static int pageInsertArray(
7156   MemPage *pPg,                   /* Page to add cells to */
7157   u8 *pBegin,                     /* End of cell-pointer array */
7158   u8 **ppData,                    /* IN/OUT: Page content-area pointer */
7159   u8 *pCellptr,                   /* Pointer to cell-pointer area */
7160   int iFirst,                     /* Index of first cell to add */
7161   int nCell,                      /* Number of cells to add to pPg */
7162   CellArray *pCArray              /* Array of cells */
7163 ){
7164   int i = iFirst;                 /* Loop counter - cell index to insert */
7165   u8 *aData = pPg->aData;         /* Complete page */
7166   u8 *pData = *ppData;            /* Content area.  A subset of aData[] */
7167   int iEnd = iFirst + nCell;      /* End of loop. One past last cell to ins */
7168   int k;                          /* Current slot in pCArray->apEnd[] */
7169   u8 *pEnd;                       /* Maximum extent of cell data */
7170   assert( CORRUPT_DB || pPg->hdrOffset==0 );    /* Never called on page 1 */
7171   if( iEnd<=iFirst ) return 0;
7172   for(k=0; pCArray->ixNx[k]<=i && ALWAYS(k<NB*2); k++){}
7173   pEnd = pCArray->apEnd[k];
7174   while( 1 /*Exit by break*/ ){
7175     int sz, rc;
7176     u8 *pSlot;
7177     assert( pCArray->szCell[i]!=0 );
7178     sz = pCArray->szCell[i];
7179     if( (aData[1]==0 && aData[2]==0) || (pSlot = pageFindSlot(pPg,sz,&rc))==0 ){
7180       if( (pData - pBegin)<sz ) return 1;
7181       pData -= sz;
7182       pSlot = pData;
7183     }
7184     /* pSlot and pCArray->apCell[i] will never overlap on a well-formed
7185     ** database.  But they might for a corrupt database.  Hence use memmove()
7186     ** since memcpy() sends SIGABORT with overlapping buffers on OpenBSD */
7187     assert( (pSlot+sz)<=pCArray->apCell[i]
7188          || pSlot>=(pCArray->apCell[i]+sz)
7189          || CORRUPT_DB );
7190     if( (uptr)(pCArray->apCell[i]+sz)>(uptr)pEnd
7191      && (uptr)(pCArray->apCell[i])<(uptr)pEnd
7192     ){
7193       assert( CORRUPT_DB );
7194       (void)SQLITE_CORRUPT_BKPT;
7195       return 1;
7196     }
7197     memmove(pSlot, pCArray->apCell[i], sz);
7198     put2byte(pCellptr, (pSlot - aData));
7199     pCellptr += 2;
7200     i++;
7201     if( i>=iEnd ) break;
7202     if( pCArray->ixNx[k]<=i ){
7203       k++;
7204       pEnd = pCArray->apEnd[k];
7205     }
7206   }
7207   *ppData = pData;
7208   return 0;
7209 }
7210 
7211 /*
7212 ** The pCArray object contains pointers to b-tree cells and their sizes.
7213 **
7214 ** This function adds the space associated with each cell in the array
7215 ** that is currently stored within the body of pPg to the pPg free-list.
7216 ** The cell-pointers and other fields of the page are not updated.
7217 **
7218 ** This function returns the total number of cells added to the free-list.
7219 */
7220 static int pageFreeArray(
7221   MemPage *pPg,                   /* Page to edit */
7222   int iFirst,                     /* First cell to delete */
7223   int nCell,                      /* Cells to delete */
7224   CellArray *pCArray              /* Array of cells */
7225 ){
7226   u8 * const aData = pPg->aData;
7227   u8 * const pEnd = &aData[pPg->pBt->usableSize];
7228   u8 * const pStart = &aData[pPg->hdrOffset + 8 + pPg->childPtrSize];
7229   int nRet = 0;
7230   int i;
7231   int iEnd = iFirst + nCell;
7232   u8 *pFree = 0;
7233   int szFree = 0;
7234 
7235   for(i=iFirst; i<iEnd; i++){
7236     u8 *pCell = pCArray->apCell[i];
7237     if( SQLITE_WITHIN(pCell, pStart, pEnd) ){
7238       int sz;
7239       /* No need to use cachedCellSize() here.  The sizes of all cells that
7240       ** are to be freed have already been computing while deciding which
7241       ** cells need freeing */
7242       sz = pCArray->szCell[i];  assert( sz>0 );
7243       if( pFree!=(pCell + sz) ){
7244         if( pFree ){
7245           assert( pFree>aData && (pFree - aData)<65536 );
7246           freeSpace(pPg, (u16)(pFree - aData), szFree);
7247         }
7248         pFree = pCell;
7249         szFree = sz;
7250         if( pFree+sz>pEnd ){
7251           return 0;
7252         }
7253       }else{
7254         pFree = pCell;
7255         szFree += sz;
7256       }
7257       nRet++;
7258     }
7259   }
7260   if( pFree ){
7261     assert( pFree>aData && (pFree - aData)<65536 );
7262     freeSpace(pPg, (u16)(pFree - aData), szFree);
7263   }
7264   return nRet;
7265 }
7266 
7267 /*
7268 ** pCArray contains pointers to and sizes of all cells in the page being
7269 ** balanced.  The current page, pPg, has pPg->nCell cells starting with
7270 ** pCArray->apCell[iOld].  After balancing, this page should hold nNew cells
7271 ** starting at apCell[iNew].
7272 **
7273 ** This routine makes the necessary adjustments to pPg so that it contains
7274 ** the correct cells after being balanced.
7275 **
7276 ** The pPg->nFree field is invalid when this function returns. It is the
7277 ** responsibility of the caller to set it correctly.
7278 */
7279 static int editPage(
7280   MemPage *pPg,                   /* Edit this page */
7281   int iOld,                       /* Index of first cell currently on page */
7282   int iNew,                       /* Index of new first cell on page */
7283   int nNew,                       /* Final number of cells on page */
7284   CellArray *pCArray              /* Array of cells and sizes */
7285 ){
7286   u8 * const aData = pPg->aData;
7287   const int hdr = pPg->hdrOffset;
7288   u8 *pBegin = &pPg->aCellIdx[nNew * 2];
7289   int nCell = pPg->nCell;       /* Cells stored on pPg */
7290   u8 *pData;
7291   u8 *pCellptr;
7292   int i;
7293   int iOldEnd = iOld + pPg->nCell + pPg->nOverflow;
7294   int iNewEnd = iNew + nNew;
7295 
7296 #ifdef SQLITE_DEBUG
7297   u8 *pTmp = sqlite3PagerTempSpace(pPg->pBt->pPager);
7298   memcpy(pTmp, aData, pPg->pBt->usableSize);
7299 #endif
7300 
7301   /* Remove cells from the start and end of the page */
7302   assert( nCell>=0 );
7303   if( iOld<iNew ){
7304     int nShift = pageFreeArray(pPg, iOld, iNew-iOld, pCArray);
7305     if( NEVER(nShift>nCell) ) return SQLITE_CORRUPT_BKPT;
7306     memmove(pPg->aCellIdx, &pPg->aCellIdx[nShift*2], nCell*2);
7307     nCell -= nShift;
7308   }
7309   if( iNewEnd < iOldEnd ){
7310     int nTail = pageFreeArray(pPg, iNewEnd, iOldEnd - iNewEnd, pCArray);
7311     assert( nCell>=nTail );
7312     nCell -= nTail;
7313   }
7314 
7315   pData = &aData[get2byteNotZero(&aData[hdr+5])];
7316   if( pData<pBegin ) goto editpage_fail;
7317   if( NEVER(pData>pPg->aDataEnd) ) goto editpage_fail;
7318 
7319   /* Add cells to the start of the page */
7320   if( iNew<iOld ){
7321     int nAdd = MIN(nNew,iOld-iNew);
7322     assert( (iOld-iNew)<nNew || nCell==0 || CORRUPT_DB );
7323     assert( nAdd>=0 );
7324     pCellptr = pPg->aCellIdx;
7325     memmove(&pCellptr[nAdd*2], pCellptr, nCell*2);
7326     if( pageInsertArray(
7327           pPg, pBegin, &pData, pCellptr,
7328           iNew, nAdd, pCArray
7329     ) ) goto editpage_fail;
7330     nCell += nAdd;
7331   }
7332 
7333   /* Add any overflow cells */
7334   for(i=0; i<pPg->nOverflow; i++){
7335     int iCell = (iOld + pPg->aiOvfl[i]) - iNew;
7336     if( iCell>=0 && iCell<nNew ){
7337       pCellptr = &pPg->aCellIdx[iCell * 2];
7338       if( nCell>iCell ){
7339         memmove(&pCellptr[2], pCellptr, (nCell - iCell) * 2);
7340       }
7341       nCell++;
7342       cachedCellSize(pCArray, iCell+iNew);
7343       if( pageInsertArray(
7344             pPg, pBegin, &pData, pCellptr,
7345             iCell+iNew, 1, pCArray
7346       ) ) goto editpage_fail;
7347     }
7348   }
7349 
7350   /* Append cells to the end of the page */
7351   assert( nCell>=0 );
7352   pCellptr = &pPg->aCellIdx[nCell*2];
7353   if( pageInsertArray(
7354         pPg, pBegin, &pData, pCellptr,
7355         iNew+nCell, nNew-nCell, pCArray
7356   ) ) goto editpage_fail;
7357 
7358   pPg->nCell = nNew;
7359   pPg->nOverflow = 0;
7360 
7361   put2byte(&aData[hdr+3], pPg->nCell);
7362   put2byte(&aData[hdr+5], pData - aData);
7363 
7364 #ifdef SQLITE_DEBUG
7365   for(i=0; i<nNew && !CORRUPT_DB; i++){
7366     u8 *pCell = pCArray->apCell[i+iNew];
7367     int iOff = get2byteAligned(&pPg->aCellIdx[i*2]);
7368     if( SQLITE_WITHIN(pCell, aData, &aData[pPg->pBt->usableSize]) ){
7369       pCell = &pTmp[pCell - aData];
7370     }
7371     assert( 0==memcmp(pCell, &aData[iOff],
7372             pCArray->pRef->xCellSize(pCArray->pRef, pCArray->apCell[i+iNew])) );
7373   }
7374 #endif
7375 
7376   return SQLITE_OK;
7377  editpage_fail:
7378   /* Unable to edit this page. Rebuild it from scratch instead. */
7379   populateCellCache(pCArray, iNew, nNew);
7380   return rebuildPage(pCArray, iNew, nNew, pPg);
7381 }
7382 
7383 
7384 #ifndef SQLITE_OMIT_QUICKBALANCE
7385 /*
7386 ** This version of balance() handles the common special case where
7387 ** a new entry is being inserted on the extreme right-end of the
7388 ** tree, in other words, when the new entry will become the largest
7389 ** entry in the tree.
7390 **
7391 ** Instead of trying to balance the 3 right-most leaf pages, just add
7392 ** a new page to the right-hand side and put the one new entry in
7393 ** that page.  This leaves the right side of the tree somewhat
7394 ** unbalanced.  But odds are that we will be inserting new entries
7395 ** at the end soon afterwards so the nearly empty page will quickly
7396 ** fill up.  On average.
7397 **
7398 ** pPage is the leaf page which is the right-most page in the tree.
7399 ** pParent is its parent.  pPage must have a single overflow entry
7400 ** which is also the right-most entry on the page.
7401 **
7402 ** The pSpace buffer is used to store a temporary copy of the divider
7403 ** cell that will be inserted into pParent. Such a cell consists of a 4
7404 ** byte page number followed by a variable length integer. In other
7405 ** words, at most 13 bytes. Hence the pSpace buffer must be at
7406 ** least 13 bytes in size.
7407 */
7408 static int balance_quick(MemPage *pParent, MemPage *pPage, u8 *pSpace){
7409   BtShared *const pBt = pPage->pBt;    /* B-Tree Database */
7410   MemPage *pNew;                       /* Newly allocated page */
7411   int rc;                              /* Return Code */
7412   Pgno pgnoNew;                        /* Page number of pNew */
7413 
7414   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
7415   assert( sqlite3PagerIswriteable(pParent->pDbPage) );
7416   assert( pPage->nOverflow==1 );
7417 
7418   if( pPage->nCell==0 ) return SQLITE_CORRUPT_BKPT;  /* dbfuzz001.test */
7419   assert( pPage->nFree>=0 );
7420   assert( pParent->nFree>=0 );
7421 
7422   /* Allocate a new page. This page will become the right-sibling of
7423   ** pPage. Make the parent page writable, so that the new divider cell
7424   ** may be inserted. If both these operations are successful, proceed.
7425   */
7426   rc = allocateBtreePage(pBt, &pNew, &pgnoNew, 0, 0);
7427 
7428   if( rc==SQLITE_OK ){
7429 
7430     u8 *pOut = &pSpace[4];
7431     u8 *pCell = pPage->apOvfl[0];
7432     u16 szCell = pPage->xCellSize(pPage, pCell);
7433     u8 *pStop;
7434     CellArray b;
7435 
7436     assert( sqlite3PagerIswriteable(pNew->pDbPage) );
7437     assert( CORRUPT_DB || pPage->aData[0]==(PTF_INTKEY|PTF_LEAFDATA|PTF_LEAF) );
7438     zeroPage(pNew, PTF_INTKEY|PTF_LEAFDATA|PTF_LEAF);
7439     b.nCell = 1;
7440     b.pRef = pPage;
7441     b.apCell = &pCell;
7442     b.szCell = &szCell;
7443     b.apEnd[0] = pPage->aDataEnd;
7444     b.ixNx[0] = 2;
7445     rc = rebuildPage(&b, 0, 1, pNew);
7446     if( NEVER(rc) ){
7447       releasePage(pNew);
7448       return rc;
7449     }
7450     pNew->nFree = pBt->usableSize - pNew->cellOffset - 2 - szCell;
7451 
7452     /* If this is an auto-vacuum database, update the pointer map
7453     ** with entries for the new page, and any pointer from the
7454     ** cell on the page to an overflow page. If either of these
7455     ** operations fails, the return code is set, but the contents
7456     ** of the parent page are still manipulated by thh code below.
7457     ** That is Ok, at this point the parent page is guaranteed to
7458     ** be marked as dirty. Returning an error code will cause a
7459     ** rollback, undoing any changes made to the parent page.
7460     */
7461     if( ISAUTOVACUUM ){
7462       ptrmapPut(pBt, pgnoNew, PTRMAP_BTREE, pParent->pgno, &rc);
7463       if( szCell>pNew->minLocal ){
7464         ptrmapPutOvflPtr(pNew, pNew, pCell, &rc);
7465       }
7466     }
7467 
7468     /* Create a divider cell to insert into pParent. The divider cell
7469     ** consists of a 4-byte page number (the page number of pPage) and
7470     ** a variable length key value (which must be the same value as the
7471     ** largest key on pPage).
7472     **
7473     ** To find the largest key value on pPage, first find the right-most
7474     ** cell on pPage. The first two fields of this cell are the
7475     ** record-length (a variable length integer at most 32-bits in size)
7476     ** and the key value (a variable length integer, may have any value).
7477     ** The first of the while(...) loops below skips over the record-length
7478     ** field. The second while(...) loop copies the key value from the
7479     ** cell on pPage into the pSpace buffer.
7480     */
7481     pCell = findCell(pPage, pPage->nCell-1);
7482     pStop = &pCell[9];
7483     while( (*(pCell++)&0x80) && pCell<pStop );
7484     pStop = &pCell[9];
7485     while( ((*(pOut++) = *(pCell++))&0x80) && pCell<pStop );
7486 
7487     /* Insert the new divider cell into pParent. */
7488     if( rc==SQLITE_OK ){
7489       insertCell(pParent, pParent->nCell, pSpace, (int)(pOut-pSpace),
7490                    0, pPage->pgno, &rc);
7491     }
7492 
7493     /* Set the right-child pointer of pParent to point to the new page. */
7494     put4byte(&pParent->aData[pParent->hdrOffset+8], pgnoNew);
7495 
7496     /* Release the reference to the new page. */
7497     releasePage(pNew);
7498   }
7499 
7500   return rc;
7501 }
7502 #endif /* SQLITE_OMIT_QUICKBALANCE */
7503 
7504 #if 0
7505 /*
7506 ** This function does not contribute anything to the operation of SQLite.
7507 ** it is sometimes activated temporarily while debugging code responsible
7508 ** for setting pointer-map entries.
7509 */
7510 static int ptrmapCheckPages(MemPage **apPage, int nPage){
7511   int i, j;
7512   for(i=0; i<nPage; i++){
7513     Pgno n;
7514     u8 e;
7515     MemPage *pPage = apPage[i];
7516     BtShared *pBt = pPage->pBt;
7517     assert( pPage->isInit );
7518 
7519     for(j=0; j<pPage->nCell; j++){
7520       CellInfo info;
7521       u8 *z;
7522 
7523       z = findCell(pPage, j);
7524       pPage->xParseCell(pPage, z, &info);
7525       if( info.nLocal<info.nPayload ){
7526         Pgno ovfl = get4byte(&z[info.nSize-4]);
7527         ptrmapGet(pBt, ovfl, &e, &n);
7528         assert( n==pPage->pgno && e==PTRMAP_OVERFLOW1 );
7529       }
7530       if( !pPage->leaf ){
7531         Pgno child = get4byte(z);
7532         ptrmapGet(pBt, child, &e, &n);
7533         assert( n==pPage->pgno && e==PTRMAP_BTREE );
7534       }
7535     }
7536     if( !pPage->leaf ){
7537       Pgno child = get4byte(&pPage->aData[pPage->hdrOffset+8]);
7538       ptrmapGet(pBt, child, &e, &n);
7539       assert( n==pPage->pgno && e==PTRMAP_BTREE );
7540     }
7541   }
7542   return 1;
7543 }
7544 #endif
7545 
7546 /*
7547 ** This function is used to copy the contents of the b-tree node stored
7548 ** on page pFrom to page pTo. If page pFrom was not a leaf page, then
7549 ** the pointer-map entries for each child page are updated so that the
7550 ** parent page stored in the pointer map is page pTo. If pFrom contained
7551 ** any cells with overflow page pointers, then the corresponding pointer
7552 ** map entries are also updated so that the parent page is page pTo.
7553 **
7554 ** If pFrom is currently carrying any overflow cells (entries in the
7555 ** MemPage.apOvfl[] array), they are not copied to pTo.
7556 **
7557 ** Before returning, page pTo is reinitialized using btreeInitPage().
7558 **
7559 ** The performance of this function is not critical. It is only used by
7560 ** the balance_shallower() and balance_deeper() procedures, neither of
7561 ** which are called often under normal circumstances.
7562 */
7563 static void copyNodeContent(MemPage *pFrom, MemPage *pTo, int *pRC){
7564   if( (*pRC)==SQLITE_OK ){
7565     BtShared * const pBt = pFrom->pBt;
7566     u8 * const aFrom = pFrom->aData;
7567     u8 * const aTo = pTo->aData;
7568     int const iFromHdr = pFrom->hdrOffset;
7569     int const iToHdr = ((pTo->pgno==1) ? 100 : 0);
7570     int rc;
7571     int iData;
7572 
7573 
7574     assert( pFrom->isInit );
7575     assert( pFrom->nFree>=iToHdr );
7576     assert( get2byte(&aFrom[iFromHdr+5]) <= (int)pBt->usableSize );
7577 
7578     /* Copy the b-tree node content from page pFrom to page pTo. */
7579     iData = get2byte(&aFrom[iFromHdr+5]);
7580     memcpy(&aTo[iData], &aFrom[iData], pBt->usableSize-iData);
7581     memcpy(&aTo[iToHdr], &aFrom[iFromHdr], pFrom->cellOffset + 2*pFrom->nCell);
7582 
7583     /* Reinitialize page pTo so that the contents of the MemPage structure
7584     ** match the new data. The initialization of pTo can actually fail under
7585     ** fairly obscure circumstances, even though it is a copy of initialized
7586     ** page pFrom.
7587     */
7588     pTo->isInit = 0;
7589     rc = btreeInitPage(pTo);
7590     if( rc==SQLITE_OK ) rc = btreeComputeFreeSpace(pTo);
7591     if( rc!=SQLITE_OK ){
7592       *pRC = rc;
7593       return;
7594     }
7595 
7596     /* If this is an auto-vacuum database, update the pointer-map entries
7597     ** for any b-tree or overflow pages that pTo now contains the pointers to.
7598     */
7599     if( ISAUTOVACUUM ){
7600       *pRC = setChildPtrmaps(pTo);
7601     }
7602   }
7603 }
7604 
7605 /*
7606 ** This routine redistributes cells on the iParentIdx'th child of pParent
7607 ** (hereafter "the page") and up to 2 siblings so that all pages have about the
7608 ** same amount of free space. Usually a single sibling on either side of the
7609 ** page are used in the balancing, though both siblings might come from one
7610 ** side if the page is the first or last child of its parent. If the page
7611 ** has fewer than 2 siblings (something which can only happen if the page
7612 ** is a root page or a child of a root page) then all available siblings
7613 ** participate in the balancing.
7614 **
7615 ** The number of siblings of the page might be increased or decreased by
7616 ** one or two in an effort to keep pages nearly full but not over full.
7617 **
7618 ** Note that when this routine is called, some of the cells on the page
7619 ** might not actually be stored in MemPage.aData[]. This can happen
7620 ** if the page is overfull. This routine ensures that all cells allocated
7621 ** to the page and its siblings fit into MemPage.aData[] before returning.
7622 **
7623 ** In the course of balancing the page and its siblings, cells may be
7624 ** inserted into or removed from the parent page (pParent). Doing so
7625 ** may cause the parent page to become overfull or underfull. If this
7626 ** happens, it is the responsibility of the caller to invoke the correct
7627 ** balancing routine to fix this problem (see the balance() routine).
7628 **
7629 ** If this routine fails for any reason, it might leave the database
7630 ** in a corrupted state. So if this routine fails, the database should
7631 ** be rolled back.
7632 **
7633 ** The third argument to this function, aOvflSpace, is a pointer to a
7634 ** buffer big enough to hold one page. If while inserting cells into the parent
7635 ** page (pParent) the parent page becomes overfull, this buffer is
7636 ** used to store the parent's overflow cells. Because this function inserts
7637 ** a maximum of four divider cells into the parent page, and the maximum
7638 ** size of a cell stored within an internal node is always less than 1/4
7639 ** of the page-size, the aOvflSpace[] buffer is guaranteed to be large
7640 ** enough for all overflow cells.
7641 **
7642 ** If aOvflSpace is set to a null pointer, this function returns
7643 ** SQLITE_NOMEM.
7644 */
7645 static int balance_nonroot(
7646   MemPage *pParent,               /* Parent page of siblings being balanced */
7647   int iParentIdx,                 /* Index of "the page" in pParent */
7648   u8 *aOvflSpace,                 /* page-size bytes of space for parent ovfl */
7649   int isRoot,                     /* True if pParent is a root-page */
7650   int bBulk                       /* True if this call is part of a bulk load */
7651 ){
7652   BtShared *pBt;               /* The whole database */
7653   int nMaxCells = 0;           /* Allocated size of apCell, szCell, aFrom. */
7654   int nNew = 0;                /* Number of pages in apNew[] */
7655   int nOld;                    /* Number of pages in apOld[] */
7656   int i, j, k;                 /* Loop counters */
7657   int nxDiv;                   /* Next divider slot in pParent->aCell[] */
7658   int rc = SQLITE_OK;          /* The return code */
7659   u16 leafCorrection;          /* 4 if pPage is a leaf.  0 if not */
7660   int leafData;                /* True if pPage is a leaf of a LEAFDATA tree */
7661   int usableSpace;             /* Bytes in pPage beyond the header */
7662   int pageFlags;               /* Value of pPage->aData[0] */
7663   int iSpace1 = 0;             /* First unused byte of aSpace1[] */
7664   int iOvflSpace = 0;          /* First unused byte of aOvflSpace[] */
7665   int szScratch;               /* Size of scratch memory requested */
7666   MemPage *apOld[NB];          /* pPage and up to two siblings */
7667   MemPage *apNew[NB+2];        /* pPage and up to NB siblings after balancing */
7668   u8 *pRight;                  /* Location in parent of right-sibling pointer */
7669   u8 *apDiv[NB-1];             /* Divider cells in pParent */
7670   int cntNew[NB+2];            /* Index in b.paCell[] of cell after i-th page */
7671   int cntOld[NB+2];            /* Old index in b.apCell[] */
7672   int szNew[NB+2];             /* Combined size of cells placed on i-th page */
7673   u8 *aSpace1;                 /* Space for copies of dividers cells */
7674   Pgno pgno;                   /* Temp var to store a page number in */
7675   u8 abDone[NB+2];             /* True after i'th new page is populated */
7676   Pgno aPgno[NB+2];            /* Page numbers of new pages before shuffling */
7677   Pgno aPgOrder[NB+2];         /* Copy of aPgno[] used for sorting pages */
7678   u16 aPgFlags[NB+2];          /* flags field of new pages before shuffling */
7679   CellArray b;                 /* Parsed information on cells being balanced */
7680 
7681   memset(abDone, 0, sizeof(abDone));
7682   memset(&b, 0, sizeof(b));
7683   pBt = pParent->pBt;
7684   assert( sqlite3_mutex_held(pBt->mutex) );
7685   assert( sqlite3PagerIswriteable(pParent->pDbPage) );
7686 
7687   /* At this point pParent may have at most one overflow cell. And if
7688   ** this overflow cell is present, it must be the cell with
7689   ** index iParentIdx. This scenario comes about when this function
7690   ** is called (indirectly) from sqlite3BtreeDelete().
7691   */
7692   assert( pParent->nOverflow==0 || pParent->nOverflow==1 );
7693   assert( pParent->nOverflow==0 || pParent->aiOvfl[0]==iParentIdx );
7694 
7695   if( !aOvflSpace ){
7696     return SQLITE_NOMEM_BKPT;
7697   }
7698   assert( pParent->nFree>=0 );
7699 
7700   /* Find the sibling pages to balance. Also locate the cells in pParent
7701   ** that divide the siblings. An attempt is made to find NN siblings on
7702   ** either side of pPage. More siblings are taken from one side, however,
7703   ** if there are fewer than NN siblings on the other side. If pParent
7704   ** has NB or fewer children then all children of pParent are taken.
7705   **
7706   ** This loop also drops the divider cells from the parent page. This
7707   ** way, the remainder of the function does not have to deal with any
7708   ** overflow cells in the parent page, since if any existed they will
7709   ** have already been removed.
7710   */
7711   i = pParent->nOverflow + pParent->nCell;
7712   if( i<2 ){
7713     nxDiv = 0;
7714   }else{
7715     assert( bBulk==0 || bBulk==1 );
7716     if( iParentIdx==0 ){
7717       nxDiv = 0;
7718     }else if( iParentIdx==i ){
7719       nxDiv = i-2+bBulk;
7720     }else{
7721       nxDiv = iParentIdx-1;
7722     }
7723     i = 2-bBulk;
7724   }
7725   nOld = i+1;
7726   if( (i+nxDiv-pParent->nOverflow)==pParent->nCell ){
7727     pRight = &pParent->aData[pParent->hdrOffset+8];
7728   }else{
7729     pRight = findCell(pParent, i+nxDiv-pParent->nOverflow);
7730   }
7731   pgno = get4byte(pRight);
7732   while( 1 ){
7733     if( rc==SQLITE_OK ){
7734       rc = getAndInitPage(pBt, pgno, &apOld[i], 0, 0);
7735     }
7736     if( rc ){
7737       memset(apOld, 0, (i+1)*sizeof(MemPage*));
7738       goto balance_cleanup;
7739     }
7740     if( apOld[i]->nFree<0 ){
7741       rc = btreeComputeFreeSpace(apOld[i]);
7742       if( rc ){
7743         memset(apOld, 0, (i)*sizeof(MemPage*));
7744         goto balance_cleanup;
7745       }
7746     }
7747     nMaxCells += apOld[i]->nCell + ArraySize(pParent->apOvfl);
7748     if( (i--)==0 ) break;
7749 
7750     if( pParent->nOverflow && i+nxDiv==pParent->aiOvfl[0] ){
7751       apDiv[i] = pParent->apOvfl[0];
7752       pgno = get4byte(apDiv[i]);
7753       szNew[i] = pParent->xCellSize(pParent, apDiv[i]);
7754       pParent->nOverflow = 0;
7755     }else{
7756       apDiv[i] = findCell(pParent, i+nxDiv-pParent->nOverflow);
7757       pgno = get4byte(apDiv[i]);
7758       szNew[i] = pParent->xCellSize(pParent, apDiv[i]);
7759 
7760       /* Drop the cell from the parent page. apDiv[i] still points to
7761       ** the cell within the parent, even though it has been dropped.
7762       ** This is safe because dropping a cell only overwrites the first
7763       ** four bytes of it, and this function does not need the first
7764       ** four bytes of the divider cell. So the pointer is safe to use
7765       ** later on.
7766       **
7767       ** But not if we are in secure-delete mode. In secure-delete mode,
7768       ** the dropCell() routine will overwrite the entire cell with zeroes.
7769       ** In this case, temporarily copy the cell into the aOvflSpace[]
7770       ** buffer. It will be copied out again as soon as the aSpace[] buffer
7771       ** is allocated.  */
7772       if( pBt->btsFlags & BTS_FAST_SECURE ){
7773         int iOff;
7774 
7775         /* If the following if() condition is not true, the db is corrupted.
7776         ** The call to dropCell() below will detect this.  */
7777         iOff = SQLITE_PTR_TO_INT(apDiv[i]) - SQLITE_PTR_TO_INT(pParent->aData);
7778         if( (iOff+szNew[i])<=(int)pBt->usableSize ){
7779           memcpy(&aOvflSpace[iOff], apDiv[i], szNew[i]);
7780           apDiv[i] = &aOvflSpace[apDiv[i]-pParent->aData];
7781         }
7782       }
7783       dropCell(pParent, i+nxDiv-pParent->nOverflow, szNew[i], &rc);
7784     }
7785   }
7786 
7787   /* Make nMaxCells a multiple of 4 in order to preserve 8-byte
7788   ** alignment */
7789   nMaxCells = (nMaxCells + 3)&~3;
7790 
7791   /*
7792   ** Allocate space for memory structures
7793   */
7794   szScratch =
7795        nMaxCells*sizeof(u8*)                       /* b.apCell */
7796      + nMaxCells*sizeof(u16)                       /* b.szCell */
7797      + pBt->pageSize;                              /* aSpace1 */
7798 
7799   assert( szScratch<=7*(int)pBt->pageSize );
7800   b.apCell = sqlite3StackAllocRaw(0, szScratch );
7801   if( b.apCell==0 ){
7802     rc = SQLITE_NOMEM_BKPT;
7803     goto balance_cleanup;
7804   }
7805   b.szCell = (u16*)&b.apCell[nMaxCells];
7806   aSpace1 = (u8*)&b.szCell[nMaxCells];
7807   assert( EIGHT_BYTE_ALIGNMENT(aSpace1) );
7808 
7809   /*
7810   ** Load pointers to all cells on sibling pages and the divider cells
7811   ** into the local b.apCell[] array.  Make copies of the divider cells
7812   ** into space obtained from aSpace1[]. The divider cells have already
7813   ** been removed from pParent.
7814   **
7815   ** If the siblings are on leaf pages, then the child pointers of the
7816   ** divider cells are stripped from the cells before they are copied
7817   ** into aSpace1[].  In this way, all cells in b.apCell[] are without
7818   ** child pointers.  If siblings are not leaves, then all cell in
7819   ** b.apCell[] include child pointers.  Either way, all cells in b.apCell[]
7820   ** are alike.
7821   **
7822   ** leafCorrection:  4 if pPage is a leaf.  0 if pPage is not a leaf.
7823   **       leafData:  1 if pPage holds key+data and pParent holds only keys.
7824   */
7825   b.pRef = apOld[0];
7826   leafCorrection = b.pRef->leaf*4;
7827   leafData = b.pRef->intKeyLeaf;
7828   for(i=0; i<nOld; i++){
7829     MemPage *pOld = apOld[i];
7830     int limit = pOld->nCell;
7831     u8 *aData = pOld->aData;
7832     u16 maskPage = pOld->maskPage;
7833     u8 *piCell = aData + pOld->cellOffset;
7834     u8 *piEnd;
7835     VVA_ONLY( int nCellAtStart = b.nCell; )
7836 
7837     /* Verify that all sibling pages are of the same "type" (table-leaf,
7838     ** table-interior, index-leaf, or index-interior).
7839     */
7840     if( pOld->aData[0]!=apOld[0]->aData[0] ){
7841       rc = SQLITE_CORRUPT_BKPT;
7842       goto balance_cleanup;
7843     }
7844 
7845     /* Load b.apCell[] with pointers to all cells in pOld.  If pOld
7846     ** contains overflow cells, include them in the b.apCell[] array
7847     ** in the correct spot.
7848     **
7849     ** Note that when there are multiple overflow cells, it is always the
7850     ** case that they are sequential and adjacent.  This invariant arises
7851     ** because multiple overflows can only occurs when inserting divider
7852     ** cells into a parent on a prior balance, and divider cells are always
7853     ** adjacent and are inserted in order.  There is an assert() tagged
7854     ** with "NOTE 1" in the overflow cell insertion loop to prove this
7855     ** invariant.
7856     **
7857     ** This must be done in advance.  Once the balance starts, the cell
7858     ** offset section of the btree page will be overwritten and we will no
7859     ** long be able to find the cells if a pointer to each cell is not saved
7860     ** first.
7861     */
7862     memset(&b.szCell[b.nCell], 0, sizeof(b.szCell[0])*(limit+pOld->nOverflow));
7863     if( pOld->nOverflow>0 ){
7864       if( NEVER(limit<pOld->aiOvfl[0]) ){
7865         rc = SQLITE_CORRUPT_BKPT;
7866         goto balance_cleanup;
7867       }
7868       limit = pOld->aiOvfl[0];
7869       for(j=0; j<limit; j++){
7870         b.apCell[b.nCell] = aData + (maskPage & get2byteAligned(piCell));
7871         piCell += 2;
7872         b.nCell++;
7873       }
7874       for(k=0; k<pOld->nOverflow; k++){
7875         assert( k==0 || pOld->aiOvfl[k-1]+1==pOld->aiOvfl[k] );/* NOTE 1 */
7876         b.apCell[b.nCell] = pOld->apOvfl[k];
7877         b.nCell++;
7878       }
7879     }
7880     piEnd = aData + pOld->cellOffset + 2*pOld->nCell;
7881     while( piCell<piEnd ){
7882       assert( b.nCell<nMaxCells );
7883       b.apCell[b.nCell] = aData + (maskPage & get2byteAligned(piCell));
7884       piCell += 2;
7885       b.nCell++;
7886     }
7887     assert( (b.nCell-nCellAtStart)==(pOld->nCell+pOld->nOverflow) );
7888 
7889     cntOld[i] = b.nCell;
7890     if( i<nOld-1 && !leafData){
7891       u16 sz = (u16)szNew[i];
7892       u8 *pTemp;
7893       assert( b.nCell<nMaxCells );
7894       b.szCell[b.nCell] = sz;
7895       pTemp = &aSpace1[iSpace1];
7896       iSpace1 += sz;
7897       assert( sz<=pBt->maxLocal+23 );
7898       assert( iSpace1 <= (int)pBt->pageSize );
7899       memcpy(pTemp, apDiv[i], sz);
7900       b.apCell[b.nCell] = pTemp+leafCorrection;
7901       assert( leafCorrection==0 || leafCorrection==4 );
7902       b.szCell[b.nCell] = b.szCell[b.nCell] - leafCorrection;
7903       if( !pOld->leaf ){
7904         assert( leafCorrection==0 );
7905         assert( pOld->hdrOffset==0 || CORRUPT_DB );
7906         /* The right pointer of the child page pOld becomes the left
7907         ** pointer of the divider cell */
7908         memcpy(b.apCell[b.nCell], &pOld->aData[8], 4);
7909       }else{
7910         assert( leafCorrection==4 );
7911         while( b.szCell[b.nCell]<4 ){
7912           /* Do not allow any cells smaller than 4 bytes. If a smaller cell
7913           ** does exist, pad it with 0x00 bytes. */
7914           assert( b.szCell[b.nCell]==3 || CORRUPT_DB );
7915           assert( b.apCell[b.nCell]==&aSpace1[iSpace1-3] || CORRUPT_DB );
7916           aSpace1[iSpace1++] = 0x00;
7917           b.szCell[b.nCell]++;
7918         }
7919       }
7920       b.nCell++;
7921     }
7922   }
7923 
7924   /*
7925   ** Figure out the number of pages needed to hold all b.nCell cells.
7926   ** Store this number in "k".  Also compute szNew[] which is the total
7927   ** size of all cells on the i-th page and cntNew[] which is the index
7928   ** in b.apCell[] of the cell that divides page i from page i+1.
7929   ** cntNew[k] should equal b.nCell.
7930   **
7931   ** Values computed by this block:
7932   **
7933   **           k: The total number of sibling pages
7934   **    szNew[i]: Spaced used on the i-th sibling page.
7935   **   cntNew[i]: Index in b.apCell[] and b.szCell[] for the first cell to
7936   **              the right of the i-th sibling page.
7937   ** usableSpace: Number of bytes of space available on each sibling.
7938   **
7939   */
7940   usableSpace = pBt->usableSize - 12 + leafCorrection;
7941   for(i=k=0; i<nOld; i++, k++){
7942     MemPage *p = apOld[i];
7943     b.apEnd[k] = p->aDataEnd;
7944     b.ixNx[k] = cntOld[i];
7945     if( k && b.ixNx[k]==b.ixNx[k-1] ){
7946       k--;  /* Omit b.ixNx[] entry for child pages with no cells */
7947     }
7948     if( !leafData ){
7949       k++;
7950       b.apEnd[k] = pParent->aDataEnd;
7951       b.ixNx[k] = cntOld[i]+1;
7952     }
7953     assert( p->nFree>=0 );
7954     szNew[i] = usableSpace - p->nFree;
7955     for(j=0; j<p->nOverflow; j++){
7956       szNew[i] += 2 + p->xCellSize(p, p->apOvfl[j]);
7957     }
7958     cntNew[i] = cntOld[i];
7959   }
7960   k = nOld;
7961   for(i=0; i<k; i++){
7962     int sz;
7963     while( szNew[i]>usableSpace ){
7964       if( i+1>=k ){
7965         k = i+2;
7966         if( k>NB+2 ){ rc = SQLITE_CORRUPT_BKPT; goto balance_cleanup; }
7967         szNew[k-1] = 0;
7968         cntNew[k-1] = b.nCell;
7969       }
7970       sz = 2 + cachedCellSize(&b, cntNew[i]-1);
7971       szNew[i] -= sz;
7972       if( !leafData ){
7973         if( cntNew[i]<b.nCell ){
7974           sz = 2 + cachedCellSize(&b, cntNew[i]);
7975         }else{
7976           sz = 0;
7977         }
7978       }
7979       szNew[i+1] += sz;
7980       cntNew[i]--;
7981     }
7982     while( cntNew[i]<b.nCell ){
7983       sz = 2 + cachedCellSize(&b, cntNew[i]);
7984       if( szNew[i]+sz>usableSpace ) break;
7985       szNew[i] += sz;
7986       cntNew[i]++;
7987       if( !leafData ){
7988         if( cntNew[i]<b.nCell ){
7989           sz = 2 + cachedCellSize(&b, cntNew[i]);
7990         }else{
7991           sz = 0;
7992         }
7993       }
7994       szNew[i+1] -= sz;
7995     }
7996     if( cntNew[i]>=b.nCell ){
7997       k = i+1;
7998     }else if( cntNew[i] <= (i>0 ? cntNew[i-1] : 0) ){
7999       rc = SQLITE_CORRUPT_BKPT;
8000       goto balance_cleanup;
8001     }
8002   }
8003 
8004   /*
8005   ** The packing computed by the previous block is biased toward the siblings
8006   ** on the left side (siblings with smaller keys). The left siblings are
8007   ** always nearly full, while the right-most sibling might be nearly empty.
8008   ** The next block of code attempts to adjust the packing of siblings to
8009   ** get a better balance.
8010   **
8011   ** This adjustment is more than an optimization.  The packing above might
8012   ** be so out of balance as to be illegal.  For example, the right-most
8013   ** sibling might be completely empty.  This adjustment is not optional.
8014   */
8015   for(i=k-1; i>0; i--){
8016     int szRight = szNew[i];  /* Size of sibling on the right */
8017     int szLeft = szNew[i-1]; /* Size of sibling on the left */
8018     int r;              /* Index of right-most cell in left sibling */
8019     int d;              /* Index of first cell to the left of right sibling */
8020 
8021     r = cntNew[i-1] - 1;
8022     d = r + 1 - leafData;
8023     (void)cachedCellSize(&b, d);
8024     do{
8025       assert( d<nMaxCells );
8026       assert( r<nMaxCells );
8027       (void)cachedCellSize(&b, r);
8028       if( szRight!=0
8029        && (bBulk || szRight+b.szCell[d]+2 > szLeft-(b.szCell[r]+(i==k-1?0:2)))){
8030         break;
8031       }
8032       szRight += b.szCell[d] + 2;
8033       szLeft -= b.szCell[r] + 2;
8034       cntNew[i-1] = r;
8035       r--;
8036       d--;
8037     }while( r>=0 );
8038     szNew[i] = szRight;
8039     szNew[i-1] = szLeft;
8040     if( cntNew[i-1] <= (i>1 ? cntNew[i-2] : 0) ){
8041       rc = SQLITE_CORRUPT_BKPT;
8042       goto balance_cleanup;
8043     }
8044   }
8045 
8046   /* Sanity check:  For a non-corrupt database file one of the follwing
8047   ** must be true:
8048   **    (1) We found one or more cells (cntNew[0])>0), or
8049   **    (2) pPage is a virtual root page.  A virtual root page is when
8050   **        the real root page is page 1 and we are the only child of
8051   **        that page.
8052   */
8053   assert( cntNew[0]>0 || (pParent->pgno==1 && pParent->nCell==0) || CORRUPT_DB);
8054   TRACE(("BALANCE: old: %d(nc=%d) %d(nc=%d) %d(nc=%d)\n",
8055     apOld[0]->pgno, apOld[0]->nCell,
8056     nOld>=2 ? apOld[1]->pgno : 0, nOld>=2 ? apOld[1]->nCell : 0,
8057     nOld>=3 ? apOld[2]->pgno : 0, nOld>=3 ? apOld[2]->nCell : 0
8058   ));
8059 
8060   /*
8061   ** Allocate k new pages.  Reuse old pages where possible.
8062   */
8063   pageFlags = apOld[0]->aData[0];
8064   for(i=0; i<k; i++){
8065     MemPage *pNew;
8066     if( i<nOld ){
8067       pNew = apNew[i] = apOld[i];
8068       apOld[i] = 0;
8069       rc = sqlite3PagerWrite(pNew->pDbPage);
8070       nNew++;
8071       if( sqlite3PagerPageRefcount(pNew->pDbPage)!=1+(i==(iParentIdx-nxDiv))
8072        && rc==SQLITE_OK
8073       ){
8074         rc = SQLITE_CORRUPT_BKPT;
8075       }
8076       if( rc ) goto balance_cleanup;
8077     }else{
8078       assert( i>0 );
8079       rc = allocateBtreePage(pBt, &pNew, &pgno, (bBulk ? 1 : pgno), 0);
8080       if( rc ) goto balance_cleanup;
8081       zeroPage(pNew, pageFlags);
8082       apNew[i] = pNew;
8083       nNew++;
8084       cntOld[i] = b.nCell;
8085 
8086       /* Set the pointer-map entry for the new sibling page. */
8087       if( ISAUTOVACUUM ){
8088         ptrmapPut(pBt, pNew->pgno, PTRMAP_BTREE, pParent->pgno, &rc);
8089         if( rc!=SQLITE_OK ){
8090           goto balance_cleanup;
8091         }
8092       }
8093     }
8094   }
8095 
8096   /*
8097   ** Reassign page numbers so that the new pages are in ascending order.
8098   ** This helps to keep entries in the disk file in order so that a scan
8099   ** of the table is closer to a linear scan through the file. That in turn
8100   ** helps the operating system to deliver pages from the disk more rapidly.
8101   **
8102   ** An O(n^2) insertion sort algorithm is used, but since n is never more
8103   ** than (NB+2) (a small constant), that should not be a problem.
8104   **
8105   ** When NB==3, this one optimization makes the database about 25% faster
8106   ** for large insertions and deletions.
8107   */
8108   for(i=0; i<nNew; i++){
8109     aPgOrder[i] = aPgno[i] = apNew[i]->pgno;
8110     aPgFlags[i] = apNew[i]->pDbPage->flags;
8111     for(j=0; j<i; j++){
8112       if( NEVER(aPgno[j]==aPgno[i]) ){
8113         /* This branch is taken if the set of sibling pages somehow contains
8114         ** duplicate entries. This can happen if the database is corrupt.
8115         ** It would be simpler to detect this as part of the loop below, but
8116         ** we do the detection here in order to avoid populating the pager
8117         ** cache with two separate objects associated with the same
8118         ** page number.  */
8119         assert( CORRUPT_DB );
8120         rc = SQLITE_CORRUPT_BKPT;
8121         goto balance_cleanup;
8122       }
8123     }
8124   }
8125   for(i=0; i<nNew; i++){
8126     int iBest = 0;                /* aPgno[] index of page number to use */
8127     for(j=1; j<nNew; j++){
8128       if( aPgOrder[j]<aPgOrder[iBest] ) iBest = j;
8129     }
8130     pgno = aPgOrder[iBest];
8131     aPgOrder[iBest] = 0xffffffff;
8132     if( iBest!=i ){
8133       if( iBest>i ){
8134         sqlite3PagerRekey(apNew[iBest]->pDbPage, pBt->nPage+iBest+1, 0);
8135       }
8136       sqlite3PagerRekey(apNew[i]->pDbPage, pgno, aPgFlags[iBest]);
8137       apNew[i]->pgno = pgno;
8138     }
8139   }
8140 
8141   TRACE(("BALANCE: new: %d(%d nc=%d) %d(%d nc=%d) %d(%d nc=%d) "
8142          "%d(%d nc=%d) %d(%d nc=%d)\n",
8143     apNew[0]->pgno, szNew[0], cntNew[0],
8144     nNew>=2 ? apNew[1]->pgno : 0, nNew>=2 ? szNew[1] : 0,
8145     nNew>=2 ? cntNew[1] - cntNew[0] - !leafData : 0,
8146     nNew>=3 ? apNew[2]->pgno : 0, nNew>=3 ? szNew[2] : 0,
8147     nNew>=3 ? cntNew[2] - cntNew[1] - !leafData : 0,
8148     nNew>=4 ? apNew[3]->pgno : 0, nNew>=4 ? szNew[3] : 0,
8149     nNew>=4 ? cntNew[3] - cntNew[2] - !leafData : 0,
8150     nNew>=5 ? apNew[4]->pgno : 0, nNew>=5 ? szNew[4] : 0,
8151     nNew>=5 ? cntNew[4] - cntNew[3] - !leafData : 0
8152   ));
8153 
8154   assert( sqlite3PagerIswriteable(pParent->pDbPage) );
8155   assert( nNew>=1 && nNew<=ArraySize(apNew) );
8156   assert( apNew[nNew-1]!=0 );
8157   put4byte(pRight, apNew[nNew-1]->pgno);
8158 
8159   /* If the sibling pages are not leaves, ensure that the right-child pointer
8160   ** of the right-most new sibling page is set to the value that was
8161   ** originally in the same field of the right-most old sibling page. */
8162   if( (pageFlags & PTF_LEAF)==0 && nOld!=nNew ){
8163     MemPage *pOld = (nNew>nOld ? apNew : apOld)[nOld-1];
8164     memcpy(&apNew[nNew-1]->aData[8], &pOld->aData[8], 4);
8165   }
8166 
8167   /* Make any required updates to pointer map entries associated with
8168   ** cells stored on sibling pages following the balance operation. Pointer
8169   ** map entries associated with divider cells are set by the insertCell()
8170   ** routine. The associated pointer map entries are:
8171   **
8172   **   a) if the cell contains a reference to an overflow chain, the
8173   **      entry associated with the first page in the overflow chain, and
8174   **
8175   **   b) if the sibling pages are not leaves, the child page associated
8176   **      with the cell.
8177   **
8178   ** If the sibling pages are not leaves, then the pointer map entry
8179   ** associated with the right-child of each sibling may also need to be
8180   ** updated. This happens below, after the sibling pages have been
8181   ** populated, not here.
8182   */
8183   if( ISAUTOVACUUM ){
8184     MemPage *pOld;
8185     MemPage *pNew = pOld = apNew[0];
8186     int cntOldNext = pNew->nCell + pNew->nOverflow;
8187     int iNew = 0;
8188     int iOld = 0;
8189 
8190     for(i=0; i<b.nCell; i++){
8191       u8 *pCell = b.apCell[i];
8192       while( i==cntOldNext ){
8193         iOld++;
8194         assert( iOld<nNew || iOld<nOld );
8195         assert( iOld>=0 && iOld<NB );
8196         pOld = iOld<nNew ? apNew[iOld] : apOld[iOld];
8197         cntOldNext += pOld->nCell + pOld->nOverflow + !leafData;
8198       }
8199       if( i==cntNew[iNew] ){
8200         pNew = apNew[++iNew];
8201         if( !leafData ) continue;
8202       }
8203 
8204       /* Cell pCell is destined for new sibling page pNew. Originally, it
8205       ** was either part of sibling page iOld (possibly an overflow cell),
8206       ** or else the divider cell to the left of sibling page iOld. So,
8207       ** if sibling page iOld had the same page number as pNew, and if
8208       ** pCell really was a part of sibling page iOld (not a divider or
8209       ** overflow cell), we can skip updating the pointer map entries.  */
8210       if( iOld>=nNew
8211        || pNew->pgno!=aPgno[iOld]
8212        || !SQLITE_WITHIN(pCell,pOld->aData,pOld->aDataEnd)
8213       ){
8214         if( !leafCorrection ){
8215           ptrmapPut(pBt, get4byte(pCell), PTRMAP_BTREE, pNew->pgno, &rc);
8216         }
8217         if( cachedCellSize(&b,i)>pNew->minLocal ){
8218           ptrmapPutOvflPtr(pNew, pOld, pCell, &rc);
8219         }
8220         if( rc ) goto balance_cleanup;
8221       }
8222     }
8223   }
8224 
8225   /* Insert new divider cells into pParent. */
8226   for(i=0; i<nNew-1; i++){
8227     u8 *pCell;
8228     u8 *pTemp;
8229     int sz;
8230     u8 *pSrcEnd;
8231     MemPage *pNew = apNew[i];
8232     j = cntNew[i];
8233 
8234     assert( j<nMaxCells );
8235     assert( b.apCell[j]!=0 );
8236     pCell = b.apCell[j];
8237     sz = b.szCell[j] + leafCorrection;
8238     pTemp = &aOvflSpace[iOvflSpace];
8239     if( !pNew->leaf ){
8240       memcpy(&pNew->aData[8], pCell, 4);
8241     }else if( leafData ){
8242       /* If the tree is a leaf-data tree, and the siblings are leaves,
8243       ** then there is no divider cell in b.apCell[]. Instead, the divider
8244       ** cell consists of the integer key for the right-most cell of
8245       ** the sibling-page assembled above only.
8246       */
8247       CellInfo info;
8248       j--;
8249       pNew->xParseCell(pNew, b.apCell[j], &info);
8250       pCell = pTemp;
8251       sz = 4 + putVarint(&pCell[4], info.nKey);
8252       pTemp = 0;
8253     }else{
8254       pCell -= 4;
8255       /* Obscure case for non-leaf-data trees: If the cell at pCell was
8256       ** previously stored on a leaf node, and its reported size was 4
8257       ** bytes, then it may actually be smaller than this
8258       ** (see btreeParseCellPtr(), 4 bytes is the minimum size of
8259       ** any cell). But it is important to pass the correct size to
8260       ** insertCell(), so reparse the cell now.
8261       **
8262       ** This can only happen for b-trees used to evaluate "IN (SELECT ...)"
8263       ** and WITHOUT ROWID tables with exactly one column which is the
8264       ** primary key.
8265       */
8266       if( b.szCell[j]==4 ){
8267         assert(leafCorrection==4);
8268         sz = pParent->xCellSize(pParent, pCell);
8269       }
8270     }
8271     iOvflSpace += sz;
8272     assert( sz<=pBt->maxLocal+23 );
8273     assert( iOvflSpace <= (int)pBt->pageSize );
8274     for(k=0; b.ixNx[k]<=i && ALWAYS(k<NB*2); k++){}
8275     pSrcEnd = b.apEnd[k];
8276     if( SQLITE_WITHIN(pSrcEnd, pCell, pCell+sz) ){
8277       rc = SQLITE_CORRUPT_BKPT;
8278       goto balance_cleanup;
8279     }
8280     insertCell(pParent, nxDiv+i, pCell, sz, pTemp, pNew->pgno, &rc);
8281     if( rc!=SQLITE_OK ) goto balance_cleanup;
8282     assert( sqlite3PagerIswriteable(pParent->pDbPage) );
8283   }
8284 
8285   /* Now update the actual sibling pages. The order in which they are updated
8286   ** is important, as this code needs to avoid disrupting any page from which
8287   ** cells may still to be read. In practice, this means:
8288   **
8289   **  (1) If cells are moving left (from apNew[iPg] to apNew[iPg-1])
8290   **      then it is not safe to update page apNew[iPg] until after
8291   **      the left-hand sibling apNew[iPg-1] has been updated.
8292   **
8293   **  (2) If cells are moving right (from apNew[iPg] to apNew[iPg+1])
8294   **      then it is not safe to update page apNew[iPg] until after
8295   **      the right-hand sibling apNew[iPg+1] has been updated.
8296   **
8297   ** If neither of the above apply, the page is safe to update.
8298   **
8299   ** The iPg value in the following loop starts at nNew-1 goes down
8300   ** to 0, then back up to nNew-1 again, thus making two passes over
8301   ** the pages.  On the initial downward pass, only condition (1) above
8302   ** needs to be tested because (2) will always be true from the previous
8303   ** step.  On the upward pass, both conditions are always true, so the
8304   ** upwards pass simply processes pages that were missed on the downward
8305   ** pass.
8306   */
8307   for(i=1-nNew; i<nNew; i++){
8308     int iPg = i<0 ? -i : i;
8309     assert( iPg>=0 && iPg<nNew );
8310     if( abDone[iPg] ) continue;         /* Skip pages already processed */
8311     if( i>=0                            /* On the upwards pass, or... */
8312      || cntOld[iPg-1]>=cntNew[iPg-1]    /* Condition (1) is true */
8313     ){
8314       int iNew;
8315       int iOld;
8316       int nNewCell;
8317 
8318       /* Verify condition (1):  If cells are moving left, update iPg
8319       ** only after iPg-1 has already been updated. */
8320       assert( iPg==0 || cntOld[iPg-1]>=cntNew[iPg-1] || abDone[iPg-1] );
8321 
8322       /* Verify condition (2):  If cells are moving right, update iPg
8323       ** only after iPg+1 has already been updated. */
8324       assert( cntNew[iPg]>=cntOld[iPg] || abDone[iPg+1] );
8325 
8326       if( iPg==0 ){
8327         iNew = iOld = 0;
8328         nNewCell = cntNew[0];
8329       }else{
8330         iOld = iPg<nOld ? (cntOld[iPg-1] + !leafData) : b.nCell;
8331         iNew = cntNew[iPg-1] + !leafData;
8332         nNewCell = cntNew[iPg] - iNew;
8333       }
8334 
8335       rc = editPage(apNew[iPg], iOld, iNew, nNewCell, &b);
8336       if( rc ) goto balance_cleanup;
8337       abDone[iPg]++;
8338       apNew[iPg]->nFree = usableSpace-szNew[iPg];
8339       assert( apNew[iPg]->nOverflow==0 );
8340       assert( apNew[iPg]->nCell==nNewCell );
8341     }
8342   }
8343 
8344   /* All pages have been processed exactly once */
8345   assert( memcmp(abDone, "\01\01\01\01\01", nNew)==0 );
8346 
8347   assert( nOld>0 );
8348   assert( nNew>0 );
8349 
8350   if( isRoot && pParent->nCell==0 && pParent->hdrOffset<=apNew[0]->nFree ){
8351     /* The root page of the b-tree now contains no cells. The only sibling
8352     ** page is the right-child of the parent. Copy the contents of the
8353     ** child page into the parent, decreasing the overall height of the
8354     ** b-tree structure by one. This is described as the "balance-shallower"
8355     ** sub-algorithm in some documentation.
8356     **
8357     ** If this is an auto-vacuum database, the call to copyNodeContent()
8358     ** sets all pointer-map entries corresponding to database image pages
8359     ** for which the pointer is stored within the content being copied.
8360     **
8361     ** It is critical that the child page be defragmented before being
8362     ** copied into the parent, because if the parent is page 1 then it will
8363     ** by smaller than the child due to the database header, and so all the
8364     ** free space needs to be up front.
8365     */
8366     assert( nNew==1 || CORRUPT_DB );
8367     rc = defragmentPage(apNew[0], -1);
8368     testcase( rc!=SQLITE_OK );
8369     assert( apNew[0]->nFree ==
8370         (get2byteNotZero(&apNew[0]->aData[5]) - apNew[0]->cellOffset
8371           - apNew[0]->nCell*2)
8372       || rc!=SQLITE_OK
8373     );
8374     copyNodeContent(apNew[0], pParent, &rc);
8375     freePage(apNew[0], &rc);
8376   }else if( ISAUTOVACUUM && !leafCorrection ){
8377     /* Fix the pointer map entries associated with the right-child of each
8378     ** sibling page. All other pointer map entries have already been taken
8379     ** care of.  */
8380     for(i=0; i<nNew; i++){
8381       u32 key = get4byte(&apNew[i]->aData[8]);
8382       ptrmapPut(pBt, key, PTRMAP_BTREE, apNew[i]->pgno, &rc);
8383     }
8384   }
8385 
8386   assert( pParent->isInit );
8387   TRACE(("BALANCE: finished: old=%d new=%d cells=%d\n",
8388           nOld, nNew, b.nCell));
8389 
8390   /* Free any old pages that were not reused as new pages.
8391   */
8392   for(i=nNew; i<nOld; i++){
8393     freePage(apOld[i], &rc);
8394   }
8395 
8396 #if 0
8397   if( ISAUTOVACUUM && rc==SQLITE_OK && apNew[0]->isInit ){
8398     /* The ptrmapCheckPages() contains assert() statements that verify that
8399     ** all pointer map pages are set correctly. This is helpful while
8400     ** debugging. This is usually disabled because a corrupt database may
8401     ** cause an assert() statement to fail.  */
8402     ptrmapCheckPages(apNew, nNew);
8403     ptrmapCheckPages(&pParent, 1);
8404   }
8405 #endif
8406 
8407   /*
8408   ** Cleanup before returning.
8409   */
8410 balance_cleanup:
8411   sqlite3StackFree(0, b.apCell);
8412   for(i=0; i<nOld; i++){
8413     releasePage(apOld[i]);
8414   }
8415   for(i=0; i<nNew; i++){
8416     releasePage(apNew[i]);
8417   }
8418 
8419   return rc;
8420 }
8421 
8422 
8423 /*
8424 ** This function is called when the root page of a b-tree structure is
8425 ** overfull (has one or more overflow pages).
8426 **
8427 ** A new child page is allocated and the contents of the current root
8428 ** page, including overflow cells, are copied into the child. The root
8429 ** page is then overwritten to make it an empty page with the right-child
8430 ** pointer pointing to the new page.
8431 **
8432 ** Before returning, all pointer-map entries corresponding to pages
8433 ** that the new child-page now contains pointers to are updated. The
8434 ** entry corresponding to the new right-child pointer of the root
8435 ** page is also updated.
8436 **
8437 ** If successful, *ppChild is set to contain a reference to the child
8438 ** page and SQLITE_OK is returned. In this case the caller is required
8439 ** to call releasePage() on *ppChild exactly once. If an error occurs,
8440 ** an error code is returned and *ppChild is set to 0.
8441 */
8442 static int balance_deeper(MemPage *pRoot, MemPage **ppChild){
8443   int rc;                        /* Return value from subprocedures */
8444   MemPage *pChild = 0;           /* Pointer to a new child page */
8445   Pgno pgnoChild = 0;            /* Page number of the new child page */
8446   BtShared *pBt = pRoot->pBt;    /* The BTree */
8447 
8448   assert( pRoot->nOverflow>0 );
8449   assert( sqlite3_mutex_held(pBt->mutex) );
8450 
8451   /* Make pRoot, the root page of the b-tree, writable. Allocate a new
8452   ** page that will become the new right-child of pPage. Copy the contents
8453   ** of the node stored on pRoot into the new child page.
8454   */
8455   rc = sqlite3PagerWrite(pRoot->pDbPage);
8456   if( rc==SQLITE_OK ){
8457     rc = allocateBtreePage(pBt,&pChild,&pgnoChild,pRoot->pgno,0);
8458     copyNodeContent(pRoot, pChild, &rc);
8459     if( ISAUTOVACUUM ){
8460       ptrmapPut(pBt, pgnoChild, PTRMAP_BTREE, pRoot->pgno, &rc);
8461     }
8462   }
8463   if( rc ){
8464     *ppChild = 0;
8465     releasePage(pChild);
8466     return rc;
8467   }
8468   assert( sqlite3PagerIswriteable(pChild->pDbPage) );
8469   assert( sqlite3PagerIswriteable(pRoot->pDbPage) );
8470   assert( pChild->nCell==pRoot->nCell || CORRUPT_DB );
8471 
8472   TRACE(("BALANCE: copy root %d into %d\n", pRoot->pgno, pChild->pgno));
8473 
8474   /* Copy the overflow cells from pRoot to pChild */
8475   memcpy(pChild->aiOvfl, pRoot->aiOvfl,
8476          pRoot->nOverflow*sizeof(pRoot->aiOvfl[0]));
8477   memcpy(pChild->apOvfl, pRoot->apOvfl,
8478          pRoot->nOverflow*sizeof(pRoot->apOvfl[0]));
8479   pChild->nOverflow = pRoot->nOverflow;
8480 
8481   /* Zero the contents of pRoot. Then install pChild as the right-child. */
8482   zeroPage(pRoot, pChild->aData[0] & ~PTF_LEAF);
8483   put4byte(&pRoot->aData[pRoot->hdrOffset+8], pgnoChild);
8484 
8485   *ppChild = pChild;
8486   return SQLITE_OK;
8487 }
8488 
8489 /*
8490 ** Return SQLITE_CORRUPT if any cursor other than pCur is currently valid
8491 ** on the same B-tree as pCur.
8492 **
8493 ** This can occur if a database is corrupt with two or more SQL tables
8494 ** pointing to the same b-tree.  If an insert occurs on one SQL table
8495 ** and causes a BEFORE TRIGGER to do a secondary insert on the other SQL
8496 ** table linked to the same b-tree.  If the secondary insert causes a
8497 ** rebalance, that can change content out from under the cursor on the
8498 ** first SQL table, violating invariants on the first insert.
8499 */
8500 static int anotherValidCursor(BtCursor *pCur){
8501   BtCursor *pOther;
8502   for(pOther=pCur->pBt->pCursor; pOther; pOther=pOther->pNext){
8503     if( pOther!=pCur
8504      && pOther->eState==CURSOR_VALID
8505      && pOther->pPage==pCur->pPage
8506     ){
8507       return SQLITE_CORRUPT_BKPT;
8508     }
8509   }
8510   return SQLITE_OK;
8511 }
8512 
8513 /*
8514 ** The page that pCur currently points to has just been modified in
8515 ** some way. This function figures out if this modification means the
8516 ** tree needs to be balanced, and if so calls the appropriate balancing
8517 ** routine. Balancing routines are:
8518 **
8519 **   balance_quick()
8520 **   balance_deeper()
8521 **   balance_nonroot()
8522 */
8523 static int balance(BtCursor *pCur){
8524   int rc = SQLITE_OK;
8525   const int nMin = pCur->pBt->usableSize * 2 / 3;
8526   u8 aBalanceQuickSpace[13];
8527   u8 *pFree = 0;
8528 
8529   VVA_ONLY( int balance_quick_called = 0 );
8530   VVA_ONLY( int balance_deeper_called = 0 );
8531 
8532   do {
8533     int iPage;
8534     MemPage *pPage = pCur->pPage;
8535 
8536     if( NEVER(pPage->nFree<0) && btreeComputeFreeSpace(pPage) ) break;
8537     if( pPage->nOverflow==0 && pPage->nFree<=nMin ){
8538       break;
8539     }else if( (iPage = pCur->iPage)==0 ){
8540       if( pPage->nOverflow && (rc = anotherValidCursor(pCur))==SQLITE_OK ){
8541         /* The root page of the b-tree is overfull. In this case call the
8542         ** balance_deeper() function to create a new child for the root-page
8543         ** and copy the current contents of the root-page to it. The
8544         ** next iteration of the do-loop will balance the child page.
8545         */
8546         assert( balance_deeper_called==0 );
8547         VVA_ONLY( balance_deeper_called++ );
8548         rc = balance_deeper(pPage, &pCur->apPage[1]);
8549         if( rc==SQLITE_OK ){
8550           pCur->iPage = 1;
8551           pCur->ix = 0;
8552           pCur->aiIdx[0] = 0;
8553           pCur->apPage[0] = pPage;
8554           pCur->pPage = pCur->apPage[1];
8555           assert( pCur->pPage->nOverflow );
8556         }
8557       }else{
8558         break;
8559       }
8560     }else{
8561       MemPage * const pParent = pCur->apPage[iPage-1];
8562       int const iIdx = pCur->aiIdx[iPage-1];
8563 
8564       rc = sqlite3PagerWrite(pParent->pDbPage);
8565       if( rc==SQLITE_OK && pParent->nFree<0 ){
8566         rc = btreeComputeFreeSpace(pParent);
8567       }
8568       if( rc==SQLITE_OK ){
8569 #ifndef SQLITE_OMIT_QUICKBALANCE
8570         if( pPage->intKeyLeaf
8571          && pPage->nOverflow==1
8572          && pPage->aiOvfl[0]==pPage->nCell
8573          && pParent->pgno!=1
8574          && pParent->nCell==iIdx
8575         ){
8576           /* Call balance_quick() to create a new sibling of pPage on which
8577           ** to store the overflow cell. balance_quick() inserts a new cell
8578           ** into pParent, which may cause pParent overflow. If this
8579           ** happens, the next iteration of the do-loop will balance pParent
8580           ** use either balance_nonroot() or balance_deeper(). Until this
8581           ** happens, the overflow cell is stored in the aBalanceQuickSpace[]
8582           ** buffer.
8583           **
8584           ** The purpose of the following assert() is to check that only a
8585           ** single call to balance_quick() is made for each call to this
8586           ** function. If this were not verified, a subtle bug involving reuse
8587           ** of the aBalanceQuickSpace[] might sneak in.
8588           */
8589           assert( balance_quick_called==0 );
8590           VVA_ONLY( balance_quick_called++ );
8591           rc = balance_quick(pParent, pPage, aBalanceQuickSpace);
8592         }else
8593 #endif
8594         {
8595           /* In this case, call balance_nonroot() to redistribute cells
8596           ** between pPage and up to 2 of its sibling pages. This involves
8597           ** modifying the contents of pParent, which may cause pParent to
8598           ** become overfull or underfull. The next iteration of the do-loop
8599           ** will balance the parent page to correct this.
8600           **
8601           ** If the parent page becomes overfull, the overflow cell or cells
8602           ** are stored in the pSpace buffer allocated immediately below.
8603           ** A subsequent iteration of the do-loop will deal with this by
8604           ** calling balance_nonroot() (balance_deeper() may be called first,
8605           ** but it doesn't deal with overflow cells - just moves them to a
8606           ** different page). Once this subsequent call to balance_nonroot()
8607           ** has completed, it is safe to release the pSpace buffer used by
8608           ** the previous call, as the overflow cell data will have been
8609           ** copied either into the body of a database page or into the new
8610           ** pSpace buffer passed to the latter call to balance_nonroot().
8611           */
8612           u8 *pSpace = sqlite3PageMalloc(pCur->pBt->pageSize);
8613           rc = balance_nonroot(pParent, iIdx, pSpace, iPage==1,
8614                                pCur->hints&BTREE_BULKLOAD);
8615           if( pFree ){
8616             /* If pFree is not NULL, it points to the pSpace buffer used
8617             ** by a previous call to balance_nonroot(). Its contents are
8618             ** now stored either on real database pages or within the
8619             ** new pSpace buffer, so it may be safely freed here. */
8620             sqlite3PageFree(pFree);
8621           }
8622 
8623           /* The pSpace buffer will be freed after the next call to
8624           ** balance_nonroot(), or just before this function returns, whichever
8625           ** comes first. */
8626           pFree = pSpace;
8627         }
8628       }
8629 
8630       pPage->nOverflow = 0;
8631 
8632       /* The next iteration of the do-loop balances the parent page. */
8633       releasePage(pPage);
8634       pCur->iPage--;
8635       assert( pCur->iPage>=0 );
8636       pCur->pPage = pCur->apPage[pCur->iPage];
8637     }
8638   }while( rc==SQLITE_OK );
8639 
8640   if( pFree ){
8641     sqlite3PageFree(pFree);
8642   }
8643   return rc;
8644 }
8645 
8646 /* Overwrite content from pX into pDest.  Only do the write if the
8647 ** content is different from what is already there.
8648 */
8649 static int btreeOverwriteContent(
8650   MemPage *pPage,           /* MemPage on which writing will occur */
8651   u8 *pDest,                /* Pointer to the place to start writing */
8652   const BtreePayload *pX,   /* Source of data to write */
8653   int iOffset,              /* Offset of first byte to write */
8654   int iAmt                  /* Number of bytes to be written */
8655 ){
8656   int nData = pX->nData - iOffset;
8657   if( nData<=0 ){
8658     /* Overwritting with zeros */
8659     int i;
8660     for(i=0; i<iAmt && pDest[i]==0; i++){}
8661     if( i<iAmt ){
8662       int rc = sqlite3PagerWrite(pPage->pDbPage);
8663       if( rc ) return rc;
8664       memset(pDest + i, 0, iAmt - i);
8665     }
8666   }else{
8667     if( nData<iAmt ){
8668       /* Mixed read data and zeros at the end.  Make a recursive call
8669       ** to write the zeros then fall through to write the real data */
8670       int rc = btreeOverwriteContent(pPage, pDest+nData, pX, iOffset+nData,
8671                                  iAmt-nData);
8672       if( rc ) return rc;
8673       iAmt = nData;
8674     }
8675     if( memcmp(pDest, ((u8*)pX->pData) + iOffset, iAmt)!=0 ){
8676       int rc = sqlite3PagerWrite(pPage->pDbPage);
8677       if( rc ) return rc;
8678       /* In a corrupt database, it is possible for the source and destination
8679       ** buffers to overlap.  This is harmless since the database is already
8680       ** corrupt but it does cause valgrind and ASAN warnings.  So use
8681       ** memmove(). */
8682       memmove(pDest, ((u8*)pX->pData) + iOffset, iAmt);
8683     }
8684   }
8685   return SQLITE_OK;
8686 }
8687 
8688 /*
8689 ** Overwrite the cell that cursor pCur is pointing to with fresh content
8690 ** contained in pX.
8691 */
8692 static int btreeOverwriteCell(BtCursor *pCur, const BtreePayload *pX){
8693   int iOffset;                        /* Next byte of pX->pData to write */
8694   int nTotal = pX->nData + pX->nZero; /* Total bytes of to write */
8695   int rc;                             /* Return code */
8696   MemPage *pPage = pCur->pPage;       /* Page being written */
8697   BtShared *pBt;                      /* Btree */
8698   Pgno ovflPgno;                      /* Next overflow page to write */
8699   u32 ovflPageSize;                   /* Size to write on overflow page */
8700 
8701   if( pCur->info.pPayload + pCur->info.nLocal > pPage->aDataEnd
8702    || pCur->info.pPayload < pPage->aData + pPage->cellOffset
8703   ){
8704     return SQLITE_CORRUPT_BKPT;
8705   }
8706   /* Overwrite the local portion first */
8707   rc = btreeOverwriteContent(pPage, pCur->info.pPayload, pX,
8708                              0, pCur->info.nLocal);
8709   if( rc ) return rc;
8710   if( pCur->info.nLocal==nTotal ) return SQLITE_OK;
8711 
8712   /* Now overwrite the overflow pages */
8713   iOffset = pCur->info.nLocal;
8714   assert( nTotal>=0 );
8715   assert( iOffset>=0 );
8716   ovflPgno = get4byte(pCur->info.pPayload + iOffset);
8717   pBt = pPage->pBt;
8718   ovflPageSize = pBt->usableSize - 4;
8719   do{
8720     rc = btreeGetPage(pBt, ovflPgno, &pPage, 0);
8721     if( rc ) return rc;
8722     if( sqlite3PagerPageRefcount(pPage->pDbPage)!=1 || pPage->isInit ){
8723       rc = SQLITE_CORRUPT_BKPT;
8724     }else{
8725       if( iOffset+ovflPageSize<(u32)nTotal ){
8726         ovflPgno = get4byte(pPage->aData);
8727       }else{
8728         ovflPageSize = nTotal - iOffset;
8729       }
8730       rc = btreeOverwriteContent(pPage, pPage->aData+4, pX,
8731                                  iOffset, ovflPageSize);
8732     }
8733     sqlite3PagerUnref(pPage->pDbPage);
8734     if( rc ) return rc;
8735     iOffset += ovflPageSize;
8736   }while( iOffset<nTotal );
8737   return SQLITE_OK;
8738 }
8739 
8740 
8741 /*
8742 ** Insert a new record into the BTree.  The content of the new record
8743 ** is described by the pX object.  The pCur cursor is used only to
8744 ** define what table the record should be inserted into, and is left
8745 ** pointing at a random location.
8746 **
8747 ** For a table btree (used for rowid tables), only the pX.nKey value of
8748 ** the key is used. The pX.pKey value must be NULL.  The pX.nKey is the
8749 ** rowid or INTEGER PRIMARY KEY of the row.  The pX.nData,pData,nZero fields
8750 ** hold the content of the row.
8751 **
8752 ** For an index btree (used for indexes and WITHOUT ROWID tables), the
8753 ** key is an arbitrary byte sequence stored in pX.pKey,nKey.  The
8754 ** pX.pData,nData,nZero fields must be zero.
8755 **
8756 ** If the seekResult parameter is non-zero, then a successful call to
8757 ** MovetoUnpacked() to seek cursor pCur to (pKey,nKey) has already
8758 ** been performed.  In other words, if seekResult!=0 then the cursor
8759 ** is currently pointing to a cell that will be adjacent to the cell
8760 ** to be inserted.  If seekResult<0 then pCur points to a cell that is
8761 ** smaller then (pKey,nKey).  If seekResult>0 then pCur points to a cell
8762 ** that is larger than (pKey,nKey).
8763 **
8764 ** If seekResult==0, that means pCur is pointing at some unknown location.
8765 ** In that case, this routine must seek the cursor to the correct insertion
8766 ** point for (pKey,nKey) before doing the insertion.  For index btrees,
8767 ** if pX->nMem is non-zero, then pX->aMem contains pointers to the unpacked
8768 ** key values and pX->aMem can be used instead of pX->pKey to avoid having
8769 ** to decode the key.
8770 */
8771 int sqlite3BtreeInsert(
8772   BtCursor *pCur,                /* Insert data into the table of this cursor */
8773   const BtreePayload *pX,        /* Content of the row to be inserted */
8774   int flags,                     /* True if this is likely an append */
8775   int seekResult                 /* Result of prior MovetoUnpacked() call */
8776 ){
8777   int rc;
8778   int loc = seekResult;          /* -1: before desired location  +1: after */
8779   int szNew = 0;
8780   int idx;
8781   MemPage *pPage;
8782   Btree *p = pCur->pBtree;
8783   BtShared *pBt = p->pBt;
8784   unsigned char *oldCell;
8785   unsigned char *newCell = 0;
8786 
8787   assert( (flags & (BTREE_SAVEPOSITION|BTREE_APPEND|BTREE_PREFORMAT))==flags );
8788   assert( (flags & BTREE_PREFORMAT)==0 || seekResult || pCur->pKeyInfo==0 );
8789 
8790   if( pCur->eState==CURSOR_FAULT ){
8791     assert( pCur->skipNext!=SQLITE_OK );
8792     return pCur->skipNext;
8793   }
8794 
8795   assert( cursorOwnsBtShared(pCur) );
8796   assert( (pCur->curFlags & BTCF_WriteFlag)!=0
8797               && pBt->inTransaction==TRANS_WRITE
8798               && (pBt->btsFlags & BTS_READ_ONLY)==0 );
8799   assert( hasSharedCacheTableLock(p, pCur->pgnoRoot, pCur->pKeyInfo!=0, 2) );
8800 
8801   /* Assert that the caller has been consistent. If this cursor was opened
8802   ** expecting an index b-tree, then the caller should be inserting blob
8803   ** keys with no associated data. If the cursor was opened expecting an
8804   ** intkey table, the caller should be inserting integer keys with a
8805   ** blob of associated data.  */
8806   assert( (flags & BTREE_PREFORMAT) || (pX->pKey==0)==(pCur->pKeyInfo==0) );
8807 
8808   /* Save the positions of any other cursors open on this table.
8809   **
8810   ** In some cases, the call to btreeMoveto() below is a no-op. For
8811   ** example, when inserting data into a table with auto-generated integer
8812   ** keys, the VDBE layer invokes sqlite3BtreeLast() to figure out the
8813   ** integer key to use. It then calls this function to actually insert the
8814   ** data into the intkey B-Tree. In this case btreeMoveto() recognizes
8815   ** that the cursor is already where it needs to be and returns without
8816   ** doing any work. To avoid thwarting these optimizations, it is important
8817   ** not to clear the cursor here.
8818   */
8819   if( pCur->curFlags & BTCF_Multiple ){
8820     rc = saveAllCursors(pBt, pCur->pgnoRoot, pCur);
8821     if( rc ) return rc;
8822     if( loc && pCur->iPage<0 ){
8823       /* This can only happen if the schema is corrupt such that there is more
8824       ** than one table or index with the same root page as used by the cursor.
8825       ** Which can only happen if the SQLITE_NoSchemaError flag was set when
8826       ** the schema was loaded. This cannot be asserted though, as a user might
8827       ** set the flag, load the schema, and then unset the flag.  */
8828       return SQLITE_CORRUPT_BKPT;
8829     }
8830   }
8831 
8832   if( pCur->pKeyInfo==0 ){
8833     assert( pX->pKey==0 );
8834     /* If this is an insert into a table b-tree, invalidate any incrblob
8835     ** cursors open on the row being replaced */
8836     if( p->hasIncrblobCur ){
8837       invalidateIncrblobCursors(p, pCur->pgnoRoot, pX->nKey, 0);
8838     }
8839 
8840     /* If BTREE_SAVEPOSITION is set, the cursor must already be pointing
8841     ** to a row with the same key as the new entry being inserted.
8842     */
8843 #ifdef SQLITE_DEBUG
8844     if( flags & BTREE_SAVEPOSITION ){
8845       assert( pCur->curFlags & BTCF_ValidNKey );
8846       assert( pX->nKey==pCur->info.nKey );
8847       assert( loc==0 );
8848     }
8849 #endif
8850 
8851     /* On the other hand, BTREE_SAVEPOSITION==0 does not imply
8852     ** that the cursor is not pointing to a row to be overwritten.
8853     ** So do a complete check.
8854     */
8855     if( (pCur->curFlags&BTCF_ValidNKey)!=0 && pX->nKey==pCur->info.nKey ){
8856       /* The cursor is pointing to the entry that is to be
8857       ** overwritten */
8858       assert( pX->nData>=0 && pX->nZero>=0 );
8859       if( pCur->info.nSize!=0
8860        && pCur->info.nPayload==(u32)pX->nData+pX->nZero
8861       ){
8862         /* New entry is the same size as the old.  Do an overwrite */
8863         return btreeOverwriteCell(pCur, pX);
8864       }
8865       assert( loc==0 );
8866     }else if( loc==0 ){
8867       /* The cursor is *not* pointing to the cell to be overwritten, nor
8868       ** to an adjacent cell.  Move the cursor so that it is pointing either
8869       ** to the cell to be overwritten or an adjacent cell.
8870       */
8871       rc = sqlite3BtreeTableMoveto(pCur, pX->nKey,
8872                (flags & BTREE_APPEND)!=0, &loc);
8873       if( rc ) return rc;
8874     }
8875   }else{
8876     /* This is an index or a WITHOUT ROWID table */
8877 
8878     /* If BTREE_SAVEPOSITION is set, the cursor must already be pointing
8879     ** to a row with the same key as the new entry being inserted.
8880     */
8881     assert( (flags & BTREE_SAVEPOSITION)==0 || loc==0 );
8882 
8883     /* If the cursor is not already pointing either to the cell to be
8884     ** overwritten, or if a new cell is being inserted, if the cursor is
8885     ** not pointing to an immediately adjacent cell, then move the cursor
8886     ** so that it does.
8887     */
8888     if( loc==0 && (flags & BTREE_SAVEPOSITION)==0 ){
8889       if( pX->nMem ){
8890         UnpackedRecord r;
8891         r.pKeyInfo = pCur->pKeyInfo;
8892         r.aMem = pX->aMem;
8893         r.nField = pX->nMem;
8894         r.default_rc = 0;
8895         r.eqSeen = 0;
8896         rc = sqlite3BtreeIndexMoveto(pCur, &r, &loc);
8897       }else{
8898         rc = btreeMoveto(pCur, pX->pKey, pX->nKey,
8899                     (flags & BTREE_APPEND)!=0, &loc);
8900       }
8901       if( rc ) return rc;
8902     }
8903 
8904     /* If the cursor is currently pointing to an entry to be overwritten
8905     ** and the new content is the same as as the old, then use the
8906     ** overwrite optimization.
8907     */
8908     if( loc==0 ){
8909       getCellInfo(pCur);
8910       if( pCur->info.nKey==pX->nKey ){
8911         BtreePayload x2;
8912         x2.pData = pX->pKey;
8913         x2.nData = pX->nKey;
8914         x2.nZero = 0;
8915         return btreeOverwriteCell(pCur, &x2);
8916       }
8917     }
8918   }
8919   assert( pCur->eState==CURSOR_VALID
8920        || (pCur->eState==CURSOR_INVALID && loc)
8921        || CORRUPT_DB );
8922 
8923   pPage = pCur->pPage;
8924   assert( pPage->intKey || pX->nKey>=0 || (flags & BTREE_PREFORMAT) );
8925   assert( pPage->leaf || !pPage->intKey );
8926   if( pPage->nFree<0 ){
8927     if( NEVER(pCur->eState>CURSOR_INVALID) ){
8928       rc = SQLITE_CORRUPT_BKPT;
8929     }else{
8930       rc = btreeComputeFreeSpace(pPage);
8931     }
8932     if( rc ) return rc;
8933   }
8934 
8935   TRACE(("INSERT: table=%d nkey=%lld ndata=%d page=%d %s\n",
8936           pCur->pgnoRoot, pX->nKey, pX->nData, pPage->pgno,
8937           loc==0 ? "overwrite" : "new entry"));
8938   assert( pPage->isInit );
8939   newCell = pBt->pTmpSpace;
8940   assert( newCell!=0 );
8941   if( flags & BTREE_PREFORMAT ){
8942     rc = SQLITE_OK;
8943     szNew = pBt->nPreformatSize;
8944     if( szNew<4 ) szNew = 4;
8945     if( ISAUTOVACUUM && szNew>pPage->maxLocal ){
8946       CellInfo info;
8947       pPage->xParseCell(pPage, newCell, &info);
8948       if( info.nPayload!=info.nLocal ){
8949         Pgno ovfl = get4byte(&newCell[szNew-4]);
8950         ptrmapPut(pBt, ovfl, PTRMAP_OVERFLOW1, pPage->pgno, &rc);
8951       }
8952     }
8953   }else{
8954     rc = fillInCell(pPage, newCell, pX, &szNew);
8955   }
8956   if( rc ) goto end_insert;
8957   assert( szNew==pPage->xCellSize(pPage, newCell) );
8958   assert( szNew <= MX_CELL_SIZE(pBt) );
8959   idx = pCur->ix;
8960   if( loc==0 ){
8961     CellInfo info;
8962     assert( idx>=0 );
8963     if( idx>=pPage->nCell ){
8964       return SQLITE_CORRUPT_BKPT;
8965     }
8966     rc = sqlite3PagerWrite(pPage->pDbPage);
8967     if( rc ){
8968       goto end_insert;
8969     }
8970     oldCell = findCell(pPage, idx);
8971     if( !pPage->leaf ){
8972       memcpy(newCell, oldCell, 4);
8973     }
8974     BTREE_CLEAR_CELL(rc, pPage, oldCell, info);
8975     testcase( pCur->curFlags & BTCF_ValidOvfl );
8976     invalidateOverflowCache(pCur);
8977     if( info.nSize==szNew && info.nLocal==info.nPayload
8978      && (!ISAUTOVACUUM || szNew<pPage->minLocal)
8979     ){
8980       /* Overwrite the old cell with the new if they are the same size.
8981       ** We could also try to do this if the old cell is smaller, then add
8982       ** the leftover space to the free list.  But experiments show that
8983       ** doing that is no faster then skipping this optimization and just
8984       ** calling dropCell() and insertCell().
8985       **
8986       ** This optimization cannot be used on an autovacuum database if the
8987       ** new entry uses overflow pages, as the insertCell() call below is
8988       ** necessary to add the PTRMAP_OVERFLOW1 pointer-map entry.  */
8989       assert( rc==SQLITE_OK ); /* clearCell never fails when nLocal==nPayload */
8990       if( oldCell < pPage->aData+pPage->hdrOffset+10 ){
8991         return SQLITE_CORRUPT_BKPT;
8992       }
8993       if( oldCell+szNew > pPage->aDataEnd ){
8994         return SQLITE_CORRUPT_BKPT;
8995       }
8996       memcpy(oldCell, newCell, szNew);
8997       return SQLITE_OK;
8998     }
8999     dropCell(pPage, idx, info.nSize, &rc);
9000     if( rc ) goto end_insert;
9001   }else if( loc<0 && pPage->nCell>0 ){
9002     assert( pPage->leaf );
9003     idx = ++pCur->ix;
9004     pCur->curFlags &= ~BTCF_ValidNKey;
9005   }else{
9006     assert( pPage->leaf );
9007   }
9008   insertCell(pPage, idx, newCell, szNew, 0, 0, &rc);
9009   assert( pPage->nOverflow==0 || rc==SQLITE_OK );
9010   assert( rc!=SQLITE_OK || pPage->nCell>0 || pPage->nOverflow>0 );
9011 
9012   /* If no error has occurred and pPage has an overflow cell, call balance()
9013   ** to redistribute the cells within the tree. Since balance() may move
9014   ** the cursor, zero the BtCursor.info.nSize and BTCF_ValidNKey
9015   ** variables.
9016   **
9017   ** Previous versions of SQLite called moveToRoot() to move the cursor
9018   ** back to the root page as balance() used to invalidate the contents
9019   ** of BtCursor.apPage[] and BtCursor.aiIdx[]. Instead of doing that,
9020   ** set the cursor state to "invalid". This makes common insert operations
9021   ** slightly faster.
9022   **
9023   ** There is a subtle but important optimization here too. When inserting
9024   ** multiple records into an intkey b-tree using a single cursor (as can
9025   ** happen while processing an "INSERT INTO ... SELECT" statement), it
9026   ** is advantageous to leave the cursor pointing to the last entry in
9027   ** the b-tree if possible. If the cursor is left pointing to the last
9028   ** entry in the table, and the next row inserted has an integer key
9029   ** larger than the largest existing key, it is possible to insert the
9030   ** row without seeking the cursor. This can be a big performance boost.
9031   */
9032   pCur->info.nSize = 0;
9033   if( pPage->nOverflow ){
9034     assert( rc==SQLITE_OK );
9035     pCur->curFlags &= ~(BTCF_ValidNKey);
9036     rc = balance(pCur);
9037 
9038     /* Must make sure nOverflow is reset to zero even if the balance()
9039     ** fails. Internal data structure corruption will result otherwise.
9040     ** Also, set the cursor state to invalid. This stops saveCursorPosition()
9041     ** from trying to save the current position of the cursor.  */
9042     pCur->pPage->nOverflow = 0;
9043     pCur->eState = CURSOR_INVALID;
9044     if( (flags & BTREE_SAVEPOSITION) && rc==SQLITE_OK ){
9045       btreeReleaseAllCursorPages(pCur);
9046       if( pCur->pKeyInfo ){
9047         assert( pCur->pKey==0 );
9048         pCur->pKey = sqlite3Malloc( pX->nKey );
9049         if( pCur->pKey==0 ){
9050           rc = SQLITE_NOMEM;
9051         }else{
9052           memcpy(pCur->pKey, pX->pKey, pX->nKey);
9053         }
9054       }
9055       pCur->eState = CURSOR_REQUIRESEEK;
9056       pCur->nKey = pX->nKey;
9057     }
9058   }
9059   assert( pCur->iPage<0 || pCur->pPage->nOverflow==0 );
9060 
9061 end_insert:
9062   return rc;
9063 }
9064 
9065 /*
9066 ** This function is used as part of copying the current row from cursor
9067 ** pSrc into cursor pDest. If the cursors are open on intkey tables, then
9068 ** parameter iKey is used as the rowid value when the record is copied
9069 ** into pDest. Otherwise, the record is copied verbatim.
9070 **
9071 ** This function does not actually write the new value to cursor pDest.
9072 ** Instead, it creates and populates any required overflow pages and
9073 ** writes the data for the new cell into the BtShared.pTmpSpace buffer
9074 ** for the destination database. The size of the cell, in bytes, is left
9075 ** in BtShared.nPreformatSize. The caller completes the insertion by
9076 ** calling sqlite3BtreeInsert() with the BTREE_PREFORMAT flag specified.
9077 **
9078 ** SQLITE_OK is returned if successful, or an SQLite error code otherwise.
9079 */
9080 int sqlite3BtreeTransferRow(BtCursor *pDest, BtCursor *pSrc, i64 iKey){
9081   int rc = SQLITE_OK;
9082   BtShared *pBt = pDest->pBt;
9083   u8 *aOut = pBt->pTmpSpace;    /* Pointer to next output buffer */
9084   const u8 *aIn;                /* Pointer to next input buffer */
9085   u32 nIn;                      /* Size of input buffer aIn[] */
9086   u32 nRem;                     /* Bytes of data still to copy */
9087 
9088   getCellInfo(pSrc);
9089   aOut += putVarint32(aOut, pSrc->info.nPayload);
9090   if( pDest->pKeyInfo==0 ) aOut += putVarint(aOut, iKey);
9091   nIn = pSrc->info.nLocal;
9092   aIn = pSrc->info.pPayload;
9093   if( aIn+nIn>pSrc->pPage->aDataEnd ){
9094     return SQLITE_CORRUPT_BKPT;
9095   }
9096   nRem = pSrc->info.nPayload;
9097   if( nIn==nRem && nIn<pDest->pPage->maxLocal ){
9098     memcpy(aOut, aIn, nIn);
9099     pBt->nPreformatSize = nIn + (aOut - pBt->pTmpSpace);
9100   }else{
9101     Pager *pSrcPager = pSrc->pBt->pPager;
9102     u8 *pPgnoOut = 0;
9103     Pgno ovflIn = 0;
9104     DbPage *pPageIn = 0;
9105     MemPage *pPageOut = 0;
9106     u32 nOut;                     /* Size of output buffer aOut[] */
9107 
9108     nOut = btreePayloadToLocal(pDest->pPage, pSrc->info.nPayload);
9109     pBt->nPreformatSize = nOut + (aOut - pBt->pTmpSpace);
9110     if( nOut<pSrc->info.nPayload ){
9111       pPgnoOut = &aOut[nOut];
9112       pBt->nPreformatSize += 4;
9113     }
9114 
9115     if( nRem>nIn ){
9116       if( aIn+nIn+4>pSrc->pPage->aDataEnd ){
9117         return SQLITE_CORRUPT_BKPT;
9118       }
9119       ovflIn = get4byte(&pSrc->info.pPayload[nIn]);
9120     }
9121 
9122     do {
9123       nRem -= nOut;
9124       do{
9125         assert( nOut>0 );
9126         if( nIn>0 ){
9127           int nCopy = MIN(nOut, nIn);
9128           memcpy(aOut, aIn, nCopy);
9129           nOut -= nCopy;
9130           nIn -= nCopy;
9131           aOut += nCopy;
9132           aIn += nCopy;
9133         }
9134         if( nOut>0 ){
9135           sqlite3PagerUnref(pPageIn);
9136           pPageIn = 0;
9137           rc = sqlite3PagerGet(pSrcPager, ovflIn, &pPageIn, PAGER_GET_READONLY);
9138           if( rc==SQLITE_OK ){
9139             aIn = (const u8*)sqlite3PagerGetData(pPageIn);
9140             ovflIn = get4byte(aIn);
9141             aIn += 4;
9142             nIn = pSrc->pBt->usableSize - 4;
9143           }
9144         }
9145       }while( rc==SQLITE_OK && nOut>0 );
9146 
9147       if( rc==SQLITE_OK && nRem>0 && ALWAYS(pPgnoOut) ){
9148         Pgno pgnoNew;
9149         MemPage *pNew = 0;
9150         rc = allocateBtreePage(pBt, &pNew, &pgnoNew, 0, 0);
9151         put4byte(pPgnoOut, pgnoNew);
9152         if( ISAUTOVACUUM && pPageOut ){
9153           ptrmapPut(pBt, pgnoNew, PTRMAP_OVERFLOW2, pPageOut->pgno, &rc);
9154         }
9155         releasePage(pPageOut);
9156         pPageOut = pNew;
9157         if( pPageOut ){
9158           pPgnoOut = pPageOut->aData;
9159           put4byte(pPgnoOut, 0);
9160           aOut = &pPgnoOut[4];
9161           nOut = MIN(pBt->usableSize - 4, nRem);
9162         }
9163       }
9164     }while( nRem>0 && rc==SQLITE_OK );
9165 
9166     releasePage(pPageOut);
9167     sqlite3PagerUnref(pPageIn);
9168   }
9169 
9170   return rc;
9171 }
9172 
9173 /*
9174 ** Delete the entry that the cursor is pointing to.
9175 **
9176 ** If the BTREE_SAVEPOSITION bit of the flags parameter is zero, then
9177 ** the cursor is left pointing at an arbitrary location after the delete.
9178 ** But if that bit is set, then the cursor is left in a state such that
9179 ** the next call to BtreeNext() or BtreePrev() moves it to the same row
9180 ** as it would have been on if the call to BtreeDelete() had been omitted.
9181 **
9182 ** The BTREE_AUXDELETE bit of flags indicates that is one of several deletes
9183 ** associated with a single table entry and its indexes.  Only one of those
9184 ** deletes is considered the "primary" delete.  The primary delete occurs
9185 ** on a cursor that is not a BTREE_FORDELETE cursor.  All but one delete
9186 ** operation on non-FORDELETE cursors is tagged with the AUXDELETE flag.
9187 ** The BTREE_AUXDELETE bit is a hint that is not used by this implementation,
9188 ** but which might be used by alternative storage engines.
9189 */
9190 int sqlite3BtreeDelete(BtCursor *pCur, u8 flags){
9191   Btree *p = pCur->pBtree;
9192   BtShared *pBt = p->pBt;
9193   int rc;                              /* Return code */
9194   MemPage *pPage;                      /* Page to delete cell from */
9195   unsigned char *pCell;                /* Pointer to cell to delete */
9196   int iCellIdx;                        /* Index of cell to delete */
9197   int iCellDepth;                      /* Depth of node containing pCell */
9198   CellInfo info;                       /* Size of the cell being deleted */
9199   int bSkipnext = 0;                   /* Leaf cursor in SKIPNEXT state */
9200   u8 bPreserve = flags & BTREE_SAVEPOSITION;  /* Keep cursor valid */
9201 
9202   assert( cursorOwnsBtShared(pCur) );
9203   assert( pBt->inTransaction==TRANS_WRITE );
9204   assert( (pBt->btsFlags & BTS_READ_ONLY)==0 );
9205   assert( pCur->curFlags & BTCF_WriteFlag );
9206   assert( hasSharedCacheTableLock(p, pCur->pgnoRoot, pCur->pKeyInfo!=0, 2) );
9207   assert( !hasReadConflicts(p, pCur->pgnoRoot) );
9208   assert( (flags & ~(BTREE_SAVEPOSITION | BTREE_AUXDELETE))==0 );
9209   if( pCur->eState==CURSOR_REQUIRESEEK ){
9210     rc = btreeRestoreCursorPosition(pCur);
9211     assert( rc!=SQLITE_OK || CORRUPT_DB || pCur->eState==CURSOR_VALID );
9212     if( rc || pCur->eState!=CURSOR_VALID ) return rc;
9213   }
9214   assert( CORRUPT_DB || pCur->eState==CURSOR_VALID );
9215 
9216   iCellDepth = pCur->iPage;
9217   iCellIdx = pCur->ix;
9218   pPage = pCur->pPage;
9219   pCell = findCell(pPage, iCellIdx);
9220   if( pPage->nFree<0 && btreeComputeFreeSpace(pPage) ) return SQLITE_CORRUPT;
9221 
9222   /* If the bPreserve flag is set to true, then the cursor position must
9223   ** be preserved following this delete operation. If the current delete
9224   ** will cause a b-tree rebalance, then this is done by saving the cursor
9225   ** key and leaving the cursor in CURSOR_REQUIRESEEK state before
9226   ** returning.
9227   **
9228   ** Or, if the current delete will not cause a rebalance, then the cursor
9229   ** will be left in CURSOR_SKIPNEXT state pointing to the entry immediately
9230   ** before or after the deleted entry. In this case set bSkipnext to true.  */
9231   if( bPreserve ){
9232     if( !pPage->leaf
9233      || (pPage->nFree+cellSizePtr(pPage,pCell)+2)>(int)(pBt->usableSize*2/3)
9234      || pPage->nCell==1  /* See dbfuzz001.test for a test case */
9235     ){
9236       /* A b-tree rebalance will be required after deleting this entry.
9237       ** Save the cursor key.  */
9238       rc = saveCursorKey(pCur);
9239       if( rc ) return rc;
9240     }else{
9241       bSkipnext = 1;
9242     }
9243   }
9244 
9245   /* If the page containing the entry to delete is not a leaf page, move
9246   ** the cursor to the largest entry in the tree that is smaller than
9247   ** the entry being deleted. This cell will replace the cell being deleted
9248   ** from the internal node. The 'previous' entry is used for this instead
9249   ** of the 'next' entry, as the previous entry is always a part of the
9250   ** sub-tree headed by the child page of the cell being deleted. This makes
9251   ** balancing the tree following the delete operation easier.  */
9252   if( !pPage->leaf ){
9253     rc = sqlite3BtreePrevious(pCur, 0);
9254     assert( rc!=SQLITE_DONE );
9255     if( rc ) return rc;
9256   }
9257 
9258   /* Save the positions of any other cursors open on this table before
9259   ** making any modifications.  */
9260   if( pCur->curFlags & BTCF_Multiple ){
9261     rc = saveAllCursors(pBt, pCur->pgnoRoot, pCur);
9262     if( rc ) return rc;
9263   }
9264 
9265   /* If this is a delete operation to remove a row from a table b-tree,
9266   ** invalidate any incrblob cursors open on the row being deleted.  */
9267   if( pCur->pKeyInfo==0 && p->hasIncrblobCur ){
9268     invalidateIncrblobCursors(p, pCur->pgnoRoot, pCur->info.nKey, 0);
9269   }
9270 
9271   /* Make the page containing the entry to be deleted writable. Then free any
9272   ** overflow pages associated with the entry and finally remove the cell
9273   ** itself from within the page.  */
9274   rc = sqlite3PagerWrite(pPage->pDbPage);
9275   if( rc ) return rc;
9276   BTREE_CLEAR_CELL(rc, pPage, pCell, info);
9277   dropCell(pPage, iCellIdx, info.nSize, &rc);
9278   if( rc ) return rc;
9279 
9280   /* If the cell deleted was not located on a leaf page, then the cursor
9281   ** is currently pointing to the largest entry in the sub-tree headed
9282   ** by the child-page of the cell that was just deleted from an internal
9283   ** node. The cell from the leaf node needs to be moved to the internal
9284   ** node to replace the deleted cell.  */
9285   if( !pPage->leaf ){
9286     MemPage *pLeaf = pCur->pPage;
9287     int nCell;
9288     Pgno n;
9289     unsigned char *pTmp;
9290 
9291     if( pLeaf->nFree<0 ){
9292       rc = btreeComputeFreeSpace(pLeaf);
9293       if( rc ) return rc;
9294     }
9295     if( iCellDepth<pCur->iPage-1 ){
9296       n = pCur->apPage[iCellDepth+1]->pgno;
9297     }else{
9298       n = pCur->pPage->pgno;
9299     }
9300     pCell = findCell(pLeaf, pLeaf->nCell-1);
9301     if( pCell<&pLeaf->aData[4] ) return SQLITE_CORRUPT_BKPT;
9302     nCell = pLeaf->xCellSize(pLeaf, pCell);
9303     assert( MX_CELL_SIZE(pBt) >= nCell );
9304     pTmp = pBt->pTmpSpace;
9305     assert( pTmp!=0 );
9306     rc = sqlite3PagerWrite(pLeaf->pDbPage);
9307     if( rc==SQLITE_OK ){
9308       insertCell(pPage, iCellIdx, pCell-4, nCell+4, pTmp, n, &rc);
9309     }
9310     dropCell(pLeaf, pLeaf->nCell-1, nCell, &rc);
9311     if( rc ) return rc;
9312   }
9313 
9314   /* Balance the tree. If the entry deleted was located on a leaf page,
9315   ** then the cursor still points to that page. In this case the first
9316   ** call to balance() repairs the tree, and the if(...) condition is
9317   ** never true.
9318   **
9319   ** Otherwise, if the entry deleted was on an internal node page, then
9320   ** pCur is pointing to the leaf page from which a cell was removed to
9321   ** replace the cell deleted from the internal node. This is slightly
9322   ** tricky as the leaf node may be underfull, and the internal node may
9323   ** be either under or overfull. In this case run the balancing algorithm
9324   ** on the leaf node first. If the balance proceeds far enough up the
9325   ** tree that we can be sure that any problem in the internal node has
9326   ** been corrected, so be it. Otherwise, after balancing the leaf node,
9327   ** walk the cursor up the tree to the internal node and balance it as
9328   ** well.  */
9329   rc = balance(pCur);
9330   if( rc==SQLITE_OK && pCur->iPage>iCellDepth ){
9331     releasePageNotNull(pCur->pPage);
9332     pCur->iPage--;
9333     while( pCur->iPage>iCellDepth ){
9334       releasePage(pCur->apPage[pCur->iPage--]);
9335     }
9336     pCur->pPage = pCur->apPage[pCur->iPage];
9337     rc = balance(pCur);
9338   }
9339 
9340   if( rc==SQLITE_OK ){
9341     if( bSkipnext ){
9342       assert( bPreserve && (pCur->iPage==iCellDepth || CORRUPT_DB) );
9343       assert( pPage==pCur->pPage || CORRUPT_DB );
9344       assert( (pPage->nCell>0 || CORRUPT_DB) && iCellIdx<=pPage->nCell );
9345       pCur->eState = CURSOR_SKIPNEXT;
9346       if( iCellIdx>=pPage->nCell ){
9347         pCur->skipNext = -1;
9348         pCur->ix = pPage->nCell-1;
9349       }else{
9350         pCur->skipNext = 1;
9351       }
9352     }else{
9353       rc = moveToRoot(pCur);
9354       if( bPreserve ){
9355         btreeReleaseAllCursorPages(pCur);
9356         pCur->eState = CURSOR_REQUIRESEEK;
9357       }
9358       if( rc==SQLITE_EMPTY ) rc = SQLITE_OK;
9359     }
9360   }
9361   return rc;
9362 }
9363 
9364 /*
9365 ** Create a new BTree table.  Write into *piTable the page
9366 ** number for the root page of the new table.
9367 **
9368 ** The type of type is determined by the flags parameter.  Only the
9369 ** following values of flags are currently in use.  Other values for
9370 ** flags might not work:
9371 **
9372 **     BTREE_INTKEY|BTREE_LEAFDATA     Used for SQL tables with rowid keys
9373 **     BTREE_ZERODATA                  Used for SQL indices
9374 */
9375 static int btreeCreateTable(Btree *p, Pgno *piTable, int createTabFlags){
9376   BtShared *pBt = p->pBt;
9377   MemPage *pRoot;
9378   Pgno pgnoRoot;
9379   int rc;
9380   int ptfFlags;          /* Page-type flage for the root page of new table */
9381 
9382   assert( sqlite3BtreeHoldsMutex(p) );
9383   assert( pBt->inTransaction==TRANS_WRITE );
9384   assert( (pBt->btsFlags & BTS_READ_ONLY)==0 );
9385 
9386 #ifdef SQLITE_OMIT_AUTOVACUUM
9387   rc = allocateBtreePage(pBt, &pRoot, &pgnoRoot, 1, 0);
9388   if( rc ){
9389     return rc;
9390   }
9391 #else
9392   if( pBt->autoVacuum ){
9393     Pgno pgnoMove;      /* Move a page here to make room for the root-page */
9394     MemPage *pPageMove; /* The page to move to. */
9395 
9396     /* Creating a new table may probably require moving an existing database
9397     ** to make room for the new tables root page. In case this page turns
9398     ** out to be an overflow page, delete all overflow page-map caches
9399     ** held by open cursors.
9400     */
9401     invalidateAllOverflowCache(pBt);
9402 
9403     /* Read the value of meta[3] from the database to determine where the
9404     ** root page of the new table should go. meta[3] is the largest root-page
9405     ** created so far, so the new root-page is (meta[3]+1).
9406     */
9407     sqlite3BtreeGetMeta(p, BTREE_LARGEST_ROOT_PAGE, &pgnoRoot);
9408     if( pgnoRoot>btreePagecount(pBt) ){
9409       return SQLITE_CORRUPT_BKPT;
9410     }
9411     pgnoRoot++;
9412 
9413     /* The new root-page may not be allocated on a pointer-map page, or the
9414     ** PENDING_BYTE page.
9415     */
9416     while( pgnoRoot==PTRMAP_PAGENO(pBt, pgnoRoot) ||
9417         pgnoRoot==PENDING_BYTE_PAGE(pBt) ){
9418       pgnoRoot++;
9419     }
9420     assert( pgnoRoot>=3 );
9421 
9422     /* Allocate a page. The page that currently resides at pgnoRoot will
9423     ** be moved to the allocated page (unless the allocated page happens
9424     ** to reside at pgnoRoot).
9425     */
9426     rc = allocateBtreePage(pBt, &pPageMove, &pgnoMove, pgnoRoot, BTALLOC_EXACT);
9427     if( rc!=SQLITE_OK ){
9428       return rc;
9429     }
9430 
9431     if( pgnoMove!=pgnoRoot ){
9432       /* pgnoRoot is the page that will be used for the root-page of
9433       ** the new table (assuming an error did not occur). But we were
9434       ** allocated pgnoMove. If required (i.e. if it was not allocated
9435       ** by extending the file), the current page at position pgnoMove
9436       ** is already journaled.
9437       */
9438       u8 eType = 0;
9439       Pgno iPtrPage = 0;
9440 
9441       /* Save the positions of any open cursors. This is required in
9442       ** case they are holding a reference to an xFetch reference
9443       ** corresponding to page pgnoRoot.  */
9444       rc = saveAllCursors(pBt, 0, 0);
9445       releasePage(pPageMove);
9446       if( rc!=SQLITE_OK ){
9447         return rc;
9448       }
9449 
9450       /* Move the page currently at pgnoRoot to pgnoMove. */
9451       rc = btreeGetPage(pBt, pgnoRoot, &pRoot, 0);
9452       if( rc!=SQLITE_OK ){
9453         return rc;
9454       }
9455       rc = ptrmapGet(pBt, pgnoRoot, &eType, &iPtrPage);
9456       if( eType==PTRMAP_ROOTPAGE || eType==PTRMAP_FREEPAGE ){
9457         rc = SQLITE_CORRUPT_BKPT;
9458       }
9459       if( rc!=SQLITE_OK ){
9460         releasePage(pRoot);
9461         return rc;
9462       }
9463       assert( eType!=PTRMAP_ROOTPAGE );
9464       assert( eType!=PTRMAP_FREEPAGE );
9465       rc = relocatePage(pBt, pRoot, eType, iPtrPage, pgnoMove, 0);
9466       releasePage(pRoot);
9467 
9468       /* Obtain the page at pgnoRoot */
9469       if( rc!=SQLITE_OK ){
9470         return rc;
9471       }
9472       rc = btreeGetPage(pBt, pgnoRoot, &pRoot, 0);
9473       if( rc!=SQLITE_OK ){
9474         return rc;
9475       }
9476       rc = sqlite3PagerWrite(pRoot->pDbPage);
9477       if( rc!=SQLITE_OK ){
9478         releasePage(pRoot);
9479         return rc;
9480       }
9481     }else{
9482       pRoot = pPageMove;
9483     }
9484 
9485     /* Update the pointer-map and meta-data with the new root-page number. */
9486     ptrmapPut(pBt, pgnoRoot, PTRMAP_ROOTPAGE, 0, &rc);
9487     if( rc ){
9488       releasePage(pRoot);
9489       return rc;
9490     }
9491 
9492     /* When the new root page was allocated, page 1 was made writable in
9493     ** order either to increase the database filesize, or to decrement the
9494     ** freelist count.  Hence, the sqlite3BtreeUpdateMeta() call cannot fail.
9495     */
9496     assert( sqlite3PagerIswriteable(pBt->pPage1->pDbPage) );
9497     rc = sqlite3BtreeUpdateMeta(p, 4, pgnoRoot);
9498     if( NEVER(rc) ){
9499       releasePage(pRoot);
9500       return rc;
9501     }
9502 
9503   }else{
9504     rc = allocateBtreePage(pBt, &pRoot, &pgnoRoot, 1, 0);
9505     if( rc ) return rc;
9506   }
9507 #endif
9508   assert( sqlite3PagerIswriteable(pRoot->pDbPage) );
9509   if( createTabFlags & BTREE_INTKEY ){
9510     ptfFlags = PTF_INTKEY | PTF_LEAFDATA | PTF_LEAF;
9511   }else{
9512     ptfFlags = PTF_ZERODATA | PTF_LEAF;
9513   }
9514   zeroPage(pRoot, ptfFlags);
9515   sqlite3PagerUnref(pRoot->pDbPage);
9516   assert( (pBt->openFlags & BTREE_SINGLE)==0 || pgnoRoot==2 );
9517   *piTable = pgnoRoot;
9518   return SQLITE_OK;
9519 }
9520 int sqlite3BtreeCreateTable(Btree *p, Pgno *piTable, int flags){
9521   int rc;
9522   sqlite3BtreeEnter(p);
9523   rc = btreeCreateTable(p, piTable, flags);
9524   sqlite3BtreeLeave(p);
9525   return rc;
9526 }
9527 
9528 /*
9529 ** Erase the given database page and all its children.  Return
9530 ** the page to the freelist.
9531 */
9532 static int clearDatabasePage(
9533   BtShared *pBt,           /* The BTree that contains the table */
9534   Pgno pgno,               /* Page number to clear */
9535   int freePageFlag,        /* Deallocate page if true */
9536   i64 *pnChange            /* Add number of Cells freed to this counter */
9537 ){
9538   MemPage *pPage;
9539   int rc;
9540   unsigned char *pCell;
9541   int i;
9542   int hdr;
9543   CellInfo info;
9544 
9545   assert( sqlite3_mutex_held(pBt->mutex) );
9546   if( pgno>btreePagecount(pBt) ){
9547     return SQLITE_CORRUPT_BKPT;
9548   }
9549   rc = getAndInitPage(pBt, pgno, &pPage, 0, 0);
9550   if( rc ) return rc;
9551   if( pPage->bBusy ){
9552     rc = SQLITE_CORRUPT_BKPT;
9553     goto cleardatabasepage_out;
9554   }
9555   pPage->bBusy = 1;
9556   hdr = pPage->hdrOffset;
9557   for(i=0; i<pPage->nCell; i++){
9558     pCell = findCell(pPage, i);
9559     if( !pPage->leaf ){
9560       rc = clearDatabasePage(pBt, get4byte(pCell), 1, pnChange);
9561       if( rc ) goto cleardatabasepage_out;
9562     }
9563     BTREE_CLEAR_CELL(rc, pPage, pCell, info);
9564     if( rc ) goto cleardatabasepage_out;
9565   }
9566   if( !pPage->leaf ){
9567     rc = clearDatabasePage(pBt, get4byte(&pPage->aData[hdr+8]), 1, pnChange);
9568     if( rc ) goto cleardatabasepage_out;
9569     if( pPage->intKey ) pnChange = 0;
9570   }
9571   if( pnChange ){
9572     testcase( !pPage->intKey );
9573     *pnChange += pPage->nCell;
9574   }
9575   if( freePageFlag ){
9576     freePage(pPage, &rc);
9577   }else if( (rc = sqlite3PagerWrite(pPage->pDbPage))==0 ){
9578     zeroPage(pPage, pPage->aData[hdr] | PTF_LEAF);
9579   }
9580 
9581 cleardatabasepage_out:
9582   pPage->bBusy = 0;
9583   releasePage(pPage);
9584   return rc;
9585 }
9586 
9587 /*
9588 ** Delete all information from a single table in the database.  iTable is
9589 ** the page number of the root of the table.  After this routine returns,
9590 ** the root page is empty, but still exists.
9591 **
9592 ** This routine will fail with SQLITE_LOCKED if there are any open
9593 ** read cursors on the table.  Open write cursors are moved to the
9594 ** root of the table.
9595 **
9596 ** If pnChange is not NULL, then the integer value pointed to by pnChange
9597 ** is incremented by the number of entries in the table.
9598 */
9599 int sqlite3BtreeClearTable(Btree *p, int iTable, i64 *pnChange){
9600   int rc;
9601   BtShared *pBt = p->pBt;
9602   sqlite3BtreeEnter(p);
9603   assert( p->inTrans==TRANS_WRITE );
9604 
9605   rc = saveAllCursors(pBt, (Pgno)iTable, 0);
9606 
9607   if( SQLITE_OK==rc ){
9608     /* Invalidate all incrblob cursors open on table iTable (assuming iTable
9609     ** is the root of a table b-tree - if it is not, the following call is
9610     ** a no-op).  */
9611     if( p->hasIncrblobCur ){
9612       invalidateIncrblobCursors(p, (Pgno)iTable, 0, 1);
9613     }
9614     rc = clearDatabasePage(pBt, (Pgno)iTable, 0, pnChange);
9615   }
9616   sqlite3BtreeLeave(p);
9617   return rc;
9618 }
9619 
9620 /*
9621 ** Delete all information from the single table that pCur is open on.
9622 **
9623 ** This routine only work for pCur on an ephemeral table.
9624 */
9625 int sqlite3BtreeClearTableOfCursor(BtCursor *pCur){
9626   return sqlite3BtreeClearTable(pCur->pBtree, pCur->pgnoRoot, 0);
9627 }
9628 
9629 /*
9630 ** Erase all information in a table and add the root of the table to
9631 ** the freelist.  Except, the root of the principle table (the one on
9632 ** page 1) is never added to the freelist.
9633 **
9634 ** This routine will fail with SQLITE_LOCKED if there are any open
9635 ** cursors on the table.
9636 **
9637 ** If AUTOVACUUM is enabled and the page at iTable is not the last
9638 ** root page in the database file, then the last root page
9639 ** in the database file is moved into the slot formerly occupied by
9640 ** iTable and that last slot formerly occupied by the last root page
9641 ** is added to the freelist instead of iTable.  In this say, all
9642 ** root pages are kept at the beginning of the database file, which
9643 ** is necessary for AUTOVACUUM to work right.  *piMoved is set to the
9644 ** page number that used to be the last root page in the file before
9645 ** the move.  If no page gets moved, *piMoved is set to 0.
9646 ** The last root page is recorded in meta[3] and the value of
9647 ** meta[3] is updated by this procedure.
9648 */
9649 static int btreeDropTable(Btree *p, Pgno iTable, int *piMoved){
9650   int rc;
9651   MemPage *pPage = 0;
9652   BtShared *pBt = p->pBt;
9653 
9654   assert( sqlite3BtreeHoldsMutex(p) );
9655   assert( p->inTrans==TRANS_WRITE );
9656   assert( iTable>=2 );
9657   if( iTable>btreePagecount(pBt) ){
9658     return SQLITE_CORRUPT_BKPT;
9659   }
9660 
9661   rc = btreeGetPage(pBt, (Pgno)iTable, &pPage, 0);
9662   if( rc ) return rc;
9663   rc = sqlite3BtreeClearTable(p, iTable, 0);
9664   if( rc ){
9665     releasePage(pPage);
9666     return rc;
9667   }
9668 
9669   *piMoved = 0;
9670 
9671 #ifdef SQLITE_OMIT_AUTOVACUUM
9672   freePage(pPage, &rc);
9673   releasePage(pPage);
9674 #else
9675   if( pBt->autoVacuum ){
9676     Pgno maxRootPgno;
9677     sqlite3BtreeGetMeta(p, BTREE_LARGEST_ROOT_PAGE, &maxRootPgno);
9678 
9679     if( iTable==maxRootPgno ){
9680       /* If the table being dropped is the table with the largest root-page
9681       ** number in the database, put the root page on the free list.
9682       */
9683       freePage(pPage, &rc);
9684       releasePage(pPage);
9685       if( rc!=SQLITE_OK ){
9686         return rc;
9687       }
9688     }else{
9689       /* The table being dropped does not have the largest root-page
9690       ** number in the database. So move the page that does into the
9691       ** gap left by the deleted root-page.
9692       */
9693       MemPage *pMove;
9694       releasePage(pPage);
9695       rc = btreeGetPage(pBt, maxRootPgno, &pMove, 0);
9696       if( rc!=SQLITE_OK ){
9697         return rc;
9698       }
9699       rc = relocatePage(pBt, pMove, PTRMAP_ROOTPAGE, 0, iTable, 0);
9700       releasePage(pMove);
9701       if( rc!=SQLITE_OK ){
9702         return rc;
9703       }
9704       pMove = 0;
9705       rc = btreeGetPage(pBt, maxRootPgno, &pMove, 0);
9706       freePage(pMove, &rc);
9707       releasePage(pMove);
9708       if( rc!=SQLITE_OK ){
9709         return rc;
9710       }
9711       *piMoved = maxRootPgno;
9712     }
9713 
9714     /* Set the new 'max-root-page' value in the database header. This
9715     ** is the old value less one, less one more if that happens to
9716     ** be a root-page number, less one again if that is the
9717     ** PENDING_BYTE_PAGE.
9718     */
9719     maxRootPgno--;
9720     while( maxRootPgno==PENDING_BYTE_PAGE(pBt)
9721            || PTRMAP_ISPAGE(pBt, maxRootPgno) ){
9722       maxRootPgno--;
9723     }
9724     assert( maxRootPgno!=PENDING_BYTE_PAGE(pBt) );
9725 
9726     rc = sqlite3BtreeUpdateMeta(p, 4, maxRootPgno);
9727   }else{
9728     freePage(pPage, &rc);
9729     releasePage(pPage);
9730   }
9731 #endif
9732   return rc;
9733 }
9734 int sqlite3BtreeDropTable(Btree *p, int iTable, int *piMoved){
9735   int rc;
9736   sqlite3BtreeEnter(p);
9737   rc = btreeDropTable(p, iTable, piMoved);
9738   sqlite3BtreeLeave(p);
9739   return rc;
9740 }
9741 
9742 
9743 /*
9744 ** This function may only be called if the b-tree connection already
9745 ** has a read or write transaction open on the database.
9746 **
9747 ** Read the meta-information out of a database file.  Meta[0]
9748 ** is the number of free pages currently in the database.  Meta[1]
9749 ** through meta[15] are available for use by higher layers.  Meta[0]
9750 ** is read-only, the others are read/write.
9751 **
9752 ** The schema layer numbers meta values differently.  At the schema
9753 ** layer (and the SetCookie and ReadCookie opcodes) the number of
9754 ** free pages is not visible.  So Cookie[0] is the same as Meta[1].
9755 **
9756 ** This routine treats Meta[BTREE_DATA_VERSION] as a special case.  Instead
9757 ** of reading the value out of the header, it instead loads the "DataVersion"
9758 ** from the pager.  The BTREE_DATA_VERSION value is not actually stored in the
9759 ** database file.  It is a number computed by the pager.  But its access
9760 ** pattern is the same as header meta values, and so it is convenient to
9761 ** read it from this routine.
9762 */
9763 void sqlite3BtreeGetMeta(Btree *p, int idx, u32 *pMeta){
9764   BtShared *pBt = p->pBt;
9765 
9766   sqlite3BtreeEnter(p);
9767   assert( p->inTrans>TRANS_NONE );
9768   assert( SQLITE_OK==querySharedCacheTableLock(p, SCHEMA_ROOT, READ_LOCK) );
9769   assert( pBt->pPage1 );
9770   assert( idx>=0 && idx<=15 );
9771 
9772   if( idx==BTREE_DATA_VERSION ){
9773     *pMeta = sqlite3PagerDataVersion(pBt->pPager) + p->iBDataVersion;
9774   }else{
9775     *pMeta = get4byte(&pBt->pPage1->aData[36 + idx*4]);
9776   }
9777 
9778   /* If auto-vacuum is disabled in this build and this is an auto-vacuum
9779   ** database, mark the database as read-only.  */
9780 #ifdef SQLITE_OMIT_AUTOVACUUM
9781   if( idx==BTREE_LARGEST_ROOT_PAGE && *pMeta>0 ){
9782     pBt->btsFlags |= BTS_READ_ONLY;
9783   }
9784 #endif
9785 
9786   sqlite3BtreeLeave(p);
9787 }
9788 
9789 /*
9790 ** Write meta-information back into the database.  Meta[0] is
9791 ** read-only and may not be written.
9792 */
9793 int sqlite3BtreeUpdateMeta(Btree *p, int idx, u32 iMeta){
9794   BtShared *pBt = p->pBt;
9795   unsigned char *pP1;
9796   int rc;
9797   assert( idx>=1 && idx<=15 );
9798   sqlite3BtreeEnter(p);
9799   assert( p->inTrans==TRANS_WRITE );
9800   assert( pBt->pPage1!=0 );
9801   pP1 = pBt->pPage1->aData;
9802   rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
9803   if( rc==SQLITE_OK ){
9804     put4byte(&pP1[36 + idx*4], iMeta);
9805 #ifndef SQLITE_OMIT_AUTOVACUUM
9806     if( idx==BTREE_INCR_VACUUM ){
9807       assert( pBt->autoVacuum || iMeta==0 );
9808       assert( iMeta==0 || iMeta==1 );
9809       pBt->incrVacuum = (u8)iMeta;
9810     }
9811 #endif
9812   }
9813   sqlite3BtreeLeave(p);
9814   return rc;
9815 }
9816 
9817 /*
9818 ** The first argument, pCur, is a cursor opened on some b-tree. Count the
9819 ** number of entries in the b-tree and write the result to *pnEntry.
9820 **
9821 ** SQLITE_OK is returned if the operation is successfully executed.
9822 ** Otherwise, if an error is encountered (i.e. an IO error or database
9823 ** corruption) an SQLite error code is returned.
9824 */
9825 int sqlite3BtreeCount(sqlite3 *db, BtCursor *pCur, i64 *pnEntry){
9826   i64 nEntry = 0;                      /* Value to return in *pnEntry */
9827   int rc;                              /* Return code */
9828 
9829   rc = moveToRoot(pCur);
9830   if( rc==SQLITE_EMPTY ){
9831     *pnEntry = 0;
9832     return SQLITE_OK;
9833   }
9834 
9835   /* Unless an error occurs, the following loop runs one iteration for each
9836   ** page in the B-Tree structure (not including overflow pages).
9837   */
9838   while( rc==SQLITE_OK && !AtomicLoad(&db->u1.isInterrupted) ){
9839     int iIdx;                          /* Index of child node in parent */
9840     MemPage *pPage;                    /* Current page of the b-tree */
9841 
9842     /* If this is a leaf page or the tree is not an int-key tree, then
9843     ** this page contains countable entries. Increment the entry counter
9844     ** accordingly.
9845     */
9846     pPage = pCur->pPage;
9847     if( pPage->leaf || !pPage->intKey ){
9848       nEntry += pPage->nCell;
9849     }
9850 
9851     /* pPage is a leaf node. This loop navigates the cursor so that it
9852     ** points to the first interior cell that it points to the parent of
9853     ** the next page in the tree that has not yet been visited. The
9854     ** pCur->aiIdx[pCur->iPage] value is set to the index of the parent cell
9855     ** of the page, or to the number of cells in the page if the next page
9856     ** to visit is the right-child of its parent.
9857     **
9858     ** If all pages in the tree have been visited, return SQLITE_OK to the
9859     ** caller.
9860     */
9861     if( pPage->leaf ){
9862       do {
9863         if( pCur->iPage==0 ){
9864           /* All pages of the b-tree have been visited. Return successfully. */
9865           *pnEntry = nEntry;
9866           return moveToRoot(pCur);
9867         }
9868         moveToParent(pCur);
9869       }while ( pCur->ix>=pCur->pPage->nCell );
9870 
9871       pCur->ix++;
9872       pPage = pCur->pPage;
9873     }
9874 
9875     /* Descend to the child node of the cell that the cursor currently
9876     ** points at. This is the right-child if (iIdx==pPage->nCell).
9877     */
9878     iIdx = pCur->ix;
9879     if( iIdx==pPage->nCell ){
9880       rc = moveToChild(pCur, get4byte(&pPage->aData[pPage->hdrOffset+8]));
9881     }else{
9882       rc = moveToChild(pCur, get4byte(findCell(pPage, iIdx)));
9883     }
9884   }
9885 
9886   /* An error has occurred. Return an error code. */
9887   return rc;
9888 }
9889 
9890 /*
9891 ** Return the pager associated with a BTree.  This routine is used for
9892 ** testing and debugging only.
9893 */
9894 Pager *sqlite3BtreePager(Btree *p){
9895   return p->pBt->pPager;
9896 }
9897 
9898 #ifndef SQLITE_OMIT_INTEGRITY_CHECK
9899 /*
9900 ** Append a message to the error message string.
9901 */
9902 static void checkAppendMsg(
9903   IntegrityCk *pCheck,
9904   const char *zFormat,
9905   ...
9906 ){
9907   va_list ap;
9908   if( !pCheck->mxErr ) return;
9909   pCheck->mxErr--;
9910   pCheck->nErr++;
9911   va_start(ap, zFormat);
9912   if( pCheck->errMsg.nChar ){
9913     sqlite3_str_append(&pCheck->errMsg, "\n", 1);
9914   }
9915   if( pCheck->zPfx ){
9916     sqlite3_str_appendf(&pCheck->errMsg, pCheck->zPfx, pCheck->v1, pCheck->v2);
9917   }
9918   sqlite3_str_vappendf(&pCheck->errMsg, zFormat, ap);
9919   va_end(ap);
9920   if( pCheck->errMsg.accError==SQLITE_NOMEM ){
9921     pCheck->bOomFault = 1;
9922   }
9923 }
9924 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
9925 
9926 #ifndef SQLITE_OMIT_INTEGRITY_CHECK
9927 
9928 /*
9929 ** Return non-zero if the bit in the IntegrityCk.aPgRef[] array that
9930 ** corresponds to page iPg is already set.
9931 */
9932 static int getPageReferenced(IntegrityCk *pCheck, Pgno iPg){
9933   assert( iPg<=pCheck->nPage && sizeof(pCheck->aPgRef[0])==1 );
9934   return (pCheck->aPgRef[iPg/8] & (1 << (iPg & 0x07)));
9935 }
9936 
9937 /*
9938 ** Set the bit in the IntegrityCk.aPgRef[] array that corresponds to page iPg.
9939 */
9940 static void setPageReferenced(IntegrityCk *pCheck, Pgno iPg){
9941   assert( iPg<=pCheck->nPage && sizeof(pCheck->aPgRef[0])==1 );
9942   pCheck->aPgRef[iPg/8] |= (1 << (iPg & 0x07));
9943 }
9944 
9945 
9946 /*
9947 ** Add 1 to the reference count for page iPage.  If this is the second
9948 ** reference to the page, add an error message to pCheck->zErrMsg.
9949 ** Return 1 if there are 2 or more references to the page and 0 if
9950 ** if this is the first reference to the page.
9951 **
9952 ** Also check that the page number is in bounds.
9953 */
9954 static int checkRef(IntegrityCk *pCheck, Pgno iPage){
9955   if( iPage>pCheck->nPage || iPage==0 ){
9956     checkAppendMsg(pCheck, "invalid page number %d", iPage);
9957     return 1;
9958   }
9959   if( getPageReferenced(pCheck, iPage) ){
9960     checkAppendMsg(pCheck, "2nd reference to page %d", iPage);
9961     return 1;
9962   }
9963   if( AtomicLoad(&pCheck->db->u1.isInterrupted) ) return 1;
9964   setPageReferenced(pCheck, iPage);
9965   return 0;
9966 }
9967 
9968 #ifndef SQLITE_OMIT_AUTOVACUUM
9969 /*
9970 ** Check that the entry in the pointer-map for page iChild maps to
9971 ** page iParent, pointer type ptrType. If not, append an error message
9972 ** to pCheck.
9973 */
9974 static void checkPtrmap(
9975   IntegrityCk *pCheck,   /* Integrity check context */
9976   Pgno iChild,           /* Child page number */
9977   u8 eType,              /* Expected pointer map type */
9978   Pgno iParent           /* Expected pointer map parent page number */
9979 ){
9980   int rc;
9981   u8 ePtrmapType;
9982   Pgno iPtrmapParent;
9983 
9984   rc = ptrmapGet(pCheck->pBt, iChild, &ePtrmapType, &iPtrmapParent);
9985   if( rc!=SQLITE_OK ){
9986     if( rc==SQLITE_NOMEM || rc==SQLITE_IOERR_NOMEM ) pCheck->bOomFault = 1;
9987     checkAppendMsg(pCheck, "Failed to read ptrmap key=%d", iChild);
9988     return;
9989   }
9990 
9991   if( ePtrmapType!=eType || iPtrmapParent!=iParent ){
9992     checkAppendMsg(pCheck,
9993       "Bad ptr map entry key=%d expected=(%d,%d) got=(%d,%d)",
9994       iChild, eType, iParent, ePtrmapType, iPtrmapParent);
9995   }
9996 }
9997 #endif
9998 
9999 /*
10000 ** Check the integrity of the freelist or of an overflow page list.
10001 ** Verify that the number of pages on the list is N.
10002 */
10003 static void checkList(
10004   IntegrityCk *pCheck,  /* Integrity checking context */
10005   int isFreeList,       /* True for a freelist.  False for overflow page list */
10006   Pgno iPage,           /* Page number for first page in the list */
10007   u32 N                 /* Expected number of pages in the list */
10008 ){
10009   int i;
10010   u32 expected = N;
10011   int nErrAtStart = pCheck->nErr;
10012   while( iPage!=0 && pCheck->mxErr ){
10013     DbPage *pOvflPage;
10014     unsigned char *pOvflData;
10015     if( checkRef(pCheck, iPage) ) break;
10016     N--;
10017     if( sqlite3PagerGet(pCheck->pPager, (Pgno)iPage, &pOvflPage, 0) ){
10018       checkAppendMsg(pCheck, "failed to get page %d", iPage);
10019       break;
10020     }
10021     pOvflData = (unsigned char *)sqlite3PagerGetData(pOvflPage);
10022     if( isFreeList ){
10023       u32 n = (u32)get4byte(&pOvflData[4]);
10024 #ifndef SQLITE_OMIT_AUTOVACUUM
10025       if( pCheck->pBt->autoVacuum ){
10026         checkPtrmap(pCheck, iPage, PTRMAP_FREEPAGE, 0);
10027       }
10028 #endif
10029       if( n>pCheck->pBt->usableSize/4-2 ){
10030         checkAppendMsg(pCheck,
10031            "freelist leaf count too big on page %d", iPage);
10032         N--;
10033       }else{
10034         for(i=0; i<(int)n; i++){
10035           Pgno iFreePage = get4byte(&pOvflData[8+i*4]);
10036 #ifndef SQLITE_OMIT_AUTOVACUUM
10037           if( pCheck->pBt->autoVacuum ){
10038             checkPtrmap(pCheck, iFreePage, PTRMAP_FREEPAGE, 0);
10039           }
10040 #endif
10041           checkRef(pCheck, iFreePage);
10042         }
10043         N -= n;
10044       }
10045     }
10046 #ifndef SQLITE_OMIT_AUTOVACUUM
10047     else{
10048       /* If this database supports auto-vacuum and iPage is not the last
10049       ** page in this overflow list, check that the pointer-map entry for
10050       ** the following page matches iPage.
10051       */
10052       if( pCheck->pBt->autoVacuum && N>0 ){
10053         i = get4byte(pOvflData);
10054         checkPtrmap(pCheck, i, PTRMAP_OVERFLOW2, iPage);
10055       }
10056     }
10057 #endif
10058     iPage = get4byte(pOvflData);
10059     sqlite3PagerUnref(pOvflPage);
10060   }
10061   if( N && nErrAtStart==pCheck->nErr ){
10062     checkAppendMsg(pCheck,
10063       "%s is %d but should be %d",
10064       isFreeList ? "size" : "overflow list length",
10065       expected-N, expected);
10066   }
10067 }
10068 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
10069 
10070 /*
10071 ** An implementation of a min-heap.
10072 **
10073 ** aHeap[0] is the number of elements on the heap.  aHeap[1] is the
10074 ** root element.  The daughter nodes of aHeap[N] are aHeap[N*2]
10075 ** and aHeap[N*2+1].
10076 **
10077 ** The heap property is this:  Every node is less than or equal to both
10078 ** of its daughter nodes.  A consequence of the heap property is that the
10079 ** root node aHeap[1] is always the minimum value currently in the heap.
10080 **
10081 ** The btreeHeapInsert() routine inserts an unsigned 32-bit number onto
10082 ** the heap, preserving the heap property.  The btreeHeapPull() routine
10083 ** removes the root element from the heap (the minimum value in the heap)
10084 ** and then moves other nodes around as necessary to preserve the heap
10085 ** property.
10086 **
10087 ** This heap is used for cell overlap and coverage testing.  Each u32
10088 ** entry represents the span of a cell or freeblock on a btree page.
10089 ** The upper 16 bits are the index of the first byte of a range and the
10090 ** lower 16 bits are the index of the last byte of that range.
10091 */
10092 static void btreeHeapInsert(u32 *aHeap, u32 x){
10093   u32 j, i = ++aHeap[0];
10094   aHeap[i] = x;
10095   while( (j = i/2)>0 && aHeap[j]>aHeap[i] ){
10096     x = aHeap[j];
10097     aHeap[j] = aHeap[i];
10098     aHeap[i] = x;
10099     i = j;
10100   }
10101 }
10102 static int btreeHeapPull(u32 *aHeap, u32 *pOut){
10103   u32 j, i, x;
10104   if( (x = aHeap[0])==0 ) return 0;
10105   *pOut = aHeap[1];
10106   aHeap[1] = aHeap[x];
10107   aHeap[x] = 0xffffffff;
10108   aHeap[0]--;
10109   i = 1;
10110   while( (j = i*2)<=aHeap[0] ){
10111     if( aHeap[j]>aHeap[j+1] ) j++;
10112     if( aHeap[i]<aHeap[j] ) break;
10113     x = aHeap[i];
10114     aHeap[i] = aHeap[j];
10115     aHeap[j] = x;
10116     i = j;
10117   }
10118   return 1;
10119 }
10120 
10121 #ifndef SQLITE_OMIT_INTEGRITY_CHECK
10122 /*
10123 ** Do various sanity checks on a single page of a tree.  Return
10124 ** the tree depth.  Root pages return 0.  Parents of root pages
10125 ** return 1, and so forth.
10126 **
10127 ** These checks are done:
10128 **
10129 **      1.  Make sure that cells and freeblocks do not overlap
10130 **          but combine to completely cover the page.
10131 **      2.  Make sure integer cell keys are in order.
10132 **      3.  Check the integrity of overflow pages.
10133 **      4.  Recursively call checkTreePage on all children.
10134 **      5.  Verify that the depth of all children is the same.
10135 */
10136 static int checkTreePage(
10137   IntegrityCk *pCheck,  /* Context for the sanity check */
10138   Pgno iPage,           /* Page number of the page to check */
10139   i64 *piMinKey,        /* Write minimum integer primary key here */
10140   i64 maxKey            /* Error if integer primary key greater than this */
10141 ){
10142   MemPage *pPage = 0;      /* The page being analyzed */
10143   int i;                   /* Loop counter */
10144   int rc;                  /* Result code from subroutine call */
10145   int depth = -1, d2;      /* Depth of a subtree */
10146   int pgno;                /* Page number */
10147   int nFrag;               /* Number of fragmented bytes on the page */
10148   int hdr;                 /* Offset to the page header */
10149   int cellStart;           /* Offset to the start of the cell pointer array */
10150   int nCell;               /* Number of cells */
10151   int doCoverageCheck = 1; /* True if cell coverage checking should be done */
10152   int keyCanBeEqual = 1;   /* True if IPK can be equal to maxKey
10153                            ** False if IPK must be strictly less than maxKey */
10154   u8 *data;                /* Page content */
10155   u8 *pCell;               /* Cell content */
10156   u8 *pCellIdx;            /* Next element of the cell pointer array */
10157   BtShared *pBt;           /* The BtShared object that owns pPage */
10158   u32 pc;                  /* Address of a cell */
10159   u32 usableSize;          /* Usable size of the page */
10160   u32 contentOffset;       /* Offset to the start of the cell content area */
10161   u32 *heap = 0;           /* Min-heap used for checking cell coverage */
10162   u32 x, prev = 0;         /* Next and previous entry on the min-heap */
10163   const char *saved_zPfx = pCheck->zPfx;
10164   int saved_v1 = pCheck->v1;
10165   int saved_v2 = pCheck->v2;
10166   u8 savedIsInit = 0;
10167 
10168   /* Check that the page exists
10169   */
10170   pBt = pCheck->pBt;
10171   usableSize = pBt->usableSize;
10172   if( iPage==0 ) return 0;
10173   if( checkRef(pCheck, iPage) ) return 0;
10174   pCheck->zPfx = "Page %u: ";
10175   pCheck->v1 = iPage;
10176   if( (rc = btreeGetPage(pBt, iPage, &pPage, 0))!=0 ){
10177     checkAppendMsg(pCheck,
10178        "unable to get the page. error code=%d", rc);
10179     goto end_of_check;
10180   }
10181 
10182   /* Clear MemPage.isInit to make sure the corruption detection code in
10183   ** btreeInitPage() is executed.  */
10184   savedIsInit = pPage->isInit;
10185   pPage->isInit = 0;
10186   if( (rc = btreeInitPage(pPage))!=0 ){
10187     assert( rc==SQLITE_CORRUPT );  /* The only possible error from InitPage */
10188     checkAppendMsg(pCheck,
10189                    "btreeInitPage() returns error code %d", rc);
10190     goto end_of_check;
10191   }
10192   if( (rc = btreeComputeFreeSpace(pPage))!=0 ){
10193     assert( rc==SQLITE_CORRUPT );
10194     checkAppendMsg(pCheck, "free space corruption", rc);
10195     goto end_of_check;
10196   }
10197   data = pPage->aData;
10198   hdr = pPage->hdrOffset;
10199 
10200   /* Set up for cell analysis */
10201   pCheck->zPfx = "On tree page %u cell %d: ";
10202   contentOffset = get2byteNotZero(&data[hdr+5]);
10203   assert( contentOffset<=usableSize );  /* Enforced by btreeInitPage() */
10204 
10205   /* EVIDENCE-OF: R-37002-32774 The two-byte integer at offset 3 gives the
10206   ** number of cells on the page. */
10207   nCell = get2byte(&data[hdr+3]);
10208   assert( pPage->nCell==nCell );
10209 
10210   /* EVIDENCE-OF: R-23882-45353 The cell pointer array of a b-tree page
10211   ** immediately follows the b-tree page header. */
10212   cellStart = hdr + 12 - 4*pPage->leaf;
10213   assert( pPage->aCellIdx==&data[cellStart] );
10214   pCellIdx = &data[cellStart + 2*(nCell-1)];
10215 
10216   if( !pPage->leaf ){
10217     /* Analyze the right-child page of internal pages */
10218     pgno = get4byte(&data[hdr+8]);
10219 #ifndef SQLITE_OMIT_AUTOVACUUM
10220     if( pBt->autoVacuum ){
10221       pCheck->zPfx = "On page %u at right child: ";
10222       checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage);
10223     }
10224 #endif
10225     depth = checkTreePage(pCheck, pgno, &maxKey, maxKey);
10226     keyCanBeEqual = 0;
10227   }else{
10228     /* For leaf pages, the coverage check will occur in the same loop
10229     ** as the other cell checks, so initialize the heap.  */
10230     heap = pCheck->heap;
10231     heap[0] = 0;
10232   }
10233 
10234   /* EVIDENCE-OF: R-02776-14802 The cell pointer array consists of K 2-byte
10235   ** integer offsets to the cell contents. */
10236   for(i=nCell-1; i>=0 && pCheck->mxErr; i--){
10237     CellInfo info;
10238 
10239     /* Check cell size */
10240     pCheck->v2 = i;
10241     assert( pCellIdx==&data[cellStart + i*2] );
10242     pc = get2byteAligned(pCellIdx);
10243     pCellIdx -= 2;
10244     if( pc<contentOffset || pc>usableSize-4 ){
10245       checkAppendMsg(pCheck, "Offset %d out of range %d..%d",
10246                              pc, contentOffset, usableSize-4);
10247       doCoverageCheck = 0;
10248       continue;
10249     }
10250     pCell = &data[pc];
10251     pPage->xParseCell(pPage, pCell, &info);
10252     if( pc+info.nSize>usableSize ){
10253       checkAppendMsg(pCheck, "Extends off end of page");
10254       doCoverageCheck = 0;
10255       continue;
10256     }
10257 
10258     /* Check for integer primary key out of range */
10259     if( pPage->intKey ){
10260       if( keyCanBeEqual ? (info.nKey > maxKey) : (info.nKey >= maxKey) ){
10261         checkAppendMsg(pCheck, "Rowid %lld out of order", info.nKey);
10262       }
10263       maxKey = info.nKey;
10264       keyCanBeEqual = 0;     /* Only the first key on the page may ==maxKey */
10265     }
10266 
10267     /* Check the content overflow list */
10268     if( info.nPayload>info.nLocal ){
10269       u32 nPage;       /* Number of pages on the overflow chain */
10270       Pgno pgnoOvfl;   /* First page of the overflow chain */
10271       assert( pc + info.nSize - 4 <= usableSize );
10272       nPage = (info.nPayload - info.nLocal + usableSize - 5)/(usableSize - 4);
10273       pgnoOvfl = get4byte(&pCell[info.nSize - 4]);
10274 #ifndef SQLITE_OMIT_AUTOVACUUM
10275       if( pBt->autoVacuum ){
10276         checkPtrmap(pCheck, pgnoOvfl, PTRMAP_OVERFLOW1, iPage);
10277       }
10278 #endif
10279       checkList(pCheck, 0, pgnoOvfl, nPage);
10280     }
10281 
10282     if( !pPage->leaf ){
10283       /* Check sanity of left child page for internal pages */
10284       pgno = get4byte(pCell);
10285 #ifndef SQLITE_OMIT_AUTOVACUUM
10286       if( pBt->autoVacuum ){
10287         checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage);
10288       }
10289 #endif
10290       d2 = checkTreePage(pCheck, pgno, &maxKey, maxKey);
10291       keyCanBeEqual = 0;
10292       if( d2!=depth ){
10293         checkAppendMsg(pCheck, "Child page depth differs");
10294         depth = d2;
10295       }
10296     }else{
10297       /* Populate the coverage-checking heap for leaf pages */
10298       btreeHeapInsert(heap, (pc<<16)|(pc+info.nSize-1));
10299     }
10300   }
10301   *piMinKey = maxKey;
10302 
10303   /* Check for complete coverage of the page
10304   */
10305   pCheck->zPfx = 0;
10306   if( doCoverageCheck && pCheck->mxErr>0 ){
10307     /* For leaf pages, the min-heap has already been initialized and the
10308     ** cells have already been inserted.  But for internal pages, that has
10309     ** not yet been done, so do it now */
10310     if( !pPage->leaf ){
10311       heap = pCheck->heap;
10312       heap[0] = 0;
10313       for(i=nCell-1; i>=0; i--){
10314         u32 size;
10315         pc = get2byteAligned(&data[cellStart+i*2]);
10316         size = pPage->xCellSize(pPage, &data[pc]);
10317         btreeHeapInsert(heap, (pc<<16)|(pc+size-1));
10318       }
10319     }
10320     /* Add the freeblocks to the min-heap
10321     **
10322     ** EVIDENCE-OF: R-20690-50594 The second field of the b-tree page header
10323     ** is the offset of the first freeblock, or zero if there are no
10324     ** freeblocks on the page.
10325     */
10326     i = get2byte(&data[hdr+1]);
10327     while( i>0 ){
10328       int size, j;
10329       assert( (u32)i<=usableSize-4 ); /* Enforced by btreeComputeFreeSpace() */
10330       size = get2byte(&data[i+2]);
10331       assert( (u32)(i+size)<=usableSize ); /* due to btreeComputeFreeSpace() */
10332       btreeHeapInsert(heap, (((u32)i)<<16)|(i+size-1));
10333       /* EVIDENCE-OF: R-58208-19414 The first 2 bytes of a freeblock are a
10334       ** big-endian integer which is the offset in the b-tree page of the next
10335       ** freeblock in the chain, or zero if the freeblock is the last on the
10336       ** chain. */
10337       j = get2byte(&data[i]);
10338       /* EVIDENCE-OF: R-06866-39125 Freeblocks are always connected in order of
10339       ** increasing offset. */
10340       assert( j==0 || j>i+size );     /* Enforced by btreeComputeFreeSpace() */
10341       assert( (u32)j<=usableSize-4 ); /* Enforced by btreeComputeFreeSpace() */
10342       i = j;
10343     }
10344     /* Analyze the min-heap looking for overlap between cells and/or
10345     ** freeblocks, and counting the number of untracked bytes in nFrag.
10346     **
10347     ** Each min-heap entry is of the form:    (start_address<<16)|end_address.
10348     ** There is an implied first entry the covers the page header, the cell
10349     ** pointer index, and the gap between the cell pointer index and the start
10350     ** of cell content.
10351     **
10352     ** The loop below pulls entries from the min-heap in order and compares
10353     ** the start_address against the previous end_address.  If there is an
10354     ** overlap, that means bytes are used multiple times.  If there is a gap,
10355     ** that gap is added to the fragmentation count.
10356     */
10357     nFrag = 0;
10358     prev = contentOffset - 1;   /* Implied first min-heap entry */
10359     while( btreeHeapPull(heap,&x) ){
10360       if( (prev&0xffff)>=(x>>16) ){
10361         checkAppendMsg(pCheck,
10362           "Multiple uses for byte %u of page %u", x>>16, iPage);
10363         break;
10364       }else{
10365         nFrag += (x>>16) - (prev&0xffff) - 1;
10366         prev = x;
10367       }
10368     }
10369     nFrag += usableSize - (prev&0xffff) - 1;
10370     /* EVIDENCE-OF: R-43263-13491 The total number of bytes in all fragments
10371     ** is stored in the fifth field of the b-tree page header.
10372     ** EVIDENCE-OF: R-07161-27322 The one-byte integer at offset 7 gives the
10373     ** number of fragmented free bytes within the cell content area.
10374     */
10375     if( heap[0]==0 && nFrag!=data[hdr+7] ){
10376       checkAppendMsg(pCheck,
10377           "Fragmentation of %d bytes reported as %d on page %u",
10378           nFrag, data[hdr+7], iPage);
10379     }
10380   }
10381 
10382 end_of_check:
10383   if( !doCoverageCheck ) pPage->isInit = savedIsInit;
10384   releasePage(pPage);
10385   pCheck->zPfx = saved_zPfx;
10386   pCheck->v1 = saved_v1;
10387   pCheck->v2 = saved_v2;
10388   return depth+1;
10389 }
10390 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
10391 
10392 #ifndef SQLITE_OMIT_INTEGRITY_CHECK
10393 /*
10394 ** This routine does a complete check of the given BTree file.  aRoot[] is
10395 ** an array of pages numbers were each page number is the root page of
10396 ** a table.  nRoot is the number of entries in aRoot.
10397 **
10398 ** A read-only or read-write transaction must be opened before calling
10399 ** this function.
10400 **
10401 ** Write the number of error seen in *pnErr.  Except for some memory
10402 ** allocation errors,  an error message held in memory obtained from
10403 ** malloc is returned if *pnErr is non-zero.  If *pnErr==0 then NULL is
10404 ** returned.  If a memory allocation error occurs, NULL is returned.
10405 **
10406 ** If the first entry in aRoot[] is 0, that indicates that the list of
10407 ** root pages is incomplete.  This is a "partial integrity-check".  This
10408 ** happens when performing an integrity check on a single table.  The
10409 ** zero is skipped, of course.  But in addition, the freelist checks
10410 ** and the checks to make sure every page is referenced are also skipped,
10411 ** since obviously it is not possible to know which pages are covered by
10412 ** the unverified btrees.  Except, if aRoot[1] is 1, then the freelist
10413 ** checks are still performed.
10414 */
10415 char *sqlite3BtreeIntegrityCheck(
10416   sqlite3 *db,  /* Database connection that is running the check */
10417   Btree *p,     /* The btree to be checked */
10418   Pgno *aRoot,  /* An array of root pages numbers for individual trees */
10419   int nRoot,    /* Number of entries in aRoot[] */
10420   int mxErr,    /* Stop reporting errors after this many */
10421   int *pnErr    /* Write number of errors seen to this variable */
10422 ){
10423   Pgno i;
10424   IntegrityCk sCheck;
10425   BtShared *pBt = p->pBt;
10426   u64 savedDbFlags = pBt->db->flags;
10427   char zErr[100];
10428   int bPartial = 0;            /* True if not checking all btrees */
10429   int bCkFreelist = 1;         /* True to scan the freelist */
10430   VVA_ONLY( int nRef );
10431   assert( nRoot>0 );
10432 
10433   /* aRoot[0]==0 means this is a partial check */
10434   if( aRoot[0]==0 ){
10435     assert( nRoot>1 );
10436     bPartial = 1;
10437     if( aRoot[1]!=1 ) bCkFreelist = 0;
10438   }
10439 
10440   sqlite3BtreeEnter(p);
10441   assert( p->inTrans>TRANS_NONE && pBt->inTransaction>TRANS_NONE );
10442   VVA_ONLY( nRef = sqlite3PagerRefcount(pBt->pPager) );
10443   assert( nRef>=0 );
10444   sCheck.db = db;
10445   sCheck.pBt = pBt;
10446   sCheck.pPager = pBt->pPager;
10447   sCheck.nPage = btreePagecount(sCheck.pBt);
10448   sCheck.mxErr = mxErr;
10449   sCheck.nErr = 0;
10450   sCheck.bOomFault = 0;
10451   sCheck.zPfx = 0;
10452   sCheck.v1 = 0;
10453   sCheck.v2 = 0;
10454   sCheck.aPgRef = 0;
10455   sCheck.heap = 0;
10456   sqlite3StrAccumInit(&sCheck.errMsg, 0, zErr, sizeof(zErr), SQLITE_MAX_LENGTH);
10457   sCheck.errMsg.printfFlags = SQLITE_PRINTF_INTERNAL;
10458   if( sCheck.nPage==0 ){
10459     goto integrity_ck_cleanup;
10460   }
10461 
10462   sCheck.aPgRef = sqlite3MallocZero((sCheck.nPage / 8)+ 1);
10463   if( !sCheck.aPgRef ){
10464     sCheck.bOomFault = 1;
10465     goto integrity_ck_cleanup;
10466   }
10467   sCheck.heap = (u32*)sqlite3PageMalloc( pBt->pageSize );
10468   if( sCheck.heap==0 ){
10469     sCheck.bOomFault = 1;
10470     goto integrity_ck_cleanup;
10471   }
10472 
10473   i = PENDING_BYTE_PAGE(pBt);
10474   if( i<=sCheck.nPage ) setPageReferenced(&sCheck, i);
10475 
10476   /* Check the integrity of the freelist
10477   */
10478   if( bCkFreelist ){
10479     sCheck.zPfx = "Main freelist: ";
10480     checkList(&sCheck, 1, get4byte(&pBt->pPage1->aData[32]),
10481               get4byte(&pBt->pPage1->aData[36]));
10482     sCheck.zPfx = 0;
10483   }
10484 
10485   /* Check all the tables.
10486   */
10487 #ifndef SQLITE_OMIT_AUTOVACUUM
10488   if( !bPartial ){
10489     if( pBt->autoVacuum ){
10490       Pgno mx = 0;
10491       Pgno mxInHdr;
10492       for(i=0; (int)i<nRoot; i++) if( mx<aRoot[i] ) mx = aRoot[i];
10493       mxInHdr = get4byte(&pBt->pPage1->aData[52]);
10494       if( mx!=mxInHdr ){
10495         checkAppendMsg(&sCheck,
10496           "max rootpage (%d) disagrees with header (%d)",
10497           mx, mxInHdr
10498         );
10499       }
10500     }else if( get4byte(&pBt->pPage1->aData[64])!=0 ){
10501       checkAppendMsg(&sCheck,
10502         "incremental_vacuum enabled with a max rootpage of zero"
10503       );
10504     }
10505   }
10506 #endif
10507   testcase( pBt->db->flags & SQLITE_CellSizeCk );
10508   pBt->db->flags &= ~(u64)SQLITE_CellSizeCk;
10509   for(i=0; (int)i<nRoot && sCheck.mxErr; i++){
10510     i64 notUsed;
10511     if( aRoot[i]==0 ) continue;
10512 #ifndef SQLITE_OMIT_AUTOVACUUM
10513     if( pBt->autoVacuum && aRoot[i]>1 && !bPartial ){
10514       checkPtrmap(&sCheck, aRoot[i], PTRMAP_ROOTPAGE, 0);
10515     }
10516 #endif
10517     checkTreePage(&sCheck, aRoot[i], &notUsed, LARGEST_INT64);
10518   }
10519   pBt->db->flags = savedDbFlags;
10520 
10521   /* Make sure every page in the file is referenced
10522   */
10523   if( !bPartial ){
10524     for(i=1; i<=sCheck.nPage && sCheck.mxErr; i++){
10525 #ifdef SQLITE_OMIT_AUTOVACUUM
10526       if( getPageReferenced(&sCheck, i)==0 ){
10527         checkAppendMsg(&sCheck, "Page %d is never used", i);
10528       }
10529 #else
10530       /* If the database supports auto-vacuum, make sure no tables contain
10531       ** references to pointer-map pages.
10532       */
10533       if( getPageReferenced(&sCheck, i)==0 &&
10534          (PTRMAP_PAGENO(pBt, i)!=i || !pBt->autoVacuum) ){
10535         checkAppendMsg(&sCheck, "Page %d is never used", i);
10536       }
10537       if( getPageReferenced(&sCheck, i)!=0 &&
10538          (PTRMAP_PAGENO(pBt, i)==i && pBt->autoVacuum) ){
10539         checkAppendMsg(&sCheck, "Pointer map page %d is referenced", i);
10540       }
10541 #endif
10542     }
10543   }
10544 
10545   /* Clean  up and report errors.
10546   */
10547 integrity_ck_cleanup:
10548   sqlite3PageFree(sCheck.heap);
10549   sqlite3_free(sCheck.aPgRef);
10550   if( sCheck.bOomFault ){
10551     sqlite3_str_reset(&sCheck.errMsg);
10552     sCheck.nErr++;
10553   }
10554   *pnErr = sCheck.nErr;
10555   if( sCheck.nErr==0 ) sqlite3_str_reset(&sCheck.errMsg);
10556   /* Make sure this analysis did not leave any unref() pages. */
10557   assert( nRef==sqlite3PagerRefcount(pBt->pPager) );
10558   sqlite3BtreeLeave(p);
10559   return sqlite3StrAccumFinish(&sCheck.errMsg);
10560 }
10561 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
10562 
10563 /*
10564 ** Return the full pathname of the underlying database file.  Return
10565 ** an empty string if the database is in-memory or a TEMP database.
10566 **
10567 ** The pager filename is invariant as long as the pager is
10568 ** open so it is safe to access without the BtShared mutex.
10569 */
10570 const char *sqlite3BtreeGetFilename(Btree *p){
10571   assert( p->pBt->pPager!=0 );
10572   return sqlite3PagerFilename(p->pBt->pPager, 1);
10573 }
10574 
10575 /*
10576 ** Return the pathname of the journal file for this database. The return
10577 ** value of this routine is the same regardless of whether the journal file
10578 ** has been created or not.
10579 **
10580 ** The pager journal filename is invariant as long as the pager is
10581 ** open so it is safe to access without the BtShared mutex.
10582 */
10583 const char *sqlite3BtreeGetJournalname(Btree *p){
10584   assert( p->pBt->pPager!=0 );
10585   return sqlite3PagerJournalname(p->pBt->pPager);
10586 }
10587 
10588 /*
10589 ** Return one of SQLITE_TXN_NONE, SQLITE_TXN_READ, or SQLITE_TXN_WRITE
10590 ** to describe the current transaction state of Btree p.
10591 */
10592 int sqlite3BtreeTxnState(Btree *p){
10593   assert( p==0 || sqlite3_mutex_held(p->db->mutex) );
10594   return p ? p->inTrans : 0;
10595 }
10596 
10597 #ifndef SQLITE_OMIT_WAL
10598 /*
10599 ** Run a checkpoint on the Btree passed as the first argument.
10600 **
10601 ** Return SQLITE_LOCKED if this or any other connection has an open
10602 ** transaction on the shared-cache the argument Btree is connected to.
10603 **
10604 ** Parameter eMode is one of SQLITE_CHECKPOINT_PASSIVE, FULL or RESTART.
10605 */
10606 int sqlite3BtreeCheckpoint(Btree *p, int eMode, int *pnLog, int *pnCkpt){
10607   int rc = SQLITE_OK;
10608   if( p ){
10609     BtShared *pBt = p->pBt;
10610     sqlite3BtreeEnter(p);
10611     if( pBt->inTransaction!=TRANS_NONE ){
10612       rc = SQLITE_LOCKED;
10613     }else{
10614       rc = sqlite3PagerCheckpoint(pBt->pPager, p->db, eMode, pnLog, pnCkpt);
10615     }
10616     sqlite3BtreeLeave(p);
10617   }
10618   return rc;
10619 }
10620 #endif
10621 
10622 /*
10623 ** Return true if there is currently a backup running on Btree p.
10624 */
10625 int sqlite3BtreeIsInBackup(Btree *p){
10626   assert( p );
10627   assert( sqlite3_mutex_held(p->db->mutex) );
10628   return p->nBackup!=0;
10629 }
10630 
10631 /*
10632 ** This function returns a pointer to a blob of memory associated with
10633 ** a single shared-btree. The memory is used by client code for its own
10634 ** purposes (for example, to store a high-level schema associated with
10635 ** the shared-btree). The btree layer manages reference counting issues.
10636 **
10637 ** The first time this is called on a shared-btree, nBytes bytes of memory
10638 ** are allocated, zeroed, and returned to the caller. For each subsequent
10639 ** call the nBytes parameter is ignored and a pointer to the same blob
10640 ** of memory returned.
10641 **
10642 ** If the nBytes parameter is 0 and the blob of memory has not yet been
10643 ** allocated, a null pointer is returned. If the blob has already been
10644 ** allocated, it is returned as normal.
10645 **
10646 ** Just before the shared-btree is closed, the function passed as the
10647 ** xFree argument when the memory allocation was made is invoked on the
10648 ** blob of allocated memory. The xFree function should not call sqlite3_free()
10649 ** on the memory, the btree layer does that.
10650 */
10651 void *sqlite3BtreeSchema(Btree *p, int nBytes, void(*xFree)(void *)){
10652   BtShared *pBt = p->pBt;
10653   sqlite3BtreeEnter(p);
10654   if( !pBt->pSchema && nBytes ){
10655     pBt->pSchema = sqlite3DbMallocZero(0, nBytes);
10656     pBt->xFreeSchema = xFree;
10657   }
10658   sqlite3BtreeLeave(p);
10659   return pBt->pSchema;
10660 }
10661 
10662 /*
10663 ** Return SQLITE_LOCKED_SHAREDCACHE if another user of the same shared
10664 ** btree as the argument handle holds an exclusive lock on the
10665 ** sqlite_schema table. Otherwise SQLITE_OK.
10666 */
10667 int sqlite3BtreeSchemaLocked(Btree *p){
10668   int rc;
10669   assert( sqlite3_mutex_held(p->db->mutex) );
10670   sqlite3BtreeEnter(p);
10671   rc = querySharedCacheTableLock(p, SCHEMA_ROOT, READ_LOCK);
10672   assert( rc==SQLITE_OK || rc==SQLITE_LOCKED_SHAREDCACHE );
10673   sqlite3BtreeLeave(p);
10674   return rc;
10675 }
10676 
10677 
10678 #ifndef SQLITE_OMIT_SHARED_CACHE
10679 /*
10680 ** Obtain a lock on the table whose root page is iTab.  The
10681 ** lock is a write lock if isWritelock is true or a read lock
10682 ** if it is false.
10683 */
10684 int sqlite3BtreeLockTable(Btree *p, int iTab, u8 isWriteLock){
10685   int rc = SQLITE_OK;
10686   assert( p->inTrans!=TRANS_NONE );
10687   if( p->sharable ){
10688     u8 lockType = READ_LOCK + isWriteLock;
10689     assert( READ_LOCK+1==WRITE_LOCK );
10690     assert( isWriteLock==0 || isWriteLock==1 );
10691 
10692     sqlite3BtreeEnter(p);
10693     rc = querySharedCacheTableLock(p, iTab, lockType);
10694     if( rc==SQLITE_OK ){
10695       rc = setSharedCacheTableLock(p, iTab, lockType);
10696     }
10697     sqlite3BtreeLeave(p);
10698   }
10699   return rc;
10700 }
10701 #endif
10702 
10703 #ifndef SQLITE_OMIT_INCRBLOB
10704 /*
10705 ** Argument pCsr must be a cursor opened for writing on an
10706 ** INTKEY table currently pointing at a valid table entry.
10707 ** This function modifies the data stored as part of that entry.
10708 **
10709 ** Only the data content may only be modified, it is not possible to
10710 ** change the length of the data stored. If this function is called with
10711 ** parameters that attempt to write past the end of the existing data,
10712 ** no modifications are made and SQLITE_CORRUPT is returned.
10713 */
10714 int sqlite3BtreePutData(BtCursor *pCsr, u32 offset, u32 amt, void *z){
10715   int rc;
10716   assert( cursorOwnsBtShared(pCsr) );
10717   assert( sqlite3_mutex_held(pCsr->pBtree->db->mutex) );
10718   assert( pCsr->curFlags & BTCF_Incrblob );
10719 
10720   rc = restoreCursorPosition(pCsr);
10721   if( rc!=SQLITE_OK ){
10722     return rc;
10723   }
10724   assert( pCsr->eState!=CURSOR_REQUIRESEEK );
10725   if( pCsr->eState!=CURSOR_VALID ){
10726     return SQLITE_ABORT;
10727   }
10728 
10729   /* Save the positions of all other cursors open on this table. This is
10730   ** required in case any of them are holding references to an xFetch
10731   ** version of the b-tree page modified by the accessPayload call below.
10732   **
10733   ** Note that pCsr must be open on a INTKEY table and saveCursorPosition()
10734   ** and hence saveAllCursors() cannot fail on a BTREE_INTKEY table, hence
10735   ** saveAllCursors can only return SQLITE_OK.
10736   */
10737   VVA_ONLY(rc =) saveAllCursors(pCsr->pBt, pCsr->pgnoRoot, pCsr);
10738   assert( rc==SQLITE_OK );
10739 
10740   /* Check some assumptions:
10741   **   (a) the cursor is open for writing,
10742   **   (b) there is a read/write transaction open,
10743   **   (c) the connection holds a write-lock on the table (if required),
10744   **   (d) there are no conflicting read-locks, and
10745   **   (e) the cursor points at a valid row of an intKey table.
10746   */
10747   if( (pCsr->curFlags & BTCF_WriteFlag)==0 ){
10748     return SQLITE_READONLY;
10749   }
10750   assert( (pCsr->pBt->btsFlags & BTS_READ_ONLY)==0
10751               && pCsr->pBt->inTransaction==TRANS_WRITE );
10752   assert( hasSharedCacheTableLock(pCsr->pBtree, pCsr->pgnoRoot, 0, 2) );
10753   assert( !hasReadConflicts(pCsr->pBtree, pCsr->pgnoRoot) );
10754   assert( pCsr->pPage->intKey );
10755 
10756   return accessPayload(pCsr, offset, amt, (unsigned char *)z, 1);
10757 }
10758 
10759 /*
10760 ** Mark this cursor as an incremental blob cursor.
10761 */
10762 void sqlite3BtreeIncrblobCursor(BtCursor *pCur){
10763   pCur->curFlags |= BTCF_Incrblob;
10764   pCur->pBtree->hasIncrblobCur = 1;
10765 }
10766 #endif
10767 
10768 /*
10769 ** Set both the "read version" (single byte at byte offset 18) and
10770 ** "write version" (single byte at byte offset 19) fields in the database
10771 ** header to iVersion.
10772 */
10773 int sqlite3BtreeSetVersion(Btree *pBtree, int iVersion){
10774   BtShared *pBt = pBtree->pBt;
10775   int rc;                         /* Return code */
10776 
10777   assert( iVersion==1 || iVersion==2 );
10778 
10779   /* If setting the version fields to 1, do not automatically open the
10780   ** WAL connection, even if the version fields are currently set to 2.
10781   */
10782   pBt->btsFlags &= ~BTS_NO_WAL;
10783   if( iVersion==1 ) pBt->btsFlags |= BTS_NO_WAL;
10784 
10785   rc = sqlite3BtreeBeginTrans(pBtree, 0, 0);
10786   if( rc==SQLITE_OK ){
10787     u8 *aData = pBt->pPage1->aData;
10788     if( aData[18]!=(u8)iVersion || aData[19]!=(u8)iVersion ){
10789       rc = sqlite3BtreeBeginTrans(pBtree, 2, 0);
10790       if( rc==SQLITE_OK ){
10791         rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
10792         if( rc==SQLITE_OK ){
10793           aData[18] = (u8)iVersion;
10794           aData[19] = (u8)iVersion;
10795         }
10796       }
10797     }
10798   }
10799 
10800   pBt->btsFlags &= ~BTS_NO_WAL;
10801   return rc;
10802 }
10803 
10804 /*
10805 ** Return true if the cursor has a hint specified.  This routine is
10806 ** only used from within assert() statements
10807 */
10808 int sqlite3BtreeCursorHasHint(BtCursor *pCsr, unsigned int mask){
10809   return (pCsr->hints & mask)!=0;
10810 }
10811 
10812 /*
10813 ** Return true if the given Btree is read-only.
10814 */
10815 int sqlite3BtreeIsReadonly(Btree *p){
10816   return (p->pBt->btsFlags & BTS_READ_ONLY)!=0;
10817 }
10818 
10819 /*
10820 ** Return the size of the header added to each page by this module.
10821 */
10822 int sqlite3HeaderSizeBtree(void){ return ROUND8(sizeof(MemPage)); }
10823 
10824 #if !defined(SQLITE_OMIT_SHARED_CACHE)
10825 /*
10826 ** Return true if the Btree passed as the only argument is sharable.
10827 */
10828 int sqlite3BtreeSharable(Btree *p){
10829   return p->sharable;
10830 }
10831 
10832 /*
10833 ** Return the number of connections to the BtShared object accessed by
10834 ** the Btree handle passed as the only argument. For private caches
10835 ** this is always 1. For shared caches it may be 1 or greater.
10836 */
10837 int sqlite3BtreeConnectionCount(Btree *p){
10838   testcase( p->sharable );
10839   return p->pBt->nRef;
10840 }
10841 #endif
10842