xref: /sqlite-3.40.0/src/btree.c (revision f71a243a)
1 /*
2 ** 2004 April 6
3 **
4 ** The author disclaims copyright to this source code.  In place of
5 ** a legal notice, here is a blessing:
6 **
7 **    May you do good and not evil.
8 **    May you find forgiveness for yourself and forgive others.
9 **    May you share freely, never taking more than you give.
10 **
11 *************************************************************************
12 ** This file implements an external (disk-based) database using BTrees.
13 ** See the header comment on "btreeInt.h" for additional information.
14 ** Including a description of file format and an overview of operation.
15 */
16 #include "btreeInt.h"
17 
18 /*
19 ** The header string that appears at the beginning of every
20 ** SQLite database.
21 */
22 static const char zMagicHeader[] = SQLITE_FILE_HEADER;
23 
24 /*
25 ** Set this global variable to 1 to enable tracing using the TRACE
26 ** macro.
27 */
28 #if 0
29 int sqlite3BtreeTrace=1;  /* True to enable tracing */
30 # define TRACE(X)  if(sqlite3BtreeTrace){printf X;fflush(stdout);}
31 #else
32 # define TRACE(X)
33 #endif
34 
35 /*
36 ** Extract a 2-byte big-endian integer from an array of unsigned bytes.
37 ** But if the value is zero, make it 65536.
38 **
39 ** This routine is used to extract the "offset to cell content area" value
40 ** from the header of a btree page.  If the page size is 65536 and the page
41 ** is empty, the offset should be 65536, but the 2-byte value stores zero.
42 ** This routine makes the necessary adjustment to 65536.
43 */
44 #define get2byteNotZero(X)  (((((int)get2byte(X))-1)&0xffff)+1)
45 
46 /*
47 ** Values passed as the 5th argument to allocateBtreePage()
48 */
49 #define BTALLOC_ANY   0           /* Allocate any page */
50 #define BTALLOC_EXACT 1           /* Allocate exact page if possible */
51 #define BTALLOC_LE    2           /* Allocate any page <= the parameter */
52 
53 /*
54 ** Macro IfNotOmitAV(x) returns (x) if SQLITE_OMIT_AUTOVACUUM is not
55 ** defined, or 0 if it is. For example:
56 **
57 **   bIncrVacuum = IfNotOmitAV(pBtShared->incrVacuum);
58 */
59 #ifndef SQLITE_OMIT_AUTOVACUUM
60 #define IfNotOmitAV(expr) (expr)
61 #else
62 #define IfNotOmitAV(expr) 0
63 #endif
64 
65 #ifndef SQLITE_OMIT_SHARED_CACHE
66 /*
67 ** A list of BtShared objects that are eligible for participation
68 ** in shared cache.  This variable has file scope during normal builds,
69 ** but the test harness needs to access it so we make it global for
70 ** test builds.
71 **
72 ** Access to this variable is protected by SQLITE_MUTEX_STATIC_MASTER.
73 */
74 #ifdef SQLITE_TEST
75 BtShared *SQLITE_WSD sqlite3SharedCacheList = 0;
76 #else
77 static BtShared *SQLITE_WSD sqlite3SharedCacheList = 0;
78 #endif
79 #endif /* SQLITE_OMIT_SHARED_CACHE */
80 
81 #ifndef SQLITE_OMIT_SHARED_CACHE
82 /*
83 ** Enable or disable the shared pager and schema features.
84 **
85 ** This routine has no effect on existing database connections.
86 ** The shared cache setting effects only future calls to
87 ** sqlite3_open(), sqlite3_open16(), or sqlite3_open_v2().
88 */
89 int sqlite3_enable_shared_cache(int enable){
90   sqlite3GlobalConfig.sharedCacheEnabled = enable;
91   return SQLITE_OK;
92 }
93 #endif
94 
95 
96 
97 #ifdef SQLITE_OMIT_SHARED_CACHE
98   /*
99   ** The functions querySharedCacheTableLock(), setSharedCacheTableLock(),
100   ** and clearAllSharedCacheTableLocks()
101   ** manipulate entries in the BtShared.pLock linked list used to store
102   ** shared-cache table level locks. If the library is compiled with the
103   ** shared-cache feature disabled, then there is only ever one user
104   ** of each BtShared structure and so this locking is not necessary.
105   ** So define the lock related functions as no-ops.
106   */
107   #define querySharedCacheTableLock(a,b,c) SQLITE_OK
108   #define setSharedCacheTableLock(a,b,c) SQLITE_OK
109   #define clearAllSharedCacheTableLocks(a)
110   #define downgradeAllSharedCacheTableLocks(a)
111   #define hasSharedCacheTableLock(a,b,c,d) 1
112   #define hasReadConflicts(a, b) 0
113 #endif
114 
115 /*
116 ** Implementation of the SQLITE_CORRUPT_PAGE() macro. Takes a single
117 ** (MemPage*) as an argument. The (MemPage*) must not be NULL.
118 **
119 ** If SQLITE_DEBUG is not defined, then this macro is equivalent to
120 ** SQLITE_CORRUPT_BKPT. Or, if SQLITE_DEBUG is set, then the log message
121 ** normally produced as a side-effect of SQLITE_CORRUPT_BKPT is augmented
122 ** with the page number and filename associated with the (MemPage*).
123 */
124 #ifdef SQLITE_DEBUG
125 int corruptPageError(int lineno, MemPage *p){
126   char *zMsg;
127   sqlite3BeginBenignMalloc();
128   zMsg = sqlite3_mprintf("database corruption page %d of %s",
129       (int)p->pgno, sqlite3PagerFilename(p->pBt->pPager, 0)
130   );
131   sqlite3EndBenignMalloc();
132   if( zMsg ){
133     sqlite3ReportError(SQLITE_CORRUPT, lineno, zMsg);
134   }
135   sqlite3_free(zMsg);
136   return SQLITE_CORRUPT_BKPT;
137 }
138 # define SQLITE_CORRUPT_PAGE(pMemPage) corruptPageError(__LINE__, pMemPage)
139 #else
140 # define SQLITE_CORRUPT_PAGE(pMemPage) SQLITE_CORRUPT_PGNO(pMemPage->pgno)
141 #endif
142 
143 #ifndef SQLITE_OMIT_SHARED_CACHE
144 
145 #ifdef SQLITE_DEBUG
146 /*
147 **** This function is only used as part of an assert() statement. ***
148 **
149 ** Check to see if pBtree holds the required locks to read or write to the
150 ** table with root page iRoot.   Return 1 if it does and 0 if not.
151 **
152 ** For example, when writing to a table with root-page iRoot via
153 ** Btree connection pBtree:
154 **
155 **    assert( hasSharedCacheTableLock(pBtree, iRoot, 0, WRITE_LOCK) );
156 **
157 ** When writing to an index that resides in a sharable database, the
158 ** caller should have first obtained a lock specifying the root page of
159 ** the corresponding table. This makes things a bit more complicated,
160 ** as this module treats each table as a separate structure. To determine
161 ** the table corresponding to the index being written, this
162 ** function has to search through the database schema.
163 **
164 ** Instead of a lock on the table/index rooted at page iRoot, the caller may
165 ** hold a write-lock on the schema table (root page 1). This is also
166 ** acceptable.
167 */
168 static int hasSharedCacheTableLock(
169   Btree *pBtree,         /* Handle that must hold lock */
170   Pgno iRoot,            /* Root page of b-tree */
171   int isIndex,           /* True if iRoot is the root of an index b-tree */
172   int eLockType          /* Required lock type (READ_LOCK or WRITE_LOCK) */
173 ){
174   Schema *pSchema = (Schema *)pBtree->pBt->pSchema;
175   Pgno iTab = 0;
176   BtLock *pLock;
177 
178   /* If this database is not shareable, or if the client is reading
179   ** and has the read-uncommitted flag set, then no lock is required.
180   ** Return true immediately.
181   */
182   if( (pBtree->sharable==0)
183    || (eLockType==READ_LOCK && (pBtree->db->flags & SQLITE_ReadUncommit))
184   ){
185     return 1;
186   }
187 
188   /* If the client is reading  or writing an index and the schema is
189   ** not loaded, then it is too difficult to actually check to see if
190   ** the correct locks are held.  So do not bother - just return true.
191   ** This case does not come up very often anyhow.
192   */
193   if( isIndex && (!pSchema || (pSchema->schemaFlags&DB_SchemaLoaded)==0) ){
194     return 1;
195   }
196 
197   /* Figure out the root-page that the lock should be held on. For table
198   ** b-trees, this is just the root page of the b-tree being read or
199   ** written. For index b-trees, it is the root page of the associated
200   ** table.  */
201   if( isIndex ){
202     HashElem *p;
203     for(p=sqliteHashFirst(&pSchema->idxHash); p; p=sqliteHashNext(p)){
204       Index *pIdx = (Index *)sqliteHashData(p);
205       if( pIdx->tnum==(int)iRoot ){
206         if( iTab ){
207           /* Two or more indexes share the same root page.  There must
208           ** be imposter tables.  So just return true.  The assert is not
209           ** useful in that case. */
210           return 1;
211         }
212         iTab = pIdx->pTable->tnum;
213       }
214     }
215   }else{
216     iTab = iRoot;
217   }
218 
219   /* Search for the required lock. Either a write-lock on root-page iTab, a
220   ** write-lock on the schema table, or (if the client is reading) a
221   ** read-lock on iTab will suffice. Return 1 if any of these are found.  */
222   for(pLock=pBtree->pBt->pLock; pLock; pLock=pLock->pNext){
223     if( pLock->pBtree==pBtree
224      && (pLock->iTable==iTab || (pLock->eLock==WRITE_LOCK && pLock->iTable==1))
225      && pLock->eLock>=eLockType
226     ){
227       return 1;
228     }
229   }
230 
231   /* Failed to find the required lock. */
232   return 0;
233 }
234 #endif /* SQLITE_DEBUG */
235 
236 #ifdef SQLITE_DEBUG
237 /*
238 **** This function may be used as part of assert() statements only. ****
239 **
240 ** Return true if it would be illegal for pBtree to write into the
241 ** table or index rooted at iRoot because other shared connections are
242 ** simultaneously reading that same table or index.
243 **
244 ** It is illegal for pBtree to write if some other Btree object that
245 ** shares the same BtShared object is currently reading or writing
246 ** the iRoot table.  Except, if the other Btree object has the
247 ** read-uncommitted flag set, then it is OK for the other object to
248 ** have a read cursor.
249 **
250 ** For example, before writing to any part of the table or index
251 ** rooted at page iRoot, one should call:
252 **
253 **    assert( !hasReadConflicts(pBtree, iRoot) );
254 */
255 static int hasReadConflicts(Btree *pBtree, Pgno iRoot){
256   BtCursor *p;
257   for(p=pBtree->pBt->pCursor; p; p=p->pNext){
258     if( p->pgnoRoot==iRoot
259      && p->pBtree!=pBtree
260      && 0==(p->pBtree->db->flags & SQLITE_ReadUncommit)
261     ){
262       return 1;
263     }
264   }
265   return 0;
266 }
267 #endif    /* #ifdef SQLITE_DEBUG */
268 
269 /*
270 ** Query to see if Btree handle p may obtain a lock of type eLock
271 ** (READ_LOCK or WRITE_LOCK) on the table with root-page iTab. Return
272 ** SQLITE_OK if the lock may be obtained (by calling
273 ** setSharedCacheTableLock()), or SQLITE_LOCKED if not.
274 */
275 static int querySharedCacheTableLock(Btree *p, Pgno iTab, u8 eLock){
276   BtShared *pBt = p->pBt;
277   BtLock *pIter;
278 
279   assert( sqlite3BtreeHoldsMutex(p) );
280   assert( eLock==READ_LOCK || eLock==WRITE_LOCK );
281   assert( p->db!=0 );
282   assert( !(p->db->flags&SQLITE_ReadUncommit)||eLock==WRITE_LOCK||iTab==1 );
283 
284   /* If requesting a write-lock, then the Btree must have an open write
285   ** transaction on this file. And, obviously, for this to be so there
286   ** must be an open write transaction on the file itself.
287   */
288   assert( eLock==READ_LOCK || (p==pBt->pWriter && p->inTrans==TRANS_WRITE) );
289   assert( eLock==READ_LOCK || pBt->inTransaction==TRANS_WRITE );
290 
291   /* This routine is a no-op if the shared-cache is not enabled */
292   if( !p->sharable ){
293     return SQLITE_OK;
294   }
295 
296   /* If some other connection is holding an exclusive lock, the
297   ** requested lock may not be obtained.
298   */
299   if( pBt->pWriter!=p && (pBt->btsFlags & BTS_EXCLUSIVE)!=0 ){
300     sqlite3ConnectionBlocked(p->db, pBt->pWriter->db);
301     return SQLITE_LOCKED_SHAREDCACHE;
302   }
303 
304   for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){
305     /* The condition (pIter->eLock!=eLock) in the following if(...)
306     ** statement is a simplification of:
307     **
308     **   (eLock==WRITE_LOCK || pIter->eLock==WRITE_LOCK)
309     **
310     ** since we know that if eLock==WRITE_LOCK, then no other connection
311     ** may hold a WRITE_LOCK on any table in this file (since there can
312     ** only be a single writer).
313     */
314     assert( pIter->eLock==READ_LOCK || pIter->eLock==WRITE_LOCK );
315     assert( eLock==READ_LOCK || pIter->pBtree==p || pIter->eLock==READ_LOCK);
316     if( pIter->pBtree!=p && pIter->iTable==iTab && pIter->eLock!=eLock ){
317       sqlite3ConnectionBlocked(p->db, pIter->pBtree->db);
318       if( eLock==WRITE_LOCK ){
319         assert( p==pBt->pWriter );
320         pBt->btsFlags |= BTS_PENDING;
321       }
322       return SQLITE_LOCKED_SHAREDCACHE;
323     }
324   }
325   return SQLITE_OK;
326 }
327 #endif /* !SQLITE_OMIT_SHARED_CACHE */
328 
329 #ifndef SQLITE_OMIT_SHARED_CACHE
330 /*
331 ** Add a lock on the table with root-page iTable to the shared-btree used
332 ** by Btree handle p. Parameter eLock must be either READ_LOCK or
333 ** WRITE_LOCK.
334 **
335 ** This function assumes the following:
336 **
337 **   (a) The specified Btree object p is connected to a sharable
338 **       database (one with the BtShared.sharable flag set), and
339 **
340 **   (b) No other Btree objects hold a lock that conflicts
341 **       with the requested lock (i.e. querySharedCacheTableLock() has
342 **       already been called and returned SQLITE_OK).
343 **
344 ** SQLITE_OK is returned if the lock is added successfully. SQLITE_NOMEM
345 ** is returned if a malloc attempt fails.
346 */
347 static int setSharedCacheTableLock(Btree *p, Pgno iTable, u8 eLock){
348   BtShared *pBt = p->pBt;
349   BtLock *pLock = 0;
350   BtLock *pIter;
351 
352   assert( sqlite3BtreeHoldsMutex(p) );
353   assert( eLock==READ_LOCK || eLock==WRITE_LOCK );
354   assert( p->db!=0 );
355 
356   /* A connection with the read-uncommitted flag set will never try to
357   ** obtain a read-lock using this function. The only read-lock obtained
358   ** by a connection in read-uncommitted mode is on the sqlite_master
359   ** table, and that lock is obtained in BtreeBeginTrans().  */
360   assert( 0==(p->db->flags&SQLITE_ReadUncommit) || eLock==WRITE_LOCK );
361 
362   /* This function should only be called on a sharable b-tree after it
363   ** has been determined that no other b-tree holds a conflicting lock.  */
364   assert( p->sharable );
365   assert( SQLITE_OK==querySharedCacheTableLock(p, iTable, eLock) );
366 
367   /* First search the list for an existing lock on this table. */
368   for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){
369     if( pIter->iTable==iTable && pIter->pBtree==p ){
370       pLock = pIter;
371       break;
372     }
373   }
374 
375   /* If the above search did not find a BtLock struct associating Btree p
376   ** with table iTable, allocate one and link it into the list.
377   */
378   if( !pLock ){
379     pLock = (BtLock *)sqlite3MallocZero(sizeof(BtLock));
380     if( !pLock ){
381       return SQLITE_NOMEM_BKPT;
382     }
383     pLock->iTable = iTable;
384     pLock->pBtree = p;
385     pLock->pNext = pBt->pLock;
386     pBt->pLock = pLock;
387   }
388 
389   /* Set the BtLock.eLock variable to the maximum of the current lock
390   ** and the requested lock. This means if a write-lock was already held
391   ** and a read-lock requested, we don't incorrectly downgrade the lock.
392   */
393   assert( WRITE_LOCK>READ_LOCK );
394   if( eLock>pLock->eLock ){
395     pLock->eLock = eLock;
396   }
397 
398   return SQLITE_OK;
399 }
400 #endif /* !SQLITE_OMIT_SHARED_CACHE */
401 
402 #ifndef SQLITE_OMIT_SHARED_CACHE
403 /*
404 ** Release all the table locks (locks obtained via calls to
405 ** the setSharedCacheTableLock() procedure) held by Btree object p.
406 **
407 ** This function assumes that Btree p has an open read or write
408 ** transaction. If it does not, then the BTS_PENDING flag
409 ** may be incorrectly cleared.
410 */
411 static void clearAllSharedCacheTableLocks(Btree *p){
412   BtShared *pBt = p->pBt;
413   BtLock **ppIter = &pBt->pLock;
414 
415   assert( sqlite3BtreeHoldsMutex(p) );
416   assert( p->sharable || 0==*ppIter );
417   assert( p->inTrans>0 );
418 
419   while( *ppIter ){
420     BtLock *pLock = *ppIter;
421     assert( (pBt->btsFlags & BTS_EXCLUSIVE)==0 || pBt->pWriter==pLock->pBtree );
422     assert( pLock->pBtree->inTrans>=pLock->eLock );
423     if( pLock->pBtree==p ){
424       *ppIter = pLock->pNext;
425       assert( pLock->iTable!=1 || pLock==&p->lock );
426       if( pLock->iTable!=1 ){
427         sqlite3_free(pLock);
428       }
429     }else{
430       ppIter = &pLock->pNext;
431     }
432   }
433 
434   assert( (pBt->btsFlags & BTS_PENDING)==0 || pBt->pWriter );
435   if( pBt->pWriter==p ){
436     pBt->pWriter = 0;
437     pBt->btsFlags &= ~(BTS_EXCLUSIVE|BTS_PENDING);
438   }else if( pBt->nTransaction==2 ){
439     /* This function is called when Btree p is concluding its
440     ** transaction. If there currently exists a writer, and p is not
441     ** that writer, then the number of locks held by connections other
442     ** than the writer must be about to drop to zero. In this case
443     ** set the BTS_PENDING flag to 0.
444     **
445     ** If there is not currently a writer, then BTS_PENDING must
446     ** be zero already. So this next line is harmless in that case.
447     */
448     pBt->btsFlags &= ~BTS_PENDING;
449   }
450 }
451 
452 /*
453 ** This function changes all write-locks held by Btree p into read-locks.
454 */
455 static void downgradeAllSharedCacheTableLocks(Btree *p){
456   BtShared *pBt = p->pBt;
457   if( pBt->pWriter==p ){
458     BtLock *pLock;
459     pBt->pWriter = 0;
460     pBt->btsFlags &= ~(BTS_EXCLUSIVE|BTS_PENDING);
461     for(pLock=pBt->pLock; pLock; pLock=pLock->pNext){
462       assert( pLock->eLock==READ_LOCK || pLock->pBtree==p );
463       pLock->eLock = READ_LOCK;
464     }
465   }
466 }
467 
468 #endif /* SQLITE_OMIT_SHARED_CACHE */
469 
470 static void releasePage(MemPage *pPage);         /* Forward reference */
471 static void releasePageOne(MemPage *pPage);      /* Forward reference */
472 static void releasePageNotNull(MemPage *pPage);  /* Forward reference */
473 
474 /*
475 ***** This routine is used inside of assert() only ****
476 **
477 ** Verify that the cursor holds the mutex on its BtShared
478 */
479 #ifdef SQLITE_DEBUG
480 static int cursorHoldsMutex(BtCursor *p){
481   return sqlite3_mutex_held(p->pBt->mutex);
482 }
483 
484 /* Verify that the cursor and the BtShared agree about what is the current
485 ** database connetion. This is important in shared-cache mode. If the database
486 ** connection pointers get out-of-sync, it is possible for routines like
487 ** btreeInitPage() to reference an stale connection pointer that references a
488 ** a connection that has already closed.  This routine is used inside assert()
489 ** statements only and for the purpose of double-checking that the btree code
490 ** does keep the database connection pointers up-to-date.
491 */
492 static int cursorOwnsBtShared(BtCursor *p){
493   assert( cursorHoldsMutex(p) );
494   return (p->pBtree->db==p->pBt->db);
495 }
496 #endif
497 
498 /*
499 ** Invalidate the overflow cache of the cursor passed as the first argument.
500 ** on the shared btree structure pBt.
501 */
502 #define invalidateOverflowCache(pCur) (pCur->curFlags &= ~BTCF_ValidOvfl)
503 
504 /*
505 ** Invalidate the overflow page-list cache for all cursors opened
506 ** on the shared btree structure pBt.
507 */
508 static void invalidateAllOverflowCache(BtShared *pBt){
509   BtCursor *p;
510   assert( sqlite3_mutex_held(pBt->mutex) );
511   for(p=pBt->pCursor; p; p=p->pNext){
512     invalidateOverflowCache(p);
513   }
514 }
515 
516 #ifndef SQLITE_OMIT_INCRBLOB
517 /*
518 ** This function is called before modifying the contents of a table
519 ** to invalidate any incrblob cursors that are open on the
520 ** row or one of the rows being modified.
521 **
522 ** If argument isClearTable is true, then the entire contents of the
523 ** table is about to be deleted. In this case invalidate all incrblob
524 ** cursors open on any row within the table with root-page pgnoRoot.
525 **
526 ** Otherwise, if argument isClearTable is false, then the row with
527 ** rowid iRow is being replaced or deleted. In this case invalidate
528 ** only those incrblob cursors open on that specific row.
529 */
530 static void invalidateIncrblobCursors(
531   Btree *pBtree,          /* The database file to check */
532   Pgno pgnoRoot,          /* The table that might be changing */
533   i64 iRow,               /* The rowid that might be changing */
534   int isClearTable        /* True if all rows are being deleted */
535 ){
536   BtCursor *p;
537   if( pBtree->hasIncrblobCur==0 ) return;
538   assert( sqlite3BtreeHoldsMutex(pBtree) );
539   pBtree->hasIncrblobCur = 0;
540   for(p=pBtree->pBt->pCursor; p; p=p->pNext){
541     if( (p->curFlags & BTCF_Incrblob)!=0 ){
542       pBtree->hasIncrblobCur = 1;
543       if( p->pgnoRoot==pgnoRoot && (isClearTable || p->info.nKey==iRow) ){
544         p->eState = CURSOR_INVALID;
545       }
546     }
547   }
548 }
549 
550 #else
551   /* Stub function when INCRBLOB is omitted */
552   #define invalidateIncrblobCursors(w,x,y,z)
553 #endif /* SQLITE_OMIT_INCRBLOB */
554 
555 /*
556 ** Set bit pgno of the BtShared.pHasContent bitvec. This is called
557 ** when a page that previously contained data becomes a free-list leaf
558 ** page.
559 **
560 ** The BtShared.pHasContent bitvec exists to work around an obscure
561 ** bug caused by the interaction of two useful IO optimizations surrounding
562 ** free-list leaf pages:
563 **
564 **   1) When all data is deleted from a page and the page becomes
565 **      a free-list leaf page, the page is not written to the database
566 **      (as free-list leaf pages contain no meaningful data). Sometimes
567 **      such a page is not even journalled (as it will not be modified,
568 **      why bother journalling it?).
569 **
570 **   2) When a free-list leaf page is reused, its content is not read
571 **      from the database or written to the journal file (why should it
572 **      be, if it is not at all meaningful?).
573 **
574 ** By themselves, these optimizations work fine and provide a handy
575 ** performance boost to bulk delete or insert operations. However, if
576 ** a page is moved to the free-list and then reused within the same
577 ** transaction, a problem comes up. If the page is not journalled when
578 ** it is moved to the free-list and it is also not journalled when it
579 ** is extracted from the free-list and reused, then the original data
580 ** may be lost. In the event of a rollback, it may not be possible
581 ** to restore the database to its original configuration.
582 **
583 ** The solution is the BtShared.pHasContent bitvec. Whenever a page is
584 ** moved to become a free-list leaf page, the corresponding bit is
585 ** set in the bitvec. Whenever a leaf page is extracted from the free-list,
586 ** optimization 2 above is omitted if the corresponding bit is already
587 ** set in BtShared.pHasContent. The contents of the bitvec are cleared
588 ** at the end of every transaction.
589 */
590 static int btreeSetHasContent(BtShared *pBt, Pgno pgno){
591   int rc = SQLITE_OK;
592   if( !pBt->pHasContent ){
593     assert( pgno<=pBt->nPage );
594     pBt->pHasContent = sqlite3BitvecCreate(pBt->nPage);
595     if( !pBt->pHasContent ){
596       rc = SQLITE_NOMEM_BKPT;
597     }
598   }
599   if( rc==SQLITE_OK && pgno<=sqlite3BitvecSize(pBt->pHasContent) ){
600     rc = sqlite3BitvecSet(pBt->pHasContent, pgno);
601   }
602   return rc;
603 }
604 
605 /*
606 ** Query the BtShared.pHasContent vector.
607 **
608 ** This function is called when a free-list leaf page is removed from the
609 ** free-list for reuse. It returns false if it is safe to retrieve the
610 ** page from the pager layer with the 'no-content' flag set. True otherwise.
611 */
612 static int btreeGetHasContent(BtShared *pBt, Pgno pgno){
613   Bitvec *p = pBt->pHasContent;
614   return (p && (pgno>sqlite3BitvecSize(p) || sqlite3BitvecTest(p, pgno)));
615 }
616 
617 /*
618 ** Clear (destroy) the BtShared.pHasContent bitvec. This should be
619 ** invoked at the conclusion of each write-transaction.
620 */
621 static void btreeClearHasContent(BtShared *pBt){
622   sqlite3BitvecDestroy(pBt->pHasContent);
623   pBt->pHasContent = 0;
624 }
625 
626 /*
627 ** Release all of the apPage[] pages for a cursor.
628 */
629 static void btreeReleaseAllCursorPages(BtCursor *pCur){
630   int i;
631   if( pCur->iPage>=0 ){
632     for(i=0; i<pCur->iPage; i++){
633       releasePageNotNull(pCur->apPage[i]);
634     }
635     releasePageNotNull(pCur->pPage);
636     pCur->iPage = -1;
637   }
638 }
639 
640 /*
641 ** The cursor passed as the only argument must point to a valid entry
642 ** when this function is called (i.e. have eState==CURSOR_VALID). This
643 ** function saves the current cursor key in variables pCur->nKey and
644 ** pCur->pKey. SQLITE_OK is returned if successful or an SQLite error
645 ** code otherwise.
646 **
647 ** If the cursor is open on an intkey table, then the integer key
648 ** (the rowid) is stored in pCur->nKey and pCur->pKey is left set to
649 ** NULL. If the cursor is open on a non-intkey table, then pCur->pKey is
650 ** set to point to a malloced buffer pCur->nKey bytes in size containing
651 ** the key.
652 */
653 static int saveCursorKey(BtCursor *pCur){
654   int rc = SQLITE_OK;
655   assert( CURSOR_VALID==pCur->eState );
656   assert( 0==pCur->pKey );
657   assert( cursorHoldsMutex(pCur) );
658 
659   if( pCur->curIntKey ){
660     /* Only the rowid is required for a table btree */
661     pCur->nKey = sqlite3BtreeIntegerKey(pCur);
662   }else{
663     /* For an index btree, save the complete key content. It is possible
664     ** that the current key is corrupt. In that case, it is possible that
665     ** the sqlite3VdbeRecordUnpack() function may overread the buffer by
666     ** up to the size of 1 varint plus 1 8-byte value when the cursor
667     ** position is restored. Hence the 17 bytes of padding allocated
668     ** below. */
669     void *pKey;
670     pCur->nKey = sqlite3BtreePayloadSize(pCur);
671     pKey = sqlite3Malloc( pCur->nKey + 9 + 8 );
672     if( pKey ){
673       rc = sqlite3BtreePayload(pCur, 0, (int)pCur->nKey, pKey);
674       if( rc==SQLITE_OK ){
675         memset(((u8*)pKey)+pCur->nKey, 0, 9+8);
676         pCur->pKey = pKey;
677       }else{
678         sqlite3_free(pKey);
679       }
680     }else{
681       rc = SQLITE_NOMEM_BKPT;
682     }
683   }
684   assert( !pCur->curIntKey || !pCur->pKey );
685   return rc;
686 }
687 
688 /*
689 ** Save the current cursor position in the variables BtCursor.nKey
690 ** and BtCursor.pKey. The cursor's state is set to CURSOR_REQUIRESEEK.
691 **
692 ** The caller must ensure that the cursor is valid (has eState==CURSOR_VALID)
693 ** prior to calling this routine.
694 */
695 static int saveCursorPosition(BtCursor *pCur){
696   int rc;
697 
698   assert( CURSOR_VALID==pCur->eState || CURSOR_SKIPNEXT==pCur->eState );
699   assert( 0==pCur->pKey );
700   assert( cursorHoldsMutex(pCur) );
701 
702   if( pCur->eState==CURSOR_SKIPNEXT ){
703     pCur->eState = CURSOR_VALID;
704   }else{
705     pCur->skipNext = 0;
706   }
707 
708   rc = saveCursorKey(pCur);
709   if( rc==SQLITE_OK ){
710     btreeReleaseAllCursorPages(pCur);
711     pCur->eState = CURSOR_REQUIRESEEK;
712   }
713 
714   pCur->curFlags &= ~(BTCF_ValidNKey|BTCF_ValidOvfl|BTCF_AtLast);
715   return rc;
716 }
717 
718 /* Forward reference */
719 static int SQLITE_NOINLINE saveCursorsOnList(BtCursor*,Pgno,BtCursor*);
720 
721 /*
722 ** Save the positions of all cursors (except pExcept) that are open on
723 ** the table with root-page iRoot.  "Saving the cursor position" means that
724 ** the location in the btree is remembered in such a way that it can be
725 ** moved back to the same spot after the btree has been modified.  This
726 ** routine is called just before cursor pExcept is used to modify the
727 ** table, for example in BtreeDelete() or BtreeInsert().
728 **
729 ** If there are two or more cursors on the same btree, then all such
730 ** cursors should have their BTCF_Multiple flag set.  The btreeCursor()
731 ** routine enforces that rule.  This routine only needs to be called in
732 ** the uncommon case when pExpect has the BTCF_Multiple flag set.
733 **
734 ** If pExpect!=NULL and if no other cursors are found on the same root-page,
735 ** then the BTCF_Multiple flag on pExpect is cleared, to avoid another
736 ** pointless call to this routine.
737 **
738 ** Implementation note:  This routine merely checks to see if any cursors
739 ** need to be saved.  It calls out to saveCursorsOnList() in the (unusual)
740 ** event that cursors are in need to being saved.
741 */
742 static int saveAllCursors(BtShared *pBt, Pgno iRoot, BtCursor *pExcept){
743   BtCursor *p;
744   assert( sqlite3_mutex_held(pBt->mutex) );
745   assert( pExcept==0 || pExcept->pBt==pBt );
746   for(p=pBt->pCursor; p; p=p->pNext){
747     if( p!=pExcept && (0==iRoot || p->pgnoRoot==iRoot) ) break;
748   }
749   if( p ) return saveCursorsOnList(p, iRoot, pExcept);
750   if( pExcept ) pExcept->curFlags &= ~BTCF_Multiple;
751   return SQLITE_OK;
752 }
753 
754 /* This helper routine to saveAllCursors does the actual work of saving
755 ** the cursors if and when a cursor is found that actually requires saving.
756 ** The common case is that no cursors need to be saved, so this routine is
757 ** broken out from its caller to avoid unnecessary stack pointer movement.
758 */
759 static int SQLITE_NOINLINE saveCursorsOnList(
760   BtCursor *p,         /* The first cursor that needs saving */
761   Pgno iRoot,          /* Only save cursor with this iRoot. Save all if zero */
762   BtCursor *pExcept    /* Do not save this cursor */
763 ){
764   do{
765     if( p!=pExcept && (0==iRoot || p->pgnoRoot==iRoot) ){
766       if( p->eState==CURSOR_VALID || p->eState==CURSOR_SKIPNEXT ){
767         int rc = saveCursorPosition(p);
768         if( SQLITE_OK!=rc ){
769           return rc;
770         }
771       }else{
772         testcase( p->iPage>=0 );
773         btreeReleaseAllCursorPages(p);
774       }
775     }
776     p = p->pNext;
777   }while( p );
778   return SQLITE_OK;
779 }
780 
781 /*
782 ** Clear the current cursor position.
783 */
784 void sqlite3BtreeClearCursor(BtCursor *pCur){
785   assert( cursorHoldsMutex(pCur) );
786   sqlite3_free(pCur->pKey);
787   pCur->pKey = 0;
788   pCur->eState = CURSOR_INVALID;
789 }
790 
791 /*
792 ** In this version of BtreeMoveto, pKey is a packed index record
793 ** such as is generated by the OP_MakeRecord opcode.  Unpack the
794 ** record and then call BtreeMovetoUnpacked() to do the work.
795 */
796 static int btreeMoveto(
797   BtCursor *pCur,     /* Cursor open on the btree to be searched */
798   const void *pKey,   /* Packed key if the btree is an index */
799   i64 nKey,           /* Integer key for tables.  Size of pKey for indices */
800   int bias,           /* Bias search to the high end */
801   int *pRes           /* Write search results here */
802 ){
803   int rc;                    /* Status code */
804   UnpackedRecord *pIdxKey;   /* Unpacked index key */
805 
806   if( pKey ){
807     KeyInfo *pKeyInfo = pCur->pKeyInfo;
808     assert( nKey==(i64)(int)nKey );
809     pIdxKey = sqlite3VdbeAllocUnpackedRecord(pKeyInfo);
810     if( pIdxKey==0 ) return SQLITE_NOMEM_BKPT;
811     sqlite3VdbeRecordUnpack(pKeyInfo, (int)nKey, pKey, pIdxKey);
812     if( pIdxKey->nField==0 || pIdxKey->nField>pKeyInfo->nAllField ){
813       rc = SQLITE_CORRUPT_BKPT;
814       goto moveto_done;
815     }
816   }else{
817     pIdxKey = 0;
818   }
819   rc = sqlite3BtreeMovetoUnpacked(pCur, pIdxKey, nKey, bias, pRes);
820 moveto_done:
821   if( pIdxKey ){
822     sqlite3DbFree(pCur->pKeyInfo->db, pIdxKey);
823   }
824   return rc;
825 }
826 
827 /*
828 ** Restore the cursor to the position it was in (or as close to as possible)
829 ** when saveCursorPosition() was called. Note that this call deletes the
830 ** saved position info stored by saveCursorPosition(), so there can be
831 ** at most one effective restoreCursorPosition() call after each
832 ** saveCursorPosition().
833 */
834 static int btreeRestoreCursorPosition(BtCursor *pCur){
835   int rc;
836   int skipNext = 0;
837   assert( cursorOwnsBtShared(pCur) );
838   assert( pCur->eState>=CURSOR_REQUIRESEEK );
839   if( pCur->eState==CURSOR_FAULT ){
840     return pCur->skipNext;
841   }
842   pCur->eState = CURSOR_INVALID;
843   if( sqlite3FaultSim(410) ){
844     rc = SQLITE_IOERR;
845   }else{
846     rc = btreeMoveto(pCur, pCur->pKey, pCur->nKey, 0, &skipNext);
847   }
848   if( rc==SQLITE_OK ){
849     sqlite3_free(pCur->pKey);
850     pCur->pKey = 0;
851     assert( pCur->eState==CURSOR_VALID || pCur->eState==CURSOR_INVALID );
852     if( skipNext ) pCur->skipNext = skipNext;
853     if( pCur->skipNext && pCur->eState==CURSOR_VALID ){
854       pCur->eState = CURSOR_SKIPNEXT;
855     }
856   }
857   return rc;
858 }
859 
860 #define restoreCursorPosition(p) \
861   (p->eState>=CURSOR_REQUIRESEEK ? \
862          btreeRestoreCursorPosition(p) : \
863          SQLITE_OK)
864 
865 /*
866 ** Determine whether or not a cursor has moved from the position where
867 ** it was last placed, or has been invalidated for any other reason.
868 ** Cursors can move when the row they are pointing at is deleted out
869 ** from under them, for example.  Cursor might also move if a btree
870 ** is rebalanced.
871 **
872 ** Calling this routine with a NULL cursor pointer returns false.
873 **
874 ** Use the separate sqlite3BtreeCursorRestore() routine to restore a cursor
875 ** back to where it ought to be if this routine returns true.
876 */
877 int sqlite3BtreeCursorHasMoved(BtCursor *pCur){
878   assert( EIGHT_BYTE_ALIGNMENT(pCur)
879        || pCur==sqlite3BtreeFakeValidCursor() );
880   assert( offsetof(BtCursor, eState)==0 );
881   assert( sizeof(pCur->eState)==1 );
882   return CURSOR_VALID != *(u8*)pCur;
883 }
884 
885 /*
886 ** Return a pointer to a fake BtCursor object that will always answer
887 ** false to the sqlite3BtreeCursorHasMoved() routine above.  The fake
888 ** cursor returned must not be used with any other Btree interface.
889 */
890 BtCursor *sqlite3BtreeFakeValidCursor(void){
891   static u8 fakeCursor = CURSOR_VALID;
892   assert( offsetof(BtCursor, eState)==0 );
893   return (BtCursor*)&fakeCursor;
894 }
895 
896 /*
897 ** This routine restores a cursor back to its original position after it
898 ** has been moved by some outside activity (such as a btree rebalance or
899 ** a row having been deleted out from under the cursor).
900 **
901 ** On success, the *pDifferentRow parameter is false if the cursor is left
902 ** pointing at exactly the same row.  *pDifferntRow is the row the cursor
903 ** was pointing to has been deleted, forcing the cursor to point to some
904 ** nearby row.
905 **
906 ** This routine should only be called for a cursor that just returned
907 ** TRUE from sqlite3BtreeCursorHasMoved().
908 */
909 int sqlite3BtreeCursorRestore(BtCursor *pCur, int *pDifferentRow){
910   int rc;
911 
912   assert( pCur!=0 );
913   assert( pCur->eState!=CURSOR_VALID );
914   rc = restoreCursorPosition(pCur);
915   if( rc ){
916     *pDifferentRow = 1;
917     return rc;
918   }
919   if( pCur->eState!=CURSOR_VALID ){
920     *pDifferentRow = 1;
921   }else{
922     *pDifferentRow = 0;
923   }
924   return SQLITE_OK;
925 }
926 
927 #ifdef SQLITE_ENABLE_CURSOR_HINTS
928 /*
929 ** Provide hints to the cursor.  The particular hint given (and the type
930 ** and number of the varargs parameters) is determined by the eHintType
931 ** parameter.  See the definitions of the BTREE_HINT_* macros for details.
932 */
933 void sqlite3BtreeCursorHint(BtCursor *pCur, int eHintType, ...){
934   /* Used only by system that substitute their own storage engine */
935 }
936 #endif
937 
938 /*
939 ** Provide flag hints to the cursor.
940 */
941 void sqlite3BtreeCursorHintFlags(BtCursor *pCur, unsigned x){
942   assert( x==BTREE_SEEK_EQ || x==BTREE_BULKLOAD || x==0 );
943   pCur->hints = x;
944 }
945 
946 
947 #ifndef SQLITE_OMIT_AUTOVACUUM
948 /*
949 ** Given a page number of a regular database page, return the page
950 ** number for the pointer-map page that contains the entry for the
951 ** input page number.
952 **
953 ** Return 0 (not a valid page) for pgno==1 since there is
954 ** no pointer map associated with page 1.  The integrity_check logic
955 ** requires that ptrmapPageno(*,1)!=1.
956 */
957 static Pgno ptrmapPageno(BtShared *pBt, Pgno pgno){
958   int nPagesPerMapPage;
959   Pgno iPtrMap, ret;
960   assert( sqlite3_mutex_held(pBt->mutex) );
961   if( pgno<2 ) return 0;
962   nPagesPerMapPage = (pBt->usableSize/5)+1;
963   iPtrMap = (pgno-2)/nPagesPerMapPage;
964   ret = (iPtrMap*nPagesPerMapPage) + 2;
965   if( ret==PENDING_BYTE_PAGE(pBt) ){
966     ret++;
967   }
968   return ret;
969 }
970 
971 /*
972 ** Write an entry into the pointer map.
973 **
974 ** This routine updates the pointer map entry for page number 'key'
975 ** so that it maps to type 'eType' and parent page number 'pgno'.
976 **
977 ** If *pRC is initially non-zero (non-SQLITE_OK) then this routine is
978 ** a no-op.  If an error occurs, the appropriate error code is written
979 ** into *pRC.
980 */
981 static void ptrmapPut(BtShared *pBt, Pgno key, u8 eType, Pgno parent, int *pRC){
982   DbPage *pDbPage;  /* The pointer map page */
983   u8 *pPtrmap;      /* The pointer map data */
984   Pgno iPtrmap;     /* The pointer map page number */
985   int offset;       /* Offset in pointer map page */
986   int rc;           /* Return code from subfunctions */
987 
988   if( *pRC ) return;
989 
990   assert( sqlite3_mutex_held(pBt->mutex) );
991   /* The master-journal page number must never be used as a pointer map page */
992   assert( 0==PTRMAP_ISPAGE(pBt, PENDING_BYTE_PAGE(pBt)) );
993 
994   assert( pBt->autoVacuum );
995   if( key==0 ){
996     *pRC = SQLITE_CORRUPT_BKPT;
997     return;
998   }
999   iPtrmap = PTRMAP_PAGENO(pBt, key);
1000   rc = sqlite3PagerGet(pBt->pPager, iPtrmap, &pDbPage, 0);
1001   if( rc!=SQLITE_OK ){
1002     *pRC = rc;
1003     return;
1004   }
1005   if( ((char*)sqlite3PagerGetExtra(pDbPage))[0]!=0 ){
1006     /* The first byte of the extra data is the MemPage.isInit byte.
1007     ** If that byte is set, it means this page is also being used
1008     ** as a btree page. */
1009     *pRC = SQLITE_CORRUPT_BKPT;
1010     goto ptrmap_exit;
1011   }
1012   offset = PTRMAP_PTROFFSET(iPtrmap, key);
1013   if( offset<0 ){
1014     *pRC = SQLITE_CORRUPT_BKPT;
1015     goto ptrmap_exit;
1016   }
1017   assert( offset <= (int)pBt->usableSize-5 );
1018   pPtrmap = (u8 *)sqlite3PagerGetData(pDbPage);
1019 
1020   if( eType!=pPtrmap[offset] || get4byte(&pPtrmap[offset+1])!=parent ){
1021     TRACE(("PTRMAP_UPDATE: %d->(%d,%d)\n", key, eType, parent));
1022     *pRC= rc = sqlite3PagerWrite(pDbPage);
1023     if( rc==SQLITE_OK ){
1024       pPtrmap[offset] = eType;
1025       put4byte(&pPtrmap[offset+1], parent);
1026     }
1027   }
1028 
1029 ptrmap_exit:
1030   sqlite3PagerUnref(pDbPage);
1031 }
1032 
1033 /*
1034 ** Read an entry from the pointer map.
1035 **
1036 ** This routine retrieves the pointer map entry for page 'key', writing
1037 ** the type and parent page number to *pEType and *pPgno respectively.
1038 ** An error code is returned if something goes wrong, otherwise SQLITE_OK.
1039 */
1040 static int ptrmapGet(BtShared *pBt, Pgno key, u8 *pEType, Pgno *pPgno){
1041   DbPage *pDbPage;   /* The pointer map page */
1042   int iPtrmap;       /* Pointer map page index */
1043   u8 *pPtrmap;       /* Pointer map page data */
1044   int offset;        /* Offset of entry in pointer map */
1045   int rc;
1046 
1047   assert( sqlite3_mutex_held(pBt->mutex) );
1048 
1049   iPtrmap = PTRMAP_PAGENO(pBt, key);
1050   rc = sqlite3PagerGet(pBt->pPager, iPtrmap, &pDbPage, 0);
1051   if( rc!=0 ){
1052     return rc;
1053   }
1054   pPtrmap = (u8 *)sqlite3PagerGetData(pDbPage);
1055 
1056   offset = PTRMAP_PTROFFSET(iPtrmap, key);
1057   if( offset<0 ){
1058     sqlite3PagerUnref(pDbPage);
1059     return SQLITE_CORRUPT_BKPT;
1060   }
1061   assert( offset <= (int)pBt->usableSize-5 );
1062   assert( pEType!=0 );
1063   *pEType = pPtrmap[offset];
1064   if( pPgno ) *pPgno = get4byte(&pPtrmap[offset+1]);
1065 
1066   sqlite3PagerUnref(pDbPage);
1067   if( *pEType<1 || *pEType>5 ) return SQLITE_CORRUPT_PGNO(iPtrmap);
1068   return SQLITE_OK;
1069 }
1070 
1071 #else /* if defined SQLITE_OMIT_AUTOVACUUM */
1072   #define ptrmapPut(w,x,y,z,rc)
1073   #define ptrmapGet(w,x,y,z) SQLITE_OK
1074   #define ptrmapPutOvflPtr(x, y, z, rc)
1075 #endif
1076 
1077 /*
1078 ** Given a btree page and a cell index (0 means the first cell on
1079 ** the page, 1 means the second cell, and so forth) return a pointer
1080 ** to the cell content.
1081 **
1082 ** findCellPastPtr() does the same except it skips past the initial
1083 ** 4-byte child pointer found on interior pages, if there is one.
1084 **
1085 ** This routine works only for pages that do not contain overflow cells.
1086 */
1087 #define findCell(P,I) \
1088   ((P)->aData + ((P)->maskPage & get2byteAligned(&(P)->aCellIdx[2*(I)])))
1089 #define findCellPastPtr(P,I) \
1090   ((P)->aDataOfst + ((P)->maskPage & get2byteAligned(&(P)->aCellIdx[2*(I)])))
1091 
1092 
1093 /*
1094 ** This is common tail processing for btreeParseCellPtr() and
1095 ** btreeParseCellPtrIndex() for the case when the cell does not fit entirely
1096 ** on a single B-tree page.  Make necessary adjustments to the CellInfo
1097 ** structure.
1098 */
1099 static SQLITE_NOINLINE void btreeParseCellAdjustSizeForOverflow(
1100   MemPage *pPage,         /* Page containing the cell */
1101   u8 *pCell,              /* Pointer to the cell text. */
1102   CellInfo *pInfo         /* Fill in this structure */
1103 ){
1104   /* If the payload will not fit completely on the local page, we have
1105   ** to decide how much to store locally and how much to spill onto
1106   ** overflow pages.  The strategy is to minimize the amount of unused
1107   ** space on overflow pages while keeping the amount of local storage
1108   ** in between minLocal and maxLocal.
1109   **
1110   ** Warning:  changing the way overflow payload is distributed in any
1111   ** way will result in an incompatible file format.
1112   */
1113   int minLocal;  /* Minimum amount of payload held locally */
1114   int maxLocal;  /* Maximum amount of payload held locally */
1115   int surplus;   /* Overflow payload available for local storage */
1116 
1117   minLocal = pPage->minLocal;
1118   maxLocal = pPage->maxLocal;
1119   surplus = minLocal + (pInfo->nPayload - minLocal)%(pPage->pBt->usableSize-4);
1120   testcase( surplus==maxLocal );
1121   testcase( surplus==maxLocal+1 );
1122   if( surplus <= maxLocal ){
1123     pInfo->nLocal = (u16)surplus;
1124   }else{
1125     pInfo->nLocal = (u16)minLocal;
1126   }
1127   pInfo->nSize = (u16)(&pInfo->pPayload[pInfo->nLocal] - pCell) + 4;
1128 }
1129 
1130 /*
1131 ** The following routines are implementations of the MemPage.xParseCell()
1132 ** method.
1133 **
1134 ** Parse a cell content block and fill in the CellInfo structure.
1135 **
1136 ** btreeParseCellPtr()        =>   table btree leaf nodes
1137 ** btreeParseCellNoPayload()  =>   table btree internal nodes
1138 ** btreeParseCellPtrIndex()   =>   index btree nodes
1139 **
1140 ** There is also a wrapper function btreeParseCell() that works for
1141 ** all MemPage types and that references the cell by index rather than
1142 ** by pointer.
1143 */
1144 static void btreeParseCellPtrNoPayload(
1145   MemPage *pPage,         /* Page containing the cell */
1146   u8 *pCell,              /* Pointer to the cell text. */
1147   CellInfo *pInfo         /* Fill in this structure */
1148 ){
1149   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1150   assert( pPage->leaf==0 );
1151   assert( pPage->childPtrSize==4 );
1152 #ifndef SQLITE_DEBUG
1153   UNUSED_PARAMETER(pPage);
1154 #endif
1155   pInfo->nSize = 4 + getVarint(&pCell[4], (u64*)&pInfo->nKey);
1156   pInfo->nPayload = 0;
1157   pInfo->nLocal = 0;
1158   pInfo->pPayload = 0;
1159   return;
1160 }
1161 static void btreeParseCellPtr(
1162   MemPage *pPage,         /* Page containing the cell */
1163   u8 *pCell,              /* Pointer to the cell text. */
1164   CellInfo *pInfo         /* Fill in this structure */
1165 ){
1166   u8 *pIter;              /* For scanning through pCell */
1167   u32 nPayload;           /* Number of bytes of cell payload */
1168   u64 iKey;               /* Extracted Key value */
1169 
1170   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1171   assert( pPage->leaf==0 || pPage->leaf==1 );
1172   assert( pPage->intKeyLeaf );
1173   assert( pPage->childPtrSize==0 );
1174   pIter = pCell;
1175 
1176   /* The next block of code is equivalent to:
1177   **
1178   **     pIter += getVarint32(pIter, nPayload);
1179   **
1180   ** The code is inlined to avoid a function call.
1181   */
1182   nPayload = *pIter;
1183   if( nPayload>=0x80 ){
1184     u8 *pEnd = &pIter[8];
1185     nPayload &= 0x7f;
1186     do{
1187       nPayload = (nPayload<<7) | (*++pIter & 0x7f);
1188     }while( (*pIter)>=0x80 && pIter<pEnd );
1189   }
1190   pIter++;
1191 
1192   /* The next block of code is equivalent to:
1193   **
1194   **     pIter += getVarint(pIter, (u64*)&pInfo->nKey);
1195   **
1196   ** The code is inlined to avoid a function call.
1197   */
1198   iKey = *pIter;
1199   if( iKey>=0x80 ){
1200     u8 *pEnd = &pIter[7];
1201     iKey &= 0x7f;
1202     while(1){
1203       iKey = (iKey<<7) | (*++pIter & 0x7f);
1204       if( (*pIter)<0x80 ) break;
1205       if( pIter>=pEnd ){
1206         iKey = (iKey<<8) | *++pIter;
1207         break;
1208       }
1209     }
1210   }
1211   pIter++;
1212 
1213   pInfo->nKey = *(i64*)&iKey;
1214   pInfo->nPayload = nPayload;
1215   pInfo->pPayload = pIter;
1216   testcase( nPayload==pPage->maxLocal );
1217   testcase( nPayload==pPage->maxLocal+1 );
1218   if( nPayload<=pPage->maxLocal ){
1219     /* This is the (easy) common case where the entire payload fits
1220     ** on the local page.  No overflow is required.
1221     */
1222     pInfo->nSize = nPayload + (u16)(pIter - pCell);
1223     if( pInfo->nSize<4 ) pInfo->nSize = 4;
1224     pInfo->nLocal = (u16)nPayload;
1225   }else{
1226     btreeParseCellAdjustSizeForOverflow(pPage, pCell, pInfo);
1227   }
1228 }
1229 static void btreeParseCellPtrIndex(
1230   MemPage *pPage,         /* Page containing the cell */
1231   u8 *pCell,              /* Pointer to the cell text. */
1232   CellInfo *pInfo         /* Fill in this structure */
1233 ){
1234   u8 *pIter;              /* For scanning through pCell */
1235   u32 nPayload;           /* Number of bytes of cell payload */
1236 
1237   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1238   assert( pPage->leaf==0 || pPage->leaf==1 );
1239   assert( pPage->intKeyLeaf==0 );
1240   pIter = pCell + pPage->childPtrSize;
1241   nPayload = *pIter;
1242   if( nPayload>=0x80 ){
1243     u8 *pEnd = &pIter[8];
1244     nPayload &= 0x7f;
1245     do{
1246       nPayload = (nPayload<<7) | (*++pIter & 0x7f);
1247     }while( *(pIter)>=0x80 && pIter<pEnd );
1248   }
1249   pIter++;
1250   pInfo->nKey = nPayload;
1251   pInfo->nPayload = nPayload;
1252   pInfo->pPayload = pIter;
1253   testcase( nPayload==pPage->maxLocal );
1254   testcase( nPayload==pPage->maxLocal+1 );
1255   if( nPayload<=pPage->maxLocal ){
1256     /* This is the (easy) common case where the entire payload fits
1257     ** on the local page.  No overflow is required.
1258     */
1259     pInfo->nSize = nPayload + (u16)(pIter - pCell);
1260     if( pInfo->nSize<4 ) pInfo->nSize = 4;
1261     pInfo->nLocal = (u16)nPayload;
1262   }else{
1263     btreeParseCellAdjustSizeForOverflow(pPage, pCell, pInfo);
1264   }
1265 }
1266 static void btreeParseCell(
1267   MemPage *pPage,         /* Page containing the cell */
1268   int iCell,              /* The cell index.  First cell is 0 */
1269   CellInfo *pInfo         /* Fill in this structure */
1270 ){
1271   pPage->xParseCell(pPage, findCell(pPage, iCell), pInfo);
1272 }
1273 
1274 /*
1275 ** The following routines are implementations of the MemPage.xCellSize
1276 ** method.
1277 **
1278 ** Compute the total number of bytes that a Cell needs in the cell
1279 ** data area of the btree-page.  The return number includes the cell
1280 ** data header and the local payload, but not any overflow page or
1281 ** the space used by the cell pointer.
1282 **
1283 ** cellSizePtrNoPayload()    =>   table internal nodes
1284 ** cellSizePtr()             =>   all index nodes & table leaf nodes
1285 */
1286 static u16 cellSizePtr(MemPage *pPage, u8 *pCell){
1287   u8 *pIter = pCell + pPage->childPtrSize; /* For looping over bytes of pCell */
1288   u8 *pEnd;                                /* End mark for a varint */
1289   u32 nSize;                               /* Size value to return */
1290 
1291 #ifdef SQLITE_DEBUG
1292   /* The value returned by this function should always be the same as
1293   ** the (CellInfo.nSize) value found by doing a full parse of the
1294   ** cell. If SQLITE_DEBUG is defined, an assert() at the bottom of
1295   ** this function verifies that this invariant is not violated. */
1296   CellInfo debuginfo;
1297   pPage->xParseCell(pPage, pCell, &debuginfo);
1298 #endif
1299 
1300   nSize = *pIter;
1301   if( nSize>=0x80 ){
1302     pEnd = &pIter[8];
1303     nSize &= 0x7f;
1304     do{
1305       nSize = (nSize<<7) | (*++pIter & 0x7f);
1306     }while( *(pIter)>=0x80 && pIter<pEnd );
1307   }
1308   pIter++;
1309   if( pPage->intKey ){
1310     /* pIter now points at the 64-bit integer key value, a variable length
1311     ** integer. The following block moves pIter to point at the first byte
1312     ** past the end of the key value. */
1313     pEnd = &pIter[9];
1314     while( (*pIter++)&0x80 && pIter<pEnd );
1315   }
1316   testcase( nSize==pPage->maxLocal );
1317   testcase( nSize==pPage->maxLocal+1 );
1318   if( nSize<=pPage->maxLocal ){
1319     nSize += (u32)(pIter - pCell);
1320     if( nSize<4 ) nSize = 4;
1321   }else{
1322     int minLocal = pPage->minLocal;
1323     nSize = minLocal + (nSize - minLocal) % (pPage->pBt->usableSize - 4);
1324     testcase( nSize==pPage->maxLocal );
1325     testcase( nSize==pPage->maxLocal+1 );
1326     if( nSize>pPage->maxLocal ){
1327       nSize = minLocal;
1328     }
1329     nSize += 4 + (u16)(pIter - pCell);
1330   }
1331   assert( nSize==debuginfo.nSize || CORRUPT_DB );
1332   return (u16)nSize;
1333 }
1334 static u16 cellSizePtrNoPayload(MemPage *pPage, u8 *pCell){
1335   u8 *pIter = pCell + 4; /* For looping over bytes of pCell */
1336   u8 *pEnd;              /* End mark for a varint */
1337 
1338 #ifdef SQLITE_DEBUG
1339   /* The value returned by this function should always be the same as
1340   ** the (CellInfo.nSize) value found by doing a full parse of the
1341   ** cell. If SQLITE_DEBUG is defined, an assert() at the bottom of
1342   ** this function verifies that this invariant is not violated. */
1343   CellInfo debuginfo;
1344   pPage->xParseCell(pPage, pCell, &debuginfo);
1345 #else
1346   UNUSED_PARAMETER(pPage);
1347 #endif
1348 
1349   assert( pPage->childPtrSize==4 );
1350   pEnd = pIter + 9;
1351   while( (*pIter++)&0x80 && pIter<pEnd );
1352   assert( debuginfo.nSize==(u16)(pIter - pCell) || CORRUPT_DB );
1353   return (u16)(pIter - pCell);
1354 }
1355 
1356 
1357 #ifdef SQLITE_DEBUG
1358 /* This variation on cellSizePtr() is used inside of assert() statements
1359 ** only. */
1360 static u16 cellSize(MemPage *pPage, int iCell){
1361   return pPage->xCellSize(pPage, findCell(pPage, iCell));
1362 }
1363 #endif
1364 
1365 #ifndef SQLITE_OMIT_AUTOVACUUM
1366 /*
1367 ** The cell pCell is currently part of page pSrc but will ultimately be part
1368 ** of pPage.  (pSrc and pPager are often the same.)  If pCell contains a
1369 ** pointer to an overflow page, insert an entry into the pointer-map for
1370 ** the overflow page that will be valid after pCell has been moved to pPage.
1371 */
1372 static void ptrmapPutOvflPtr(MemPage *pPage, MemPage *pSrc, u8 *pCell,int *pRC){
1373   CellInfo info;
1374   if( *pRC ) return;
1375   assert( pCell!=0 );
1376   pPage->xParseCell(pPage, pCell, &info);
1377   if( info.nLocal<info.nPayload ){
1378     Pgno ovfl;
1379     if( SQLITE_WITHIN(pSrc->aDataEnd, pCell, pCell+info.nLocal) ){
1380       testcase( pSrc!=pPage );
1381       *pRC = SQLITE_CORRUPT_BKPT;
1382       return;
1383     }
1384     ovfl = get4byte(&pCell[info.nSize-4]);
1385     ptrmapPut(pPage->pBt, ovfl, PTRMAP_OVERFLOW1, pPage->pgno, pRC);
1386   }
1387 }
1388 #endif
1389 
1390 
1391 /*
1392 ** Defragment the page given. This routine reorganizes cells within the
1393 ** page so that there are no free-blocks on the free-block list.
1394 **
1395 ** Parameter nMaxFrag is the maximum amount of fragmented space that may be
1396 ** present in the page after this routine returns.
1397 **
1398 ** EVIDENCE-OF: R-44582-60138 SQLite may from time to time reorganize a
1399 ** b-tree page so that there are no freeblocks or fragment bytes, all
1400 ** unused bytes are contained in the unallocated space region, and all
1401 ** cells are packed tightly at the end of the page.
1402 */
1403 static int defragmentPage(MemPage *pPage, int nMaxFrag){
1404   int i;                     /* Loop counter */
1405   int pc;                    /* Address of the i-th cell */
1406   int hdr;                   /* Offset to the page header */
1407   int size;                  /* Size of a cell */
1408   int usableSize;            /* Number of usable bytes on a page */
1409   int cellOffset;            /* Offset to the cell pointer array */
1410   int cbrk;                  /* Offset to the cell content area */
1411   int nCell;                 /* Number of cells on the page */
1412   unsigned char *data;       /* The page data */
1413   unsigned char *temp;       /* Temp area for cell content */
1414   unsigned char *src;        /* Source of content */
1415   int iCellFirst;            /* First allowable cell index */
1416   int iCellLast;             /* Last possible cell index */
1417 
1418   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
1419   assert( pPage->pBt!=0 );
1420   assert( pPage->pBt->usableSize <= SQLITE_MAX_PAGE_SIZE );
1421   assert( pPage->nOverflow==0 );
1422   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1423   temp = 0;
1424   src = data = pPage->aData;
1425   hdr = pPage->hdrOffset;
1426   cellOffset = pPage->cellOffset;
1427   nCell = pPage->nCell;
1428   assert( nCell==get2byte(&data[hdr+3]) || CORRUPT_DB );
1429   iCellFirst = cellOffset + 2*nCell;
1430   usableSize = pPage->pBt->usableSize;
1431 
1432   /* This block handles pages with two or fewer free blocks and nMaxFrag
1433   ** or fewer fragmented bytes. In this case it is faster to move the
1434   ** two (or one) blocks of cells using memmove() and add the required
1435   ** offsets to each pointer in the cell-pointer array than it is to
1436   ** reconstruct the entire page.  */
1437   if( (int)data[hdr+7]<=nMaxFrag ){
1438     int iFree = get2byte(&data[hdr+1]);
1439     if( iFree>usableSize-4 ) return SQLITE_CORRUPT_PAGE(pPage);
1440     if( iFree ){
1441       int iFree2 = get2byte(&data[iFree]);
1442       if( iFree2>usableSize-4 ) return SQLITE_CORRUPT_PAGE(pPage);
1443       if( 0==iFree2 || (data[iFree2]==0 && data[iFree2+1]==0) ){
1444         u8 *pEnd = &data[cellOffset + nCell*2];
1445         u8 *pAddr;
1446         int sz2 = 0;
1447         int sz = get2byte(&data[iFree+2]);
1448         int top = get2byte(&data[hdr+5]);
1449         if( top>=iFree ){
1450           return SQLITE_CORRUPT_PAGE(pPage);
1451         }
1452         if( iFree2 ){
1453           if( iFree+sz>iFree2 ) return SQLITE_CORRUPT_PAGE(pPage);
1454           sz2 = get2byte(&data[iFree2+2]);
1455           if( iFree2+sz2 > usableSize ) return SQLITE_CORRUPT_PAGE(pPage);
1456           memmove(&data[iFree+sz+sz2], &data[iFree+sz], iFree2-(iFree+sz));
1457           sz += sz2;
1458         }else if( iFree+sz>usableSize ){
1459           return SQLITE_CORRUPT_PAGE(pPage);
1460         }
1461 
1462         cbrk = top+sz;
1463         assert( cbrk+(iFree-top) <= usableSize );
1464         memmove(&data[cbrk], &data[top], iFree-top);
1465         for(pAddr=&data[cellOffset]; pAddr<pEnd; pAddr+=2){
1466           pc = get2byte(pAddr);
1467           if( pc<iFree ){ put2byte(pAddr, pc+sz); }
1468           else if( pc<iFree2 ){ put2byte(pAddr, pc+sz2); }
1469         }
1470         goto defragment_out;
1471       }
1472     }
1473   }
1474 
1475   cbrk = usableSize;
1476   iCellLast = usableSize - 4;
1477   for(i=0; i<nCell; i++){
1478     u8 *pAddr;     /* The i-th cell pointer */
1479     pAddr = &data[cellOffset + i*2];
1480     pc = get2byte(pAddr);
1481     testcase( pc==iCellFirst );
1482     testcase( pc==iCellLast );
1483     /* These conditions have already been verified in btreeInitPage()
1484     ** if PRAGMA cell_size_check=ON.
1485     */
1486     if( pc<iCellFirst || pc>iCellLast ){
1487       return SQLITE_CORRUPT_PAGE(pPage);
1488     }
1489     assert( pc>=iCellFirst && pc<=iCellLast );
1490     size = pPage->xCellSize(pPage, &src[pc]);
1491     cbrk -= size;
1492     if( cbrk<iCellFirst || pc+size>usableSize ){
1493       return SQLITE_CORRUPT_PAGE(pPage);
1494     }
1495     assert( cbrk+size<=usableSize && cbrk>=iCellFirst );
1496     testcase( cbrk+size==usableSize );
1497     testcase( pc+size==usableSize );
1498     put2byte(pAddr, cbrk);
1499     if( temp==0 ){
1500       int x;
1501       if( cbrk==pc ) continue;
1502       temp = sqlite3PagerTempSpace(pPage->pBt->pPager);
1503       x = get2byte(&data[hdr+5]);
1504       memcpy(&temp[x], &data[x], (cbrk+size) - x);
1505       src = temp;
1506     }
1507     memcpy(&data[cbrk], &src[pc], size);
1508   }
1509   data[hdr+7] = 0;
1510 
1511  defragment_out:
1512   assert( pPage->nFree>=0 );
1513   if( data[hdr+7]+cbrk-iCellFirst!=pPage->nFree ){
1514     return SQLITE_CORRUPT_PAGE(pPage);
1515   }
1516   assert( cbrk>=iCellFirst );
1517   put2byte(&data[hdr+5], cbrk);
1518   data[hdr+1] = 0;
1519   data[hdr+2] = 0;
1520   memset(&data[iCellFirst], 0, cbrk-iCellFirst);
1521   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
1522   return SQLITE_OK;
1523 }
1524 
1525 /*
1526 ** Search the free-list on page pPg for space to store a cell nByte bytes in
1527 ** size. If one can be found, return a pointer to the space and remove it
1528 ** from the free-list.
1529 **
1530 ** If no suitable space can be found on the free-list, return NULL.
1531 **
1532 ** This function may detect corruption within pPg.  If corruption is
1533 ** detected then *pRc is set to SQLITE_CORRUPT and NULL is returned.
1534 **
1535 ** Slots on the free list that are between 1 and 3 bytes larger than nByte
1536 ** will be ignored if adding the extra space to the fragmentation count
1537 ** causes the fragmentation count to exceed 60.
1538 */
1539 static u8 *pageFindSlot(MemPage *pPg, int nByte, int *pRc){
1540   const int hdr = pPg->hdrOffset;            /* Offset to page header */
1541   u8 * const aData = pPg->aData;             /* Page data */
1542   int iAddr = hdr + 1;                       /* Address of ptr to pc */
1543   int pc = get2byte(&aData[iAddr]);          /* Address of a free slot */
1544   int x;                                     /* Excess size of the slot */
1545   int maxPC = pPg->pBt->usableSize - nByte;  /* Max address for a usable slot */
1546   int size;                                  /* Size of the free slot */
1547 
1548   assert( pc>0 );
1549   while( pc<=maxPC ){
1550     /* EVIDENCE-OF: R-22710-53328 The third and fourth bytes of each
1551     ** freeblock form a big-endian integer which is the size of the freeblock
1552     ** in bytes, including the 4-byte header. */
1553     size = get2byte(&aData[pc+2]);
1554     if( (x = size - nByte)>=0 ){
1555       testcase( x==4 );
1556       testcase( x==3 );
1557       if( x<4 ){
1558         /* EVIDENCE-OF: R-11498-58022 In a well-formed b-tree page, the total
1559         ** number of bytes in fragments may not exceed 60. */
1560         if( aData[hdr+7]>57 ) return 0;
1561 
1562         /* Remove the slot from the free-list. Update the number of
1563         ** fragmented bytes within the page. */
1564         memcpy(&aData[iAddr], &aData[pc], 2);
1565         aData[hdr+7] += (u8)x;
1566       }else if( x+pc > maxPC ){
1567         /* This slot extends off the end of the usable part of the page */
1568         *pRc = SQLITE_CORRUPT_PAGE(pPg);
1569         return 0;
1570       }else{
1571         /* The slot remains on the free-list. Reduce its size to account
1572         ** for the portion used by the new allocation. */
1573         put2byte(&aData[pc+2], x);
1574       }
1575       return &aData[pc + x];
1576     }
1577     iAddr = pc;
1578     pc = get2byte(&aData[pc]);
1579     if( pc<=iAddr+size ){
1580       if( pc ){
1581         /* The next slot in the chain is not past the end of the current slot */
1582         *pRc = SQLITE_CORRUPT_PAGE(pPg);
1583       }
1584       return 0;
1585     }
1586   }
1587   if( pc>maxPC+nByte-4 ){
1588     /* The free slot chain extends off the end of the page */
1589     *pRc = SQLITE_CORRUPT_PAGE(pPg);
1590   }
1591   return 0;
1592 }
1593 
1594 /*
1595 ** Allocate nByte bytes of space from within the B-Tree page passed
1596 ** as the first argument. Write into *pIdx the index into pPage->aData[]
1597 ** of the first byte of allocated space. Return either SQLITE_OK or
1598 ** an error code (usually SQLITE_CORRUPT).
1599 **
1600 ** The caller guarantees that there is sufficient space to make the
1601 ** allocation.  This routine might need to defragment in order to bring
1602 ** all the space together, however.  This routine will avoid using
1603 ** the first two bytes past the cell pointer area since presumably this
1604 ** allocation is being made in order to insert a new cell, so we will
1605 ** also end up needing a new cell pointer.
1606 */
1607 static int allocateSpace(MemPage *pPage, int nByte, int *pIdx){
1608   const int hdr = pPage->hdrOffset;    /* Local cache of pPage->hdrOffset */
1609   u8 * const data = pPage->aData;      /* Local cache of pPage->aData */
1610   int top;                             /* First byte of cell content area */
1611   int rc = SQLITE_OK;                  /* Integer return code */
1612   int gap;        /* First byte of gap between cell pointers and cell content */
1613 
1614   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
1615   assert( pPage->pBt );
1616   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1617   assert( nByte>=0 );  /* Minimum cell size is 4 */
1618   assert( pPage->nFree>=nByte );
1619   assert( pPage->nOverflow==0 );
1620   assert( nByte < (int)(pPage->pBt->usableSize-8) );
1621 
1622   assert( pPage->cellOffset == hdr + 12 - 4*pPage->leaf );
1623   gap = pPage->cellOffset + 2*pPage->nCell;
1624   assert( gap<=65536 );
1625   /* EVIDENCE-OF: R-29356-02391 If the database uses a 65536-byte page size
1626   ** and the reserved space is zero (the usual value for reserved space)
1627   ** then the cell content offset of an empty page wants to be 65536.
1628   ** However, that integer is too large to be stored in a 2-byte unsigned
1629   ** integer, so a value of 0 is used in its place. */
1630   top = get2byte(&data[hdr+5]);
1631   assert( top<=(int)pPage->pBt->usableSize ); /* by btreeComputeFreeSpace() */
1632   if( gap>top ){
1633     if( top==0 && pPage->pBt->usableSize==65536 ){
1634       top = 65536;
1635     }else{
1636       return SQLITE_CORRUPT_PAGE(pPage);
1637     }
1638   }
1639 
1640   /* If there is enough space between gap and top for one more cell pointer,
1641   ** and if the freelist is not empty, then search the
1642   ** freelist looking for a slot big enough to satisfy the request.
1643   */
1644   testcase( gap+2==top );
1645   testcase( gap+1==top );
1646   testcase( gap==top );
1647   if( (data[hdr+2] || data[hdr+1]) && gap+2<=top ){
1648     u8 *pSpace = pageFindSlot(pPage, nByte, &rc);
1649     if( pSpace ){
1650       assert( pSpace>=data && (pSpace - data)<65536 );
1651       *pIdx = (int)(pSpace - data);
1652       return SQLITE_OK;
1653     }else if( rc ){
1654       return rc;
1655     }
1656   }
1657 
1658   /* The request could not be fulfilled using a freelist slot.  Check
1659   ** to see if defragmentation is necessary.
1660   */
1661   testcase( gap+2+nByte==top );
1662   if( gap+2+nByte>top ){
1663     assert( pPage->nCell>0 || CORRUPT_DB );
1664     assert( pPage->nFree>=0 );
1665     rc = defragmentPage(pPage, MIN(4, pPage->nFree - (2+nByte)));
1666     if( rc ) return rc;
1667     top = get2byteNotZero(&data[hdr+5]);
1668     assert( gap+2+nByte<=top );
1669   }
1670 
1671 
1672   /* Allocate memory from the gap in between the cell pointer array
1673   ** and the cell content area.  The btreeComputeFreeSpace() call has already
1674   ** validated the freelist.  Given that the freelist is valid, there
1675   ** is no way that the allocation can extend off the end of the page.
1676   ** The assert() below verifies the previous sentence.
1677   */
1678   top -= nByte;
1679   put2byte(&data[hdr+5], top);
1680   assert( top+nByte <= (int)pPage->pBt->usableSize );
1681   *pIdx = top;
1682   return SQLITE_OK;
1683 }
1684 
1685 /*
1686 ** Return a section of the pPage->aData to the freelist.
1687 ** The first byte of the new free block is pPage->aData[iStart]
1688 ** and the size of the block is iSize bytes.
1689 **
1690 ** Adjacent freeblocks are coalesced.
1691 **
1692 ** Even though the freeblock list was checked by btreeComputeFreeSpace(),
1693 ** that routine will not detect overlap between cells or freeblocks.  Nor
1694 ** does it detect cells or freeblocks that encrouch into the reserved bytes
1695 ** at the end of the page.  So do additional corruption checks inside this
1696 ** routine and return SQLITE_CORRUPT if any problems are found.
1697 */
1698 static int freeSpace(MemPage *pPage, u16 iStart, u16 iSize){
1699   u16 iPtr;                             /* Address of ptr to next freeblock */
1700   u16 iFreeBlk;                         /* Address of the next freeblock */
1701   u8 hdr;                               /* Page header size.  0 or 100 */
1702   u8 nFrag = 0;                         /* Reduction in fragmentation */
1703   u16 iOrigSize = iSize;                /* Original value of iSize */
1704   u16 x;                                /* Offset to cell content area */
1705   u32 iEnd = iStart + iSize;            /* First byte past the iStart buffer */
1706   unsigned char *data = pPage->aData;   /* Page content */
1707 
1708   assert( pPage->pBt!=0 );
1709   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
1710   assert( CORRUPT_DB || iStart>=pPage->hdrOffset+6+pPage->childPtrSize );
1711   assert( CORRUPT_DB || iEnd <= pPage->pBt->usableSize );
1712   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1713   assert( iSize>=4 );   /* Minimum cell size is 4 */
1714   assert( iStart<=pPage->pBt->usableSize-4 );
1715 
1716   /* The list of freeblocks must be in ascending order.  Find the
1717   ** spot on the list where iStart should be inserted.
1718   */
1719   hdr = pPage->hdrOffset;
1720   iPtr = hdr + 1;
1721   if( data[iPtr+1]==0 && data[iPtr]==0 ){
1722     iFreeBlk = 0;  /* Shortcut for the case when the freelist is empty */
1723   }else{
1724     while( (iFreeBlk = get2byte(&data[iPtr]))<iStart ){
1725       if( iFreeBlk<iPtr+4 ){
1726         if( iFreeBlk==0 ) break;
1727         return SQLITE_CORRUPT_PAGE(pPage);
1728       }
1729       iPtr = iFreeBlk;
1730     }
1731     if( iFreeBlk>pPage->pBt->usableSize-4 ){
1732       return SQLITE_CORRUPT_PAGE(pPage);
1733     }
1734     assert( iFreeBlk>iPtr || iFreeBlk==0 );
1735 
1736     /* At this point:
1737     **    iFreeBlk:   First freeblock after iStart, or zero if none
1738     **    iPtr:       The address of a pointer to iFreeBlk
1739     **
1740     ** Check to see if iFreeBlk should be coalesced onto the end of iStart.
1741     */
1742     if( iFreeBlk && iEnd+3>=iFreeBlk ){
1743       nFrag = iFreeBlk - iEnd;
1744       if( iEnd>iFreeBlk ) return SQLITE_CORRUPT_PAGE(pPage);
1745       iEnd = iFreeBlk + get2byte(&data[iFreeBlk+2]);
1746       if( iEnd > pPage->pBt->usableSize ){
1747         return SQLITE_CORRUPT_PAGE(pPage);
1748       }
1749       iSize = iEnd - iStart;
1750       iFreeBlk = get2byte(&data[iFreeBlk]);
1751     }
1752 
1753     /* If iPtr is another freeblock (that is, if iPtr is not the freelist
1754     ** pointer in the page header) then check to see if iStart should be
1755     ** coalesced onto the end of iPtr.
1756     */
1757     if( iPtr>hdr+1 ){
1758       int iPtrEnd = iPtr + get2byte(&data[iPtr+2]);
1759       if( iPtrEnd+3>=iStart ){
1760         if( iPtrEnd>iStart ) return SQLITE_CORRUPT_PAGE(pPage);
1761         nFrag += iStart - iPtrEnd;
1762         iSize = iEnd - iPtr;
1763         iStart = iPtr;
1764       }
1765     }
1766     if( nFrag>data[hdr+7] ) return SQLITE_CORRUPT_PAGE(pPage);
1767     data[hdr+7] -= nFrag;
1768   }
1769   x = get2byte(&data[hdr+5]);
1770   if( iStart<=x ){
1771     /* The new freeblock is at the beginning of the cell content area,
1772     ** so just extend the cell content area rather than create another
1773     ** freelist entry */
1774     if( iStart<x || iPtr!=hdr+1 ) return SQLITE_CORRUPT_PAGE(pPage);
1775     put2byte(&data[hdr+1], iFreeBlk);
1776     put2byte(&data[hdr+5], iEnd);
1777   }else{
1778     /* Insert the new freeblock into the freelist */
1779     put2byte(&data[iPtr], iStart);
1780   }
1781   if( pPage->pBt->btsFlags & BTS_FAST_SECURE ){
1782     /* Overwrite deleted information with zeros when the secure_delete
1783     ** option is enabled */
1784     memset(&data[iStart], 0, iSize);
1785   }
1786   put2byte(&data[iStart], iFreeBlk);
1787   put2byte(&data[iStart+2], iSize);
1788   pPage->nFree += iOrigSize;
1789   return SQLITE_OK;
1790 }
1791 
1792 /*
1793 ** Decode the flags byte (the first byte of the header) for a page
1794 ** and initialize fields of the MemPage structure accordingly.
1795 **
1796 ** Only the following combinations are supported.  Anything different
1797 ** indicates a corrupt database files:
1798 **
1799 **         PTF_ZERODATA
1800 **         PTF_ZERODATA | PTF_LEAF
1801 **         PTF_LEAFDATA | PTF_INTKEY
1802 **         PTF_LEAFDATA | PTF_INTKEY | PTF_LEAF
1803 */
1804 static int decodeFlags(MemPage *pPage, int flagByte){
1805   BtShared *pBt;     /* A copy of pPage->pBt */
1806 
1807   assert( pPage->hdrOffset==(pPage->pgno==1 ? 100 : 0) );
1808   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1809   pPage->leaf = (u8)(flagByte>>3);  assert( PTF_LEAF == 1<<3 );
1810   flagByte &= ~PTF_LEAF;
1811   pPage->childPtrSize = 4-4*pPage->leaf;
1812   pPage->xCellSize = cellSizePtr;
1813   pBt = pPage->pBt;
1814   if( flagByte==(PTF_LEAFDATA | PTF_INTKEY) ){
1815     /* EVIDENCE-OF: R-07291-35328 A value of 5 (0x05) means the page is an
1816     ** interior table b-tree page. */
1817     assert( (PTF_LEAFDATA|PTF_INTKEY)==5 );
1818     /* EVIDENCE-OF: R-26900-09176 A value of 13 (0x0d) means the page is a
1819     ** leaf table b-tree page. */
1820     assert( (PTF_LEAFDATA|PTF_INTKEY|PTF_LEAF)==13 );
1821     pPage->intKey = 1;
1822     if( pPage->leaf ){
1823       pPage->intKeyLeaf = 1;
1824       pPage->xParseCell = btreeParseCellPtr;
1825     }else{
1826       pPage->intKeyLeaf = 0;
1827       pPage->xCellSize = cellSizePtrNoPayload;
1828       pPage->xParseCell = btreeParseCellPtrNoPayload;
1829     }
1830     pPage->maxLocal = pBt->maxLeaf;
1831     pPage->minLocal = pBt->minLeaf;
1832   }else if( flagByte==PTF_ZERODATA ){
1833     /* EVIDENCE-OF: R-43316-37308 A value of 2 (0x02) means the page is an
1834     ** interior index b-tree page. */
1835     assert( (PTF_ZERODATA)==2 );
1836     /* EVIDENCE-OF: R-59615-42828 A value of 10 (0x0a) means the page is a
1837     ** leaf index b-tree page. */
1838     assert( (PTF_ZERODATA|PTF_LEAF)==10 );
1839     pPage->intKey = 0;
1840     pPage->intKeyLeaf = 0;
1841     pPage->xParseCell = btreeParseCellPtrIndex;
1842     pPage->maxLocal = pBt->maxLocal;
1843     pPage->minLocal = pBt->minLocal;
1844   }else{
1845     /* EVIDENCE-OF: R-47608-56469 Any other value for the b-tree page type is
1846     ** an error. */
1847     return SQLITE_CORRUPT_PAGE(pPage);
1848   }
1849   pPage->max1bytePayload = pBt->max1bytePayload;
1850   return SQLITE_OK;
1851 }
1852 
1853 /*
1854 ** Compute the amount of freespace on the page.  In other words, fill
1855 ** in the pPage->nFree field.
1856 */
1857 static int btreeComputeFreeSpace(MemPage *pPage){
1858   int pc;            /* Address of a freeblock within pPage->aData[] */
1859   u8 hdr;            /* Offset to beginning of page header */
1860   u8 *data;          /* Equal to pPage->aData */
1861   int usableSize;    /* Amount of usable space on each page */
1862   int nFree;         /* Number of unused bytes on the page */
1863   int top;           /* First byte of the cell content area */
1864   int iCellFirst;    /* First allowable cell or freeblock offset */
1865   int iCellLast;     /* Last possible cell or freeblock offset */
1866 
1867   assert( pPage->pBt!=0 );
1868   assert( pPage->pBt->db!=0 );
1869   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1870   assert( pPage->pgno==sqlite3PagerPagenumber(pPage->pDbPage) );
1871   assert( pPage == sqlite3PagerGetExtra(pPage->pDbPage) );
1872   assert( pPage->aData == sqlite3PagerGetData(pPage->pDbPage) );
1873   assert( pPage->isInit==1 );
1874   assert( pPage->nFree<0 );
1875 
1876   usableSize = pPage->pBt->usableSize;
1877   hdr = pPage->hdrOffset;
1878   data = pPage->aData;
1879   /* EVIDENCE-OF: R-58015-48175 The two-byte integer at offset 5 designates
1880   ** the start of the cell content area. A zero value for this integer is
1881   ** interpreted as 65536. */
1882   top = get2byteNotZero(&data[hdr+5]);
1883   iCellFirst = hdr + 8 + pPage->childPtrSize + 2*pPage->nCell;
1884   iCellLast = usableSize - 4;
1885 
1886   /* Compute the total free space on the page
1887   ** EVIDENCE-OF: R-23588-34450 The two-byte integer at offset 1 gives the
1888   ** start of the first freeblock on the page, or is zero if there are no
1889   ** freeblocks. */
1890   pc = get2byte(&data[hdr+1]);
1891   nFree = data[hdr+7] + top;  /* Init nFree to non-freeblock free space */
1892   if( pc>0 ){
1893     u32 next, size;
1894     if( pc<iCellFirst ){
1895       /* EVIDENCE-OF: R-55530-52930 In a well-formed b-tree page, there will
1896       ** always be at least one cell before the first freeblock.
1897       */
1898       return SQLITE_CORRUPT_PAGE(pPage);
1899     }
1900     while( 1 ){
1901       if( pc>iCellLast ){
1902         /* Freeblock off the end of the page */
1903         return SQLITE_CORRUPT_PAGE(pPage);
1904       }
1905       next = get2byte(&data[pc]);
1906       size = get2byte(&data[pc+2]);
1907       nFree = nFree + size;
1908       if( next<=pc+size+3 ) break;
1909       pc = next;
1910     }
1911     if( next>0 ){
1912       /* Freeblock not in ascending order */
1913       return SQLITE_CORRUPT_PAGE(pPage);
1914     }
1915     if( pc+size>(unsigned int)usableSize ){
1916       /* Last freeblock extends past page end */
1917       return SQLITE_CORRUPT_PAGE(pPage);
1918     }
1919   }
1920 
1921   /* At this point, nFree contains the sum of the offset to the start
1922   ** of the cell-content area plus the number of free bytes within
1923   ** the cell-content area. If this is greater than the usable-size
1924   ** of the page, then the page must be corrupted. This check also
1925   ** serves to verify that the offset to the start of the cell-content
1926   ** area, according to the page header, lies within the page.
1927   */
1928   if( nFree>usableSize || nFree<iCellFirst ){
1929     return SQLITE_CORRUPT_PAGE(pPage);
1930   }
1931   pPage->nFree = (u16)(nFree - iCellFirst);
1932   return SQLITE_OK;
1933 }
1934 
1935 /*
1936 ** Do additional sanity check after btreeInitPage() if
1937 ** PRAGMA cell_size_check=ON
1938 */
1939 static SQLITE_NOINLINE int btreeCellSizeCheck(MemPage *pPage){
1940   int iCellFirst;    /* First allowable cell or freeblock offset */
1941   int iCellLast;     /* Last possible cell or freeblock offset */
1942   int i;             /* Index into the cell pointer array */
1943   int sz;            /* Size of a cell */
1944   int pc;            /* Address of a freeblock within pPage->aData[] */
1945   u8 *data;          /* Equal to pPage->aData */
1946   int usableSize;    /* Maximum usable space on the page */
1947   int cellOffset;    /* Start of cell content area */
1948 
1949   iCellFirst = pPage->cellOffset + 2*pPage->nCell;
1950   usableSize = pPage->pBt->usableSize;
1951   iCellLast = usableSize - 4;
1952   data = pPage->aData;
1953   cellOffset = pPage->cellOffset;
1954   if( !pPage->leaf ) iCellLast--;
1955   for(i=0; i<pPage->nCell; i++){
1956     pc = get2byteAligned(&data[cellOffset+i*2]);
1957     testcase( pc==iCellFirst );
1958     testcase( pc==iCellLast );
1959     if( pc<iCellFirst || pc>iCellLast ){
1960       return SQLITE_CORRUPT_PAGE(pPage);
1961     }
1962     sz = pPage->xCellSize(pPage, &data[pc]);
1963     testcase( pc+sz==usableSize );
1964     if( pc+sz>usableSize ){
1965       return SQLITE_CORRUPT_PAGE(pPage);
1966     }
1967   }
1968   return SQLITE_OK;
1969 }
1970 
1971 /*
1972 ** Initialize the auxiliary information for a disk block.
1973 **
1974 ** Return SQLITE_OK on success.  If we see that the page does
1975 ** not contain a well-formed database page, then return
1976 ** SQLITE_CORRUPT.  Note that a return of SQLITE_OK does not
1977 ** guarantee that the page is well-formed.  It only shows that
1978 ** we failed to detect any corruption.
1979 */
1980 static int btreeInitPage(MemPage *pPage){
1981   u8 *data;          /* Equal to pPage->aData */
1982   BtShared *pBt;        /* The main btree structure */
1983 
1984   assert( pPage->pBt!=0 );
1985   assert( pPage->pBt->db!=0 );
1986   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1987   assert( pPage->pgno==sqlite3PagerPagenumber(pPage->pDbPage) );
1988   assert( pPage == sqlite3PagerGetExtra(pPage->pDbPage) );
1989   assert( pPage->aData == sqlite3PagerGetData(pPage->pDbPage) );
1990   assert( pPage->isInit==0 );
1991 
1992   pBt = pPage->pBt;
1993   data = pPage->aData + pPage->hdrOffset;
1994   /* EVIDENCE-OF: R-28594-02890 The one-byte flag at offset 0 indicating
1995   ** the b-tree page type. */
1996   if( decodeFlags(pPage, data[0]) ){
1997     return SQLITE_CORRUPT_PAGE(pPage);
1998   }
1999   assert( pBt->pageSize>=512 && pBt->pageSize<=65536 );
2000   pPage->maskPage = (u16)(pBt->pageSize - 1);
2001   pPage->nOverflow = 0;
2002   pPage->cellOffset = pPage->hdrOffset + 8 + pPage->childPtrSize;
2003   pPage->aCellIdx = data + pPage->childPtrSize + 8;
2004   pPage->aDataEnd = pPage->aData + pBt->usableSize;
2005   pPage->aDataOfst = pPage->aData + pPage->childPtrSize;
2006   /* EVIDENCE-OF: R-37002-32774 The two-byte integer at offset 3 gives the
2007   ** number of cells on the page. */
2008   pPage->nCell = get2byte(&data[3]);
2009   if( pPage->nCell>MX_CELL(pBt) ){
2010     /* To many cells for a single page.  The page must be corrupt */
2011     return SQLITE_CORRUPT_PAGE(pPage);
2012   }
2013   testcase( pPage->nCell==MX_CELL(pBt) );
2014   /* EVIDENCE-OF: R-24089-57979 If a page contains no cells (which is only
2015   ** possible for a root page of a table that contains no rows) then the
2016   ** offset to the cell content area will equal the page size minus the
2017   ** bytes of reserved space. */
2018   assert( pPage->nCell>0
2019        || get2byteNotZero(&data[5])==(int)pBt->usableSize
2020        || CORRUPT_DB );
2021   pPage->nFree = -1;  /* Indicate that this value is yet uncomputed */
2022   pPage->isInit = 1;
2023   if( pBt->db->flags & SQLITE_CellSizeCk ){
2024     return btreeCellSizeCheck(pPage);
2025   }
2026   return SQLITE_OK;
2027 }
2028 
2029 /*
2030 ** Set up a raw page so that it looks like a database page holding
2031 ** no entries.
2032 */
2033 static void zeroPage(MemPage *pPage, int flags){
2034   unsigned char *data = pPage->aData;
2035   BtShared *pBt = pPage->pBt;
2036   u8 hdr = pPage->hdrOffset;
2037   u16 first;
2038 
2039   assert( sqlite3PagerPagenumber(pPage->pDbPage)==pPage->pgno );
2040   assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage );
2041   assert( sqlite3PagerGetData(pPage->pDbPage) == data );
2042   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
2043   assert( sqlite3_mutex_held(pBt->mutex) );
2044   if( pBt->btsFlags & BTS_FAST_SECURE ){
2045     memset(&data[hdr], 0, pBt->usableSize - hdr);
2046   }
2047   data[hdr] = (char)flags;
2048   first = hdr + ((flags&PTF_LEAF)==0 ? 12 : 8);
2049   memset(&data[hdr+1], 0, 4);
2050   data[hdr+7] = 0;
2051   put2byte(&data[hdr+5], pBt->usableSize);
2052   pPage->nFree = (u16)(pBt->usableSize - first);
2053   decodeFlags(pPage, flags);
2054   pPage->cellOffset = first;
2055   pPage->aDataEnd = &data[pBt->usableSize];
2056   pPage->aCellIdx = &data[first];
2057   pPage->aDataOfst = &data[pPage->childPtrSize];
2058   pPage->nOverflow = 0;
2059   assert( pBt->pageSize>=512 && pBt->pageSize<=65536 );
2060   pPage->maskPage = (u16)(pBt->pageSize - 1);
2061   pPage->nCell = 0;
2062   pPage->isInit = 1;
2063 }
2064 
2065 
2066 /*
2067 ** Convert a DbPage obtained from the pager into a MemPage used by
2068 ** the btree layer.
2069 */
2070 static MemPage *btreePageFromDbPage(DbPage *pDbPage, Pgno pgno, BtShared *pBt){
2071   MemPage *pPage = (MemPage*)sqlite3PagerGetExtra(pDbPage);
2072   if( pgno!=pPage->pgno ){
2073     pPage->aData = sqlite3PagerGetData(pDbPage);
2074     pPage->pDbPage = pDbPage;
2075     pPage->pBt = pBt;
2076     pPage->pgno = pgno;
2077     pPage->hdrOffset = pgno==1 ? 100 : 0;
2078   }
2079   assert( pPage->aData==sqlite3PagerGetData(pDbPage) );
2080   return pPage;
2081 }
2082 
2083 /*
2084 ** Get a page from the pager.  Initialize the MemPage.pBt and
2085 ** MemPage.aData elements if needed.  See also: btreeGetUnusedPage().
2086 **
2087 ** If the PAGER_GET_NOCONTENT flag is set, it means that we do not care
2088 ** about the content of the page at this time.  So do not go to the disk
2089 ** to fetch the content.  Just fill in the content with zeros for now.
2090 ** If in the future we call sqlite3PagerWrite() on this page, that
2091 ** means we have started to be concerned about content and the disk
2092 ** read should occur at that point.
2093 */
2094 static int btreeGetPage(
2095   BtShared *pBt,       /* The btree */
2096   Pgno pgno,           /* Number of the page to fetch */
2097   MemPage **ppPage,    /* Return the page in this parameter */
2098   int flags            /* PAGER_GET_NOCONTENT or PAGER_GET_READONLY */
2099 ){
2100   int rc;
2101   DbPage *pDbPage;
2102 
2103   assert( flags==0 || flags==PAGER_GET_NOCONTENT || flags==PAGER_GET_READONLY );
2104   assert( sqlite3_mutex_held(pBt->mutex) );
2105   rc = sqlite3PagerGet(pBt->pPager, pgno, (DbPage**)&pDbPage, flags);
2106   if( rc ) return rc;
2107   *ppPage = btreePageFromDbPage(pDbPage, pgno, pBt);
2108   return SQLITE_OK;
2109 }
2110 
2111 /*
2112 ** Retrieve a page from the pager cache. If the requested page is not
2113 ** already in the pager cache return NULL. Initialize the MemPage.pBt and
2114 ** MemPage.aData elements if needed.
2115 */
2116 static MemPage *btreePageLookup(BtShared *pBt, Pgno pgno){
2117   DbPage *pDbPage;
2118   assert( sqlite3_mutex_held(pBt->mutex) );
2119   pDbPage = sqlite3PagerLookup(pBt->pPager, pgno);
2120   if( pDbPage ){
2121     return btreePageFromDbPage(pDbPage, pgno, pBt);
2122   }
2123   return 0;
2124 }
2125 
2126 /*
2127 ** Return the size of the database file in pages. If there is any kind of
2128 ** error, return ((unsigned int)-1).
2129 */
2130 static Pgno btreePagecount(BtShared *pBt){
2131   return pBt->nPage;
2132 }
2133 u32 sqlite3BtreeLastPage(Btree *p){
2134   assert( sqlite3BtreeHoldsMutex(p) );
2135   assert( ((p->pBt->nPage)&0x80000000)==0 );
2136   return btreePagecount(p->pBt);
2137 }
2138 
2139 /*
2140 ** Get a page from the pager and initialize it.
2141 **
2142 ** If pCur!=0 then the page is being fetched as part of a moveToChild()
2143 ** call.  Do additional sanity checking on the page in this case.
2144 ** And if the fetch fails, this routine must decrement pCur->iPage.
2145 **
2146 ** The page is fetched as read-write unless pCur is not NULL and is
2147 ** a read-only cursor.
2148 **
2149 ** If an error occurs, then *ppPage is undefined. It
2150 ** may remain unchanged, or it may be set to an invalid value.
2151 */
2152 static int getAndInitPage(
2153   BtShared *pBt,                  /* The database file */
2154   Pgno pgno,                      /* Number of the page to get */
2155   MemPage **ppPage,               /* Write the page pointer here */
2156   BtCursor *pCur,                 /* Cursor to receive the page, or NULL */
2157   int bReadOnly                   /* True for a read-only page */
2158 ){
2159   int rc;
2160   DbPage *pDbPage;
2161   assert( sqlite3_mutex_held(pBt->mutex) );
2162   assert( pCur==0 || ppPage==&pCur->pPage );
2163   assert( pCur==0 || bReadOnly==pCur->curPagerFlags );
2164   assert( pCur==0 || pCur->iPage>0 );
2165 
2166   if( pgno>btreePagecount(pBt) ){
2167     rc = SQLITE_CORRUPT_BKPT;
2168     goto getAndInitPage_error1;
2169   }
2170   rc = sqlite3PagerGet(pBt->pPager, pgno, (DbPage**)&pDbPage, bReadOnly);
2171   if( rc ){
2172     goto getAndInitPage_error1;
2173   }
2174   *ppPage = (MemPage*)sqlite3PagerGetExtra(pDbPage);
2175   if( (*ppPage)->isInit==0 ){
2176     btreePageFromDbPage(pDbPage, pgno, pBt);
2177     rc = btreeInitPage(*ppPage);
2178     if( rc!=SQLITE_OK ){
2179       goto getAndInitPage_error2;
2180     }
2181   }
2182   assert( (*ppPage)->pgno==pgno );
2183   assert( (*ppPage)->aData==sqlite3PagerGetData(pDbPage) );
2184 
2185   /* If obtaining a child page for a cursor, we must verify that the page is
2186   ** compatible with the root page. */
2187   if( pCur && ((*ppPage)->nCell<1 || (*ppPage)->intKey!=pCur->curIntKey) ){
2188     rc = SQLITE_CORRUPT_PGNO(pgno);
2189     goto getAndInitPage_error2;
2190   }
2191   return SQLITE_OK;
2192 
2193 getAndInitPage_error2:
2194   releasePage(*ppPage);
2195 getAndInitPage_error1:
2196   if( pCur ){
2197     pCur->iPage--;
2198     pCur->pPage = pCur->apPage[pCur->iPage];
2199   }
2200   testcase( pgno==0 );
2201   assert( pgno!=0 || rc==SQLITE_CORRUPT );
2202   return rc;
2203 }
2204 
2205 /*
2206 ** Release a MemPage.  This should be called once for each prior
2207 ** call to btreeGetPage.
2208 **
2209 ** Page1 is a special case and must be released using releasePageOne().
2210 */
2211 static void releasePageNotNull(MemPage *pPage){
2212   assert( pPage->aData );
2213   assert( pPage->pBt );
2214   assert( pPage->pDbPage!=0 );
2215   assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage );
2216   assert( sqlite3PagerGetData(pPage->pDbPage)==pPage->aData );
2217   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
2218   sqlite3PagerUnrefNotNull(pPage->pDbPage);
2219 }
2220 static void releasePage(MemPage *pPage){
2221   if( pPage ) releasePageNotNull(pPage);
2222 }
2223 static void releasePageOne(MemPage *pPage){
2224   assert( pPage!=0 );
2225   assert( pPage->aData );
2226   assert( pPage->pBt );
2227   assert( pPage->pDbPage!=0 );
2228   assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage );
2229   assert( sqlite3PagerGetData(pPage->pDbPage)==pPage->aData );
2230   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
2231   sqlite3PagerUnrefPageOne(pPage->pDbPage);
2232 }
2233 
2234 /*
2235 ** Get an unused page.
2236 **
2237 ** This works just like btreeGetPage() with the addition:
2238 **
2239 **   *  If the page is already in use for some other purpose, immediately
2240 **      release it and return an SQLITE_CURRUPT error.
2241 **   *  Make sure the isInit flag is clear
2242 */
2243 static int btreeGetUnusedPage(
2244   BtShared *pBt,       /* The btree */
2245   Pgno pgno,           /* Number of the page to fetch */
2246   MemPage **ppPage,    /* Return the page in this parameter */
2247   int flags            /* PAGER_GET_NOCONTENT or PAGER_GET_READONLY */
2248 ){
2249   int rc = btreeGetPage(pBt, pgno, ppPage, flags);
2250   if( rc==SQLITE_OK ){
2251     if( sqlite3PagerPageRefcount((*ppPage)->pDbPage)>1 ){
2252       releasePage(*ppPage);
2253       *ppPage = 0;
2254       return SQLITE_CORRUPT_BKPT;
2255     }
2256     (*ppPage)->isInit = 0;
2257   }else{
2258     *ppPage = 0;
2259   }
2260   return rc;
2261 }
2262 
2263 
2264 /*
2265 ** During a rollback, when the pager reloads information into the cache
2266 ** so that the cache is restored to its original state at the start of
2267 ** the transaction, for each page restored this routine is called.
2268 **
2269 ** This routine needs to reset the extra data section at the end of the
2270 ** page to agree with the restored data.
2271 */
2272 static void pageReinit(DbPage *pData){
2273   MemPage *pPage;
2274   pPage = (MemPage *)sqlite3PagerGetExtra(pData);
2275   assert( sqlite3PagerPageRefcount(pData)>0 );
2276   if( pPage->isInit ){
2277     assert( sqlite3_mutex_held(pPage->pBt->mutex) );
2278     pPage->isInit = 0;
2279     if( sqlite3PagerPageRefcount(pData)>1 ){
2280       /* pPage might not be a btree page;  it might be an overflow page
2281       ** or ptrmap page or a free page.  In those cases, the following
2282       ** call to btreeInitPage() will likely return SQLITE_CORRUPT.
2283       ** But no harm is done by this.  And it is very important that
2284       ** btreeInitPage() be called on every btree page so we make
2285       ** the call for every page that comes in for re-initing. */
2286       btreeInitPage(pPage);
2287     }
2288   }
2289 }
2290 
2291 /*
2292 ** Invoke the busy handler for a btree.
2293 */
2294 static int btreeInvokeBusyHandler(void *pArg){
2295   BtShared *pBt = (BtShared*)pArg;
2296   assert( pBt->db );
2297   assert( sqlite3_mutex_held(pBt->db->mutex) );
2298   return sqlite3InvokeBusyHandler(&pBt->db->busyHandler,
2299                                   sqlite3PagerFile(pBt->pPager));
2300 }
2301 
2302 /*
2303 ** Open a database file.
2304 **
2305 ** zFilename is the name of the database file.  If zFilename is NULL
2306 ** then an ephemeral database is created.  The ephemeral database might
2307 ** be exclusively in memory, or it might use a disk-based memory cache.
2308 ** Either way, the ephemeral database will be automatically deleted
2309 ** when sqlite3BtreeClose() is called.
2310 **
2311 ** If zFilename is ":memory:" then an in-memory database is created
2312 ** that is automatically destroyed when it is closed.
2313 **
2314 ** The "flags" parameter is a bitmask that might contain bits like
2315 ** BTREE_OMIT_JOURNAL and/or BTREE_MEMORY.
2316 **
2317 ** If the database is already opened in the same database connection
2318 ** and we are in shared cache mode, then the open will fail with an
2319 ** SQLITE_CONSTRAINT error.  We cannot allow two or more BtShared
2320 ** objects in the same database connection since doing so will lead
2321 ** to problems with locking.
2322 */
2323 int sqlite3BtreeOpen(
2324   sqlite3_vfs *pVfs,      /* VFS to use for this b-tree */
2325   const char *zFilename,  /* Name of the file containing the BTree database */
2326   sqlite3 *db,            /* Associated database handle */
2327   Btree **ppBtree,        /* Pointer to new Btree object written here */
2328   int flags,              /* Options */
2329   int vfsFlags            /* Flags passed through to sqlite3_vfs.xOpen() */
2330 ){
2331   BtShared *pBt = 0;             /* Shared part of btree structure */
2332   Btree *p;                      /* Handle to return */
2333   sqlite3_mutex *mutexOpen = 0;  /* Prevents a race condition. Ticket #3537 */
2334   int rc = SQLITE_OK;            /* Result code from this function */
2335   u8 nReserve;                   /* Byte of unused space on each page */
2336   unsigned char zDbHeader[100];  /* Database header content */
2337 
2338   /* True if opening an ephemeral, temporary database */
2339   const int isTempDb = zFilename==0 || zFilename[0]==0;
2340 
2341   /* Set the variable isMemdb to true for an in-memory database, or
2342   ** false for a file-based database.
2343   */
2344 #ifdef SQLITE_OMIT_MEMORYDB
2345   const int isMemdb = 0;
2346 #else
2347   const int isMemdb = (zFilename && strcmp(zFilename, ":memory:")==0)
2348                        || (isTempDb && sqlite3TempInMemory(db))
2349                        || (vfsFlags & SQLITE_OPEN_MEMORY)!=0;
2350 #endif
2351 
2352   assert( db!=0 );
2353   assert( pVfs!=0 );
2354   assert( sqlite3_mutex_held(db->mutex) );
2355   assert( (flags&0xff)==flags );   /* flags fit in 8 bits */
2356 
2357   /* Only a BTREE_SINGLE database can be BTREE_UNORDERED */
2358   assert( (flags & BTREE_UNORDERED)==0 || (flags & BTREE_SINGLE)!=0 );
2359 
2360   /* A BTREE_SINGLE database is always a temporary and/or ephemeral */
2361   assert( (flags & BTREE_SINGLE)==0 || isTempDb );
2362 
2363   if( isMemdb ){
2364     flags |= BTREE_MEMORY;
2365   }
2366   if( (vfsFlags & SQLITE_OPEN_MAIN_DB)!=0 && (isMemdb || isTempDb) ){
2367     vfsFlags = (vfsFlags & ~SQLITE_OPEN_MAIN_DB) | SQLITE_OPEN_TEMP_DB;
2368   }
2369   p = sqlite3MallocZero(sizeof(Btree));
2370   if( !p ){
2371     return SQLITE_NOMEM_BKPT;
2372   }
2373   p->inTrans = TRANS_NONE;
2374   p->db = db;
2375 #ifndef SQLITE_OMIT_SHARED_CACHE
2376   p->lock.pBtree = p;
2377   p->lock.iTable = 1;
2378 #endif
2379 
2380 #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)
2381   /*
2382   ** If this Btree is a candidate for shared cache, try to find an
2383   ** existing BtShared object that we can share with
2384   */
2385   if( isTempDb==0 && (isMemdb==0 || (vfsFlags&SQLITE_OPEN_URI)!=0) ){
2386     if( vfsFlags & SQLITE_OPEN_SHAREDCACHE ){
2387       int nFilename = sqlite3Strlen30(zFilename)+1;
2388       int nFullPathname = pVfs->mxPathname+1;
2389       char *zFullPathname = sqlite3Malloc(MAX(nFullPathname,nFilename));
2390       MUTEX_LOGIC( sqlite3_mutex *mutexShared; )
2391 
2392       p->sharable = 1;
2393       if( !zFullPathname ){
2394         sqlite3_free(p);
2395         return SQLITE_NOMEM_BKPT;
2396       }
2397       if( isMemdb ){
2398         memcpy(zFullPathname, zFilename, nFilename);
2399       }else{
2400         rc = sqlite3OsFullPathname(pVfs, zFilename,
2401                                    nFullPathname, zFullPathname);
2402         if( rc ){
2403           sqlite3_free(zFullPathname);
2404           sqlite3_free(p);
2405           return rc;
2406         }
2407       }
2408 #if SQLITE_THREADSAFE
2409       mutexOpen = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_OPEN);
2410       sqlite3_mutex_enter(mutexOpen);
2411       mutexShared = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER);
2412       sqlite3_mutex_enter(mutexShared);
2413 #endif
2414       for(pBt=GLOBAL(BtShared*,sqlite3SharedCacheList); pBt; pBt=pBt->pNext){
2415         assert( pBt->nRef>0 );
2416         if( 0==strcmp(zFullPathname, sqlite3PagerFilename(pBt->pPager, 0))
2417                  && sqlite3PagerVfs(pBt->pPager)==pVfs ){
2418           int iDb;
2419           for(iDb=db->nDb-1; iDb>=0; iDb--){
2420             Btree *pExisting = db->aDb[iDb].pBt;
2421             if( pExisting && pExisting->pBt==pBt ){
2422               sqlite3_mutex_leave(mutexShared);
2423               sqlite3_mutex_leave(mutexOpen);
2424               sqlite3_free(zFullPathname);
2425               sqlite3_free(p);
2426               return SQLITE_CONSTRAINT;
2427             }
2428           }
2429           p->pBt = pBt;
2430           pBt->nRef++;
2431           break;
2432         }
2433       }
2434       sqlite3_mutex_leave(mutexShared);
2435       sqlite3_free(zFullPathname);
2436     }
2437 #ifdef SQLITE_DEBUG
2438     else{
2439       /* In debug mode, we mark all persistent databases as sharable
2440       ** even when they are not.  This exercises the locking code and
2441       ** gives more opportunity for asserts(sqlite3_mutex_held())
2442       ** statements to find locking problems.
2443       */
2444       p->sharable = 1;
2445     }
2446 #endif
2447   }
2448 #endif
2449   if( pBt==0 ){
2450     /*
2451     ** The following asserts make sure that structures used by the btree are
2452     ** the right size.  This is to guard against size changes that result
2453     ** when compiling on a different architecture.
2454     */
2455     assert( sizeof(i64)==8 );
2456     assert( sizeof(u64)==8 );
2457     assert( sizeof(u32)==4 );
2458     assert( sizeof(u16)==2 );
2459     assert( sizeof(Pgno)==4 );
2460 
2461     pBt = sqlite3MallocZero( sizeof(*pBt) );
2462     if( pBt==0 ){
2463       rc = SQLITE_NOMEM_BKPT;
2464       goto btree_open_out;
2465     }
2466     rc = sqlite3PagerOpen(pVfs, &pBt->pPager, zFilename,
2467                           sizeof(MemPage), flags, vfsFlags, pageReinit);
2468     if( rc==SQLITE_OK ){
2469       sqlite3PagerSetMmapLimit(pBt->pPager, db->szMmap);
2470       rc = sqlite3PagerReadFileheader(pBt->pPager,sizeof(zDbHeader),zDbHeader);
2471     }
2472     if( rc!=SQLITE_OK ){
2473       goto btree_open_out;
2474     }
2475     pBt->openFlags = (u8)flags;
2476     pBt->db = db;
2477     sqlite3PagerSetBusyHandler(pBt->pPager, btreeInvokeBusyHandler, pBt);
2478     p->pBt = pBt;
2479 
2480     pBt->pCursor = 0;
2481     pBt->pPage1 = 0;
2482     if( sqlite3PagerIsreadonly(pBt->pPager) ) pBt->btsFlags |= BTS_READ_ONLY;
2483 #if defined(SQLITE_SECURE_DELETE)
2484     pBt->btsFlags |= BTS_SECURE_DELETE;
2485 #elif defined(SQLITE_FAST_SECURE_DELETE)
2486     pBt->btsFlags |= BTS_OVERWRITE;
2487 #endif
2488     /* EVIDENCE-OF: R-51873-39618 The page size for a database file is
2489     ** determined by the 2-byte integer located at an offset of 16 bytes from
2490     ** the beginning of the database file. */
2491     pBt->pageSize = (zDbHeader[16]<<8) | (zDbHeader[17]<<16);
2492     if( pBt->pageSize<512 || pBt->pageSize>SQLITE_MAX_PAGE_SIZE
2493          || ((pBt->pageSize-1)&pBt->pageSize)!=0 ){
2494       pBt->pageSize = 0;
2495 #ifndef SQLITE_OMIT_AUTOVACUUM
2496       /* If the magic name ":memory:" will create an in-memory database, then
2497       ** leave the autoVacuum mode at 0 (do not auto-vacuum), even if
2498       ** SQLITE_DEFAULT_AUTOVACUUM is true. On the other hand, if
2499       ** SQLITE_OMIT_MEMORYDB has been defined, then ":memory:" is just a
2500       ** regular file-name. In this case the auto-vacuum applies as per normal.
2501       */
2502       if( zFilename && !isMemdb ){
2503         pBt->autoVacuum = (SQLITE_DEFAULT_AUTOVACUUM ? 1 : 0);
2504         pBt->incrVacuum = (SQLITE_DEFAULT_AUTOVACUUM==2 ? 1 : 0);
2505       }
2506 #endif
2507       nReserve = 0;
2508     }else{
2509       /* EVIDENCE-OF: R-37497-42412 The size of the reserved region is
2510       ** determined by the one-byte unsigned integer found at an offset of 20
2511       ** into the database file header. */
2512       nReserve = zDbHeader[20];
2513       pBt->btsFlags |= BTS_PAGESIZE_FIXED;
2514 #ifndef SQLITE_OMIT_AUTOVACUUM
2515       pBt->autoVacuum = (get4byte(&zDbHeader[36 + 4*4])?1:0);
2516       pBt->incrVacuum = (get4byte(&zDbHeader[36 + 7*4])?1:0);
2517 #endif
2518     }
2519     rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize, nReserve);
2520     if( rc ) goto btree_open_out;
2521     pBt->usableSize = pBt->pageSize - nReserve;
2522     assert( (pBt->pageSize & 7)==0 );  /* 8-byte alignment of pageSize */
2523 
2524 #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)
2525     /* Add the new BtShared object to the linked list sharable BtShareds.
2526     */
2527     pBt->nRef = 1;
2528     if( p->sharable ){
2529       MUTEX_LOGIC( sqlite3_mutex *mutexShared; )
2530       MUTEX_LOGIC( mutexShared = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER);)
2531       if( SQLITE_THREADSAFE && sqlite3GlobalConfig.bCoreMutex ){
2532         pBt->mutex = sqlite3MutexAlloc(SQLITE_MUTEX_FAST);
2533         if( pBt->mutex==0 ){
2534           rc = SQLITE_NOMEM_BKPT;
2535           goto btree_open_out;
2536         }
2537       }
2538       sqlite3_mutex_enter(mutexShared);
2539       pBt->pNext = GLOBAL(BtShared*,sqlite3SharedCacheList);
2540       GLOBAL(BtShared*,sqlite3SharedCacheList) = pBt;
2541       sqlite3_mutex_leave(mutexShared);
2542     }
2543 #endif
2544   }
2545 
2546 #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)
2547   /* If the new Btree uses a sharable pBtShared, then link the new
2548   ** Btree into the list of all sharable Btrees for the same connection.
2549   ** The list is kept in ascending order by pBt address.
2550   */
2551   if( p->sharable ){
2552     int i;
2553     Btree *pSib;
2554     for(i=0; i<db->nDb; i++){
2555       if( (pSib = db->aDb[i].pBt)!=0 && pSib->sharable ){
2556         while( pSib->pPrev ){ pSib = pSib->pPrev; }
2557         if( (uptr)p->pBt<(uptr)pSib->pBt ){
2558           p->pNext = pSib;
2559           p->pPrev = 0;
2560           pSib->pPrev = p;
2561         }else{
2562           while( pSib->pNext && (uptr)pSib->pNext->pBt<(uptr)p->pBt ){
2563             pSib = pSib->pNext;
2564           }
2565           p->pNext = pSib->pNext;
2566           p->pPrev = pSib;
2567           if( p->pNext ){
2568             p->pNext->pPrev = p;
2569           }
2570           pSib->pNext = p;
2571         }
2572         break;
2573       }
2574     }
2575   }
2576 #endif
2577   *ppBtree = p;
2578 
2579 btree_open_out:
2580   if( rc!=SQLITE_OK ){
2581     if( pBt && pBt->pPager ){
2582       sqlite3PagerClose(pBt->pPager, 0);
2583     }
2584     sqlite3_free(pBt);
2585     sqlite3_free(p);
2586     *ppBtree = 0;
2587   }else{
2588     sqlite3_file *pFile;
2589 
2590     /* If the B-Tree was successfully opened, set the pager-cache size to the
2591     ** default value. Except, when opening on an existing shared pager-cache,
2592     ** do not change the pager-cache size.
2593     */
2594     if( sqlite3BtreeSchema(p, 0, 0)==0 ){
2595       sqlite3PagerSetCachesize(p->pBt->pPager, SQLITE_DEFAULT_CACHE_SIZE);
2596     }
2597 
2598     pFile = sqlite3PagerFile(pBt->pPager);
2599     if( pFile->pMethods ){
2600       sqlite3OsFileControlHint(pFile, SQLITE_FCNTL_PDB, (void*)&pBt->db);
2601     }
2602   }
2603   if( mutexOpen ){
2604     assert( sqlite3_mutex_held(mutexOpen) );
2605     sqlite3_mutex_leave(mutexOpen);
2606   }
2607   assert( rc!=SQLITE_OK || sqlite3BtreeConnectionCount(*ppBtree)>0 );
2608   return rc;
2609 }
2610 
2611 /*
2612 ** Decrement the BtShared.nRef counter.  When it reaches zero,
2613 ** remove the BtShared structure from the sharing list.  Return
2614 ** true if the BtShared.nRef counter reaches zero and return
2615 ** false if it is still positive.
2616 */
2617 static int removeFromSharingList(BtShared *pBt){
2618 #ifndef SQLITE_OMIT_SHARED_CACHE
2619   MUTEX_LOGIC( sqlite3_mutex *pMaster; )
2620   BtShared *pList;
2621   int removed = 0;
2622 
2623   assert( sqlite3_mutex_notheld(pBt->mutex) );
2624   MUTEX_LOGIC( pMaster = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER); )
2625   sqlite3_mutex_enter(pMaster);
2626   pBt->nRef--;
2627   if( pBt->nRef<=0 ){
2628     if( GLOBAL(BtShared*,sqlite3SharedCacheList)==pBt ){
2629       GLOBAL(BtShared*,sqlite3SharedCacheList) = pBt->pNext;
2630     }else{
2631       pList = GLOBAL(BtShared*,sqlite3SharedCacheList);
2632       while( ALWAYS(pList) && pList->pNext!=pBt ){
2633         pList=pList->pNext;
2634       }
2635       if( ALWAYS(pList) ){
2636         pList->pNext = pBt->pNext;
2637       }
2638     }
2639     if( SQLITE_THREADSAFE ){
2640       sqlite3_mutex_free(pBt->mutex);
2641     }
2642     removed = 1;
2643   }
2644   sqlite3_mutex_leave(pMaster);
2645   return removed;
2646 #else
2647   return 1;
2648 #endif
2649 }
2650 
2651 /*
2652 ** Make sure pBt->pTmpSpace points to an allocation of
2653 ** MX_CELL_SIZE(pBt) bytes with a 4-byte prefix for a left-child
2654 ** pointer.
2655 */
2656 static void allocateTempSpace(BtShared *pBt){
2657   if( !pBt->pTmpSpace ){
2658     pBt->pTmpSpace = sqlite3PageMalloc( pBt->pageSize );
2659 
2660     /* One of the uses of pBt->pTmpSpace is to format cells before
2661     ** inserting them into a leaf page (function fillInCell()). If
2662     ** a cell is less than 4 bytes in size, it is rounded up to 4 bytes
2663     ** by the various routines that manipulate binary cells. Which
2664     ** can mean that fillInCell() only initializes the first 2 or 3
2665     ** bytes of pTmpSpace, but that the first 4 bytes are copied from
2666     ** it into a database page. This is not actually a problem, but it
2667     ** does cause a valgrind error when the 1 or 2 bytes of unitialized
2668     ** data is passed to system call write(). So to avoid this error,
2669     ** zero the first 4 bytes of temp space here.
2670     **
2671     ** Also:  Provide four bytes of initialized space before the
2672     ** beginning of pTmpSpace as an area available to prepend the
2673     ** left-child pointer to the beginning of a cell.
2674     */
2675     if( pBt->pTmpSpace ){
2676       memset(pBt->pTmpSpace, 0, 8);
2677       pBt->pTmpSpace += 4;
2678     }
2679   }
2680 }
2681 
2682 /*
2683 ** Free the pBt->pTmpSpace allocation
2684 */
2685 static void freeTempSpace(BtShared *pBt){
2686   if( pBt->pTmpSpace ){
2687     pBt->pTmpSpace -= 4;
2688     sqlite3PageFree(pBt->pTmpSpace);
2689     pBt->pTmpSpace = 0;
2690   }
2691 }
2692 
2693 /*
2694 ** Close an open database and invalidate all cursors.
2695 */
2696 int sqlite3BtreeClose(Btree *p){
2697   BtShared *pBt = p->pBt;
2698   BtCursor *pCur;
2699 
2700   /* Close all cursors opened via this handle.  */
2701   assert( sqlite3_mutex_held(p->db->mutex) );
2702   sqlite3BtreeEnter(p);
2703   pCur = pBt->pCursor;
2704   while( pCur ){
2705     BtCursor *pTmp = pCur;
2706     pCur = pCur->pNext;
2707     if( pTmp->pBtree==p ){
2708       sqlite3BtreeCloseCursor(pTmp);
2709     }
2710   }
2711 
2712   /* Rollback any active transaction and free the handle structure.
2713   ** The call to sqlite3BtreeRollback() drops any table-locks held by
2714   ** this handle.
2715   */
2716   sqlite3BtreeRollback(p, SQLITE_OK, 0);
2717   sqlite3BtreeLeave(p);
2718 
2719   /* If there are still other outstanding references to the shared-btree
2720   ** structure, return now. The remainder of this procedure cleans
2721   ** up the shared-btree.
2722   */
2723   assert( p->wantToLock==0 && p->locked==0 );
2724   if( !p->sharable || removeFromSharingList(pBt) ){
2725     /* The pBt is no longer on the sharing list, so we can access
2726     ** it without having to hold the mutex.
2727     **
2728     ** Clean out and delete the BtShared object.
2729     */
2730     assert( !pBt->pCursor );
2731     sqlite3PagerClose(pBt->pPager, p->db);
2732     if( pBt->xFreeSchema && pBt->pSchema ){
2733       pBt->xFreeSchema(pBt->pSchema);
2734     }
2735     sqlite3DbFree(0, pBt->pSchema);
2736     freeTempSpace(pBt);
2737     sqlite3_free(pBt);
2738   }
2739 
2740 #ifndef SQLITE_OMIT_SHARED_CACHE
2741   assert( p->wantToLock==0 );
2742   assert( p->locked==0 );
2743   if( p->pPrev ) p->pPrev->pNext = p->pNext;
2744   if( p->pNext ) p->pNext->pPrev = p->pPrev;
2745 #endif
2746 
2747   sqlite3_free(p);
2748   return SQLITE_OK;
2749 }
2750 
2751 /*
2752 ** Change the "soft" limit on the number of pages in the cache.
2753 ** Unused and unmodified pages will be recycled when the number of
2754 ** pages in the cache exceeds this soft limit.  But the size of the
2755 ** cache is allowed to grow larger than this limit if it contains
2756 ** dirty pages or pages still in active use.
2757 */
2758 int sqlite3BtreeSetCacheSize(Btree *p, int mxPage){
2759   BtShared *pBt = p->pBt;
2760   assert( sqlite3_mutex_held(p->db->mutex) );
2761   sqlite3BtreeEnter(p);
2762   sqlite3PagerSetCachesize(pBt->pPager, mxPage);
2763   sqlite3BtreeLeave(p);
2764   return SQLITE_OK;
2765 }
2766 
2767 /*
2768 ** Change the "spill" limit on the number of pages in the cache.
2769 ** If the number of pages exceeds this limit during a write transaction,
2770 ** the pager might attempt to "spill" pages to the journal early in
2771 ** order to free up memory.
2772 **
2773 ** The value returned is the current spill size.  If zero is passed
2774 ** as an argument, no changes are made to the spill size setting, so
2775 ** using mxPage of 0 is a way to query the current spill size.
2776 */
2777 int sqlite3BtreeSetSpillSize(Btree *p, int mxPage){
2778   BtShared *pBt = p->pBt;
2779   int res;
2780   assert( sqlite3_mutex_held(p->db->mutex) );
2781   sqlite3BtreeEnter(p);
2782   res = sqlite3PagerSetSpillsize(pBt->pPager, mxPage);
2783   sqlite3BtreeLeave(p);
2784   return res;
2785 }
2786 
2787 #if SQLITE_MAX_MMAP_SIZE>0
2788 /*
2789 ** Change the limit on the amount of the database file that may be
2790 ** memory mapped.
2791 */
2792 int sqlite3BtreeSetMmapLimit(Btree *p, sqlite3_int64 szMmap){
2793   BtShared *pBt = p->pBt;
2794   assert( sqlite3_mutex_held(p->db->mutex) );
2795   sqlite3BtreeEnter(p);
2796   sqlite3PagerSetMmapLimit(pBt->pPager, szMmap);
2797   sqlite3BtreeLeave(p);
2798   return SQLITE_OK;
2799 }
2800 #endif /* SQLITE_MAX_MMAP_SIZE>0 */
2801 
2802 /*
2803 ** Change the way data is synced to disk in order to increase or decrease
2804 ** how well the database resists damage due to OS crashes and power
2805 ** failures.  Level 1 is the same as asynchronous (no syncs() occur and
2806 ** there is a high probability of damage)  Level 2 is the default.  There
2807 ** is a very low but non-zero probability of damage.  Level 3 reduces the
2808 ** probability of damage to near zero but with a write performance reduction.
2809 */
2810 #ifndef SQLITE_OMIT_PAGER_PRAGMAS
2811 int sqlite3BtreeSetPagerFlags(
2812   Btree *p,              /* The btree to set the safety level on */
2813   unsigned pgFlags       /* Various PAGER_* flags */
2814 ){
2815   BtShared *pBt = p->pBt;
2816   assert( sqlite3_mutex_held(p->db->mutex) );
2817   sqlite3BtreeEnter(p);
2818   sqlite3PagerSetFlags(pBt->pPager, pgFlags);
2819   sqlite3BtreeLeave(p);
2820   return SQLITE_OK;
2821 }
2822 #endif
2823 
2824 /*
2825 ** Change the default pages size and the number of reserved bytes per page.
2826 ** Or, if the page size has already been fixed, return SQLITE_READONLY
2827 ** without changing anything.
2828 **
2829 ** The page size must be a power of 2 between 512 and 65536.  If the page
2830 ** size supplied does not meet this constraint then the page size is not
2831 ** changed.
2832 **
2833 ** Page sizes are constrained to be a power of two so that the region
2834 ** of the database file used for locking (beginning at PENDING_BYTE,
2835 ** the first byte past the 1GB boundary, 0x40000000) needs to occur
2836 ** at the beginning of a page.
2837 **
2838 ** If parameter nReserve is less than zero, then the number of reserved
2839 ** bytes per page is left unchanged.
2840 **
2841 ** If the iFix!=0 then the BTS_PAGESIZE_FIXED flag is set so that the page size
2842 ** and autovacuum mode can no longer be changed.
2843 */
2844 int sqlite3BtreeSetPageSize(Btree *p, int pageSize, int nReserve, int iFix){
2845   int rc = SQLITE_OK;
2846   BtShared *pBt = p->pBt;
2847   assert( nReserve>=-1 && nReserve<=255 );
2848   sqlite3BtreeEnter(p);
2849 #if SQLITE_HAS_CODEC
2850   if( nReserve>pBt->optimalReserve ) pBt->optimalReserve = (u8)nReserve;
2851 #endif
2852   if( pBt->btsFlags & BTS_PAGESIZE_FIXED ){
2853     sqlite3BtreeLeave(p);
2854     return SQLITE_READONLY;
2855   }
2856   if( nReserve<0 ){
2857     nReserve = pBt->pageSize - pBt->usableSize;
2858   }
2859   assert( nReserve>=0 && nReserve<=255 );
2860   if( pageSize>=512 && pageSize<=SQLITE_MAX_PAGE_SIZE &&
2861         ((pageSize-1)&pageSize)==0 ){
2862     assert( (pageSize & 7)==0 );
2863     assert( !pBt->pCursor );
2864     pBt->pageSize = (u32)pageSize;
2865     freeTempSpace(pBt);
2866   }
2867   rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize, nReserve);
2868   pBt->usableSize = pBt->pageSize - (u16)nReserve;
2869   if( iFix ) pBt->btsFlags |= BTS_PAGESIZE_FIXED;
2870   sqlite3BtreeLeave(p);
2871   return rc;
2872 }
2873 
2874 /*
2875 ** Return the currently defined page size
2876 */
2877 int sqlite3BtreeGetPageSize(Btree *p){
2878   return p->pBt->pageSize;
2879 }
2880 
2881 /*
2882 ** This function is similar to sqlite3BtreeGetReserve(), except that it
2883 ** may only be called if it is guaranteed that the b-tree mutex is already
2884 ** held.
2885 **
2886 ** This is useful in one special case in the backup API code where it is
2887 ** known that the shared b-tree mutex is held, but the mutex on the
2888 ** database handle that owns *p is not. In this case if sqlite3BtreeEnter()
2889 ** were to be called, it might collide with some other operation on the
2890 ** database handle that owns *p, causing undefined behavior.
2891 */
2892 int sqlite3BtreeGetReserveNoMutex(Btree *p){
2893   int n;
2894   assert( sqlite3_mutex_held(p->pBt->mutex) );
2895   n = p->pBt->pageSize - p->pBt->usableSize;
2896   return n;
2897 }
2898 
2899 /*
2900 ** Return the number of bytes of space at the end of every page that
2901 ** are intentually left unused.  This is the "reserved" space that is
2902 ** sometimes used by extensions.
2903 **
2904 ** If SQLITE_HAS_MUTEX is defined then the number returned is the
2905 ** greater of the current reserved space and the maximum requested
2906 ** reserve space.
2907 */
2908 int sqlite3BtreeGetOptimalReserve(Btree *p){
2909   int n;
2910   sqlite3BtreeEnter(p);
2911   n = sqlite3BtreeGetReserveNoMutex(p);
2912 #ifdef SQLITE_HAS_CODEC
2913   if( n<p->pBt->optimalReserve ) n = p->pBt->optimalReserve;
2914 #endif
2915   sqlite3BtreeLeave(p);
2916   return n;
2917 }
2918 
2919 
2920 /*
2921 ** Set the maximum page count for a database if mxPage is positive.
2922 ** No changes are made if mxPage is 0 or negative.
2923 ** Regardless of the value of mxPage, return the maximum page count.
2924 */
2925 int sqlite3BtreeMaxPageCount(Btree *p, int mxPage){
2926   int n;
2927   sqlite3BtreeEnter(p);
2928   n = sqlite3PagerMaxPageCount(p->pBt->pPager, mxPage);
2929   sqlite3BtreeLeave(p);
2930   return n;
2931 }
2932 
2933 /*
2934 ** Change the values for the BTS_SECURE_DELETE and BTS_OVERWRITE flags:
2935 **
2936 **    newFlag==0       Both BTS_SECURE_DELETE and BTS_OVERWRITE are cleared
2937 **    newFlag==1       BTS_SECURE_DELETE set and BTS_OVERWRITE is cleared
2938 **    newFlag==2       BTS_SECURE_DELETE cleared and BTS_OVERWRITE is set
2939 **    newFlag==(-1)    No changes
2940 **
2941 ** This routine acts as a query if newFlag is less than zero
2942 **
2943 ** With BTS_OVERWRITE set, deleted content is overwritten by zeros, but
2944 ** freelist leaf pages are not written back to the database.  Thus in-page
2945 ** deleted content is cleared, but freelist deleted content is not.
2946 **
2947 ** With BTS_SECURE_DELETE, operation is like BTS_OVERWRITE with the addition
2948 ** that freelist leaf pages are written back into the database, increasing
2949 ** the amount of disk I/O.
2950 */
2951 int sqlite3BtreeSecureDelete(Btree *p, int newFlag){
2952   int b;
2953   if( p==0 ) return 0;
2954   sqlite3BtreeEnter(p);
2955   assert( BTS_OVERWRITE==BTS_SECURE_DELETE*2 );
2956   assert( BTS_FAST_SECURE==(BTS_OVERWRITE|BTS_SECURE_DELETE) );
2957   if( newFlag>=0 ){
2958     p->pBt->btsFlags &= ~BTS_FAST_SECURE;
2959     p->pBt->btsFlags |= BTS_SECURE_DELETE*newFlag;
2960   }
2961   b = (p->pBt->btsFlags & BTS_FAST_SECURE)/BTS_SECURE_DELETE;
2962   sqlite3BtreeLeave(p);
2963   return b;
2964 }
2965 
2966 /*
2967 ** Change the 'auto-vacuum' property of the database. If the 'autoVacuum'
2968 ** parameter is non-zero, then auto-vacuum mode is enabled. If zero, it
2969 ** is disabled. The default value for the auto-vacuum property is
2970 ** determined by the SQLITE_DEFAULT_AUTOVACUUM macro.
2971 */
2972 int sqlite3BtreeSetAutoVacuum(Btree *p, int autoVacuum){
2973 #ifdef SQLITE_OMIT_AUTOVACUUM
2974   return SQLITE_READONLY;
2975 #else
2976   BtShared *pBt = p->pBt;
2977   int rc = SQLITE_OK;
2978   u8 av = (u8)autoVacuum;
2979 
2980   sqlite3BtreeEnter(p);
2981   if( (pBt->btsFlags & BTS_PAGESIZE_FIXED)!=0 && (av ?1:0)!=pBt->autoVacuum ){
2982     rc = SQLITE_READONLY;
2983   }else{
2984     pBt->autoVacuum = av ?1:0;
2985     pBt->incrVacuum = av==2 ?1:0;
2986   }
2987   sqlite3BtreeLeave(p);
2988   return rc;
2989 #endif
2990 }
2991 
2992 /*
2993 ** Return the value of the 'auto-vacuum' property. If auto-vacuum is
2994 ** enabled 1 is returned. Otherwise 0.
2995 */
2996 int sqlite3BtreeGetAutoVacuum(Btree *p){
2997 #ifdef SQLITE_OMIT_AUTOVACUUM
2998   return BTREE_AUTOVACUUM_NONE;
2999 #else
3000   int rc;
3001   sqlite3BtreeEnter(p);
3002   rc = (
3003     (!p->pBt->autoVacuum)?BTREE_AUTOVACUUM_NONE:
3004     (!p->pBt->incrVacuum)?BTREE_AUTOVACUUM_FULL:
3005     BTREE_AUTOVACUUM_INCR
3006   );
3007   sqlite3BtreeLeave(p);
3008   return rc;
3009 #endif
3010 }
3011 
3012 /*
3013 ** If the user has not set the safety-level for this database connection
3014 ** using "PRAGMA synchronous", and if the safety-level is not already
3015 ** set to the value passed to this function as the second parameter,
3016 ** set it so.
3017 */
3018 #if SQLITE_DEFAULT_SYNCHRONOUS!=SQLITE_DEFAULT_WAL_SYNCHRONOUS \
3019     && !defined(SQLITE_OMIT_WAL)
3020 static void setDefaultSyncFlag(BtShared *pBt, u8 safety_level){
3021   sqlite3 *db;
3022   Db *pDb;
3023   if( (db=pBt->db)!=0 && (pDb=db->aDb)!=0 ){
3024     while( pDb->pBt==0 || pDb->pBt->pBt!=pBt ){ pDb++; }
3025     if( pDb->bSyncSet==0
3026      && pDb->safety_level!=safety_level
3027      && pDb!=&db->aDb[1]
3028     ){
3029       pDb->safety_level = safety_level;
3030       sqlite3PagerSetFlags(pBt->pPager,
3031           pDb->safety_level | (db->flags & PAGER_FLAGS_MASK));
3032     }
3033   }
3034 }
3035 #else
3036 # define setDefaultSyncFlag(pBt,safety_level)
3037 #endif
3038 
3039 /* Forward declaration */
3040 static int newDatabase(BtShared*);
3041 
3042 
3043 /*
3044 ** Get a reference to pPage1 of the database file.  This will
3045 ** also acquire a readlock on that file.
3046 **
3047 ** SQLITE_OK is returned on success.  If the file is not a
3048 ** well-formed database file, then SQLITE_CORRUPT is returned.
3049 ** SQLITE_BUSY is returned if the database is locked.  SQLITE_NOMEM
3050 ** is returned if we run out of memory.
3051 */
3052 static int lockBtree(BtShared *pBt){
3053   int rc;              /* Result code from subfunctions */
3054   MemPage *pPage1;     /* Page 1 of the database file */
3055   u32 nPage;           /* Number of pages in the database */
3056   u32 nPageFile = 0;   /* Number of pages in the database file */
3057   u32 nPageHeader;     /* Number of pages in the database according to hdr */
3058 
3059   assert( sqlite3_mutex_held(pBt->mutex) );
3060   assert( pBt->pPage1==0 );
3061   rc = sqlite3PagerSharedLock(pBt->pPager);
3062   if( rc!=SQLITE_OK ) return rc;
3063   rc = btreeGetPage(pBt, 1, &pPage1, 0);
3064   if( rc!=SQLITE_OK ) return rc;
3065 
3066   /* Do some checking to help insure the file we opened really is
3067   ** a valid database file.
3068   */
3069   nPage = nPageHeader = get4byte(28+(u8*)pPage1->aData);
3070   sqlite3PagerPagecount(pBt->pPager, (int*)&nPageFile);
3071   if( nPage==0 || memcmp(24+(u8*)pPage1->aData, 92+(u8*)pPage1->aData,4)!=0 ){
3072     nPage = nPageFile;
3073   }
3074   if( (pBt->db->flags & SQLITE_ResetDatabase)!=0 ){
3075     nPage = 0;
3076   }
3077   if( nPage>0 ){
3078     u32 pageSize;
3079     u32 usableSize;
3080     u8 *page1 = pPage1->aData;
3081     rc = SQLITE_NOTADB;
3082     /* EVIDENCE-OF: R-43737-39999 Every valid SQLite database file begins
3083     ** with the following 16 bytes (in hex): 53 51 4c 69 74 65 20 66 6f 72 6d
3084     ** 61 74 20 33 00. */
3085     if( memcmp(page1, zMagicHeader, 16)!=0 ){
3086       goto page1_init_failed;
3087     }
3088 
3089 #ifdef SQLITE_OMIT_WAL
3090     if( page1[18]>1 ){
3091       pBt->btsFlags |= BTS_READ_ONLY;
3092     }
3093     if( page1[19]>1 ){
3094       goto page1_init_failed;
3095     }
3096 #else
3097     if( page1[18]>2 ){
3098       pBt->btsFlags |= BTS_READ_ONLY;
3099     }
3100     if( page1[19]>2 ){
3101       goto page1_init_failed;
3102     }
3103 
3104     /* If the write version is set to 2, this database should be accessed
3105     ** in WAL mode. If the log is not already open, open it now. Then
3106     ** return SQLITE_OK and return without populating BtShared.pPage1.
3107     ** The caller detects this and calls this function again. This is
3108     ** required as the version of page 1 currently in the page1 buffer
3109     ** may not be the latest version - there may be a newer one in the log
3110     ** file.
3111     */
3112     if( page1[19]==2 && (pBt->btsFlags & BTS_NO_WAL)==0 ){
3113       int isOpen = 0;
3114       rc = sqlite3PagerOpenWal(pBt->pPager, &isOpen);
3115       if( rc!=SQLITE_OK ){
3116         goto page1_init_failed;
3117       }else{
3118         setDefaultSyncFlag(pBt, SQLITE_DEFAULT_WAL_SYNCHRONOUS+1);
3119         if( isOpen==0 ){
3120           releasePageOne(pPage1);
3121           return SQLITE_OK;
3122         }
3123       }
3124       rc = SQLITE_NOTADB;
3125     }else{
3126       setDefaultSyncFlag(pBt, SQLITE_DEFAULT_SYNCHRONOUS+1);
3127     }
3128 #endif
3129 
3130     /* EVIDENCE-OF: R-15465-20813 The maximum and minimum embedded payload
3131     ** fractions and the leaf payload fraction values must be 64, 32, and 32.
3132     **
3133     ** The original design allowed these amounts to vary, but as of
3134     ** version 3.6.0, we require them to be fixed.
3135     */
3136     if( memcmp(&page1[21], "\100\040\040",3)!=0 ){
3137       goto page1_init_failed;
3138     }
3139     /* EVIDENCE-OF: R-51873-39618 The page size for a database file is
3140     ** determined by the 2-byte integer located at an offset of 16 bytes from
3141     ** the beginning of the database file. */
3142     pageSize = (page1[16]<<8) | (page1[17]<<16);
3143     /* EVIDENCE-OF: R-25008-21688 The size of a page is a power of two
3144     ** between 512 and 65536 inclusive. */
3145     if( ((pageSize-1)&pageSize)!=0
3146      || pageSize>SQLITE_MAX_PAGE_SIZE
3147      || pageSize<=256
3148     ){
3149       goto page1_init_failed;
3150     }
3151     pBt->btsFlags |= BTS_PAGESIZE_FIXED;
3152     assert( (pageSize & 7)==0 );
3153     /* EVIDENCE-OF: R-59310-51205 The "reserved space" size in the 1-byte
3154     ** integer at offset 20 is the number of bytes of space at the end of
3155     ** each page to reserve for extensions.
3156     **
3157     ** EVIDENCE-OF: R-37497-42412 The size of the reserved region is
3158     ** determined by the one-byte unsigned integer found at an offset of 20
3159     ** into the database file header. */
3160     usableSize = pageSize - page1[20];
3161     if( (u32)pageSize!=pBt->pageSize ){
3162       /* After reading the first page of the database assuming a page size
3163       ** of BtShared.pageSize, we have discovered that the page-size is
3164       ** actually pageSize. Unlock the database, leave pBt->pPage1 at
3165       ** zero and return SQLITE_OK. The caller will call this function
3166       ** again with the correct page-size.
3167       */
3168       releasePageOne(pPage1);
3169       pBt->usableSize = usableSize;
3170       pBt->pageSize = pageSize;
3171       freeTempSpace(pBt);
3172       rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize,
3173                                    pageSize-usableSize);
3174       return rc;
3175     }
3176     if( sqlite3WritableSchema(pBt->db)==0 && nPage>nPageFile ){
3177       rc = SQLITE_CORRUPT_BKPT;
3178       goto page1_init_failed;
3179     }
3180     /* EVIDENCE-OF: R-28312-64704 However, the usable size is not allowed to
3181     ** be less than 480. In other words, if the page size is 512, then the
3182     ** reserved space size cannot exceed 32. */
3183     if( usableSize<480 ){
3184       goto page1_init_failed;
3185     }
3186     pBt->pageSize = pageSize;
3187     pBt->usableSize = usableSize;
3188 #ifndef SQLITE_OMIT_AUTOVACUUM
3189     pBt->autoVacuum = (get4byte(&page1[36 + 4*4])?1:0);
3190     pBt->incrVacuum = (get4byte(&page1[36 + 7*4])?1:0);
3191 #endif
3192   }
3193 
3194   /* maxLocal is the maximum amount of payload to store locally for
3195   ** a cell.  Make sure it is small enough so that at least minFanout
3196   ** cells can will fit on one page.  We assume a 10-byte page header.
3197   ** Besides the payload, the cell must store:
3198   **     2-byte pointer to the cell
3199   **     4-byte child pointer
3200   **     9-byte nKey value
3201   **     4-byte nData value
3202   **     4-byte overflow page pointer
3203   ** So a cell consists of a 2-byte pointer, a header which is as much as
3204   ** 17 bytes long, 0 to N bytes of payload, and an optional 4 byte overflow
3205   ** page pointer.
3206   */
3207   pBt->maxLocal = (u16)((pBt->usableSize-12)*64/255 - 23);
3208   pBt->minLocal = (u16)((pBt->usableSize-12)*32/255 - 23);
3209   pBt->maxLeaf = (u16)(pBt->usableSize - 35);
3210   pBt->minLeaf = (u16)((pBt->usableSize-12)*32/255 - 23);
3211   if( pBt->maxLocal>127 ){
3212     pBt->max1bytePayload = 127;
3213   }else{
3214     pBt->max1bytePayload = (u8)pBt->maxLocal;
3215   }
3216   assert( pBt->maxLeaf + 23 <= MX_CELL_SIZE(pBt) );
3217   pBt->pPage1 = pPage1;
3218   pBt->nPage = nPage;
3219   return SQLITE_OK;
3220 
3221 page1_init_failed:
3222   releasePageOne(pPage1);
3223   pBt->pPage1 = 0;
3224   return rc;
3225 }
3226 
3227 #ifndef NDEBUG
3228 /*
3229 ** Return the number of cursors open on pBt. This is for use
3230 ** in assert() expressions, so it is only compiled if NDEBUG is not
3231 ** defined.
3232 **
3233 ** Only write cursors are counted if wrOnly is true.  If wrOnly is
3234 ** false then all cursors are counted.
3235 **
3236 ** For the purposes of this routine, a cursor is any cursor that
3237 ** is capable of reading or writing to the database.  Cursors that
3238 ** have been tripped into the CURSOR_FAULT state are not counted.
3239 */
3240 static int countValidCursors(BtShared *pBt, int wrOnly){
3241   BtCursor *pCur;
3242   int r = 0;
3243   for(pCur=pBt->pCursor; pCur; pCur=pCur->pNext){
3244     if( (wrOnly==0 || (pCur->curFlags & BTCF_WriteFlag)!=0)
3245      && pCur->eState!=CURSOR_FAULT ) r++;
3246   }
3247   return r;
3248 }
3249 #endif
3250 
3251 /*
3252 ** If there are no outstanding cursors and we are not in the middle
3253 ** of a transaction but there is a read lock on the database, then
3254 ** this routine unrefs the first page of the database file which
3255 ** has the effect of releasing the read lock.
3256 **
3257 ** If there is a transaction in progress, this routine is a no-op.
3258 */
3259 static void unlockBtreeIfUnused(BtShared *pBt){
3260   assert( sqlite3_mutex_held(pBt->mutex) );
3261   assert( countValidCursors(pBt,0)==0 || pBt->inTransaction>TRANS_NONE );
3262   if( pBt->inTransaction==TRANS_NONE && pBt->pPage1!=0 ){
3263     MemPage *pPage1 = pBt->pPage1;
3264     assert( pPage1->aData );
3265     assert( sqlite3PagerRefcount(pBt->pPager)==1 );
3266     pBt->pPage1 = 0;
3267     releasePageOne(pPage1);
3268   }
3269 }
3270 
3271 /*
3272 ** If pBt points to an empty file then convert that empty file
3273 ** into a new empty database by initializing the first page of
3274 ** the database.
3275 */
3276 static int newDatabase(BtShared *pBt){
3277   MemPage *pP1;
3278   unsigned char *data;
3279   int rc;
3280 
3281   assert( sqlite3_mutex_held(pBt->mutex) );
3282   if( pBt->nPage>0 ){
3283     return SQLITE_OK;
3284   }
3285   pP1 = pBt->pPage1;
3286   assert( pP1!=0 );
3287   data = pP1->aData;
3288   rc = sqlite3PagerWrite(pP1->pDbPage);
3289   if( rc ) return rc;
3290   memcpy(data, zMagicHeader, sizeof(zMagicHeader));
3291   assert( sizeof(zMagicHeader)==16 );
3292   data[16] = (u8)((pBt->pageSize>>8)&0xff);
3293   data[17] = (u8)((pBt->pageSize>>16)&0xff);
3294   data[18] = 1;
3295   data[19] = 1;
3296   assert( pBt->usableSize<=pBt->pageSize && pBt->usableSize+255>=pBt->pageSize);
3297   data[20] = (u8)(pBt->pageSize - pBt->usableSize);
3298   data[21] = 64;
3299   data[22] = 32;
3300   data[23] = 32;
3301   memset(&data[24], 0, 100-24);
3302   zeroPage(pP1, PTF_INTKEY|PTF_LEAF|PTF_LEAFDATA );
3303   pBt->btsFlags |= BTS_PAGESIZE_FIXED;
3304 #ifndef SQLITE_OMIT_AUTOVACUUM
3305   assert( pBt->autoVacuum==1 || pBt->autoVacuum==0 );
3306   assert( pBt->incrVacuum==1 || pBt->incrVacuum==0 );
3307   put4byte(&data[36 + 4*4], pBt->autoVacuum);
3308   put4byte(&data[36 + 7*4], pBt->incrVacuum);
3309 #endif
3310   pBt->nPage = 1;
3311   data[31] = 1;
3312   return SQLITE_OK;
3313 }
3314 
3315 /*
3316 ** Initialize the first page of the database file (creating a database
3317 ** consisting of a single page and no schema objects). Return SQLITE_OK
3318 ** if successful, or an SQLite error code otherwise.
3319 */
3320 int sqlite3BtreeNewDb(Btree *p){
3321   int rc;
3322   sqlite3BtreeEnter(p);
3323   p->pBt->nPage = 0;
3324   rc = newDatabase(p->pBt);
3325   sqlite3BtreeLeave(p);
3326   return rc;
3327 }
3328 
3329 /*
3330 ** Attempt to start a new transaction. A write-transaction
3331 ** is started if the second argument is nonzero, otherwise a read-
3332 ** transaction.  If the second argument is 2 or more and exclusive
3333 ** transaction is started, meaning that no other process is allowed
3334 ** to access the database.  A preexisting transaction may not be
3335 ** upgraded to exclusive by calling this routine a second time - the
3336 ** exclusivity flag only works for a new transaction.
3337 **
3338 ** A write-transaction must be started before attempting any
3339 ** changes to the database.  None of the following routines
3340 ** will work unless a transaction is started first:
3341 **
3342 **      sqlite3BtreeCreateTable()
3343 **      sqlite3BtreeCreateIndex()
3344 **      sqlite3BtreeClearTable()
3345 **      sqlite3BtreeDropTable()
3346 **      sqlite3BtreeInsert()
3347 **      sqlite3BtreeDelete()
3348 **      sqlite3BtreeUpdateMeta()
3349 **
3350 ** If an initial attempt to acquire the lock fails because of lock contention
3351 ** and the database was previously unlocked, then invoke the busy handler
3352 ** if there is one.  But if there was previously a read-lock, do not
3353 ** invoke the busy handler - just return SQLITE_BUSY.  SQLITE_BUSY is
3354 ** returned when there is already a read-lock in order to avoid a deadlock.
3355 **
3356 ** Suppose there are two processes A and B.  A has a read lock and B has
3357 ** a reserved lock.  B tries to promote to exclusive but is blocked because
3358 ** of A's read lock.  A tries to promote to reserved but is blocked by B.
3359 ** One or the other of the two processes must give way or there can be
3360 ** no progress.  By returning SQLITE_BUSY and not invoking the busy callback
3361 ** when A already has a read lock, we encourage A to give up and let B
3362 ** proceed.
3363 */
3364 int sqlite3BtreeBeginTrans(Btree *p, int wrflag, int *pSchemaVersion){
3365   BtShared *pBt = p->pBt;
3366   int rc = SQLITE_OK;
3367 
3368   sqlite3BtreeEnter(p);
3369   btreeIntegrity(p);
3370 
3371   /* If the btree is already in a write-transaction, or it
3372   ** is already in a read-transaction and a read-transaction
3373   ** is requested, this is a no-op.
3374   */
3375   if( p->inTrans==TRANS_WRITE || (p->inTrans==TRANS_READ && !wrflag) ){
3376     goto trans_begun;
3377   }
3378   assert( pBt->inTransaction==TRANS_WRITE || IfNotOmitAV(pBt->bDoTruncate)==0 );
3379 
3380   if( (p->db->flags & SQLITE_ResetDatabase)
3381    && sqlite3PagerIsreadonly(pBt->pPager)==0
3382   ){
3383     pBt->btsFlags &= ~BTS_READ_ONLY;
3384   }
3385 
3386   /* Write transactions are not possible on a read-only database */
3387   if( (pBt->btsFlags & BTS_READ_ONLY)!=0 && wrflag ){
3388     rc = SQLITE_READONLY;
3389     goto trans_begun;
3390   }
3391 
3392 #ifndef SQLITE_OMIT_SHARED_CACHE
3393   {
3394     sqlite3 *pBlock = 0;
3395     /* If another database handle has already opened a write transaction
3396     ** on this shared-btree structure and a second write transaction is
3397     ** requested, return SQLITE_LOCKED.
3398     */
3399     if( (wrflag && pBt->inTransaction==TRANS_WRITE)
3400      || (pBt->btsFlags & BTS_PENDING)!=0
3401     ){
3402       pBlock = pBt->pWriter->db;
3403     }else if( wrflag>1 ){
3404       BtLock *pIter;
3405       for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){
3406         if( pIter->pBtree!=p ){
3407           pBlock = pIter->pBtree->db;
3408           break;
3409         }
3410       }
3411     }
3412     if( pBlock ){
3413       sqlite3ConnectionBlocked(p->db, pBlock);
3414       rc = SQLITE_LOCKED_SHAREDCACHE;
3415       goto trans_begun;
3416     }
3417   }
3418 #endif
3419 
3420   /* Any read-only or read-write transaction implies a read-lock on
3421   ** page 1. So if some other shared-cache client already has a write-lock
3422   ** on page 1, the transaction cannot be opened. */
3423   rc = querySharedCacheTableLock(p, MASTER_ROOT, READ_LOCK);
3424   if( SQLITE_OK!=rc ) goto trans_begun;
3425 
3426   pBt->btsFlags &= ~BTS_INITIALLY_EMPTY;
3427   if( pBt->nPage==0 ) pBt->btsFlags |= BTS_INITIALLY_EMPTY;
3428   do {
3429     /* Call lockBtree() until either pBt->pPage1 is populated or
3430     ** lockBtree() returns something other than SQLITE_OK. lockBtree()
3431     ** may return SQLITE_OK but leave pBt->pPage1 set to 0 if after
3432     ** reading page 1 it discovers that the page-size of the database
3433     ** file is not pBt->pageSize. In this case lockBtree() will update
3434     ** pBt->pageSize to the page-size of the file on disk.
3435     */
3436     while( pBt->pPage1==0 && SQLITE_OK==(rc = lockBtree(pBt)) );
3437 
3438     if( rc==SQLITE_OK && wrflag ){
3439       if( (pBt->btsFlags & BTS_READ_ONLY)!=0 ){
3440         rc = SQLITE_READONLY;
3441       }else{
3442         rc = sqlite3PagerBegin(pBt->pPager,wrflag>1,sqlite3TempInMemory(p->db));
3443         if( rc==SQLITE_OK ){
3444           rc = newDatabase(pBt);
3445         }else if( rc==SQLITE_BUSY_SNAPSHOT && pBt->inTransaction==TRANS_NONE ){
3446           /* if there was no transaction opened when this function was
3447           ** called and SQLITE_BUSY_SNAPSHOT is returned, change the error
3448           ** code to SQLITE_BUSY. */
3449           rc = SQLITE_BUSY;
3450         }
3451       }
3452     }
3453 
3454     if( rc!=SQLITE_OK ){
3455       unlockBtreeIfUnused(pBt);
3456     }
3457   }while( (rc&0xFF)==SQLITE_BUSY && pBt->inTransaction==TRANS_NONE &&
3458           btreeInvokeBusyHandler(pBt) );
3459   sqlite3PagerResetLockTimeout(pBt->pPager);
3460 
3461   if( rc==SQLITE_OK ){
3462     if( p->inTrans==TRANS_NONE ){
3463       pBt->nTransaction++;
3464 #ifndef SQLITE_OMIT_SHARED_CACHE
3465       if( p->sharable ){
3466         assert( p->lock.pBtree==p && p->lock.iTable==1 );
3467         p->lock.eLock = READ_LOCK;
3468         p->lock.pNext = pBt->pLock;
3469         pBt->pLock = &p->lock;
3470       }
3471 #endif
3472     }
3473     p->inTrans = (wrflag?TRANS_WRITE:TRANS_READ);
3474     if( p->inTrans>pBt->inTransaction ){
3475       pBt->inTransaction = p->inTrans;
3476     }
3477     if( wrflag ){
3478       MemPage *pPage1 = pBt->pPage1;
3479 #ifndef SQLITE_OMIT_SHARED_CACHE
3480       assert( !pBt->pWriter );
3481       pBt->pWriter = p;
3482       pBt->btsFlags &= ~BTS_EXCLUSIVE;
3483       if( wrflag>1 ) pBt->btsFlags |= BTS_EXCLUSIVE;
3484 #endif
3485 
3486       /* If the db-size header field is incorrect (as it may be if an old
3487       ** client has been writing the database file), update it now. Doing
3488       ** this sooner rather than later means the database size can safely
3489       ** re-read the database size from page 1 if a savepoint or transaction
3490       ** rollback occurs within the transaction.
3491       */
3492       if( pBt->nPage!=get4byte(&pPage1->aData[28]) ){
3493         rc = sqlite3PagerWrite(pPage1->pDbPage);
3494         if( rc==SQLITE_OK ){
3495           put4byte(&pPage1->aData[28], pBt->nPage);
3496         }
3497       }
3498     }
3499   }
3500 
3501 trans_begun:
3502   if( rc==SQLITE_OK ){
3503     if( pSchemaVersion ){
3504       *pSchemaVersion = get4byte(&pBt->pPage1->aData[40]);
3505     }
3506     if( wrflag ){
3507       /* This call makes sure that the pager has the correct number of
3508       ** open savepoints. If the second parameter is greater than 0 and
3509       ** the sub-journal is not already open, then it will be opened here.
3510       */
3511       rc = sqlite3PagerOpenSavepoint(pBt->pPager, p->db->nSavepoint);
3512     }
3513   }
3514 
3515   btreeIntegrity(p);
3516   sqlite3BtreeLeave(p);
3517   return rc;
3518 }
3519 
3520 #ifndef SQLITE_OMIT_AUTOVACUUM
3521 
3522 /*
3523 ** Set the pointer-map entries for all children of page pPage. Also, if
3524 ** pPage contains cells that point to overflow pages, set the pointer
3525 ** map entries for the overflow pages as well.
3526 */
3527 static int setChildPtrmaps(MemPage *pPage){
3528   int i;                             /* Counter variable */
3529   int nCell;                         /* Number of cells in page pPage */
3530   int rc;                            /* Return code */
3531   BtShared *pBt = pPage->pBt;
3532   Pgno pgno = pPage->pgno;
3533 
3534   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
3535   rc = pPage->isInit ? SQLITE_OK : btreeInitPage(pPage);
3536   if( rc!=SQLITE_OK ) return rc;
3537   nCell = pPage->nCell;
3538 
3539   for(i=0; i<nCell; i++){
3540     u8 *pCell = findCell(pPage, i);
3541 
3542     ptrmapPutOvflPtr(pPage, pPage, pCell, &rc);
3543 
3544     if( !pPage->leaf ){
3545       Pgno childPgno = get4byte(pCell);
3546       ptrmapPut(pBt, childPgno, PTRMAP_BTREE, pgno, &rc);
3547     }
3548   }
3549 
3550   if( !pPage->leaf ){
3551     Pgno childPgno = get4byte(&pPage->aData[pPage->hdrOffset+8]);
3552     ptrmapPut(pBt, childPgno, PTRMAP_BTREE, pgno, &rc);
3553   }
3554 
3555   return rc;
3556 }
3557 
3558 /*
3559 ** Somewhere on pPage is a pointer to page iFrom.  Modify this pointer so
3560 ** that it points to iTo. Parameter eType describes the type of pointer to
3561 ** be modified, as  follows:
3562 **
3563 ** PTRMAP_BTREE:     pPage is a btree-page. The pointer points at a child
3564 **                   page of pPage.
3565 **
3566 ** PTRMAP_OVERFLOW1: pPage is a btree-page. The pointer points at an overflow
3567 **                   page pointed to by one of the cells on pPage.
3568 **
3569 ** PTRMAP_OVERFLOW2: pPage is an overflow-page. The pointer points at the next
3570 **                   overflow page in the list.
3571 */
3572 static int modifyPagePointer(MemPage *pPage, Pgno iFrom, Pgno iTo, u8 eType){
3573   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
3574   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
3575   if( eType==PTRMAP_OVERFLOW2 ){
3576     /* The pointer is always the first 4 bytes of the page in this case.  */
3577     if( get4byte(pPage->aData)!=iFrom ){
3578       return SQLITE_CORRUPT_PAGE(pPage);
3579     }
3580     put4byte(pPage->aData, iTo);
3581   }else{
3582     int i;
3583     int nCell;
3584     int rc;
3585 
3586     rc = pPage->isInit ? SQLITE_OK : btreeInitPage(pPage);
3587     if( rc ) return rc;
3588     nCell = pPage->nCell;
3589 
3590     for(i=0; i<nCell; i++){
3591       u8 *pCell = findCell(pPage, i);
3592       if( eType==PTRMAP_OVERFLOW1 ){
3593         CellInfo info;
3594         pPage->xParseCell(pPage, pCell, &info);
3595         if( info.nLocal<info.nPayload ){
3596           if( pCell+info.nSize > pPage->aData+pPage->pBt->usableSize ){
3597             return SQLITE_CORRUPT_PAGE(pPage);
3598           }
3599           if( iFrom==get4byte(pCell+info.nSize-4) ){
3600             put4byte(pCell+info.nSize-4, iTo);
3601             break;
3602           }
3603         }
3604       }else{
3605         if( get4byte(pCell)==iFrom ){
3606           put4byte(pCell, iTo);
3607           break;
3608         }
3609       }
3610     }
3611 
3612     if( i==nCell ){
3613       if( eType!=PTRMAP_BTREE ||
3614           get4byte(&pPage->aData[pPage->hdrOffset+8])!=iFrom ){
3615         return SQLITE_CORRUPT_PAGE(pPage);
3616       }
3617       put4byte(&pPage->aData[pPage->hdrOffset+8], iTo);
3618     }
3619   }
3620   return SQLITE_OK;
3621 }
3622 
3623 
3624 /*
3625 ** Move the open database page pDbPage to location iFreePage in the
3626 ** database. The pDbPage reference remains valid.
3627 **
3628 ** The isCommit flag indicates that there is no need to remember that
3629 ** the journal needs to be sync()ed before database page pDbPage->pgno
3630 ** can be written to. The caller has already promised not to write to that
3631 ** page.
3632 */
3633 static int relocatePage(
3634   BtShared *pBt,           /* Btree */
3635   MemPage *pDbPage,        /* Open page to move */
3636   u8 eType,                /* Pointer map 'type' entry for pDbPage */
3637   Pgno iPtrPage,           /* Pointer map 'page-no' entry for pDbPage */
3638   Pgno iFreePage,          /* The location to move pDbPage to */
3639   int isCommit             /* isCommit flag passed to sqlite3PagerMovepage */
3640 ){
3641   MemPage *pPtrPage;   /* The page that contains a pointer to pDbPage */
3642   Pgno iDbPage = pDbPage->pgno;
3643   Pager *pPager = pBt->pPager;
3644   int rc;
3645 
3646   assert( eType==PTRMAP_OVERFLOW2 || eType==PTRMAP_OVERFLOW1 ||
3647       eType==PTRMAP_BTREE || eType==PTRMAP_ROOTPAGE );
3648   assert( sqlite3_mutex_held(pBt->mutex) );
3649   assert( pDbPage->pBt==pBt );
3650   if( iDbPage<3 ) return SQLITE_CORRUPT_BKPT;
3651 
3652   /* Move page iDbPage from its current location to page number iFreePage */
3653   TRACE(("AUTOVACUUM: Moving %d to free page %d (ptr page %d type %d)\n",
3654       iDbPage, iFreePage, iPtrPage, eType));
3655   rc = sqlite3PagerMovepage(pPager, pDbPage->pDbPage, iFreePage, isCommit);
3656   if( rc!=SQLITE_OK ){
3657     return rc;
3658   }
3659   pDbPage->pgno = iFreePage;
3660 
3661   /* If pDbPage was a btree-page, then it may have child pages and/or cells
3662   ** that point to overflow pages. The pointer map entries for all these
3663   ** pages need to be changed.
3664   **
3665   ** If pDbPage is an overflow page, then the first 4 bytes may store a
3666   ** pointer to a subsequent overflow page. If this is the case, then
3667   ** the pointer map needs to be updated for the subsequent overflow page.
3668   */
3669   if( eType==PTRMAP_BTREE || eType==PTRMAP_ROOTPAGE ){
3670     rc = setChildPtrmaps(pDbPage);
3671     if( rc!=SQLITE_OK ){
3672       return rc;
3673     }
3674   }else{
3675     Pgno nextOvfl = get4byte(pDbPage->aData);
3676     if( nextOvfl!=0 ){
3677       ptrmapPut(pBt, nextOvfl, PTRMAP_OVERFLOW2, iFreePage, &rc);
3678       if( rc!=SQLITE_OK ){
3679         return rc;
3680       }
3681     }
3682   }
3683 
3684   /* Fix the database pointer on page iPtrPage that pointed at iDbPage so
3685   ** that it points at iFreePage. Also fix the pointer map entry for
3686   ** iPtrPage.
3687   */
3688   if( eType!=PTRMAP_ROOTPAGE ){
3689     rc = btreeGetPage(pBt, iPtrPage, &pPtrPage, 0);
3690     if( rc!=SQLITE_OK ){
3691       return rc;
3692     }
3693     rc = sqlite3PagerWrite(pPtrPage->pDbPage);
3694     if( rc!=SQLITE_OK ){
3695       releasePage(pPtrPage);
3696       return rc;
3697     }
3698     rc = modifyPagePointer(pPtrPage, iDbPage, iFreePage, eType);
3699     releasePage(pPtrPage);
3700     if( rc==SQLITE_OK ){
3701       ptrmapPut(pBt, iFreePage, eType, iPtrPage, &rc);
3702     }
3703   }
3704   return rc;
3705 }
3706 
3707 /* Forward declaration required by incrVacuumStep(). */
3708 static int allocateBtreePage(BtShared *, MemPage **, Pgno *, Pgno, u8);
3709 
3710 /*
3711 ** Perform a single step of an incremental-vacuum. If successful, return
3712 ** SQLITE_OK. If there is no work to do (and therefore no point in
3713 ** calling this function again), return SQLITE_DONE. Or, if an error
3714 ** occurs, return some other error code.
3715 **
3716 ** More specifically, this function attempts to re-organize the database so
3717 ** that the last page of the file currently in use is no longer in use.
3718 **
3719 ** Parameter nFin is the number of pages that this database would contain
3720 ** were this function called until it returns SQLITE_DONE.
3721 **
3722 ** If the bCommit parameter is non-zero, this function assumes that the
3723 ** caller will keep calling incrVacuumStep() until it returns SQLITE_DONE
3724 ** or an error. bCommit is passed true for an auto-vacuum-on-commit
3725 ** operation, or false for an incremental vacuum.
3726 */
3727 static int incrVacuumStep(BtShared *pBt, Pgno nFin, Pgno iLastPg, int bCommit){
3728   Pgno nFreeList;           /* Number of pages still on the free-list */
3729   int rc;
3730 
3731   assert( sqlite3_mutex_held(pBt->mutex) );
3732   assert( iLastPg>nFin );
3733 
3734   if( !PTRMAP_ISPAGE(pBt, iLastPg) && iLastPg!=PENDING_BYTE_PAGE(pBt) ){
3735     u8 eType;
3736     Pgno iPtrPage;
3737 
3738     nFreeList = get4byte(&pBt->pPage1->aData[36]);
3739     if( nFreeList==0 ){
3740       return SQLITE_DONE;
3741     }
3742 
3743     rc = ptrmapGet(pBt, iLastPg, &eType, &iPtrPage);
3744     if( rc!=SQLITE_OK ){
3745       return rc;
3746     }
3747     if( eType==PTRMAP_ROOTPAGE ){
3748       return SQLITE_CORRUPT_BKPT;
3749     }
3750 
3751     if( eType==PTRMAP_FREEPAGE ){
3752       if( bCommit==0 ){
3753         /* Remove the page from the files free-list. This is not required
3754         ** if bCommit is non-zero. In that case, the free-list will be
3755         ** truncated to zero after this function returns, so it doesn't
3756         ** matter if it still contains some garbage entries.
3757         */
3758         Pgno iFreePg;
3759         MemPage *pFreePg;
3760         rc = allocateBtreePage(pBt, &pFreePg, &iFreePg, iLastPg, BTALLOC_EXACT);
3761         if( rc!=SQLITE_OK ){
3762           return rc;
3763         }
3764         assert( iFreePg==iLastPg );
3765         releasePage(pFreePg);
3766       }
3767     } else {
3768       Pgno iFreePg;             /* Index of free page to move pLastPg to */
3769       MemPage *pLastPg;
3770       u8 eMode = BTALLOC_ANY;   /* Mode parameter for allocateBtreePage() */
3771       Pgno iNear = 0;           /* nearby parameter for allocateBtreePage() */
3772 
3773       rc = btreeGetPage(pBt, iLastPg, &pLastPg, 0);
3774       if( rc!=SQLITE_OK ){
3775         return rc;
3776       }
3777 
3778       /* If bCommit is zero, this loop runs exactly once and page pLastPg
3779       ** is swapped with the first free page pulled off the free list.
3780       **
3781       ** On the other hand, if bCommit is greater than zero, then keep
3782       ** looping until a free-page located within the first nFin pages
3783       ** of the file is found.
3784       */
3785       if( bCommit==0 ){
3786         eMode = BTALLOC_LE;
3787         iNear = nFin;
3788       }
3789       do {
3790         MemPage *pFreePg;
3791         rc = allocateBtreePage(pBt, &pFreePg, &iFreePg, iNear, eMode);
3792         if( rc!=SQLITE_OK ){
3793           releasePage(pLastPg);
3794           return rc;
3795         }
3796         releasePage(pFreePg);
3797       }while( bCommit && iFreePg>nFin );
3798       assert( iFreePg<iLastPg );
3799 
3800       rc = relocatePage(pBt, pLastPg, eType, iPtrPage, iFreePg, bCommit);
3801       releasePage(pLastPg);
3802       if( rc!=SQLITE_OK ){
3803         return rc;
3804       }
3805     }
3806   }
3807 
3808   if( bCommit==0 ){
3809     do {
3810       iLastPg--;
3811     }while( iLastPg==PENDING_BYTE_PAGE(pBt) || PTRMAP_ISPAGE(pBt, iLastPg) );
3812     pBt->bDoTruncate = 1;
3813     pBt->nPage = iLastPg;
3814   }
3815   return SQLITE_OK;
3816 }
3817 
3818 /*
3819 ** The database opened by the first argument is an auto-vacuum database
3820 ** nOrig pages in size containing nFree free pages. Return the expected
3821 ** size of the database in pages following an auto-vacuum operation.
3822 */
3823 static Pgno finalDbSize(BtShared *pBt, Pgno nOrig, Pgno nFree){
3824   int nEntry;                     /* Number of entries on one ptrmap page */
3825   Pgno nPtrmap;                   /* Number of PtrMap pages to be freed */
3826   Pgno nFin;                      /* Return value */
3827 
3828   nEntry = pBt->usableSize/5;
3829   nPtrmap = (nFree-nOrig+PTRMAP_PAGENO(pBt, nOrig)+nEntry)/nEntry;
3830   nFin = nOrig - nFree - nPtrmap;
3831   if( nOrig>PENDING_BYTE_PAGE(pBt) && nFin<PENDING_BYTE_PAGE(pBt) ){
3832     nFin--;
3833   }
3834   while( PTRMAP_ISPAGE(pBt, nFin) || nFin==PENDING_BYTE_PAGE(pBt) ){
3835     nFin--;
3836   }
3837 
3838   return nFin;
3839 }
3840 
3841 /*
3842 ** A write-transaction must be opened before calling this function.
3843 ** It performs a single unit of work towards an incremental vacuum.
3844 **
3845 ** If the incremental vacuum is finished after this function has run,
3846 ** SQLITE_DONE is returned. If it is not finished, but no error occurred,
3847 ** SQLITE_OK is returned. Otherwise an SQLite error code.
3848 */
3849 int sqlite3BtreeIncrVacuum(Btree *p){
3850   int rc;
3851   BtShared *pBt = p->pBt;
3852 
3853   sqlite3BtreeEnter(p);
3854   assert( pBt->inTransaction==TRANS_WRITE && p->inTrans==TRANS_WRITE );
3855   if( !pBt->autoVacuum ){
3856     rc = SQLITE_DONE;
3857   }else{
3858     Pgno nOrig = btreePagecount(pBt);
3859     Pgno nFree = get4byte(&pBt->pPage1->aData[36]);
3860     Pgno nFin = finalDbSize(pBt, nOrig, nFree);
3861 
3862     if( nOrig<nFin ){
3863       rc = SQLITE_CORRUPT_BKPT;
3864     }else if( nFree>0 ){
3865       rc = saveAllCursors(pBt, 0, 0);
3866       if( rc==SQLITE_OK ){
3867         invalidateAllOverflowCache(pBt);
3868         rc = incrVacuumStep(pBt, nFin, nOrig, 0);
3869       }
3870       if( rc==SQLITE_OK ){
3871         rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
3872         put4byte(&pBt->pPage1->aData[28], pBt->nPage);
3873       }
3874     }else{
3875       rc = SQLITE_DONE;
3876     }
3877   }
3878   sqlite3BtreeLeave(p);
3879   return rc;
3880 }
3881 
3882 /*
3883 ** This routine is called prior to sqlite3PagerCommit when a transaction
3884 ** is committed for an auto-vacuum database.
3885 **
3886 ** If SQLITE_OK is returned, then *pnTrunc is set to the number of pages
3887 ** the database file should be truncated to during the commit process.
3888 ** i.e. the database has been reorganized so that only the first *pnTrunc
3889 ** pages are in use.
3890 */
3891 static int autoVacuumCommit(BtShared *pBt){
3892   int rc = SQLITE_OK;
3893   Pager *pPager = pBt->pPager;
3894   VVA_ONLY( int nRef = sqlite3PagerRefcount(pPager); )
3895 
3896   assert( sqlite3_mutex_held(pBt->mutex) );
3897   invalidateAllOverflowCache(pBt);
3898   assert(pBt->autoVacuum);
3899   if( !pBt->incrVacuum ){
3900     Pgno nFin;         /* Number of pages in database after autovacuuming */
3901     Pgno nFree;        /* Number of pages on the freelist initially */
3902     Pgno iFree;        /* The next page to be freed */
3903     Pgno nOrig;        /* Database size before freeing */
3904 
3905     nOrig = btreePagecount(pBt);
3906     if( PTRMAP_ISPAGE(pBt, nOrig) || nOrig==PENDING_BYTE_PAGE(pBt) ){
3907       /* It is not possible to create a database for which the final page
3908       ** is either a pointer-map page or the pending-byte page. If one
3909       ** is encountered, this indicates corruption.
3910       */
3911       return SQLITE_CORRUPT_BKPT;
3912     }
3913 
3914     nFree = get4byte(&pBt->pPage1->aData[36]);
3915     nFin = finalDbSize(pBt, nOrig, nFree);
3916     if( nFin>nOrig ) return SQLITE_CORRUPT_BKPT;
3917     if( nFin<nOrig ){
3918       rc = saveAllCursors(pBt, 0, 0);
3919     }
3920     for(iFree=nOrig; iFree>nFin && rc==SQLITE_OK; iFree--){
3921       rc = incrVacuumStep(pBt, nFin, iFree, 1);
3922     }
3923     if( (rc==SQLITE_DONE || rc==SQLITE_OK) && nFree>0 ){
3924       rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
3925       put4byte(&pBt->pPage1->aData[32], 0);
3926       put4byte(&pBt->pPage1->aData[36], 0);
3927       put4byte(&pBt->pPage1->aData[28], nFin);
3928       pBt->bDoTruncate = 1;
3929       pBt->nPage = nFin;
3930     }
3931     if( rc!=SQLITE_OK ){
3932       sqlite3PagerRollback(pPager);
3933     }
3934   }
3935 
3936   assert( nRef>=sqlite3PagerRefcount(pPager) );
3937   return rc;
3938 }
3939 
3940 #else /* ifndef SQLITE_OMIT_AUTOVACUUM */
3941 # define setChildPtrmaps(x) SQLITE_OK
3942 #endif
3943 
3944 /*
3945 ** This routine does the first phase of a two-phase commit.  This routine
3946 ** causes a rollback journal to be created (if it does not already exist)
3947 ** and populated with enough information so that if a power loss occurs
3948 ** the database can be restored to its original state by playing back
3949 ** the journal.  Then the contents of the journal are flushed out to
3950 ** the disk.  After the journal is safely on oxide, the changes to the
3951 ** database are written into the database file and flushed to oxide.
3952 ** At the end of this call, the rollback journal still exists on the
3953 ** disk and we are still holding all locks, so the transaction has not
3954 ** committed.  See sqlite3BtreeCommitPhaseTwo() for the second phase of the
3955 ** commit process.
3956 **
3957 ** This call is a no-op if no write-transaction is currently active on pBt.
3958 **
3959 ** Otherwise, sync the database file for the btree pBt. zMaster points to
3960 ** the name of a master journal file that should be written into the
3961 ** individual journal file, or is NULL, indicating no master journal file
3962 ** (single database transaction).
3963 **
3964 ** When this is called, the master journal should already have been
3965 ** created, populated with this journal pointer and synced to disk.
3966 **
3967 ** Once this is routine has returned, the only thing required to commit
3968 ** the write-transaction for this database file is to delete the journal.
3969 */
3970 int sqlite3BtreeCommitPhaseOne(Btree *p, const char *zMaster){
3971   int rc = SQLITE_OK;
3972   if( p->inTrans==TRANS_WRITE ){
3973     BtShared *pBt = p->pBt;
3974     sqlite3BtreeEnter(p);
3975 #ifndef SQLITE_OMIT_AUTOVACUUM
3976     if( pBt->autoVacuum ){
3977       rc = autoVacuumCommit(pBt);
3978       if( rc!=SQLITE_OK ){
3979         sqlite3BtreeLeave(p);
3980         return rc;
3981       }
3982     }
3983     if( pBt->bDoTruncate ){
3984       sqlite3PagerTruncateImage(pBt->pPager, pBt->nPage);
3985     }
3986 #endif
3987     rc = sqlite3PagerCommitPhaseOne(pBt->pPager, zMaster, 0);
3988     sqlite3BtreeLeave(p);
3989   }
3990   return rc;
3991 }
3992 
3993 /*
3994 ** This function is called from both BtreeCommitPhaseTwo() and BtreeRollback()
3995 ** at the conclusion of a transaction.
3996 */
3997 static void btreeEndTransaction(Btree *p){
3998   BtShared *pBt = p->pBt;
3999   sqlite3 *db = p->db;
4000   assert( sqlite3BtreeHoldsMutex(p) );
4001 
4002 #ifndef SQLITE_OMIT_AUTOVACUUM
4003   pBt->bDoTruncate = 0;
4004 #endif
4005   if( p->inTrans>TRANS_NONE && db->nVdbeRead>1 ){
4006     /* If there are other active statements that belong to this database
4007     ** handle, downgrade to a read-only transaction. The other statements
4008     ** may still be reading from the database.  */
4009     downgradeAllSharedCacheTableLocks(p);
4010     p->inTrans = TRANS_READ;
4011   }else{
4012     /* If the handle had any kind of transaction open, decrement the
4013     ** transaction count of the shared btree. If the transaction count
4014     ** reaches 0, set the shared state to TRANS_NONE. The unlockBtreeIfUnused()
4015     ** call below will unlock the pager.  */
4016     if( p->inTrans!=TRANS_NONE ){
4017       clearAllSharedCacheTableLocks(p);
4018       pBt->nTransaction--;
4019       if( 0==pBt->nTransaction ){
4020         pBt->inTransaction = TRANS_NONE;
4021       }
4022     }
4023 
4024     /* Set the current transaction state to TRANS_NONE and unlock the
4025     ** pager if this call closed the only read or write transaction.  */
4026     p->inTrans = TRANS_NONE;
4027     unlockBtreeIfUnused(pBt);
4028   }
4029 
4030   btreeIntegrity(p);
4031 }
4032 
4033 /*
4034 ** Commit the transaction currently in progress.
4035 **
4036 ** This routine implements the second phase of a 2-phase commit.  The
4037 ** sqlite3BtreeCommitPhaseOne() routine does the first phase and should
4038 ** be invoked prior to calling this routine.  The sqlite3BtreeCommitPhaseOne()
4039 ** routine did all the work of writing information out to disk and flushing the
4040 ** contents so that they are written onto the disk platter.  All this
4041 ** routine has to do is delete or truncate or zero the header in the
4042 ** the rollback journal (which causes the transaction to commit) and
4043 ** drop locks.
4044 **
4045 ** Normally, if an error occurs while the pager layer is attempting to
4046 ** finalize the underlying journal file, this function returns an error and
4047 ** the upper layer will attempt a rollback. However, if the second argument
4048 ** is non-zero then this b-tree transaction is part of a multi-file
4049 ** transaction. In this case, the transaction has already been committed
4050 ** (by deleting a master journal file) and the caller will ignore this
4051 ** functions return code. So, even if an error occurs in the pager layer,
4052 ** reset the b-tree objects internal state to indicate that the write
4053 ** transaction has been closed. This is quite safe, as the pager will have
4054 ** transitioned to the error state.
4055 **
4056 ** This will release the write lock on the database file.  If there
4057 ** are no active cursors, it also releases the read lock.
4058 */
4059 int sqlite3BtreeCommitPhaseTwo(Btree *p, int bCleanup){
4060 
4061   if( p->inTrans==TRANS_NONE ) return SQLITE_OK;
4062   sqlite3BtreeEnter(p);
4063   btreeIntegrity(p);
4064 
4065   /* If the handle has a write-transaction open, commit the shared-btrees
4066   ** transaction and set the shared state to TRANS_READ.
4067   */
4068   if( p->inTrans==TRANS_WRITE ){
4069     int rc;
4070     BtShared *pBt = p->pBt;
4071     assert( pBt->inTransaction==TRANS_WRITE );
4072     assert( pBt->nTransaction>0 );
4073     rc = sqlite3PagerCommitPhaseTwo(pBt->pPager);
4074     if( rc!=SQLITE_OK && bCleanup==0 ){
4075       sqlite3BtreeLeave(p);
4076       return rc;
4077     }
4078     p->iDataVersion--;  /* Compensate for pPager->iDataVersion++; */
4079     pBt->inTransaction = TRANS_READ;
4080     btreeClearHasContent(pBt);
4081   }
4082 
4083   btreeEndTransaction(p);
4084   sqlite3BtreeLeave(p);
4085   return SQLITE_OK;
4086 }
4087 
4088 /*
4089 ** Do both phases of a commit.
4090 */
4091 int sqlite3BtreeCommit(Btree *p){
4092   int rc;
4093   sqlite3BtreeEnter(p);
4094   rc = sqlite3BtreeCommitPhaseOne(p, 0);
4095   if( rc==SQLITE_OK ){
4096     rc = sqlite3BtreeCommitPhaseTwo(p, 0);
4097   }
4098   sqlite3BtreeLeave(p);
4099   return rc;
4100 }
4101 
4102 /*
4103 ** This routine sets the state to CURSOR_FAULT and the error
4104 ** code to errCode for every cursor on any BtShared that pBtree
4105 ** references.  Or if the writeOnly flag is set to 1, then only
4106 ** trip write cursors and leave read cursors unchanged.
4107 **
4108 ** Every cursor is a candidate to be tripped, including cursors
4109 ** that belong to other database connections that happen to be
4110 ** sharing the cache with pBtree.
4111 **
4112 ** This routine gets called when a rollback occurs. If the writeOnly
4113 ** flag is true, then only write-cursors need be tripped - read-only
4114 ** cursors save their current positions so that they may continue
4115 ** following the rollback. Or, if writeOnly is false, all cursors are
4116 ** tripped. In general, writeOnly is false if the transaction being
4117 ** rolled back modified the database schema. In this case b-tree root
4118 ** pages may be moved or deleted from the database altogether, making
4119 ** it unsafe for read cursors to continue.
4120 **
4121 ** If the writeOnly flag is true and an error is encountered while
4122 ** saving the current position of a read-only cursor, all cursors,
4123 ** including all read-cursors are tripped.
4124 **
4125 ** SQLITE_OK is returned if successful, or if an error occurs while
4126 ** saving a cursor position, an SQLite error code.
4127 */
4128 int sqlite3BtreeTripAllCursors(Btree *pBtree, int errCode, int writeOnly){
4129   BtCursor *p;
4130   int rc = SQLITE_OK;
4131 
4132   assert( (writeOnly==0 || writeOnly==1) && BTCF_WriteFlag==1 );
4133   if( pBtree ){
4134     sqlite3BtreeEnter(pBtree);
4135     for(p=pBtree->pBt->pCursor; p; p=p->pNext){
4136       if( writeOnly && (p->curFlags & BTCF_WriteFlag)==0 ){
4137         if( p->eState==CURSOR_VALID || p->eState==CURSOR_SKIPNEXT ){
4138           rc = saveCursorPosition(p);
4139           if( rc!=SQLITE_OK ){
4140             (void)sqlite3BtreeTripAllCursors(pBtree, rc, 0);
4141             break;
4142           }
4143         }
4144       }else{
4145         sqlite3BtreeClearCursor(p);
4146         p->eState = CURSOR_FAULT;
4147         p->skipNext = errCode;
4148       }
4149       btreeReleaseAllCursorPages(p);
4150     }
4151     sqlite3BtreeLeave(pBtree);
4152   }
4153   return rc;
4154 }
4155 
4156 /*
4157 ** Set the pBt->nPage field correctly, according to the current
4158 ** state of the database.  Assume pBt->pPage1 is valid.
4159 */
4160 static void btreeSetNPage(BtShared *pBt, MemPage *pPage1){
4161   int nPage = get4byte(&pPage1->aData[28]);
4162   testcase( nPage==0 );
4163   if( nPage==0 ) sqlite3PagerPagecount(pBt->pPager, &nPage);
4164   testcase( pBt->nPage!=nPage );
4165   pBt->nPage = nPage;
4166 }
4167 
4168 /*
4169 ** Rollback the transaction in progress.
4170 **
4171 ** If tripCode is not SQLITE_OK then cursors will be invalidated (tripped).
4172 ** Only write cursors are tripped if writeOnly is true but all cursors are
4173 ** tripped if writeOnly is false.  Any attempt to use
4174 ** a tripped cursor will result in an error.
4175 **
4176 ** This will release the write lock on the database file.  If there
4177 ** are no active cursors, it also releases the read lock.
4178 */
4179 int sqlite3BtreeRollback(Btree *p, int tripCode, int writeOnly){
4180   int rc;
4181   BtShared *pBt = p->pBt;
4182   MemPage *pPage1;
4183 
4184   assert( writeOnly==1 || writeOnly==0 );
4185   assert( tripCode==SQLITE_ABORT_ROLLBACK || tripCode==SQLITE_OK );
4186   sqlite3BtreeEnter(p);
4187   if( tripCode==SQLITE_OK ){
4188     rc = tripCode = saveAllCursors(pBt, 0, 0);
4189     if( rc ) writeOnly = 0;
4190   }else{
4191     rc = SQLITE_OK;
4192   }
4193   if( tripCode ){
4194     int rc2 = sqlite3BtreeTripAllCursors(p, tripCode, writeOnly);
4195     assert( rc==SQLITE_OK || (writeOnly==0 && rc2==SQLITE_OK) );
4196     if( rc2!=SQLITE_OK ) rc = rc2;
4197   }
4198   btreeIntegrity(p);
4199 
4200   if( p->inTrans==TRANS_WRITE ){
4201     int rc2;
4202 
4203     assert( TRANS_WRITE==pBt->inTransaction );
4204     rc2 = sqlite3PagerRollback(pBt->pPager);
4205     if( rc2!=SQLITE_OK ){
4206       rc = rc2;
4207     }
4208 
4209     /* The rollback may have destroyed the pPage1->aData value.  So
4210     ** call btreeGetPage() on page 1 again to make
4211     ** sure pPage1->aData is set correctly. */
4212     if( btreeGetPage(pBt, 1, &pPage1, 0)==SQLITE_OK ){
4213       btreeSetNPage(pBt, pPage1);
4214       releasePageOne(pPage1);
4215     }
4216     assert( countValidCursors(pBt, 1)==0 );
4217     pBt->inTransaction = TRANS_READ;
4218     btreeClearHasContent(pBt);
4219   }
4220 
4221   btreeEndTransaction(p);
4222   sqlite3BtreeLeave(p);
4223   return rc;
4224 }
4225 
4226 /*
4227 ** Start a statement subtransaction. The subtransaction can be rolled
4228 ** back independently of the main transaction. You must start a transaction
4229 ** before starting a subtransaction. The subtransaction is ended automatically
4230 ** if the main transaction commits or rolls back.
4231 **
4232 ** Statement subtransactions are used around individual SQL statements
4233 ** that are contained within a BEGIN...COMMIT block.  If a constraint
4234 ** error occurs within the statement, the effect of that one statement
4235 ** can be rolled back without having to rollback the entire transaction.
4236 **
4237 ** A statement sub-transaction is implemented as an anonymous savepoint. The
4238 ** value passed as the second parameter is the total number of savepoints,
4239 ** including the new anonymous savepoint, open on the B-Tree. i.e. if there
4240 ** are no active savepoints and no other statement-transactions open,
4241 ** iStatement is 1. This anonymous savepoint can be released or rolled back
4242 ** using the sqlite3BtreeSavepoint() function.
4243 */
4244 int sqlite3BtreeBeginStmt(Btree *p, int iStatement){
4245   int rc;
4246   BtShared *pBt = p->pBt;
4247   sqlite3BtreeEnter(p);
4248   assert( p->inTrans==TRANS_WRITE );
4249   assert( (pBt->btsFlags & BTS_READ_ONLY)==0 );
4250   assert( iStatement>0 );
4251   assert( iStatement>p->db->nSavepoint );
4252   assert( pBt->inTransaction==TRANS_WRITE );
4253   /* At the pager level, a statement transaction is a savepoint with
4254   ** an index greater than all savepoints created explicitly using
4255   ** SQL statements. It is illegal to open, release or rollback any
4256   ** such savepoints while the statement transaction savepoint is active.
4257   */
4258   rc = sqlite3PagerOpenSavepoint(pBt->pPager, iStatement);
4259   sqlite3BtreeLeave(p);
4260   return rc;
4261 }
4262 
4263 /*
4264 ** The second argument to this function, op, is always SAVEPOINT_ROLLBACK
4265 ** or SAVEPOINT_RELEASE. This function either releases or rolls back the
4266 ** savepoint identified by parameter iSavepoint, depending on the value
4267 ** of op.
4268 **
4269 ** Normally, iSavepoint is greater than or equal to zero. However, if op is
4270 ** SAVEPOINT_ROLLBACK, then iSavepoint may also be -1. In this case the
4271 ** contents of the entire transaction are rolled back. This is different
4272 ** from a normal transaction rollback, as no locks are released and the
4273 ** transaction remains open.
4274 */
4275 int sqlite3BtreeSavepoint(Btree *p, int op, int iSavepoint){
4276   int rc = SQLITE_OK;
4277   if( p && p->inTrans==TRANS_WRITE ){
4278     BtShared *pBt = p->pBt;
4279     assert( op==SAVEPOINT_RELEASE || op==SAVEPOINT_ROLLBACK );
4280     assert( iSavepoint>=0 || (iSavepoint==-1 && op==SAVEPOINT_ROLLBACK) );
4281     sqlite3BtreeEnter(p);
4282     if( op==SAVEPOINT_ROLLBACK ){
4283       rc = saveAllCursors(pBt, 0, 0);
4284     }
4285     if( rc==SQLITE_OK ){
4286       rc = sqlite3PagerSavepoint(pBt->pPager, op, iSavepoint);
4287     }
4288     if( rc==SQLITE_OK ){
4289       if( iSavepoint<0 && (pBt->btsFlags & BTS_INITIALLY_EMPTY)!=0 ){
4290         pBt->nPage = 0;
4291       }
4292       rc = newDatabase(pBt);
4293       btreeSetNPage(pBt, pBt->pPage1);
4294 
4295       /* pBt->nPage might be zero if the database was corrupt when
4296       ** the transaction was started. Otherwise, it must be at least 1.  */
4297       assert( CORRUPT_DB || pBt->nPage>0 );
4298     }
4299     sqlite3BtreeLeave(p);
4300   }
4301   return rc;
4302 }
4303 
4304 /*
4305 ** Create a new cursor for the BTree whose root is on the page
4306 ** iTable. If a read-only cursor is requested, it is assumed that
4307 ** the caller already has at least a read-only transaction open
4308 ** on the database already. If a write-cursor is requested, then
4309 ** the caller is assumed to have an open write transaction.
4310 **
4311 ** If the BTREE_WRCSR bit of wrFlag is clear, then the cursor can only
4312 ** be used for reading.  If the BTREE_WRCSR bit is set, then the cursor
4313 ** can be used for reading or for writing if other conditions for writing
4314 ** are also met.  These are the conditions that must be met in order
4315 ** for writing to be allowed:
4316 **
4317 ** 1:  The cursor must have been opened with wrFlag containing BTREE_WRCSR
4318 **
4319 ** 2:  Other database connections that share the same pager cache
4320 **     but which are not in the READ_UNCOMMITTED state may not have
4321 **     cursors open with wrFlag==0 on the same table.  Otherwise
4322 **     the changes made by this write cursor would be visible to
4323 **     the read cursors in the other database connection.
4324 **
4325 ** 3:  The database must be writable (not on read-only media)
4326 **
4327 ** 4:  There must be an active transaction.
4328 **
4329 ** The BTREE_FORDELETE bit of wrFlag may optionally be set if BTREE_WRCSR
4330 ** is set.  If FORDELETE is set, that is a hint to the implementation that
4331 ** this cursor will only be used to seek to and delete entries of an index
4332 ** as part of a larger DELETE statement.  The FORDELETE hint is not used by
4333 ** this implementation.  But in a hypothetical alternative storage engine
4334 ** in which index entries are automatically deleted when corresponding table
4335 ** rows are deleted, the FORDELETE flag is a hint that all SEEK and DELETE
4336 ** operations on this cursor can be no-ops and all READ operations can
4337 ** return a null row (2-bytes: 0x01 0x00).
4338 **
4339 ** No checking is done to make sure that page iTable really is the
4340 ** root page of a b-tree.  If it is not, then the cursor acquired
4341 ** will not work correctly.
4342 **
4343 ** It is assumed that the sqlite3BtreeCursorZero() has been called
4344 ** on pCur to initialize the memory space prior to invoking this routine.
4345 */
4346 static int btreeCursor(
4347   Btree *p,                              /* The btree */
4348   int iTable,                            /* Root page of table to open */
4349   int wrFlag,                            /* 1 to write. 0 read-only */
4350   struct KeyInfo *pKeyInfo,              /* First arg to comparison function */
4351   BtCursor *pCur                         /* Space for new cursor */
4352 ){
4353   BtShared *pBt = p->pBt;                /* Shared b-tree handle */
4354   BtCursor *pX;                          /* Looping over other all cursors */
4355 
4356   assert( sqlite3BtreeHoldsMutex(p) );
4357   assert( wrFlag==0
4358        || wrFlag==BTREE_WRCSR
4359        || wrFlag==(BTREE_WRCSR|BTREE_FORDELETE)
4360   );
4361 
4362   /* The following assert statements verify that if this is a sharable
4363   ** b-tree database, the connection is holding the required table locks,
4364   ** and that no other connection has any open cursor that conflicts with
4365   ** this lock.  */
4366   assert( hasSharedCacheTableLock(p, iTable, pKeyInfo!=0, (wrFlag?2:1)) );
4367   assert( wrFlag==0 || !hasReadConflicts(p, iTable) );
4368 
4369   /* Assert that the caller has opened the required transaction. */
4370   assert( p->inTrans>TRANS_NONE );
4371   assert( wrFlag==0 || p->inTrans==TRANS_WRITE );
4372   assert( pBt->pPage1 && pBt->pPage1->aData );
4373   assert( wrFlag==0 || (pBt->btsFlags & BTS_READ_ONLY)==0 );
4374 
4375   if( wrFlag ){
4376     allocateTempSpace(pBt);
4377     if( pBt->pTmpSpace==0 ) return SQLITE_NOMEM_BKPT;
4378   }
4379   if( iTable==1 && btreePagecount(pBt)==0 ){
4380     assert( wrFlag==0 );
4381     iTable = 0;
4382   }
4383 
4384   /* Now that no other errors can occur, finish filling in the BtCursor
4385   ** variables and link the cursor into the BtShared list.  */
4386   pCur->pgnoRoot = (Pgno)iTable;
4387   pCur->iPage = -1;
4388   pCur->pKeyInfo = pKeyInfo;
4389   pCur->pBtree = p;
4390   pCur->pBt = pBt;
4391   pCur->curFlags = wrFlag ? BTCF_WriteFlag : 0;
4392   pCur->curPagerFlags = wrFlag ? 0 : PAGER_GET_READONLY;
4393   /* If there are two or more cursors on the same btree, then all such
4394   ** cursors *must* have the BTCF_Multiple flag set. */
4395   for(pX=pBt->pCursor; pX; pX=pX->pNext){
4396     if( pX->pgnoRoot==(Pgno)iTable ){
4397       pX->curFlags |= BTCF_Multiple;
4398       pCur->curFlags |= BTCF_Multiple;
4399     }
4400   }
4401   pCur->pNext = pBt->pCursor;
4402   pBt->pCursor = pCur;
4403   pCur->eState = CURSOR_INVALID;
4404   return SQLITE_OK;
4405 }
4406 int sqlite3BtreeCursor(
4407   Btree *p,                                   /* The btree */
4408   int iTable,                                 /* Root page of table to open */
4409   int wrFlag,                                 /* 1 to write. 0 read-only */
4410   struct KeyInfo *pKeyInfo,                   /* First arg to xCompare() */
4411   BtCursor *pCur                              /* Write new cursor here */
4412 ){
4413   int rc;
4414   if( iTable<1 ){
4415     rc = SQLITE_CORRUPT_BKPT;
4416   }else{
4417     sqlite3BtreeEnter(p);
4418     rc = btreeCursor(p, iTable, wrFlag, pKeyInfo, pCur);
4419     sqlite3BtreeLeave(p);
4420   }
4421   return rc;
4422 }
4423 
4424 /*
4425 ** Return the size of a BtCursor object in bytes.
4426 **
4427 ** This interfaces is needed so that users of cursors can preallocate
4428 ** sufficient storage to hold a cursor.  The BtCursor object is opaque
4429 ** to users so they cannot do the sizeof() themselves - they must call
4430 ** this routine.
4431 */
4432 int sqlite3BtreeCursorSize(void){
4433   return ROUND8(sizeof(BtCursor));
4434 }
4435 
4436 /*
4437 ** Initialize memory that will be converted into a BtCursor object.
4438 **
4439 ** The simple approach here would be to memset() the entire object
4440 ** to zero.  But it turns out that the apPage[] and aiIdx[] arrays
4441 ** do not need to be zeroed and they are large, so we can save a lot
4442 ** of run-time by skipping the initialization of those elements.
4443 */
4444 void sqlite3BtreeCursorZero(BtCursor *p){
4445   memset(p, 0, offsetof(BtCursor, BTCURSOR_FIRST_UNINIT));
4446 }
4447 
4448 /*
4449 ** Close a cursor.  The read lock on the database file is released
4450 ** when the last cursor is closed.
4451 */
4452 int sqlite3BtreeCloseCursor(BtCursor *pCur){
4453   Btree *pBtree = pCur->pBtree;
4454   if( pBtree ){
4455     BtShared *pBt = pCur->pBt;
4456     sqlite3BtreeEnter(pBtree);
4457     assert( pBt->pCursor!=0 );
4458     if( pBt->pCursor==pCur ){
4459       pBt->pCursor = pCur->pNext;
4460     }else{
4461       BtCursor *pPrev = pBt->pCursor;
4462       do{
4463         if( pPrev->pNext==pCur ){
4464           pPrev->pNext = pCur->pNext;
4465           break;
4466         }
4467         pPrev = pPrev->pNext;
4468       }while( ALWAYS(pPrev) );
4469     }
4470     btreeReleaseAllCursorPages(pCur);
4471     unlockBtreeIfUnused(pBt);
4472     sqlite3_free(pCur->aOverflow);
4473     sqlite3_free(pCur->pKey);
4474     sqlite3BtreeLeave(pBtree);
4475     pCur->pBtree = 0;
4476   }
4477   return SQLITE_OK;
4478 }
4479 
4480 /*
4481 ** Make sure the BtCursor* given in the argument has a valid
4482 ** BtCursor.info structure.  If it is not already valid, call
4483 ** btreeParseCell() to fill it in.
4484 **
4485 ** BtCursor.info is a cache of the information in the current cell.
4486 ** Using this cache reduces the number of calls to btreeParseCell().
4487 */
4488 #ifndef NDEBUG
4489   static int cellInfoEqual(CellInfo *a, CellInfo *b){
4490     if( a->nKey!=b->nKey ) return 0;
4491     if( a->pPayload!=b->pPayload ) return 0;
4492     if( a->nPayload!=b->nPayload ) return 0;
4493     if( a->nLocal!=b->nLocal ) return 0;
4494     if( a->nSize!=b->nSize ) return 0;
4495     return 1;
4496   }
4497   static void assertCellInfo(BtCursor *pCur){
4498     CellInfo info;
4499     memset(&info, 0, sizeof(info));
4500     btreeParseCell(pCur->pPage, pCur->ix, &info);
4501     assert( CORRUPT_DB || cellInfoEqual(&info, &pCur->info) );
4502   }
4503 #else
4504   #define assertCellInfo(x)
4505 #endif
4506 static SQLITE_NOINLINE void getCellInfo(BtCursor *pCur){
4507   if( pCur->info.nSize==0 ){
4508     pCur->curFlags |= BTCF_ValidNKey;
4509     btreeParseCell(pCur->pPage,pCur->ix,&pCur->info);
4510   }else{
4511     assertCellInfo(pCur);
4512   }
4513 }
4514 
4515 #ifndef NDEBUG  /* The next routine used only within assert() statements */
4516 /*
4517 ** Return true if the given BtCursor is valid.  A valid cursor is one
4518 ** that is currently pointing to a row in a (non-empty) table.
4519 ** This is a verification routine is used only within assert() statements.
4520 */
4521 int sqlite3BtreeCursorIsValid(BtCursor *pCur){
4522   return pCur && pCur->eState==CURSOR_VALID;
4523 }
4524 #endif /* NDEBUG */
4525 int sqlite3BtreeCursorIsValidNN(BtCursor *pCur){
4526   assert( pCur!=0 );
4527   return pCur->eState==CURSOR_VALID;
4528 }
4529 
4530 /*
4531 ** Return the value of the integer key or "rowid" for a table btree.
4532 ** This routine is only valid for a cursor that is pointing into a
4533 ** ordinary table btree.  If the cursor points to an index btree or
4534 ** is invalid, the result of this routine is undefined.
4535 */
4536 i64 sqlite3BtreeIntegerKey(BtCursor *pCur){
4537   assert( cursorHoldsMutex(pCur) );
4538   assert( pCur->eState==CURSOR_VALID );
4539   assert( pCur->curIntKey );
4540   getCellInfo(pCur);
4541   return pCur->info.nKey;
4542 }
4543 
4544 #ifdef SQLITE_ENABLE_OFFSET_SQL_FUNC
4545 /*
4546 ** Return the offset into the database file for the start of the
4547 ** payload to which the cursor is pointing.
4548 */
4549 i64 sqlite3BtreeOffset(BtCursor *pCur){
4550   assert( cursorHoldsMutex(pCur) );
4551   assert( pCur->eState==CURSOR_VALID );
4552   getCellInfo(pCur);
4553   return (i64)pCur->pBt->pageSize*((i64)pCur->pPage->pgno - 1) +
4554          (i64)(pCur->info.pPayload - pCur->pPage->aData);
4555 }
4556 #endif /* SQLITE_ENABLE_OFFSET_SQL_FUNC */
4557 
4558 /*
4559 ** Return the number of bytes of payload for the entry that pCur is
4560 ** currently pointing to.  For table btrees, this will be the amount
4561 ** of data.  For index btrees, this will be the size of the key.
4562 **
4563 ** The caller must guarantee that the cursor is pointing to a non-NULL
4564 ** valid entry.  In other words, the calling procedure must guarantee
4565 ** that the cursor has Cursor.eState==CURSOR_VALID.
4566 */
4567 u32 sqlite3BtreePayloadSize(BtCursor *pCur){
4568   assert( cursorHoldsMutex(pCur) );
4569   assert( pCur->eState==CURSOR_VALID );
4570   getCellInfo(pCur);
4571   return pCur->info.nPayload;
4572 }
4573 
4574 /*
4575 ** Return an upper bound on the size of any record for the table
4576 ** that the cursor is pointing into.
4577 **
4578 ** This is an optimization.  Everything will still work if this
4579 ** routine always returns 2147483647 (which is the largest record
4580 ** that SQLite can handle) or more.  But returning a smaller value might
4581 ** prevent large memory allocations when trying to interpret a
4582 ** corrupt datrabase.
4583 **
4584 ** The current implementation merely returns the size of the underlying
4585 ** database file.
4586 */
4587 sqlite3_int64 sqlite3BtreeMaxRecordSize(BtCursor *pCur){
4588   assert( cursorHoldsMutex(pCur) );
4589   assert( pCur->eState==CURSOR_VALID );
4590   return pCur->pBt->pageSize * (sqlite3_int64)pCur->pBt->nPage;
4591 }
4592 
4593 /*
4594 ** Given the page number of an overflow page in the database (parameter
4595 ** ovfl), this function finds the page number of the next page in the
4596 ** linked list of overflow pages. If possible, it uses the auto-vacuum
4597 ** pointer-map data instead of reading the content of page ovfl to do so.
4598 **
4599 ** If an error occurs an SQLite error code is returned. Otherwise:
4600 **
4601 ** The page number of the next overflow page in the linked list is
4602 ** written to *pPgnoNext. If page ovfl is the last page in its linked
4603 ** list, *pPgnoNext is set to zero.
4604 **
4605 ** If ppPage is not NULL, and a reference to the MemPage object corresponding
4606 ** to page number pOvfl was obtained, then *ppPage is set to point to that
4607 ** reference. It is the responsibility of the caller to call releasePage()
4608 ** on *ppPage to free the reference. In no reference was obtained (because
4609 ** the pointer-map was used to obtain the value for *pPgnoNext), then
4610 ** *ppPage is set to zero.
4611 */
4612 static int getOverflowPage(
4613   BtShared *pBt,               /* The database file */
4614   Pgno ovfl,                   /* Current overflow page number */
4615   MemPage **ppPage,            /* OUT: MemPage handle (may be NULL) */
4616   Pgno *pPgnoNext              /* OUT: Next overflow page number */
4617 ){
4618   Pgno next = 0;
4619   MemPage *pPage = 0;
4620   int rc = SQLITE_OK;
4621 
4622   assert( sqlite3_mutex_held(pBt->mutex) );
4623   assert(pPgnoNext);
4624 
4625 #ifndef SQLITE_OMIT_AUTOVACUUM
4626   /* Try to find the next page in the overflow list using the
4627   ** autovacuum pointer-map pages. Guess that the next page in
4628   ** the overflow list is page number (ovfl+1). If that guess turns
4629   ** out to be wrong, fall back to loading the data of page
4630   ** number ovfl to determine the next page number.
4631   */
4632   if( pBt->autoVacuum ){
4633     Pgno pgno;
4634     Pgno iGuess = ovfl+1;
4635     u8 eType;
4636 
4637     while( PTRMAP_ISPAGE(pBt, iGuess) || iGuess==PENDING_BYTE_PAGE(pBt) ){
4638       iGuess++;
4639     }
4640 
4641     if( iGuess<=btreePagecount(pBt) ){
4642       rc = ptrmapGet(pBt, iGuess, &eType, &pgno);
4643       if( rc==SQLITE_OK && eType==PTRMAP_OVERFLOW2 && pgno==ovfl ){
4644         next = iGuess;
4645         rc = SQLITE_DONE;
4646       }
4647     }
4648   }
4649 #endif
4650 
4651   assert( next==0 || rc==SQLITE_DONE );
4652   if( rc==SQLITE_OK ){
4653     rc = btreeGetPage(pBt, ovfl, &pPage, (ppPage==0) ? PAGER_GET_READONLY : 0);
4654     assert( rc==SQLITE_OK || pPage==0 );
4655     if( rc==SQLITE_OK ){
4656       next = get4byte(pPage->aData);
4657     }
4658   }
4659 
4660   *pPgnoNext = next;
4661   if( ppPage ){
4662     *ppPage = pPage;
4663   }else{
4664     releasePage(pPage);
4665   }
4666   return (rc==SQLITE_DONE ? SQLITE_OK : rc);
4667 }
4668 
4669 /*
4670 ** Copy data from a buffer to a page, or from a page to a buffer.
4671 **
4672 ** pPayload is a pointer to data stored on database page pDbPage.
4673 ** If argument eOp is false, then nByte bytes of data are copied
4674 ** from pPayload to the buffer pointed at by pBuf. If eOp is true,
4675 ** then sqlite3PagerWrite() is called on pDbPage and nByte bytes
4676 ** of data are copied from the buffer pBuf to pPayload.
4677 **
4678 ** SQLITE_OK is returned on success, otherwise an error code.
4679 */
4680 static int copyPayload(
4681   void *pPayload,           /* Pointer to page data */
4682   void *pBuf,               /* Pointer to buffer */
4683   int nByte,                /* Number of bytes to copy */
4684   int eOp,                  /* 0 -> copy from page, 1 -> copy to page */
4685   DbPage *pDbPage           /* Page containing pPayload */
4686 ){
4687   if( eOp ){
4688     /* Copy data from buffer to page (a write operation) */
4689     int rc = sqlite3PagerWrite(pDbPage);
4690     if( rc!=SQLITE_OK ){
4691       return rc;
4692     }
4693     memcpy(pPayload, pBuf, nByte);
4694   }else{
4695     /* Copy data from page to buffer (a read operation) */
4696     memcpy(pBuf, pPayload, nByte);
4697   }
4698   return SQLITE_OK;
4699 }
4700 
4701 /*
4702 ** This function is used to read or overwrite payload information
4703 ** for the entry that the pCur cursor is pointing to. The eOp
4704 ** argument is interpreted as follows:
4705 **
4706 **   0: The operation is a read. Populate the overflow cache.
4707 **   1: The operation is a write. Populate the overflow cache.
4708 **
4709 ** A total of "amt" bytes are read or written beginning at "offset".
4710 ** Data is read to or from the buffer pBuf.
4711 **
4712 ** The content being read or written might appear on the main page
4713 ** or be scattered out on multiple overflow pages.
4714 **
4715 ** If the current cursor entry uses one or more overflow pages
4716 ** this function may allocate space for and lazily populate
4717 ** the overflow page-list cache array (BtCursor.aOverflow).
4718 ** Subsequent calls use this cache to make seeking to the supplied offset
4719 ** more efficient.
4720 **
4721 ** Once an overflow page-list cache has been allocated, it must be
4722 ** invalidated if some other cursor writes to the same table, or if
4723 ** the cursor is moved to a different row. Additionally, in auto-vacuum
4724 ** mode, the following events may invalidate an overflow page-list cache.
4725 **
4726 **   * An incremental vacuum,
4727 **   * A commit in auto_vacuum="full" mode,
4728 **   * Creating a table (may require moving an overflow page).
4729 */
4730 static int accessPayload(
4731   BtCursor *pCur,      /* Cursor pointing to entry to read from */
4732   u32 offset,          /* Begin reading this far into payload */
4733   u32 amt,             /* Read this many bytes */
4734   unsigned char *pBuf, /* Write the bytes into this buffer */
4735   int eOp              /* zero to read. non-zero to write. */
4736 ){
4737   unsigned char *aPayload;
4738   int rc = SQLITE_OK;
4739   int iIdx = 0;
4740   MemPage *pPage = pCur->pPage;               /* Btree page of current entry */
4741   BtShared *pBt = pCur->pBt;                  /* Btree this cursor belongs to */
4742 #ifdef SQLITE_DIRECT_OVERFLOW_READ
4743   unsigned char * const pBufStart = pBuf;     /* Start of original out buffer */
4744 #endif
4745 
4746   assert( pPage );
4747   assert( eOp==0 || eOp==1 );
4748   assert( pCur->eState==CURSOR_VALID );
4749   assert( pCur->ix<pPage->nCell );
4750   assert( cursorHoldsMutex(pCur) );
4751 
4752   getCellInfo(pCur);
4753   aPayload = pCur->info.pPayload;
4754   assert( offset+amt <= pCur->info.nPayload );
4755 
4756   assert( aPayload > pPage->aData );
4757   if( (uptr)(aPayload - pPage->aData) > (pBt->usableSize - pCur->info.nLocal) ){
4758     /* Trying to read or write past the end of the data is an error.  The
4759     ** conditional above is really:
4760     **    &aPayload[pCur->info.nLocal] > &pPage->aData[pBt->usableSize]
4761     ** but is recast into its current form to avoid integer overflow problems
4762     */
4763     return SQLITE_CORRUPT_PAGE(pPage);
4764   }
4765 
4766   /* Check if data must be read/written to/from the btree page itself. */
4767   if( offset<pCur->info.nLocal ){
4768     int a = amt;
4769     if( a+offset>pCur->info.nLocal ){
4770       a = pCur->info.nLocal - offset;
4771     }
4772     rc = copyPayload(&aPayload[offset], pBuf, a, eOp, pPage->pDbPage);
4773     offset = 0;
4774     pBuf += a;
4775     amt -= a;
4776   }else{
4777     offset -= pCur->info.nLocal;
4778   }
4779 
4780 
4781   if( rc==SQLITE_OK && amt>0 ){
4782     const u32 ovflSize = pBt->usableSize - 4;  /* Bytes content per ovfl page */
4783     Pgno nextPage;
4784 
4785     nextPage = get4byte(&aPayload[pCur->info.nLocal]);
4786 
4787     /* If the BtCursor.aOverflow[] has not been allocated, allocate it now.
4788     **
4789     ** The aOverflow[] array is sized at one entry for each overflow page
4790     ** in the overflow chain. The page number of the first overflow page is
4791     ** stored in aOverflow[0], etc. A value of 0 in the aOverflow[] array
4792     ** means "not yet known" (the cache is lazily populated).
4793     */
4794     if( (pCur->curFlags & BTCF_ValidOvfl)==0 ){
4795       int nOvfl = (pCur->info.nPayload-pCur->info.nLocal+ovflSize-1)/ovflSize;
4796       if( pCur->aOverflow==0
4797        || nOvfl*(int)sizeof(Pgno) > sqlite3MallocSize(pCur->aOverflow)
4798       ){
4799         Pgno *aNew = (Pgno*)sqlite3Realloc(
4800             pCur->aOverflow, nOvfl*2*sizeof(Pgno)
4801         );
4802         if( aNew==0 ){
4803           return SQLITE_NOMEM_BKPT;
4804         }else{
4805           pCur->aOverflow = aNew;
4806         }
4807       }
4808       memset(pCur->aOverflow, 0, nOvfl*sizeof(Pgno));
4809       pCur->curFlags |= BTCF_ValidOvfl;
4810     }else{
4811       /* If the overflow page-list cache has been allocated and the
4812       ** entry for the first required overflow page is valid, skip
4813       ** directly to it.
4814       */
4815       if( pCur->aOverflow[offset/ovflSize] ){
4816         iIdx = (offset/ovflSize);
4817         nextPage = pCur->aOverflow[iIdx];
4818         offset = (offset%ovflSize);
4819       }
4820     }
4821 
4822     assert( rc==SQLITE_OK && amt>0 );
4823     while( nextPage ){
4824       /* If required, populate the overflow page-list cache. */
4825       assert( pCur->aOverflow[iIdx]==0
4826               || pCur->aOverflow[iIdx]==nextPage
4827               || CORRUPT_DB );
4828       pCur->aOverflow[iIdx] = nextPage;
4829 
4830       if( offset>=ovflSize ){
4831         /* The only reason to read this page is to obtain the page
4832         ** number for the next page in the overflow chain. The page
4833         ** data is not required. So first try to lookup the overflow
4834         ** page-list cache, if any, then fall back to the getOverflowPage()
4835         ** function.
4836         */
4837         assert( pCur->curFlags & BTCF_ValidOvfl );
4838         assert( pCur->pBtree->db==pBt->db );
4839         if( pCur->aOverflow[iIdx+1] ){
4840           nextPage = pCur->aOverflow[iIdx+1];
4841         }else{
4842           rc = getOverflowPage(pBt, nextPage, 0, &nextPage);
4843         }
4844         offset -= ovflSize;
4845       }else{
4846         /* Need to read this page properly. It contains some of the
4847         ** range of data that is being read (eOp==0) or written (eOp!=0).
4848         */
4849         int a = amt;
4850         if( a + offset > ovflSize ){
4851           a = ovflSize - offset;
4852         }
4853 
4854 #ifdef SQLITE_DIRECT_OVERFLOW_READ
4855         /* If all the following are true:
4856         **
4857         **   1) this is a read operation, and
4858         **   2) data is required from the start of this overflow page, and
4859         **   3) there are no dirty pages in the page-cache
4860         **   4) the database is file-backed, and
4861         **   5) the page is not in the WAL file
4862         **   6) at least 4 bytes have already been read into the output buffer
4863         **
4864         ** then data can be read directly from the database file into the
4865         ** output buffer, bypassing the page-cache altogether. This speeds
4866         ** up loading large records that span many overflow pages.
4867         */
4868         if( eOp==0                                             /* (1) */
4869          && offset==0                                          /* (2) */
4870          && sqlite3PagerDirectReadOk(pBt->pPager, nextPage)    /* (3,4,5) */
4871          && &pBuf[-4]>=pBufStart                               /* (6) */
4872         ){
4873           sqlite3_file *fd = sqlite3PagerFile(pBt->pPager);
4874           u8 aSave[4];
4875           u8 *aWrite = &pBuf[-4];
4876           assert( aWrite>=pBufStart );                         /* due to (6) */
4877           memcpy(aSave, aWrite, 4);
4878           rc = sqlite3OsRead(fd, aWrite, a+4, (i64)pBt->pageSize*(nextPage-1));
4879           nextPage = get4byte(aWrite);
4880           memcpy(aWrite, aSave, 4);
4881         }else
4882 #endif
4883 
4884         {
4885           DbPage *pDbPage;
4886           rc = sqlite3PagerGet(pBt->pPager, nextPage, &pDbPage,
4887               (eOp==0 ? PAGER_GET_READONLY : 0)
4888           );
4889           if( rc==SQLITE_OK ){
4890             aPayload = sqlite3PagerGetData(pDbPage);
4891             nextPage = get4byte(aPayload);
4892             rc = copyPayload(&aPayload[offset+4], pBuf, a, eOp, pDbPage);
4893             sqlite3PagerUnref(pDbPage);
4894             offset = 0;
4895           }
4896         }
4897         amt -= a;
4898         if( amt==0 ) return rc;
4899         pBuf += a;
4900       }
4901       if( rc ) break;
4902       iIdx++;
4903     }
4904   }
4905 
4906   if( rc==SQLITE_OK && amt>0 ){
4907     /* Overflow chain ends prematurely */
4908     return SQLITE_CORRUPT_PAGE(pPage);
4909   }
4910   return rc;
4911 }
4912 
4913 /*
4914 ** Read part of the payload for the row at which that cursor pCur is currently
4915 ** pointing.  "amt" bytes will be transferred into pBuf[].  The transfer
4916 ** begins at "offset".
4917 **
4918 ** pCur can be pointing to either a table or an index b-tree.
4919 ** If pointing to a table btree, then the content section is read.  If
4920 ** pCur is pointing to an index b-tree then the key section is read.
4921 **
4922 ** For sqlite3BtreePayload(), the caller must ensure that pCur is pointing
4923 ** to a valid row in the table.  For sqlite3BtreePayloadChecked(), the
4924 ** cursor might be invalid or might need to be restored before being read.
4925 **
4926 ** Return SQLITE_OK on success or an error code if anything goes
4927 ** wrong.  An error is returned if "offset+amt" is larger than
4928 ** the available payload.
4929 */
4930 int sqlite3BtreePayload(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){
4931   assert( cursorHoldsMutex(pCur) );
4932   assert( pCur->eState==CURSOR_VALID );
4933   assert( pCur->iPage>=0 && pCur->pPage );
4934   assert( pCur->ix<pCur->pPage->nCell );
4935   return accessPayload(pCur, offset, amt, (unsigned char*)pBuf, 0);
4936 }
4937 
4938 /*
4939 ** This variant of sqlite3BtreePayload() works even if the cursor has not
4940 ** in the CURSOR_VALID state.  It is only used by the sqlite3_blob_read()
4941 ** interface.
4942 */
4943 #ifndef SQLITE_OMIT_INCRBLOB
4944 static SQLITE_NOINLINE int accessPayloadChecked(
4945   BtCursor *pCur,
4946   u32 offset,
4947   u32 amt,
4948   void *pBuf
4949 ){
4950   int rc;
4951   if ( pCur->eState==CURSOR_INVALID ){
4952     return SQLITE_ABORT;
4953   }
4954   assert( cursorOwnsBtShared(pCur) );
4955   rc = btreeRestoreCursorPosition(pCur);
4956   return rc ? rc : accessPayload(pCur, offset, amt, pBuf, 0);
4957 }
4958 int sqlite3BtreePayloadChecked(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){
4959   if( pCur->eState==CURSOR_VALID ){
4960     assert( cursorOwnsBtShared(pCur) );
4961     return accessPayload(pCur, offset, amt, pBuf, 0);
4962   }else{
4963     return accessPayloadChecked(pCur, offset, amt, pBuf);
4964   }
4965 }
4966 #endif /* SQLITE_OMIT_INCRBLOB */
4967 
4968 /*
4969 ** Return a pointer to payload information from the entry that the
4970 ** pCur cursor is pointing to.  The pointer is to the beginning of
4971 ** the key if index btrees (pPage->intKey==0) and is the data for
4972 ** table btrees (pPage->intKey==1). The number of bytes of available
4973 ** key/data is written into *pAmt.  If *pAmt==0, then the value
4974 ** returned will not be a valid pointer.
4975 **
4976 ** This routine is an optimization.  It is common for the entire key
4977 ** and data to fit on the local page and for there to be no overflow
4978 ** pages.  When that is so, this routine can be used to access the
4979 ** key and data without making a copy.  If the key and/or data spills
4980 ** onto overflow pages, then accessPayload() must be used to reassemble
4981 ** the key/data and copy it into a preallocated buffer.
4982 **
4983 ** The pointer returned by this routine looks directly into the cached
4984 ** page of the database.  The data might change or move the next time
4985 ** any btree routine is called.
4986 */
4987 static const void *fetchPayload(
4988   BtCursor *pCur,      /* Cursor pointing to entry to read from */
4989   u32 *pAmt            /* Write the number of available bytes here */
4990 ){
4991   int amt;
4992   assert( pCur!=0 && pCur->iPage>=0 && pCur->pPage);
4993   assert( pCur->eState==CURSOR_VALID );
4994   assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
4995   assert( cursorOwnsBtShared(pCur) );
4996   assert( pCur->ix<pCur->pPage->nCell );
4997   assert( pCur->info.nSize>0 );
4998   assert( pCur->info.pPayload>pCur->pPage->aData || CORRUPT_DB );
4999   assert( pCur->info.pPayload<pCur->pPage->aDataEnd ||CORRUPT_DB);
5000   amt = pCur->info.nLocal;
5001   if( amt>(int)(pCur->pPage->aDataEnd - pCur->info.pPayload) ){
5002     /* There is too little space on the page for the expected amount
5003     ** of local content. Database must be corrupt. */
5004     assert( CORRUPT_DB );
5005     amt = MAX(0, (int)(pCur->pPage->aDataEnd - pCur->info.pPayload));
5006   }
5007   *pAmt = (u32)amt;
5008   return (void*)pCur->info.pPayload;
5009 }
5010 
5011 
5012 /*
5013 ** For the entry that cursor pCur is point to, return as
5014 ** many bytes of the key or data as are available on the local
5015 ** b-tree page.  Write the number of available bytes into *pAmt.
5016 **
5017 ** The pointer returned is ephemeral.  The key/data may move
5018 ** or be destroyed on the next call to any Btree routine,
5019 ** including calls from other threads against the same cache.
5020 ** Hence, a mutex on the BtShared should be held prior to calling
5021 ** this routine.
5022 **
5023 ** These routines is used to get quick access to key and data
5024 ** in the common case where no overflow pages are used.
5025 */
5026 const void *sqlite3BtreePayloadFetch(BtCursor *pCur, u32 *pAmt){
5027   return fetchPayload(pCur, pAmt);
5028 }
5029 
5030 
5031 /*
5032 ** Move the cursor down to a new child page.  The newPgno argument is the
5033 ** page number of the child page to move to.
5034 **
5035 ** This function returns SQLITE_CORRUPT if the page-header flags field of
5036 ** the new child page does not match the flags field of the parent (i.e.
5037 ** if an intkey page appears to be the parent of a non-intkey page, or
5038 ** vice-versa).
5039 */
5040 static int moveToChild(BtCursor *pCur, u32 newPgno){
5041   BtShared *pBt = pCur->pBt;
5042 
5043   assert( cursorOwnsBtShared(pCur) );
5044   assert( pCur->eState==CURSOR_VALID );
5045   assert( pCur->iPage<BTCURSOR_MAX_DEPTH );
5046   assert( pCur->iPage>=0 );
5047   if( pCur->iPage>=(BTCURSOR_MAX_DEPTH-1) ){
5048     return SQLITE_CORRUPT_BKPT;
5049   }
5050   pCur->info.nSize = 0;
5051   pCur->curFlags &= ~(BTCF_ValidNKey|BTCF_ValidOvfl);
5052   pCur->aiIdx[pCur->iPage] = pCur->ix;
5053   pCur->apPage[pCur->iPage] = pCur->pPage;
5054   pCur->ix = 0;
5055   pCur->iPage++;
5056   return getAndInitPage(pBt, newPgno, &pCur->pPage, pCur, pCur->curPagerFlags);
5057 }
5058 
5059 #ifdef SQLITE_DEBUG
5060 /*
5061 ** Page pParent is an internal (non-leaf) tree page. This function
5062 ** asserts that page number iChild is the left-child if the iIdx'th
5063 ** cell in page pParent. Or, if iIdx is equal to the total number of
5064 ** cells in pParent, that page number iChild is the right-child of
5065 ** the page.
5066 */
5067 static void assertParentIndex(MemPage *pParent, int iIdx, Pgno iChild){
5068   if( CORRUPT_DB ) return;  /* The conditions tested below might not be true
5069                             ** in a corrupt database */
5070   assert( iIdx<=pParent->nCell );
5071   if( iIdx==pParent->nCell ){
5072     assert( get4byte(&pParent->aData[pParent->hdrOffset+8])==iChild );
5073   }else{
5074     assert( get4byte(findCell(pParent, iIdx))==iChild );
5075   }
5076 }
5077 #else
5078 #  define assertParentIndex(x,y,z)
5079 #endif
5080 
5081 /*
5082 ** Move the cursor up to the parent page.
5083 **
5084 ** pCur->idx is set to the cell index that contains the pointer
5085 ** to the page we are coming from.  If we are coming from the
5086 ** right-most child page then pCur->idx is set to one more than
5087 ** the largest cell index.
5088 */
5089 static void moveToParent(BtCursor *pCur){
5090   MemPage *pLeaf;
5091   assert( cursorOwnsBtShared(pCur) );
5092   assert( pCur->eState==CURSOR_VALID );
5093   assert( pCur->iPage>0 );
5094   assert( pCur->pPage );
5095   assertParentIndex(
5096     pCur->apPage[pCur->iPage-1],
5097     pCur->aiIdx[pCur->iPage-1],
5098     pCur->pPage->pgno
5099   );
5100   testcase( pCur->aiIdx[pCur->iPage-1] > pCur->apPage[pCur->iPage-1]->nCell );
5101   pCur->info.nSize = 0;
5102   pCur->curFlags &= ~(BTCF_ValidNKey|BTCF_ValidOvfl);
5103   pCur->ix = pCur->aiIdx[pCur->iPage-1];
5104   pLeaf = pCur->pPage;
5105   pCur->pPage = pCur->apPage[--pCur->iPage];
5106   releasePageNotNull(pLeaf);
5107 }
5108 
5109 /*
5110 ** Move the cursor to point to the root page of its b-tree structure.
5111 **
5112 ** If the table has a virtual root page, then the cursor is moved to point
5113 ** to the virtual root page instead of the actual root page. A table has a
5114 ** virtual root page when the actual root page contains no cells and a
5115 ** single child page. This can only happen with the table rooted at page 1.
5116 **
5117 ** If the b-tree structure is empty, the cursor state is set to
5118 ** CURSOR_INVALID and this routine returns SQLITE_EMPTY. Otherwise,
5119 ** the cursor is set to point to the first cell located on the root
5120 ** (or virtual root) page and the cursor state is set to CURSOR_VALID.
5121 **
5122 ** If this function returns successfully, it may be assumed that the
5123 ** page-header flags indicate that the [virtual] root-page is the expected
5124 ** kind of b-tree page (i.e. if when opening the cursor the caller did not
5125 ** specify a KeyInfo structure the flags byte is set to 0x05 or 0x0D,
5126 ** indicating a table b-tree, or if the caller did specify a KeyInfo
5127 ** structure the flags byte is set to 0x02 or 0x0A, indicating an index
5128 ** b-tree).
5129 */
5130 static int moveToRoot(BtCursor *pCur){
5131   MemPage *pRoot;
5132   int rc = SQLITE_OK;
5133 
5134   assert( cursorOwnsBtShared(pCur) );
5135   assert( CURSOR_INVALID < CURSOR_REQUIRESEEK );
5136   assert( CURSOR_VALID   < CURSOR_REQUIRESEEK );
5137   assert( CURSOR_FAULT   > CURSOR_REQUIRESEEK );
5138   assert( pCur->eState < CURSOR_REQUIRESEEK || pCur->iPage<0 );
5139   assert( pCur->pgnoRoot>0 || pCur->iPage<0 );
5140 
5141   if( pCur->iPage>=0 ){
5142     if( pCur->iPage ){
5143       releasePageNotNull(pCur->pPage);
5144       while( --pCur->iPage ){
5145         releasePageNotNull(pCur->apPage[pCur->iPage]);
5146       }
5147       pCur->pPage = pCur->apPage[0];
5148       goto skip_init;
5149     }
5150   }else if( pCur->pgnoRoot==0 ){
5151     pCur->eState = CURSOR_INVALID;
5152     return SQLITE_EMPTY;
5153   }else{
5154     assert( pCur->iPage==(-1) );
5155     if( pCur->eState>=CURSOR_REQUIRESEEK ){
5156       if( pCur->eState==CURSOR_FAULT ){
5157         assert( pCur->skipNext!=SQLITE_OK );
5158         return pCur->skipNext;
5159       }
5160       sqlite3BtreeClearCursor(pCur);
5161     }
5162     rc = getAndInitPage(pCur->pBtree->pBt, pCur->pgnoRoot, &pCur->pPage,
5163                         0, pCur->curPagerFlags);
5164     if( rc!=SQLITE_OK ){
5165       pCur->eState = CURSOR_INVALID;
5166       return rc;
5167     }
5168     pCur->iPage = 0;
5169     pCur->curIntKey = pCur->pPage->intKey;
5170   }
5171   pRoot = pCur->pPage;
5172   assert( pRoot->pgno==pCur->pgnoRoot );
5173 
5174   /* If pCur->pKeyInfo is not NULL, then the caller that opened this cursor
5175   ** expected to open it on an index b-tree. Otherwise, if pKeyInfo is
5176   ** NULL, the caller expects a table b-tree. If this is not the case,
5177   ** return an SQLITE_CORRUPT error.
5178   **
5179   ** Earlier versions of SQLite assumed that this test could not fail
5180   ** if the root page was already loaded when this function was called (i.e.
5181   ** if pCur->iPage>=0). But this is not so if the database is corrupted
5182   ** in such a way that page pRoot is linked into a second b-tree table
5183   ** (or the freelist).  */
5184   assert( pRoot->intKey==1 || pRoot->intKey==0 );
5185   if( pRoot->isInit==0 || (pCur->pKeyInfo==0)!=pRoot->intKey ){
5186     return SQLITE_CORRUPT_PAGE(pCur->pPage);
5187   }
5188 
5189 skip_init:
5190   pCur->ix = 0;
5191   pCur->info.nSize = 0;
5192   pCur->curFlags &= ~(BTCF_AtLast|BTCF_ValidNKey|BTCF_ValidOvfl);
5193 
5194   pRoot = pCur->pPage;
5195   if( pRoot->nCell>0 ){
5196     pCur->eState = CURSOR_VALID;
5197   }else if( !pRoot->leaf ){
5198     Pgno subpage;
5199     if( pRoot->pgno!=1 ) return SQLITE_CORRUPT_BKPT;
5200     subpage = get4byte(&pRoot->aData[pRoot->hdrOffset+8]);
5201     pCur->eState = CURSOR_VALID;
5202     rc = moveToChild(pCur, subpage);
5203   }else{
5204     pCur->eState = CURSOR_INVALID;
5205     rc = SQLITE_EMPTY;
5206   }
5207   return rc;
5208 }
5209 
5210 /*
5211 ** Move the cursor down to the left-most leaf entry beneath the
5212 ** entry to which it is currently pointing.
5213 **
5214 ** The left-most leaf is the one with the smallest key - the first
5215 ** in ascending order.
5216 */
5217 static int moveToLeftmost(BtCursor *pCur){
5218   Pgno pgno;
5219   int rc = SQLITE_OK;
5220   MemPage *pPage;
5221 
5222   assert( cursorOwnsBtShared(pCur) );
5223   assert( pCur->eState==CURSOR_VALID );
5224   while( rc==SQLITE_OK && !(pPage = pCur->pPage)->leaf ){
5225     assert( pCur->ix<pPage->nCell );
5226     pgno = get4byte(findCell(pPage, pCur->ix));
5227     rc = moveToChild(pCur, pgno);
5228   }
5229   return rc;
5230 }
5231 
5232 /*
5233 ** Move the cursor down to the right-most leaf entry beneath the
5234 ** page to which it is currently pointing.  Notice the difference
5235 ** between moveToLeftmost() and moveToRightmost().  moveToLeftmost()
5236 ** finds the left-most entry beneath the *entry* whereas moveToRightmost()
5237 ** finds the right-most entry beneath the *page*.
5238 **
5239 ** The right-most entry is the one with the largest key - the last
5240 ** key in ascending order.
5241 */
5242 static int moveToRightmost(BtCursor *pCur){
5243   Pgno pgno;
5244   int rc = SQLITE_OK;
5245   MemPage *pPage = 0;
5246 
5247   assert( cursorOwnsBtShared(pCur) );
5248   assert( pCur->eState==CURSOR_VALID );
5249   while( !(pPage = pCur->pPage)->leaf ){
5250     pgno = get4byte(&pPage->aData[pPage->hdrOffset+8]);
5251     pCur->ix = pPage->nCell;
5252     rc = moveToChild(pCur, pgno);
5253     if( rc ) return rc;
5254   }
5255   pCur->ix = pPage->nCell-1;
5256   assert( pCur->info.nSize==0 );
5257   assert( (pCur->curFlags & BTCF_ValidNKey)==0 );
5258   return SQLITE_OK;
5259 }
5260 
5261 /* Move the cursor to the first entry in the table.  Return SQLITE_OK
5262 ** on success.  Set *pRes to 0 if the cursor actually points to something
5263 ** or set *pRes to 1 if the table is empty.
5264 */
5265 int sqlite3BtreeFirst(BtCursor *pCur, int *pRes){
5266   int rc;
5267 
5268   assert( cursorOwnsBtShared(pCur) );
5269   assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
5270   rc = moveToRoot(pCur);
5271   if( rc==SQLITE_OK ){
5272     assert( pCur->pPage->nCell>0 );
5273     *pRes = 0;
5274     rc = moveToLeftmost(pCur);
5275   }else if( rc==SQLITE_EMPTY ){
5276     assert( pCur->pgnoRoot==0 || pCur->pPage->nCell==0 );
5277     *pRes = 1;
5278     rc = SQLITE_OK;
5279   }
5280   return rc;
5281 }
5282 
5283 /* Move the cursor to the last entry in the table.  Return SQLITE_OK
5284 ** on success.  Set *pRes to 0 if the cursor actually points to something
5285 ** or set *pRes to 1 if the table is empty.
5286 */
5287 int sqlite3BtreeLast(BtCursor *pCur, int *pRes){
5288   int rc;
5289 
5290   assert( cursorOwnsBtShared(pCur) );
5291   assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
5292 
5293   /* If the cursor already points to the last entry, this is a no-op. */
5294   if( CURSOR_VALID==pCur->eState && (pCur->curFlags & BTCF_AtLast)!=0 ){
5295 #ifdef SQLITE_DEBUG
5296     /* This block serves to assert() that the cursor really does point
5297     ** to the last entry in the b-tree. */
5298     int ii;
5299     for(ii=0; ii<pCur->iPage; ii++){
5300       assert( pCur->aiIdx[ii]==pCur->apPage[ii]->nCell );
5301     }
5302     assert( pCur->ix==pCur->pPage->nCell-1 );
5303     assert( pCur->pPage->leaf );
5304 #endif
5305     *pRes = 0;
5306     return SQLITE_OK;
5307   }
5308 
5309   rc = moveToRoot(pCur);
5310   if( rc==SQLITE_OK ){
5311     assert( pCur->eState==CURSOR_VALID );
5312     *pRes = 0;
5313     rc = moveToRightmost(pCur);
5314     if( rc==SQLITE_OK ){
5315       pCur->curFlags |= BTCF_AtLast;
5316     }else{
5317       pCur->curFlags &= ~BTCF_AtLast;
5318     }
5319   }else if( rc==SQLITE_EMPTY ){
5320     assert( pCur->pgnoRoot==0 || pCur->pPage->nCell==0 );
5321     *pRes = 1;
5322     rc = SQLITE_OK;
5323   }
5324   return rc;
5325 }
5326 
5327 /* Move the cursor so that it points to an entry near the key
5328 ** specified by pIdxKey or intKey.   Return a success code.
5329 **
5330 ** For INTKEY tables, the intKey parameter is used.  pIdxKey
5331 ** must be NULL.  For index tables, pIdxKey is used and intKey
5332 ** is ignored.
5333 **
5334 ** If an exact match is not found, then the cursor is always
5335 ** left pointing at a leaf page which would hold the entry if it
5336 ** were present.  The cursor might point to an entry that comes
5337 ** before or after the key.
5338 **
5339 ** An integer is written into *pRes which is the result of
5340 ** comparing the key with the entry to which the cursor is
5341 ** pointing.  The meaning of the integer written into
5342 ** *pRes is as follows:
5343 **
5344 **     *pRes<0      The cursor is left pointing at an entry that
5345 **                  is smaller than intKey/pIdxKey or if the table is empty
5346 **                  and the cursor is therefore left point to nothing.
5347 **
5348 **     *pRes==0     The cursor is left pointing at an entry that
5349 **                  exactly matches intKey/pIdxKey.
5350 **
5351 **     *pRes>0      The cursor is left pointing at an entry that
5352 **                  is larger than intKey/pIdxKey.
5353 **
5354 ** For index tables, the pIdxKey->eqSeen field is set to 1 if there
5355 ** exists an entry in the table that exactly matches pIdxKey.
5356 */
5357 int sqlite3BtreeMovetoUnpacked(
5358   BtCursor *pCur,          /* The cursor to be moved */
5359   UnpackedRecord *pIdxKey, /* Unpacked index key */
5360   i64 intKey,              /* The table key */
5361   int biasRight,           /* If true, bias the search to the high end */
5362   int *pRes                /* Write search results here */
5363 ){
5364   int rc;
5365   RecordCompare xRecordCompare;
5366 
5367   assert( cursorOwnsBtShared(pCur) );
5368   assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
5369   assert( pRes );
5370   assert( (pIdxKey==0)==(pCur->pKeyInfo==0) );
5371   assert( pCur->eState!=CURSOR_VALID || (pIdxKey==0)==(pCur->curIntKey!=0) );
5372 
5373   /* If the cursor is already positioned at the point we are trying
5374   ** to move to, then just return without doing any work */
5375   if( pIdxKey==0
5376    && pCur->eState==CURSOR_VALID && (pCur->curFlags & BTCF_ValidNKey)!=0
5377   ){
5378     if( pCur->info.nKey==intKey ){
5379       *pRes = 0;
5380       return SQLITE_OK;
5381     }
5382     if( pCur->info.nKey<intKey ){
5383       if( (pCur->curFlags & BTCF_AtLast)!=0 ){
5384         *pRes = -1;
5385         return SQLITE_OK;
5386       }
5387       /* If the requested key is one more than the previous key, then
5388       ** try to get there using sqlite3BtreeNext() rather than a full
5389       ** binary search.  This is an optimization only.  The correct answer
5390       ** is still obtained without this case, only a little more slowely */
5391       if( pCur->info.nKey+1==intKey ){
5392         *pRes = 0;
5393         rc = sqlite3BtreeNext(pCur, 0);
5394         if( rc==SQLITE_OK ){
5395           getCellInfo(pCur);
5396           if( pCur->info.nKey==intKey ){
5397             return SQLITE_OK;
5398           }
5399         }else if( rc==SQLITE_DONE ){
5400           rc = SQLITE_OK;
5401         }else{
5402           return rc;
5403         }
5404       }
5405     }
5406   }
5407 
5408   if( pIdxKey ){
5409     xRecordCompare = sqlite3VdbeFindCompare(pIdxKey);
5410     pIdxKey->errCode = 0;
5411     assert( pIdxKey->default_rc==1
5412          || pIdxKey->default_rc==0
5413          || pIdxKey->default_rc==-1
5414     );
5415   }else{
5416     xRecordCompare = 0; /* All keys are integers */
5417   }
5418 
5419   rc = moveToRoot(pCur);
5420   if( rc ){
5421     if( rc==SQLITE_EMPTY ){
5422       assert( pCur->pgnoRoot==0 || pCur->pPage->nCell==0 );
5423       *pRes = -1;
5424       return SQLITE_OK;
5425     }
5426     return rc;
5427   }
5428   assert( pCur->pPage );
5429   assert( pCur->pPage->isInit );
5430   assert( pCur->eState==CURSOR_VALID );
5431   assert( pCur->pPage->nCell > 0 );
5432   assert( pCur->iPage==0 || pCur->apPage[0]->intKey==pCur->curIntKey );
5433   assert( pCur->curIntKey || pIdxKey );
5434   for(;;){
5435     int lwr, upr, idx, c;
5436     Pgno chldPg;
5437     MemPage *pPage = pCur->pPage;
5438     u8 *pCell;                          /* Pointer to current cell in pPage */
5439 
5440     /* pPage->nCell must be greater than zero. If this is the root-page
5441     ** the cursor would have been INVALID above and this for(;;) loop
5442     ** not run. If this is not the root-page, then the moveToChild() routine
5443     ** would have already detected db corruption. Similarly, pPage must
5444     ** be the right kind (index or table) of b-tree page. Otherwise
5445     ** a moveToChild() or moveToRoot() call would have detected corruption.  */
5446     assert( pPage->nCell>0 );
5447     assert( pPage->intKey==(pIdxKey==0) );
5448     lwr = 0;
5449     upr = pPage->nCell-1;
5450     assert( biasRight==0 || biasRight==1 );
5451     idx = upr>>(1-biasRight); /* idx = biasRight ? upr : (lwr+upr)/2; */
5452     pCur->ix = (u16)idx;
5453     if( xRecordCompare==0 ){
5454       for(;;){
5455         i64 nCellKey;
5456         pCell = findCellPastPtr(pPage, idx);
5457         if( pPage->intKeyLeaf ){
5458           while( 0x80 <= *(pCell++) ){
5459             if( pCell>=pPage->aDataEnd ){
5460               return SQLITE_CORRUPT_PAGE(pPage);
5461             }
5462           }
5463         }
5464         getVarint(pCell, (u64*)&nCellKey);
5465         if( nCellKey<intKey ){
5466           lwr = idx+1;
5467           if( lwr>upr ){ c = -1; break; }
5468         }else if( nCellKey>intKey ){
5469           upr = idx-1;
5470           if( lwr>upr ){ c = +1; break; }
5471         }else{
5472           assert( nCellKey==intKey );
5473           pCur->ix = (u16)idx;
5474           if( !pPage->leaf ){
5475             lwr = idx;
5476             goto moveto_next_layer;
5477           }else{
5478             pCur->curFlags |= BTCF_ValidNKey;
5479             pCur->info.nKey = nCellKey;
5480             pCur->info.nSize = 0;
5481             *pRes = 0;
5482             return SQLITE_OK;
5483           }
5484         }
5485         assert( lwr+upr>=0 );
5486         idx = (lwr+upr)>>1;  /* idx = (lwr+upr)/2; */
5487       }
5488     }else{
5489       for(;;){
5490         int nCell;  /* Size of the pCell cell in bytes */
5491         pCell = findCellPastPtr(pPage, idx);
5492 
5493         /* The maximum supported page-size is 65536 bytes. This means that
5494         ** the maximum number of record bytes stored on an index B-Tree
5495         ** page is less than 16384 bytes and may be stored as a 2-byte
5496         ** varint. This information is used to attempt to avoid parsing
5497         ** the entire cell by checking for the cases where the record is
5498         ** stored entirely within the b-tree page by inspecting the first
5499         ** 2 bytes of the cell.
5500         */
5501         nCell = pCell[0];
5502         if( nCell<=pPage->max1bytePayload ){
5503           /* This branch runs if the record-size field of the cell is a
5504           ** single byte varint and the record fits entirely on the main
5505           ** b-tree page.  */
5506           testcase( pCell+nCell+1==pPage->aDataEnd );
5507           c = xRecordCompare(nCell, (void*)&pCell[1], pIdxKey);
5508         }else if( !(pCell[1] & 0x80)
5509           && (nCell = ((nCell&0x7f)<<7) + pCell[1])<=pPage->maxLocal
5510         ){
5511           /* The record-size field is a 2 byte varint and the record
5512           ** fits entirely on the main b-tree page.  */
5513           testcase( pCell+nCell+2==pPage->aDataEnd );
5514           c = xRecordCompare(nCell, (void*)&pCell[2], pIdxKey);
5515         }else{
5516           /* The record flows over onto one or more overflow pages. In
5517           ** this case the whole cell needs to be parsed, a buffer allocated
5518           ** and accessPayload() used to retrieve the record into the
5519           ** buffer before VdbeRecordCompare() can be called.
5520           **
5521           ** If the record is corrupt, the xRecordCompare routine may read
5522           ** up to two varints past the end of the buffer. An extra 18
5523           ** bytes of padding is allocated at the end of the buffer in
5524           ** case this happens.  */
5525           void *pCellKey;
5526           u8 * const pCellBody = pCell - pPage->childPtrSize;
5527           const int nOverrun = 18;  /* Size of the overrun padding */
5528           pPage->xParseCell(pPage, pCellBody, &pCur->info);
5529           nCell = (int)pCur->info.nKey;
5530           testcase( nCell<0 );   /* True if key size is 2^32 or more */
5531           testcase( nCell==0 );  /* Invalid key size:  0x80 0x80 0x00 */
5532           testcase( nCell==1 );  /* Invalid key size:  0x80 0x80 0x01 */
5533           testcase( nCell==2 );  /* Minimum legal index key size */
5534           if( nCell<2 || nCell/pCur->pBt->usableSize>pCur->pBt->nPage ){
5535             rc = SQLITE_CORRUPT_PAGE(pPage);
5536             goto moveto_finish;
5537           }
5538           pCellKey = sqlite3Malloc( nCell+nOverrun );
5539           if( pCellKey==0 ){
5540             rc = SQLITE_NOMEM_BKPT;
5541             goto moveto_finish;
5542           }
5543           pCur->ix = (u16)idx;
5544           rc = accessPayload(pCur, 0, nCell, (unsigned char*)pCellKey, 0);
5545           memset(((u8*)pCellKey)+nCell,0,nOverrun); /* Fix uninit warnings */
5546           pCur->curFlags &= ~BTCF_ValidOvfl;
5547           if( rc ){
5548             sqlite3_free(pCellKey);
5549             goto moveto_finish;
5550           }
5551           c = sqlite3VdbeRecordCompare(nCell, pCellKey, pIdxKey);
5552           sqlite3_free(pCellKey);
5553         }
5554         assert(
5555             (pIdxKey->errCode!=SQLITE_CORRUPT || c==0)
5556          && (pIdxKey->errCode!=SQLITE_NOMEM || pCur->pBtree->db->mallocFailed)
5557         );
5558         if( c<0 ){
5559           lwr = idx+1;
5560         }else if( c>0 ){
5561           upr = idx-1;
5562         }else{
5563           assert( c==0 );
5564           *pRes = 0;
5565           rc = SQLITE_OK;
5566           pCur->ix = (u16)idx;
5567           if( pIdxKey->errCode ) rc = SQLITE_CORRUPT_BKPT;
5568           goto moveto_finish;
5569         }
5570         if( lwr>upr ) break;
5571         assert( lwr+upr>=0 );
5572         idx = (lwr+upr)>>1;  /* idx = (lwr+upr)/2 */
5573       }
5574     }
5575     assert( lwr==upr+1 || (pPage->intKey && !pPage->leaf) );
5576     assert( pPage->isInit );
5577     if( pPage->leaf ){
5578       assert( pCur->ix<pCur->pPage->nCell );
5579       pCur->ix = (u16)idx;
5580       *pRes = c;
5581       rc = SQLITE_OK;
5582       goto moveto_finish;
5583     }
5584 moveto_next_layer:
5585     if( lwr>=pPage->nCell ){
5586       chldPg = get4byte(&pPage->aData[pPage->hdrOffset+8]);
5587     }else{
5588       chldPg = get4byte(findCell(pPage, lwr));
5589     }
5590     pCur->ix = (u16)lwr;
5591     rc = moveToChild(pCur, chldPg);
5592     if( rc ) break;
5593   }
5594 moveto_finish:
5595   pCur->info.nSize = 0;
5596   assert( (pCur->curFlags & BTCF_ValidOvfl)==0 );
5597   return rc;
5598 }
5599 
5600 
5601 /*
5602 ** Return TRUE if the cursor is not pointing at an entry of the table.
5603 **
5604 ** TRUE will be returned after a call to sqlite3BtreeNext() moves
5605 ** past the last entry in the table or sqlite3BtreePrev() moves past
5606 ** the first entry.  TRUE is also returned if the table is empty.
5607 */
5608 int sqlite3BtreeEof(BtCursor *pCur){
5609   /* TODO: What if the cursor is in CURSOR_REQUIRESEEK but all table entries
5610   ** have been deleted? This API will need to change to return an error code
5611   ** as well as the boolean result value.
5612   */
5613   return (CURSOR_VALID!=pCur->eState);
5614 }
5615 
5616 /*
5617 ** Return an estimate for the number of rows in the table that pCur is
5618 ** pointing to.  Return a negative number if no estimate is currently
5619 ** available.
5620 */
5621 i64 sqlite3BtreeRowCountEst(BtCursor *pCur){
5622   i64 n;
5623   u8 i;
5624 
5625   assert( cursorOwnsBtShared(pCur) );
5626   assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
5627 
5628   /* Currently this interface is only called by the OP_IfSmaller
5629   ** opcode, and it that case the cursor will always be valid and
5630   ** will always point to a leaf node. */
5631   if( NEVER(pCur->eState!=CURSOR_VALID) ) return -1;
5632   if( NEVER(pCur->pPage->leaf==0) ) return -1;
5633 
5634   n = pCur->pPage->nCell;
5635   for(i=0; i<pCur->iPage; i++){
5636     n *= pCur->apPage[i]->nCell;
5637   }
5638   return n;
5639 }
5640 
5641 /*
5642 ** Advance the cursor to the next entry in the database.
5643 ** Return value:
5644 **
5645 **    SQLITE_OK        success
5646 **    SQLITE_DONE      cursor is already pointing at the last element
5647 **    otherwise        some kind of error occurred
5648 **
5649 ** The main entry point is sqlite3BtreeNext().  That routine is optimized
5650 ** for the common case of merely incrementing the cell counter BtCursor.aiIdx
5651 ** to the next cell on the current page.  The (slower) btreeNext() helper
5652 ** routine is called when it is necessary to move to a different page or
5653 ** to restore the cursor.
5654 **
5655 ** If bit 0x01 of the F argument in sqlite3BtreeNext(C,F) is 1, then the
5656 ** cursor corresponds to an SQL index and this routine could have been
5657 ** skipped if the SQL index had been a unique index.  The F argument
5658 ** is a hint to the implement.  SQLite btree implementation does not use
5659 ** this hint, but COMDB2 does.
5660 */
5661 static SQLITE_NOINLINE int btreeNext(BtCursor *pCur){
5662   int rc;
5663   int idx;
5664   MemPage *pPage;
5665 
5666   assert( cursorOwnsBtShared(pCur) );
5667   if( pCur->eState!=CURSOR_VALID ){
5668     assert( (pCur->curFlags & BTCF_ValidOvfl)==0 );
5669     rc = restoreCursorPosition(pCur);
5670     if( rc!=SQLITE_OK ){
5671       return rc;
5672     }
5673     if( CURSOR_INVALID==pCur->eState ){
5674       return SQLITE_DONE;
5675     }
5676     if( pCur->eState==CURSOR_SKIPNEXT ){
5677       pCur->eState = CURSOR_VALID;
5678       if( pCur->skipNext>0 ) return SQLITE_OK;
5679     }
5680   }
5681 
5682   pPage = pCur->pPage;
5683   idx = ++pCur->ix;
5684   if( !pPage->isInit ){
5685     /* The only known way for this to happen is for there to be a
5686     ** recursive SQL function that does a DELETE operation as part of a
5687     ** SELECT which deletes content out from under an active cursor
5688     ** in a corrupt database file where the table being DELETE-ed from
5689     ** has pages in common with the table being queried.  See TH3
5690     ** module cov1/btree78.test testcase 220 (2018-06-08) for an
5691     ** example. */
5692     return SQLITE_CORRUPT_BKPT;
5693   }
5694 
5695   /* If the database file is corrupt, it is possible for the value of idx
5696   ** to be invalid here. This can only occur if a second cursor modifies
5697   ** the page while cursor pCur is holding a reference to it. Which can
5698   ** only happen if the database is corrupt in such a way as to link the
5699   ** page into more than one b-tree structure. */
5700   testcase( idx>pPage->nCell );
5701 
5702   if( idx>=pPage->nCell ){
5703     if( !pPage->leaf ){
5704       rc = moveToChild(pCur, get4byte(&pPage->aData[pPage->hdrOffset+8]));
5705       if( rc ) return rc;
5706       return moveToLeftmost(pCur);
5707     }
5708     do{
5709       if( pCur->iPage==0 ){
5710         pCur->eState = CURSOR_INVALID;
5711         return SQLITE_DONE;
5712       }
5713       moveToParent(pCur);
5714       pPage = pCur->pPage;
5715     }while( pCur->ix>=pPage->nCell );
5716     if( pPage->intKey ){
5717       return sqlite3BtreeNext(pCur, 0);
5718     }else{
5719       return SQLITE_OK;
5720     }
5721   }
5722   if( pPage->leaf ){
5723     return SQLITE_OK;
5724   }else{
5725     return moveToLeftmost(pCur);
5726   }
5727 }
5728 int sqlite3BtreeNext(BtCursor *pCur, int flags){
5729   MemPage *pPage;
5730   UNUSED_PARAMETER( flags );  /* Used in COMDB2 but not native SQLite */
5731   assert( cursorOwnsBtShared(pCur) );
5732   assert( flags==0 || flags==1 );
5733   pCur->info.nSize = 0;
5734   pCur->curFlags &= ~(BTCF_ValidNKey|BTCF_ValidOvfl);
5735   if( pCur->eState!=CURSOR_VALID ) return btreeNext(pCur);
5736   pPage = pCur->pPage;
5737   if( (++pCur->ix)>=pPage->nCell ){
5738     pCur->ix--;
5739     return btreeNext(pCur);
5740   }
5741   if( pPage->leaf ){
5742     return SQLITE_OK;
5743   }else{
5744     return moveToLeftmost(pCur);
5745   }
5746 }
5747 
5748 /*
5749 ** Step the cursor to the back to the previous entry in the database.
5750 ** Return values:
5751 **
5752 **     SQLITE_OK     success
5753 **     SQLITE_DONE   the cursor is already on the first element of the table
5754 **     otherwise     some kind of error occurred
5755 **
5756 ** The main entry point is sqlite3BtreePrevious().  That routine is optimized
5757 ** for the common case of merely decrementing the cell counter BtCursor.aiIdx
5758 ** to the previous cell on the current page.  The (slower) btreePrevious()
5759 ** helper routine is called when it is necessary to move to a different page
5760 ** or to restore the cursor.
5761 **
5762 ** If bit 0x01 of the F argument to sqlite3BtreePrevious(C,F) is 1, then
5763 ** the cursor corresponds to an SQL index and this routine could have been
5764 ** skipped if the SQL index had been a unique index.  The F argument is a
5765 ** hint to the implement.  The native SQLite btree implementation does not
5766 ** use this hint, but COMDB2 does.
5767 */
5768 static SQLITE_NOINLINE int btreePrevious(BtCursor *pCur){
5769   int rc;
5770   MemPage *pPage;
5771 
5772   assert( cursorOwnsBtShared(pCur) );
5773   assert( (pCur->curFlags & (BTCF_AtLast|BTCF_ValidOvfl|BTCF_ValidNKey))==0 );
5774   assert( pCur->info.nSize==0 );
5775   if( pCur->eState!=CURSOR_VALID ){
5776     rc = restoreCursorPosition(pCur);
5777     if( rc!=SQLITE_OK ){
5778       return rc;
5779     }
5780     if( CURSOR_INVALID==pCur->eState ){
5781       return SQLITE_DONE;
5782     }
5783     if( CURSOR_SKIPNEXT==pCur->eState ){
5784       pCur->eState = CURSOR_VALID;
5785       if( pCur->skipNext<0 ) return SQLITE_OK;
5786     }
5787   }
5788 
5789   pPage = pCur->pPage;
5790   assert( pPage->isInit );
5791   if( !pPage->leaf ){
5792     int idx = pCur->ix;
5793     rc = moveToChild(pCur, get4byte(findCell(pPage, idx)));
5794     if( rc ) return rc;
5795     rc = moveToRightmost(pCur);
5796   }else{
5797     while( pCur->ix==0 ){
5798       if( pCur->iPage==0 ){
5799         pCur->eState = CURSOR_INVALID;
5800         return SQLITE_DONE;
5801       }
5802       moveToParent(pCur);
5803     }
5804     assert( pCur->info.nSize==0 );
5805     assert( (pCur->curFlags & (BTCF_ValidOvfl))==0 );
5806 
5807     pCur->ix--;
5808     pPage = pCur->pPage;
5809     if( pPage->intKey && !pPage->leaf ){
5810       rc = sqlite3BtreePrevious(pCur, 0);
5811     }else{
5812       rc = SQLITE_OK;
5813     }
5814   }
5815   return rc;
5816 }
5817 int sqlite3BtreePrevious(BtCursor *pCur, int flags){
5818   assert( cursorOwnsBtShared(pCur) );
5819   assert( flags==0 || flags==1 );
5820   UNUSED_PARAMETER( flags );  /* Used in COMDB2 but not native SQLite */
5821   pCur->curFlags &= ~(BTCF_AtLast|BTCF_ValidOvfl|BTCF_ValidNKey);
5822   pCur->info.nSize = 0;
5823   if( pCur->eState!=CURSOR_VALID
5824    || pCur->ix==0
5825    || pCur->pPage->leaf==0
5826   ){
5827     return btreePrevious(pCur);
5828   }
5829   pCur->ix--;
5830   return SQLITE_OK;
5831 }
5832 
5833 /*
5834 ** Allocate a new page from the database file.
5835 **
5836 ** The new page is marked as dirty.  (In other words, sqlite3PagerWrite()
5837 ** has already been called on the new page.)  The new page has also
5838 ** been referenced and the calling routine is responsible for calling
5839 ** sqlite3PagerUnref() on the new page when it is done.
5840 **
5841 ** SQLITE_OK is returned on success.  Any other return value indicates
5842 ** an error.  *ppPage is set to NULL in the event of an error.
5843 **
5844 ** If the "nearby" parameter is not 0, then an effort is made to
5845 ** locate a page close to the page number "nearby".  This can be used in an
5846 ** attempt to keep related pages close to each other in the database file,
5847 ** which in turn can make database access faster.
5848 **
5849 ** If the eMode parameter is BTALLOC_EXACT and the nearby page exists
5850 ** anywhere on the free-list, then it is guaranteed to be returned.  If
5851 ** eMode is BTALLOC_LT then the page returned will be less than or equal
5852 ** to nearby if any such page exists.  If eMode is BTALLOC_ANY then there
5853 ** are no restrictions on which page is returned.
5854 */
5855 static int allocateBtreePage(
5856   BtShared *pBt,         /* The btree */
5857   MemPage **ppPage,      /* Store pointer to the allocated page here */
5858   Pgno *pPgno,           /* Store the page number here */
5859   Pgno nearby,           /* Search for a page near this one */
5860   u8 eMode               /* BTALLOC_EXACT, BTALLOC_LT, or BTALLOC_ANY */
5861 ){
5862   MemPage *pPage1;
5863   int rc;
5864   u32 n;     /* Number of pages on the freelist */
5865   u32 k;     /* Number of leaves on the trunk of the freelist */
5866   MemPage *pTrunk = 0;
5867   MemPage *pPrevTrunk = 0;
5868   Pgno mxPage;     /* Total size of the database file */
5869 
5870   assert( sqlite3_mutex_held(pBt->mutex) );
5871   assert( eMode==BTALLOC_ANY || (nearby>0 && IfNotOmitAV(pBt->autoVacuum)) );
5872   pPage1 = pBt->pPage1;
5873   mxPage = btreePagecount(pBt);
5874   /* EVIDENCE-OF: R-05119-02637 The 4-byte big-endian integer at offset 36
5875   ** stores stores the total number of pages on the freelist. */
5876   n = get4byte(&pPage1->aData[36]);
5877   testcase( n==mxPage-1 );
5878   if( n>=mxPage ){
5879     return SQLITE_CORRUPT_BKPT;
5880   }
5881   if( n>0 ){
5882     /* There are pages on the freelist.  Reuse one of those pages. */
5883     Pgno iTrunk;
5884     u8 searchList = 0; /* If the free-list must be searched for 'nearby' */
5885     u32 nSearch = 0;   /* Count of the number of search attempts */
5886 
5887     /* If eMode==BTALLOC_EXACT and a query of the pointer-map
5888     ** shows that the page 'nearby' is somewhere on the free-list, then
5889     ** the entire-list will be searched for that page.
5890     */
5891 #ifndef SQLITE_OMIT_AUTOVACUUM
5892     if( eMode==BTALLOC_EXACT ){
5893       if( nearby<=mxPage ){
5894         u8 eType;
5895         assert( nearby>0 );
5896         assert( pBt->autoVacuum );
5897         rc = ptrmapGet(pBt, nearby, &eType, 0);
5898         if( rc ) return rc;
5899         if( eType==PTRMAP_FREEPAGE ){
5900           searchList = 1;
5901         }
5902       }
5903     }else if( eMode==BTALLOC_LE ){
5904       searchList = 1;
5905     }
5906 #endif
5907 
5908     /* Decrement the free-list count by 1. Set iTrunk to the index of the
5909     ** first free-list trunk page. iPrevTrunk is initially 1.
5910     */
5911     rc = sqlite3PagerWrite(pPage1->pDbPage);
5912     if( rc ) return rc;
5913     put4byte(&pPage1->aData[36], n-1);
5914 
5915     /* The code within this loop is run only once if the 'searchList' variable
5916     ** is not true. Otherwise, it runs once for each trunk-page on the
5917     ** free-list until the page 'nearby' is located (eMode==BTALLOC_EXACT)
5918     ** or until a page less than 'nearby' is located (eMode==BTALLOC_LT)
5919     */
5920     do {
5921       pPrevTrunk = pTrunk;
5922       if( pPrevTrunk ){
5923         /* EVIDENCE-OF: R-01506-11053 The first integer on a freelist trunk page
5924         ** is the page number of the next freelist trunk page in the list or
5925         ** zero if this is the last freelist trunk page. */
5926         iTrunk = get4byte(&pPrevTrunk->aData[0]);
5927       }else{
5928         /* EVIDENCE-OF: R-59841-13798 The 4-byte big-endian integer at offset 32
5929         ** stores the page number of the first page of the freelist, or zero if
5930         ** the freelist is empty. */
5931         iTrunk = get4byte(&pPage1->aData[32]);
5932       }
5933       testcase( iTrunk==mxPage );
5934       if( iTrunk>mxPage || nSearch++ > n ){
5935         rc = SQLITE_CORRUPT_PGNO(pPrevTrunk ? pPrevTrunk->pgno : 1);
5936       }else{
5937         rc = btreeGetUnusedPage(pBt, iTrunk, &pTrunk, 0);
5938       }
5939       if( rc ){
5940         pTrunk = 0;
5941         goto end_allocate_page;
5942       }
5943       assert( pTrunk!=0 );
5944       assert( pTrunk->aData!=0 );
5945       /* EVIDENCE-OF: R-13523-04394 The second integer on a freelist trunk page
5946       ** is the number of leaf page pointers to follow. */
5947       k = get4byte(&pTrunk->aData[4]);
5948       if( k==0 && !searchList ){
5949         /* The trunk has no leaves and the list is not being searched.
5950         ** So extract the trunk page itself and use it as the newly
5951         ** allocated page */
5952         assert( pPrevTrunk==0 );
5953         rc = sqlite3PagerWrite(pTrunk->pDbPage);
5954         if( rc ){
5955           goto end_allocate_page;
5956         }
5957         *pPgno = iTrunk;
5958         memcpy(&pPage1->aData[32], &pTrunk->aData[0], 4);
5959         *ppPage = pTrunk;
5960         pTrunk = 0;
5961         TRACE(("ALLOCATE: %d trunk - %d free pages left\n", *pPgno, n-1));
5962       }else if( k>(u32)(pBt->usableSize/4 - 2) ){
5963         /* Value of k is out of range.  Database corruption */
5964         rc = SQLITE_CORRUPT_PGNO(iTrunk);
5965         goto end_allocate_page;
5966 #ifndef SQLITE_OMIT_AUTOVACUUM
5967       }else if( searchList
5968             && (nearby==iTrunk || (iTrunk<nearby && eMode==BTALLOC_LE))
5969       ){
5970         /* The list is being searched and this trunk page is the page
5971         ** to allocate, regardless of whether it has leaves.
5972         */
5973         *pPgno = iTrunk;
5974         *ppPage = pTrunk;
5975         searchList = 0;
5976         rc = sqlite3PagerWrite(pTrunk->pDbPage);
5977         if( rc ){
5978           goto end_allocate_page;
5979         }
5980         if( k==0 ){
5981           if( !pPrevTrunk ){
5982             memcpy(&pPage1->aData[32], &pTrunk->aData[0], 4);
5983           }else{
5984             rc = sqlite3PagerWrite(pPrevTrunk->pDbPage);
5985             if( rc!=SQLITE_OK ){
5986               goto end_allocate_page;
5987             }
5988             memcpy(&pPrevTrunk->aData[0], &pTrunk->aData[0], 4);
5989           }
5990         }else{
5991           /* The trunk page is required by the caller but it contains
5992           ** pointers to free-list leaves. The first leaf becomes a trunk
5993           ** page in this case.
5994           */
5995           MemPage *pNewTrunk;
5996           Pgno iNewTrunk = get4byte(&pTrunk->aData[8]);
5997           if( iNewTrunk>mxPage ){
5998             rc = SQLITE_CORRUPT_PGNO(iTrunk);
5999             goto end_allocate_page;
6000           }
6001           testcase( iNewTrunk==mxPage );
6002           rc = btreeGetUnusedPage(pBt, iNewTrunk, &pNewTrunk, 0);
6003           if( rc!=SQLITE_OK ){
6004             goto end_allocate_page;
6005           }
6006           rc = sqlite3PagerWrite(pNewTrunk->pDbPage);
6007           if( rc!=SQLITE_OK ){
6008             releasePage(pNewTrunk);
6009             goto end_allocate_page;
6010           }
6011           memcpy(&pNewTrunk->aData[0], &pTrunk->aData[0], 4);
6012           put4byte(&pNewTrunk->aData[4], k-1);
6013           memcpy(&pNewTrunk->aData[8], &pTrunk->aData[12], (k-1)*4);
6014           releasePage(pNewTrunk);
6015           if( !pPrevTrunk ){
6016             assert( sqlite3PagerIswriteable(pPage1->pDbPage) );
6017             put4byte(&pPage1->aData[32], iNewTrunk);
6018           }else{
6019             rc = sqlite3PagerWrite(pPrevTrunk->pDbPage);
6020             if( rc ){
6021               goto end_allocate_page;
6022             }
6023             put4byte(&pPrevTrunk->aData[0], iNewTrunk);
6024           }
6025         }
6026         pTrunk = 0;
6027         TRACE(("ALLOCATE: %d trunk - %d free pages left\n", *pPgno, n-1));
6028 #endif
6029       }else if( k>0 ){
6030         /* Extract a leaf from the trunk */
6031         u32 closest;
6032         Pgno iPage;
6033         unsigned char *aData = pTrunk->aData;
6034         if( nearby>0 ){
6035           u32 i;
6036           closest = 0;
6037           if( eMode==BTALLOC_LE ){
6038             for(i=0; i<k; i++){
6039               iPage = get4byte(&aData[8+i*4]);
6040               if( iPage<=nearby ){
6041                 closest = i;
6042                 break;
6043               }
6044             }
6045           }else{
6046             int dist;
6047             dist = sqlite3AbsInt32(get4byte(&aData[8]) - nearby);
6048             for(i=1; i<k; i++){
6049               int d2 = sqlite3AbsInt32(get4byte(&aData[8+i*4]) - nearby);
6050               if( d2<dist ){
6051                 closest = i;
6052                 dist = d2;
6053               }
6054             }
6055           }
6056         }else{
6057           closest = 0;
6058         }
6059 
6060         iPage = get4byte(&aData[8+closest*4]);
6061         testcase( iPage==mxPage );
6062         if( iPage>mxPage ){
6063           rc = SQLITE_CORRUPT_PGNO(iTrunk);
6064           goto end_allocate_page;
6065         }
6066         testcase( iPage==mxPage );
6067         if( !searchList
6068          || (iPage==nearby || (iPage<nearby && eMode==BTALLOC_LE))
6069         ){
6070           int noContent;
6071           *pPgno = iPage;
6072           TRACE(("ALLOCATE: %d was leaf %d of %d on trunk %d"
6073                  ": %d more free pages\n",
6074                  *pPgno, closest+1, k, pTrunk->pgno, n-1));
6075           rc = sqlite3PagerWrite(pTrunk->pDbPage);
6076           if( rc ) goto end_allocate_page;
6077           if( closest<k-1 ){
6078             memcpy(&aData[8+closest*4], &aData[4+k*4], 4);
6079           }
6080           put4byte(&aData[4], k-1);
6081           noContent = !btreeGetHasContent(pBt, *pPgno)? PAGER_GET_NOCONTENT : 0;
6082           rc = btreeGetUnusedPage(pBt, *pPgno, ppPage, noContent);
6083           if( rc==SQLITE_OK ){
6084             rc = sqlite3PagerWrite((*ppPage)->pDbPage);
6085             if( rc!=SQLITE_OK ){
6086               releasePage(*ppPage);
6087               *ppPage = 0;
6088             }
6089           }
6090           searchList = 0;
6091         }
6092       }
6093       releasePage(pPrevTrunk);
6094       pPrevTrunk = 0;
6095     }while( searchList );
6096   }else{
6097     /* There are no pages on the freelist, so append a new page to the
6098     ** database image.
6099     **
6100     ** Normally, new pages allocated by this block can be requested from the
6101     ** pager layer with the 'no-content' flag set. This prevents the pager
6102     ** from trying to read the pages content from disk. However, if the
6103     ** current transaction has already run one or more incremental-vacuum
6104     ** steps, then the page we are about to allocate may contain content
6105     ** that is required in the event of a rollback. In this case, do
6106     ** not set the no-content flag. This causes the pager to load and journal
6107     ** the current page content before overwriting it.
6108     **
6109     ** Note that the pager will not actually attempt to load or journal
6110     ** content for any page that really does lie past the end of the database
6111     ** file on disk. So the effects of disabling the no-content optimization
6112     ** here are confined to those pages that lie between the end of the
6113     ** database image and the end of the database file.
6114     */
6115     int bNoContent = (0==IfNotOmitAV(pBt->bDoTruncate))? PAGER_GET_NOCONTENT:0;
6116 
6117     rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
6118     if( rc ) return rc;
6119     pBt->nPage++;
6120     if( pBt->nPage==PENDING_BYTE_PAGE(pBt) ) pBt->nPage++;
6121 
6122 #ifndef SQLITE_OMIT_AUTOVACUUM
6123     if( pBt->autoVacuum && PTRMAP_ISPAGE(pBt, pBt->nPage) ){
6124       /* If *pPgno refers to a pointer-map page, allocate two new pages
6125       ** at the end of the file instead of one. The first allocated page
6126       ** becomes a new pointer-map page, the second is used by the caller.
6127       */
6128       MemPage *pPg = 0;
6129       TRACE(("ALLOCATE: %d from end of file (pointer-map page)\n", pBt->nPage));
6130       assert( pBt->nPage!=PENDING_BYTE_PAGE(pBt) );
6131       rc = btreeGetUnusedPage(pBt, pBt->nPage, &pPg, bNoContent);
6132       if( rc==SQLITE_OK ){
6133         rc = sqlite3PagerWrite(pPg->pDbPage);
6134         releasePage(pPg);
6135       }
6136       if( rc ) return rc;
6137       pBt->nPage++;
6138       if( pBt->nPage==PENDING_BYTE_PAGE(pBt) ){ pBt->nPage++; }
6139     }
6140 #endif
6141     put4byte(28 + (u8*)pBt->pPage1->aData, pBt->nPage);
6142     *pPgno = pBt->nPage;
6143 
6144     assert( *pPgno!=PENDING_BYTE_PAGE(pBt) );
6145     rc = btreeGetUnusedPage(pBt, *pPgno, ppPage, bNoContent);
6146     if( rc ) return rc;
6147     rc = sqlite3PagerWrite((*ppPage)->pDbPage);
6148     if( rc!=SQLITE_OK ){
6149       releasePage(*ppPage);
6150       *ppPage = 0;
6151     }
6152     TRACE(("ALLOCATE: %d from end of file\n", *pPgno));
6153   }
6154 
6155   assert( CORRUPT_DB || *pPgno!=PENDING_BYTE_PAGE(pBt) );
6156 
6157 end_allocate_page:
6158   releasePage(pTrunk);
6159   releasePage(pPrevTrunk);
6160   assert( rc!=SQLITE_OK || sqlite3PagerPageRefcount((*ppPage)->pDbPage)<=1 );
6161   assert( rc!=SQLITE_OK || (*ppPage)->isInit==0 );
6162   return rc;
6163 }
6164 
6165 /*
6166 ** This function is used to add page iPage to the database file free-list.
6167 ** It is assumed that the page is not already a part of the free-list.
6168 **
6169 ** The value passed as the second argument to this function is optional.
6170 ** If the caller happens to have a pointer to the MemPage object
6171 ** corresponding to page iPage handy, it may pass it as the second value.
6172 ** Otherwise, it may pass NULL.
6173 **
6174 ** If a pointer to a MemPage object is passed as the second argument,
6175 ** its reference count is not altered by this function.
6176 */
6177 static int freePage2(BtShared *pBt, MemPage *pMemPage, Pgno iPage){
6178   MemPage *pTrunk = 0;                /* Free-list trunk page */
6179   Pgno iTrunk = 0;                    /* Page number of free-list trunk page */
6180   MemPage *pPage1 = pBt->pPage1;      /* Local reference to page 1 */
6181   MemPage *pPage;                     /* Page being freed. May be NULL. */
6182   int rc;                             /* Return Code */
6183   u32 nFree;                          /* Initial number of pages on free-list */
6184 
6185   assert( sqlite3_mutex_held(pBt->mutex) );
6186   assert( CORRUPT_DB || iPage>1 );
6187   assert( !pMemPage || pMemPage->pgno==iPage );
6188 
6189   if( iPage<2 || iPage>pBt->nPage ){
6190     return SQLITE_CORRUPT_BKPT;
6191   }
6192   if( pMemPage ){
6193     pPage = pMemPage;
6194     sqlite3PagerRef(pPage->pDbPage);
6195   }else{
6196     pPage = btreePageLookup(pBt, iPage);
6197   }
6198 
6199   /* Increment the free page count on pPage1 */
6200   rc = sqlite3PagerWrite(pPage1->pDbPage);
6201   if( rc ) goto freepage_out;
6202   nFree = get4byte(&pPage1->aData[36]);
6203   put4byte(&pPage1->aData[36], nFree+1);
6204 
6205   if( pBt->btsFlags & BTS_SECURE_DELETE ){
6206     /* If the secure_delete option is enabled, then
6207     ** always fully overwrite deleted information with zeros.
6208     */
6209     if( (!pPage && ((rc = btreeGetPage(pBt, iPage, &pPage, 0))!=0) )
6210      ||            ((rc = sqlite3PagerWrite(pPage->pDbPage))!=0)
6211     ){
6212       goto freepage_out;
6213     }
6214     memset(pPage->aData, 0, pPage->pBt->pageSize);
6215   }
6216 
6217   /* If the database supports auto-vacuum, write an entry in the pointer-map
6218   ** to indicate that the page is free.
6219   */
6220   if( ISAUTOVACUUM ){
6221     ptrmapPut(pBt, iPage, PTRMAP_FREEPAGE, 0, &rc);
6222     if( rc ) goto freepage_out;
6223   }
6224 
6225   /* Now manipulate the actual database free-list structure. There are two
6226   ** possibilities. If the free-list is currently empty, or if the first
6227   ** trunk page in the free-list is full, then this page will become a
6228   ** new free-list trunk page. Otherwise, it will become a leaf of the
6229   ** first trunk page in the current free-list. This block tests if it
6230   ** is possible to add the page as a new free-list leaf.
6231   */
6232   if( nFree!=0 ){
6233     u32 nLeaf;                /* Initial number of leaf cells on trunk page */
6234 
6235     iTrunk = get4byte(&pPage1->aData[32]);
6236     rc = btreeGetPage(pBt, iTrunk, &pTrunk, 0);
6237     if( rc!=SQLITE_OK ){
6238       goto freepage_out;
6239     }
6240 
6241     nLeaf = get4byte(&pTrunk->aData[4]);
6242     assert( pBt->usableSize>32 );
6243     if( nLeaf > (u32)pBt->usableSize/4 - 2 ){
6244       rc = SQLITE_CORRUPT_BKPT;
6245       goto freepage_out;
6246     }
6247     if( nLeaf < (u32)pBt->usableSize/4 - 8 ){
6248       /* In this case there is room on the trunk page to insert the page
6249       ** being freed as a new leaf.
6250       **
6251       ** Note that the trunk page is not really full until it contains
6252       ** usableSize/4 - 2 entries, not usableSize/4 - 8 entries as we have
6253       ** coded.  But due to a coding error in versions of SQLite prior to
6254       ** 3.6.0, databases with freelist trunk pages holding more than
6255       ** usableSize/4 - 8 entries will be reported as corrupt.  In order
6256       ** to maintain backwards compatibility with older versions of SQLite,
6257       ** we will continue to restrict the number of entries to usableSize/4 - 8
6258       ** for now.  At some point in the future (once everyone has upgraded
6259       ** to 3.6.0 or later) we should consider fixing the conditional above
6260       ** to read "usableSize/4-2" instead of "usableSize/4-8".
6261       **
6262       ** EVIDENCE-OF: R-19920-11576 However, newer versions of SQLite still
6263       ** avoid using the last six entries in the freelist trunk page array in
6264       ** order that database files created by newer versions of SQLite can be
6265       ** read by older versions of SQLite.
6266       */
6267       rc = sqlite3PagerWrite(pTrunk->pDbPage);
6268       if( rc==SQLITE_OK ){
6269         put4byte(&pTrunk->aData[4], nLeaf+1);
6270         put4byte(&pTrunk->aData[8+nLeaf*4], iPage);
6271         if( pPage && (pBt->btsFlags & BTS_SECURE_DELETE)==0 ){
6272           sqlite3PagerDontWrite(pPage->pDbPage);
6273         }
6274         rc = btreeSetHasContent(pBt, iPage);
6275       }
6276       TRACE(("FREE-PAGE: %d leaf on trunk page %d\n",pPage->pgno,pTrunk->pgno));
6277       goto freepage_out;
6278     }
6279   }
6280 
6281   /* If control flows to this point, then it was not possible to add the
6282   ** the page being freed as a leaf page of the first trunk in the free-list.
6283   ** Possibly because the free-list is empty, or possibly because the
6284   ** first trunk in the free-list is full. Either way, the page being freed
6285   ** will become the new first trunk page in the free-list.
6286   */
6287   if( pPage==0 && SQLITE_OK!=(rc = btreeGetPage(pBt, iPage, &pPage, 0)) ){
6288     goto freepage_out;
6289   }
6290   rc = sqlite3PagerWrite(pPage->pDbPage);
6291   if( rc!=SQLITE_OK ){
6292     goto freepage_out;
6293   }
6294   put4byte(pPage->aData, iTrunk);
6295   put4byte(&pPage->aData[4], 0);
6296   put4byte(&pPage1->aData[32], iPage);
6297   TRACE(("FREE-PAGE: %d new trunk page replacing %d\n", pPage->pgno, iTrunk));
6298 
6299 freepage_out:
6300   if( pPage ){
6301     pPage->isInit = 0;
6302   }
6303   releasePage(pPage);
6304   releasePage(pTrunk);
6305   return rc;
6306 }
6307 static void freePage(MemPage *pPage, int *pRC){
6308   if( (*pRC)==SQLITE_OK ){
6309     *pRC = freePage2(pPage->pBt, pPage, pPage->pgno);
6310   }
6311 }
6312 
6313 /*
6314 ** Free any overflow pages associated with the given Cell.  Store
6315 ** size information about the cell in pInfo.
6316 */
6317 static int clearCell(
6318   MemPage *pPage,          /* The page that contains the Cell */
6319   unsigned char *pCell,    /* First byte of the Cell */
6320   CellInfo *pInfo          /* Size information about the cell */
6321 ){
6322   BtShared *pBt;
6323   Pgno ovflPgno;
6324   int rc;
6325   int nOvfl;
6326   u32 ovflPageSize;
6327 
6328   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
6329   pPage->xParseCell(pPage, pCell, pInfo);
6330   if( pInfo->nLocal==pInfo->nPayload ){
6331     return SQLITE_OK;  /* No overflow pages. Return without doing anything */
6332   }
6333   testcase( pCell + pInfo->nSize == pPage->aDataEnd );
6334   testcase( pCell + (pInfo->nSize-1) == pPage->aDataEnd );
6335   if( pCell + pInfo->nSize > pPage->aDataEnd ){
6336     /* Cell extends past end of page */
6337     return SQLITE_CORRUPT_PAGE(pPage);
6338   }
6339   ovflPgno = get4byte(pCell + pInfo->nSize - 4);
6340   pBt = pPage->pBt;
6341   assert( pBt->usableSize > 4 );
6342   ovflPageSize = pBt->usableSize - 4;
6343   nOvfl = (pInfo->nPayload - pInfo->nLocal + ovflPageSize - 1)/ovflPageSize;
6344   assert( nOvfl>0 ||
6345     (CORRUPT_DB && (pInfo->nPayload + ovflPageSize)<ovflPageSize)
6346   );
6347   while( nOvfl-- ){
6348     Pgno iNext = 0;
6349     MemPage *pOvfl = 0;
6350     if( ovflPgno<2 || ovflPgno>btreePagecount(pBt) ){
6351       /* 0 is not a legal page number and page 1 cannot be an
6352       ** overflow page. Therefore if ovflPgno<2 or past the end of the
6353       ** file the database must be corrupt. */
6354       return SQLITE_CORRUPT_BKPT;
6355     }
6356     if( nOvfl ){
6357       rc = getOverflowPage(pBt, ovflPgno, &pOvfl, &iNext);
6358       if( rc ) return rc;
6359     }
6360 
6361     if( ( pOvfl || ((pOvfl = btreePageLookup(pBt, ovflPgno))!=0) )
6362      && sqlite3PagerPageRefcount(pOvfl->pDbPage)!=1
6363     ){
6364       /* There is no reason any cursor should have an outstanding reference
6365       ** to an overflow page belonging to a cell that is being deleted/updated.
6366       ** So if there exists more than one reference to this page, then it
6367       ** must not really be an overflow page and the database must be corrupt.
6368       ** It is helpful to detect this before calling freePage2(), as
6369       ** freePage2() may zero the page contents if secure-delete mode is
6370       ** enabled. If this 'overflow' page happens to be a page that the
6371       ** caller is iterating through or using in some other way, this
6372       ** can be problematic.
6373       */
6374       rc = SQLITE_CORRUPT_BKPT;
6375     }else{
6376       rc = freePage2(pBt, pOvfl, ovflPgno);
6377     }
6378 
6379     if( pOvfl ){
6380       sqlite3PagerUnref(pOvfl->pDbPage);
6381     }
6382     if( rc ) return rc;
6383     ovflPgno = iNext;
6384   }
6385   return SQLITE_OK;
6386 }
6387 
6388 /*
6389 ** Create the byte sequence used to represent a cell on page pPage
6390 ** and write that byte sequence into pCell[].  Overflow pages are
6391 ** allocated and filled in as necessary.  The calling procedure
6392 ** is responsible for making sure sufficient space has been allocated
6393 ** for pCell[].
6394 **
6395 ** Note that pCell does not necessary need to point to the pPage->aData
6396 ** area.  pCell might point to some temporary storage.  The cell will
6397 ** be constructed in this temporary area then copied into pPage->aData
6398 ** later.
6399 */
6400 static int fillInCell(
6401   MemPage *pPage,                /* The page that contains the cell */
6402   unsigned char *pCell,          /* Complete text of the cell */
6403   const BtreePayload *pX,        /* Payload with which to construct the cell */
6404   int *pnSize                    /* Write cell size here */
6405 ){
6406   int nPayload;
6407   const u8 *pSrc;
6408   int nSrc, n, rc, mn;
6409   int spaceLeft;
6410   MemPage *pToRelease;
6411   unsigned char *pPrior;
6412   unsigned char *pPayload;
6413   BtShared *pBt;
6414   Pgno pgnoOvfl;
6415   int nHeader;
6416 
6417   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
6418 
6419   /* pPage is not necessarily writeable since pCell might be auxiliary
6420   ** buffer space that is separate from the pPage buffer area */
6421   assert( pCell<pPage->aData || pCell>=&pPage->aData[pPage->pBt->pageSize]
6422             || sqlite3PagerIswriteable(pPage->pDbPage) );
6423 
6424   /* Fill in the header. */
6425   nHeader = pPage->childPtrSize;
6426   if( pPage->intKey ){
6427     nPayload = pX->nData + pX->nZero;
6428     pSrc = pX->pData;
6429     nSrc = pX->nData;
6430     assert( pPage->intKeyLeaf ); /* fillInCell() only called for leaves */
6431     nHeader += putVarint32(&pCell[nHeader], nPayload);
6432     nHeader += putVarint(&pCell[nHeader], *(u64*)&pX->nKey);
6433   }else{
6434     assert( pX->nKey<=0x7fffffff && pX->pKey!=0 );
6435     nSrc = nPayload = (int)pX->nKey;
6436     pSrc = pX->pKey;
6437     nHeader += putVarint32(&pCell[nHeader], nPayload);
6438   }
6439 
6440   /* Fill in the payload */
6441   pPayload = &pCell[nHeader];
6442   if( nPayload<=pPage->maxLocal ){
6443     /* This is the common case where everything fits on the btree page
6444     ** and no overflow pages are required. */
6445     n = nHeader + nPayload;
6446     testcase( n==3 );
6447     testcase( n==4 );
6448     if( n<4 ) n = 4;
6449     *pnSize = n;
6450     assert( nSrc<=nPayload );
6451     testcase( nSrc<nPayload );
6452     memcpy(pPayload, pSrc, nSrc);
6453     memset(pPayload+nSrc, 0, nPayload-nSrc);
6454     return SQLITE_OK;
6455   }
6456 
6457   /* If we reach this point, it means that some of the content will need
6458   ** to spill onto overflow pages.
6459   */
6460   mn = pPage->minLocal;
6461   n = mn + (nPayload - mn) % (pPage->pBt->usableSize - 4);
6462   testcase( n==pPage->maxLocal );
6463   testcase( n==pPage->maxLocal+1 );
6464   if( n > pPage->maxLocal ) n = mn;
6465   spaceLeft = n;
6466   *pnSize = n + nHeader + 4;
6467   pPrior = &pCell[nHeader+n];
6468   pToRelease = 0;
6469   pgnoOvfl = 0;
6470   pBt = pPage->pBt;
6471 
6472   /* At this point variables should be set as follows:
6473   **
6474   **   nPayload           Total payload size in bytes
6475   **   pPayload           Begin writing payload here
6476   **   spaceLeft          Space available at pPayload.  If nPayload>spaceLeft,
6477   **                      that means content must spill into overflow pages.
6478   **   *pnSize            Size of the local cell (not counting overflow pages)
6479   **   pPrior             Where to write the pgno of the first overflow page
6480   **
6481   ** Use a call to btreeParseCellPtr() to verify that the values above
6482   ** were computed correctly.
6483   */
6484 #ifdef SQLITE_DEBUG
6485   {
6486     CellInfo info;
6487     pPage->xParseCell(pPage, pCell, &info);
6488     assert( nHeader==(int)(info.pPayload - pCell) );
6489     assert( info.nKey==pX->nKey );
6490     assert( *pnSize == info.nSize );
6491     assert( spaceLeft == info.nLocal );
6492   }
6493 #endif
6494 
6495   /* Write the payload into the local Cell and any extra into overflow pages */
6496   while( 1 ){
6497     n = nPayload;
6498     if( n>spaceLeft ) n = spaceLeft;
6499 
6500     /* If pToRelease is not zero than pPayload points into the data area
6501     ** of pToRelease.  Make sure pToRelease is still writeable. */
6502     assert( pToRelease==0 || sqlite3PagerIswriteable(pToRelease->pDbPage) );
6503 
6504     /* If pPayload is part of the data area of pPage, then make sure pPage
6505     ** is still writeable */
6506     assert( pPayload<pPage->aData || pPayload>=&pPage->aData[pBt->pageSize]
6507             || sqlite3PagerIswriteable(pPage->pDbPage) );
6508 
6509     if( nSrc>=n ){
6510       memcpy(pPayload, pSrc, n);
6511     }else if( nSrc>0 ){
6512       n = nSrc;
6513       memcpy(pPayload, pSrc, n);
6514     }else{
6515       memset(pPayload, 0, n);
6516     }
6517     nPayload -= n;
6518     if( nPayload<=0 ) break;
6519     pPayload += n;
6520     pSrc += n;
6521     nSrc -= n;
6522     spaceLeft -= n;
6523     if( spaceLeft==0 ){
6524       MemPage *pOvfl = 0;
6525 #ifndef SQLITE_OMIT_AUTOVACUUM
6526       Pgno pgnoPtrmap = pgnoOvfl; /* Overflow page pointer-map entry page */
6527       if( pBt->autoVacuum ){
6528         do{
6529           pgnoOvfl++;
6530         } while(
6531           PTRMAP_ISPAGE(pBt, pgnoOvfl) || pgnoOvfl==PENDING_BYTE_PAGE(pBt)
6532         );
6533       }
6534 #endif
6535       rc = allocateBtreePage(pBt, &pOvfl, &pgnoOvfl, pgnoOvfl, 0);
6536 #ifndef SQLITE_OMIT_AUTOVACUUM
6537       /* If the database supports auto-vacuum, and the second or subsequent
6538       ** overflow page is being allocated, add an entry to the pointer-map
6539       ** for that page now.
6540       **
6541       ** If this is the first overflow page, then write a partial entry
6542       ** to the pointer-map. If we write nothing to this pointer-map slot,
6543       ** then the optimistic overflow chain processing in clearCell()
6544       ** may misinterpret the uninitialized values and delete the
6545       ** wrong pages from the database.
6546       */
6547       if( pBt->autoVacuum && rc==SQLITE_OK ){
6548         u8 eType = (pgnoPtrmap?PTRMAP_OVERFLOW2:PTRMAP_OVERFLOW1);
6549         ptrmapPut(pBt, pgnoOvfl, eType, pgnoPtrmap, &rc);
6550         if( rc ){
6551           releasePage(pOvfl);
6552         }
6553       }
6554 #endif
6555       if( rc ){
6556         releasePage(pToRelease);
6557         return rc;
6558       }
6559 
6560       /* If pToRelease is not zero than pPrior points into the data area
6561       ** of pToRelease.  Make sure pToRelease is still writeable. */
6562       assert( pToRelease==0 || sqlite3PagerIswriteable(pToRelease->pDbPage) );
6563 
6564       /* If pPrior is part of the data area of pPage, then make sure pPage
6565       ** is still writeable */
6566       assert( pPrior<pPage->aData || pPrior>=&pPage->aData[pBt->pageSize]
6567             || sqlite3PagerIswriteable(pPage->pDbPage) );
6568 
6569       put4byte(pPrior, pgnoOvfl);
6570       releasePage(pToRelease);
6571       pToRelease = pOvfl;
6572       pPrior = pOvfl->aData;
6573       put4byte(pPrior, 0);
6574       pPayload = &pOvfl->aData[4];
6575       spaceLeft = pBt->usableSize - 4;
6576     }
6577   }
6578   releasePage(pToRelease);
6579   return SQLITE_OK;
6580 }
6581 
6582 /*
6583 ** Remove the i-th cell from pPage.  This routine effects pPage only.
6584 ** The cell content is not freed or deallocated.  It is assumed that
6585 ** the cell content has been copied someplace else.  This routine just
6586 ** removes the reference to the cell from pPage.
6587 **
6588 ** "sz" must be the number of bytes in the cell.
6589 */
6590 static void dropCell(MemPage *pPage, int idx, int sz, int *pRC){
6591   u32 pc;         /* Offset to cell content of cell being deleted */
6592   u8 *data;       /* pPage->aData */
6593   u8 *ptr;        /* Used to move bytes around within data[] */
6594   int rc;         /* The return code */
6595   int hdr;        /* Beginning of the header.  0 most pages.  100 page 1 */
6596 
6597   if( *pRC ) return;
6598   assert( idx>=0 && idx<pPage->nCell );
6599   assert( CORRUPT_DB || sz==cellSize(pPage, idx) );
6600   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
6601   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
6602   assert( pPage->nFree>=0 );
6603   data = pPage->aData;
6604   ptr = &pPage->aCellIdx[2*idx];
6605   pc = get2byte(ptr);
6606   hdr = pPage->hdrOffset;
6607   testcase( pc==get2byte(&data[hdr+5]) );
6608   testcase( pc+sz==pPage->pBt->usableSize );
6609   if( pc+sz > pPage->pBt->usableSize ){
6610     *pRC = SQLITE_CORRUPT_BKPT;
6611     return;
6612   }
6613   rc = freeSpace(pPage, pc, sz);
6614   if( rc ){
6615     *pRC = rc;
6616     return;
6617   }
6618   pPage->nCell--;
6619   if( pPage->nCell==0 ){
6620     memset(&data[hdr+1], 0, 4);
6621     data[hdr+7] = 0;
6622     put2byte(&data[hdr+5], pPage->pBt->usableSize);
6623     pPage->nFree = pPage->pBt->usableSize - pPage->hdrOffset
6624                        - pPage->childPtrSize - 8;
6625   }else{
6626     memmove(ptr, ptr+2, 2*(pPage->nCell - idx));
6627     put2byte(&data[hdr+3], pPage->nCell);
6628     pPage->nFree += 2;
6629   }
6630 }
6631 
6632 /*
6633 ** Insert a new cell on pPage at cell index "i".  pCell points to the
6634 ** content of the cell.
6635 **
6636 ** If the cell content will fit on the page, then put it there.  If it
6637 ** will not fit, then make a copy of the cell content into pTemp if
6638 ** pTemp is not null.  Regardless of pTemp, allocate a new entry
6639 ** in pPage->apOvfl[] and make it point to the cell content (either
6640 ** in pTemp or the original pCell) and also record its index.
6641 ** Allocating a new entry in pPage->aCell[] implies that
6642 ** pPage->nOverflow is incremented.
6643 **
6644 ** *pRC must be SQLITE_OK when this routine is called.
6645 */
6646 static void insertCell(
6647   MemPage *pPage,   /* Page into which we are copying */
6648   int i,            /* New cell becomes the i-th cell of the page */
6649   u8 *pCell,        /* Content of the new cell */
6650   int sz,           /* Bytes of content in pCell */
6651   u8 *pTemp,        /* Temp storage space for pCell, if needed */
6652   Pgno iChild,      /* If non-zero, replace first 4 bytes with this value */
6653   int *pRC          /* Read and write return code from here */
6654 ){
6655   int idx = 0;      /* Where to write new cell content in data[] */
6656   int j;            /* Loop counter */
6657   u8 *data;         /* The content of the whole page */
6658   u8 *pIns;         /* The point in pPage->aCellIdx[] where no cell inserted */
6659 
6660   assert( *pRC==SQLITE_OK );
6661   assert( i>=0 && i<=pPage->nCell+pPage->nOverflow );
6662   assert( MX_CELL(pPage->pBt)<=10921 );
6663   assert( pPage->nCell<=MX_CELL(pPage->pBt) || CORRUPT_DB );
6664   assert( pPage->nOverflow<=ArraySize(pPage->apOvfl) );
6665   assert( ArraySize(pPage->apOvfl)==ArraySize(pPage->aiOvfl) );
6666   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
6667   assert( sz==pPage->xCellSize(pPage, pCell) || CORRUPT_DB );
6668   assert( pPage->nFree>=0 );
6669   if( pPage->nOverflow || sz+2>pPage->nFree ){
6670     if( pTemp ){
6671       memcpy(pTemp, pCell, sz);
6672       pCell = pTemp;
6673     }
6674     if( iChild ){
6675       put4byte(pCell, iChild);
6676     }
6677     j = pPage->nOverflow++;
6678     /* Comparison against ArraySize-1 since we hold back one extra slot
6679     ** as a contingency.  In other words, never need more than 3 overflow
6680     ** slots but 4 are allocated, just to be safe. */
6681     assert( j < ArraySize(pPage->apOvfl)-1 );
6682     pPage->apOvfl[j] = pCell;
6683     pPage->aiOvfl[j] = (u16)i;
6684 
6685     /* When multiple overflows occur, they are always sequential and in
6686     ** sorted order.  This invariants arise because multiple overflows can
6687     ** only occur when inserting divider cells into the parent page during
6688     ** balancing, and the dividers are adjacent and sorted.
6689     */
6690     assert( j==0 || pPage->aiOvfl[j-1]<(u16)i ); /* Overflows in sorted order */
6691     assert( j==0 || i==pPage->aiOvfl[j-1]+1 );   /* Overflows are sequential */
6692   }else{
6693     int rc = sqlite3PagerWrite(pPage->pDbPage);
6694     if( rc!=SQLITE_OK ){
6695       *pRC = rc;
6696       return;
6697     }
6698     assert( sqlite3PagerIswriteable(pPage->pDbPage) );
6699     data = pPage->aData;
6700     assert( &data[pPage->cellOffset]==pPage->aCellIdx );
6701     rc = allocateSpace(pPage, sz, &idx);
6702     if( rc ){ *pRC = rc; return; }
6703     /* The allocateSpace() routine guarantees the following properties
6704     ** if it returns successfully */
6705     assert( idx >= 0 );
6706     assert( idx >= pPage->cellOffset+2*pPage->nCell+2 || CORRUPT_DB );
6707     assert( idx+sz <= (int)pPage->pBt->usableSize );
6708     pPage->nFree -= (u16)(2 + sz);
6709     if( iChild ){
6710       /* In a corrupt database where an entry in the cell index section of
6711       ** a btree page has a value of 3 or less, the pCell value might point
6712       ** as many as 4 bytes in front of the start of the aData buffer for
6713       ** the source page.  Make sure this does not cause problems by not
6714       ** reading the first 4 bytes */
6715       memcpy(&data[idx+4], pCell+4, sz-4);
6716       put4byte(&data[idx], iChild);
6717     }else{
6718       memcpy(&data[idx], pCell, sz);
6719     }
6720     pIns = pPage->aCellIdx + i*2;
6721     memmove(pIns+2, pIns, 2*(pPage->nCell - i));
6722     put2byte(pIns, idx);
6723     pPage->nCell++;
6724     /* increment the cell count */
6725     if( (++data[pPage->hdrOffset+4])==0 ) data[pPage->hdrOffset+3]++;
6726     assert( get2byte(&data[pPage->hdrOffset+3])==pPage->nCell || CORRUPT_DB );
6727 #ifndef SQLITE_OMIT_AUTOVACUUM
6728     if( pPage->pBt->autoVacuum ){
6729       /* The cell may contain a pointer to an overflow page. If so, write
6730       ** the entry for the overflow page into the pointer map.
6731       */
6732       ptrmapPutOvflPtr(pPage, pPage, pCell, pRC);
6733     }
6734 #endif
6735   }
6736 }
6737 
6738 /*
6739 ** The following parameters determine how many adjacent pages get involved
6740 ** in a balancing operation.  NN is the number of neighbors on either side
6741 ** of the page that participate in the balancing operation.  NB is the
6742 ** total number of pages that participate, including the target page and
6743 ** NN neighbors on either side.
6744 **
6745 ** The minimum value of NN is 1 (of course).  Increasing NN above 1
6746 ** (to 2 or 3) gives a modest improvement in SELECT and DELETE performance
6747 ** in exchange for a larger degradation in INSERT and UPDATE performance.
6748 ** The value of NN appears to give the best results overall.
6749 **
6750 ** (Later:) The description above makes it seem as if these values are
6751 ** tunable - as if you could change them and recompile and it would all work.
6752 ** But that is unlikely.  NB has been 3 since the inception of SQLite and
6753 ** we have never tested any other value.
6754 */
6755 #define NN 1             /* Number of neighbors on either side of pPage */
6756 #define NB 3             /* (NN*2+1): Total pages involved in the balance */
6757 
6758 /*
6759 ** A CellArray object contains a cache of pointers and sizes for a
6760 ** consecutive sequence of cells that might be held on multiple pages.
6761 **
6762 ** The cells in this array are the divider cell or cells from the pParent
6763 ** page plus up to three child pages.  There are a total of nCell cells.
6764 **
6765 ** pRef is a pointer to one of the pages that contributes cells.  This is
6766 ** used to access information such as MemPage.intKey and MemPage.pBt->pageSize
6767 ** which should be common to all pages that contribute cells to this array.
6768 **
6769 ** apCell[] and szCell[] hold, respectively, pointers to the start of each
6770 ** cell and the size of each cell.  Some of the apCell[] pointers might refer
6771 ** to overflow cells.  In other words, some apCel[] pointers might not point
6772 ** to content area of the pages.
6773 **
6774 ** A szCell[] of zero means the size of that cell has not yet been computed.
6775 **
6776 ** The cells come from as many as four different pages:
6777 **
6778 **             -----------
6779 **             | Parent  |
6780 **             -----------
6781 **            /     |     \
6782 **           /      |      \
6783 **  ---------   ---------   ---------
6784 **  |Child-1|   |Child-2|   |Child-3|
6785 **  ---------   ---------   ---------
6786 **
6787 ** The order of cells is in the array is for an index btree is:
6788 **
6789 **       1.  All cells from Child-1 in order
6790 **       2.  The first divider cell from Parent
6791 **       3.  All cells from Child-2 in order
6792 **       4.  The second divider cell from Parent
6793 **       5.  All cells from Child-3 in order
6794 **
6795 ** For a table-btree (with rowids) the items 2 and 4 are empty because
6796 ** content exists only in leaves and there are no divider cells.
6797 **
6798 ** For an index btree, the apEnd[] array holds pointer to the end of page
6799 ** for Child-1, the Parent, Child-2, the Parent (again), and Child-3,
6800 ** respectively. The ixNx[] array holds the number of cells contained in
6801 ** each of these 5 stages, and all stages to the left.  Hence:
6802 **
6803 **    ixNx[0] = Number of cells in Child-1.
6804 **    ixNx[1] = Number of cells in Child-1 plus 1 for first divider.
6805 **    ixNx[2] = Number of cells in Child-1 and Child-2 + 1 for 1st divider.
6806 **    ixNx[3] = Number of cells in Child-1 and Child-2 + both divider cells
6807 **    ixNx[4] = Total number of cells.
6808 **
6809 ** For a table-btree, the concept is similar, except only apEnd[0]..apEnd[2]
6810 ** are used and they point to the leaf pages only, and the ixNx value are:
6811 **
6812 **    ixNx[0] = Number of cells in Child-1.
6813 **    ixNx[1] = Number of cells in Child-1 and Child-2.
6814 **    ixNx[2] = Total number of cells.
6815 **
6816 ** Sometimes when deleting, a child page can have zero cells.  In those
6817 ** cases, ixNx[] entries with higher indexes, and the corresponding apEnd[]
6818 ** entries, shift down.  The end result is that each ixNx[] entry should
6819 ** be larger than the previous
6820 */
6821 typedef struct CellArray CellArray;
6822 struct CellArray {
6823   int nCell;              /* Number of cells in apCell[] */
6824   MemPage *pRef;          /* Reference page */
6825   u8 **apCell;            /* All cells begin balanced */
6826   u16 *szCell;            /* Local size of all cells in apCell[] */
6827   u8 *apEnd[NB*2];        /* MemPage.aDataEnd values */
6828   int ixNx[NB*2];         /* Index of at which we move to the next apEnd[] */
6829 };
6830 
6831 /*
6832 ** Make sure the cell sizes at idx, idx+1, ..., idx+N-1 have been
6833 ** computed.
6834 */
6835 static void populateCellCache(CellArray *p, int idx, int N){
6836   assert( idx>=0 && idx+N<=p->nCell );
6837   while( N>0 ){
6838     assert( p->apCell[idx]!=0 );
6839     if( p->szCell[idx]==0 ){
6840       p->szCell[idx] = p->pRef->xCellSize(p->pRef, p->apCell[idx]);
6841     }else{
6842       assert( CORRUPT_DB ||
6843               p->szCell[idx]==p->pRef->xCellSize(p->pRef, p->apCell[idx]) );
6844     }
6845     idx++;
6846     N--;
6847   }
6848 }
6849 
6850 /*
6851 ** Return the size of the Nth element of the cell array
6852 */
6853 static SQLITE_NOINLINE u16 computeCellSize(CellArray *p, int N){
6854   assert( N>=0 && N<p->nCell );
6855   assert( p->szCell[N]==0 );
6856   p->szCell[N] = p->pRef->xCellSize(p->pRef, p->apCell[N]);
6857   return p->szCell[N];
6858 }
6859 static u16 cachedCellSize(CellArray *p, int N){
6860   assert( N>=0 && N<p->nCell );
6861   if( p->szCell[N] ) return p->szCell[N];
6862   return computeCellSize(p, N);
6863 }
6864 
6865 /*
6866 ** Array apCell[] contains pointers to nCell b-tree page cells. The
6867 ** szCell[] array contains the size in bytes of each cell. This function
6868 ** replaces the current contents of page pPg with the contents of the cell
6869 ** array.
6870 **
6871 ** Some of the cells in apCell[] may currently be stored in pPg. This
6872 ** function works around problems caused by this by making a copy of any
6873 ** such cells before overwriting the page data.
6874 **
6875 ** The MemPage.nFree field is invalidated by this function. It is the
6876 ** responsibility of the caller to set it correctly.
6877 */
6878 static int rebuildPage(
6879   CellArray *pCArray,             /* Content to be added to page pPg */
6880   int iFirst,                     /* First cell in pCArray to use */
6881   int nCell,                      /* Final number of cells on page */
6882   MemPage *pPg                    /* The page to be reconstructed */
6883 ){
6884   const int hdr = pPg->hdrOffset;          /* Offset of header on pPg */
6885   u8 * const aData = pPg->aData;           /* Pointer to data for pPg */
6886   const int usableSize = pPg->pBt->usableSize;
6887   u8 * const pEnd = &aData[usableSize];
6888   int i = iFirst;                 /* Which cell to copy from pCArray*/
6889   u32 j;                          /* Start of cell content area */
6890   int iEnd = i+nCell;             /* Loop terminator */
6891   u8 *pCellptr = pPg->aCellIdx;
6892   u8 *pTmp = sqlite3PagerTempSpace(pPg->pBt->pPager);
6893   u8 *pData;
6894   int k;                          /* Current slot in pCArray->apEnd[] */
6895   u8 *pSrcEnd;                    /* Current pCArray->apEnd[k] value */
6896 
6897   assert( i<iEnd );
6898   j = get2byte(&aData[hdr+5]);
6899   if( NEVER(j>(u32)usableSize) ){ j = 0; }
6900   memcpy(&pTmp[j], &aData[j], usableSize - j);
6901 
6902   for(k=0; pCArray->ixNx[k]<=i && ALWAYS(k<NB*2); k++){}
6903   pSrcEnd = pCArray->apEnd[k];
6904 
6905   pData = pEnd;
6906   while( 1/*exit by break*/ ){
6907     u8 *pCell = pCArray->apCell[i];
6908     u16 sz = pCArray->szCell[i];
6909     assert( sz>0 );
6910     if( SQLITE_WITHIN(pCell,aData,pEnd) ){
6911       if( ((uptr)(pCell+sz))>(uptr)pEnd ) return SQLITE_CORRUPT_BKPT;
6912       pCell = &pTmp[pCell - aData];
6913     }else if( (uptr)(pCell+sz)>(uptr)pSrcEnd
6914            && (uptr)(pCell)<(uptr)pSrcEnd
6915     ){
6916       return SQLITE_CORRUPT_BKPT;
6917     }
6918 
6919     pData -= sz;
6920     put2byte(pCellptr, (pData - aData));
6921     pCellptr += 2;
6922     if( pData < pCellptr ) return SQLITE_CORRUPT_BKPT;
6923     memcpy(pData, pCell, sz);
6924     assert( sz==pPg->xCellSize(pPg, pCell) || CORRUPT_DB );
6925     testcase( sz!=pPg->xCellSize(pPg,pCell) );
6926     i++;
6927     if( i>=iEnd ) break;
6928     if( pCArray->ixNx[k]<=i ){
6929       k++;
6930       pSrcEnd = pCArray->apEnd[k];
6931     }
6932   }
6933 
6934   /* The pPg->nFree field is now set incorrectly. The caller will fix it. */
6935   pPg->nCell = nCell;
6936   pPg->nOverflow = 0;
6937 
6938   put2byte(&aData[hdr+1], 0);
6939   put2byte(&aData[hdr+3], pPg->nCell);
6940   put2byte(&aData[hdr+5], pData - aData);
6941   aData[hdr+7] = 0x00;
6942   return SQLITE_OK;
6943 }
6944 
6945 /*
6946 ** The pCArray objects contains pointers to b-tree cells and the cell sizes.
6947 ** This function attempts to add the cells stored in the array to page pPg.
6948 ** If it cannot (because the page needs to be defragmented before the cells
6949 ** will fit), non-zero is returned. Otherwise, if the cells are added
6950 ** successfully, zero is returned.
6951 **
6952 ** Argument pCellptr points to the first entry in the cell-pointer array
6953 ** (part of page pPg) to populate. After cell apCell[0] is written to the
6954 ** page body, a 16-bit offset is written to pCellptr. And so on, for each
6955 ** cell in the array. It is the responsibility of the caller to ensure
6956 ** that it is safe to overwrite this part of the cell-pointer array.
6957 **
6958 ** When this function is called, *ppData points to the start of the
6959 ** content area on page pPg. If the size of the content area is extended,
6960 ** *ppData is updated to point to the new start of the content area
6961 ** before returning.
6962 **
6963 ** Finally, argument pBegin points to the byte immediately following the
6964 ** end of the space required by this page for the cell-pointer area (for
6965 ** all cells - not just those inserted by the current call). If the content
6966 ** area must be extended to before this point in order to accomodate all
6967 ** cells in apCell[], then the cells do not fit and non-zero is returned.
6968 */
6969 static int pageInsertArray(
6970   MemPage *pPg,                   /* Page to add cells to */
6971   u8 *pBegin,                     /* End of cell-pointer array */
6972   u8 **ppData,                    /* IN/OUT: Page content-area pointer */
6973   u8 *pCellptr,                   /* Pointer to cell-pointer area */
6974   int iFirst,                     /* Index of first cell to add */
6975   int nCell,                      /* Number of cells to add to pPg */
6976   CellArray *pCArray              /* Array of cells */
6977 ){
6978   int i = iFirst;                 /* Loop counter - cell index to insert */
6979   u8 *aData = pPg->aData;         /* Complete page */
6980   u8 *pData = *ppData;            /* Content area.  A subset of aData[] */
6981   int iEnd = iFirst + nCell;      /* End of loop. One past last cell to ins */
6982   int k;                          /* Current slot in pCArray->apEnd[] */
6983   u8 *pEnd;                       /* Maximum extent of cell data */
6984   assert( CORRUPT_DB || pPg->hdrOffset==0 );    /* Never called on page 1 */
6985   if( iEnd<=iFirst ) return 0;
6986   for(k=0; pCArray->ixNx[k]<=i && ALWAYS(k<NB*2); k++){}
6987   pEnd = pCArray->apEnd[k];
6988   while( 1 /*Exit by break*/ ){
6989     int sz, rc;
6990     u8 *pSlot;
6991     sz = cachedCellSize(pCArray, i);
6992     if( (aData[1]==0 && aData[2]==0) || (pSlot = pageFindSlot(pPg,sz,&rc))==0 ){
6993       if( (pData - pBegin)<sz ) return 1;
6994       pData -= sz;
6995       pSlot = pData;
6996     }
6997     /* pSlot and pCArray->apCell[i] will never overlap on a well-formed
6998     ** database.  But they might for a corrupt database.  Hence use memmove()
6999     ** since memcpy() sends SIGABORT with overlapping buffers on OpenBSD */
7000     assert( (pSlot+sz)<=pCArray->apCell[i]
7001          || pSlot>=(pCArray->apCell[i]+sz)
7002          || CORRUPT_DB );
7003     if( (uptr)(pCArray->apCell[i]+sz)>(uptr)pEnd
7004      && (uptr)(pCArray->apCell[i])<(uptr)pEnd
7005     ){
7006       assert( CORRUPT_DB );
7007       (void)SQLITE_CORRUPT_BKPT;
7008       return 1;
7009     }
7010     memmove(pSlot, pCArray->apCell[i], sz);
7011     put2byte(pCellptr, (pSlot - aData));
7012     pCellptr += 2;
7013     i++;
7014     if( i>=iEnd ) break;
7015     if( pCArray->ixNx[k]<=i ){
7016       k++;
7017       pEnd = pCArray->apEnd[k];
7018     }
7019   }
7020   *ppData = pData;
7021   return 0;
7022 }
7023 
7024 /*
7025 ** The pCArray object contains pointers to b-tree cells and their sizes.
7026 **
7027 ** This function adds the space associated with each cell in the array
7028 ** that is currently stored within the body of pPg to the pPg free-list.
7029 ** The cell-pointers and other fields of the page are not updated.
7030 **
7031 ** This function returns the total number of cells added to the free-list.
7032 */
7033 static int pageFreeArray(
7034   MemPage *pPg,                   /* Page to edit */
7035   int iFirst,                     /* First cell to delete */
7036   int nCell,                      /* Cells to delete */
7037   CellArray *pCArray              /* Array of cells */
7038 ){
7039   u8 * const aData = pPg->aData;
7040   u8 * const pEnd = &aData[pPg->pBt->usableSize];
7041   u8 * const pStart = &aData[pPg->hdrOffset + 8 + pPg->childPtrSize];
7042   int nRet = 0;
7043   int i;
7044   int iEnd = iFirst + nCell;
7045   u8 *pFree = 0;
7046   int szFree = 0;
7047 
7048   for(i=iFirst; i<iEnd; i++){
7049     u8 *pCell = pCArray->apCell[i];
7050     if( SQLITE_WITHIN(pCell, pStart, pEnd) ){
7051       int sz;
7052       /* No need to use cachedCellSize() here.  The sizes of all cells that
7053       ** are to be freed have already been computing while deciding which
7054       ** cells need freeing */
7055       sz = pCArray->szCell[i];  assert( sz>0 );
7056       if( pFree!=(pCell + sz) ){
7057         if( pFree ){
7058           assert( pFree>aData && (pFree - aData)<65536 );
7059           freeSpace(pPg, (u16)(pFree - aData), szFree);
7060         }
7061         pFree = pCell;
7062         szFree = sz;
7063         if( pFree+sz>pEnd ) return 0;
7064       }else{
7065         pFree = pCell;
7066         szFree += sz;
7067       }
7068       nRet++;
7069     }
7070   }
7071   if( pFree ){
7072     assert( pFree>aData && (pFree - aData)<65536 );
7073     freeSpace(pPg, (u16)(pFree - aData), szFree);
7074   }
7075   return nRet;
7076 }
7077 
7078 /*
7079 ** pCArray contains pointers to and sizes of all cells in the page being
7080 ** balanced.  The current page, pPg, has pPg->nCell cells starting with
7081 ** pCArray->apCell[iOld].  After balancing, this page should hold nNew cells
7082 ** starting at apCell[iNew].
7083 **
7084 ** This routine makes the necessary adjustments to pPg so that it contains
7085 ** the correct cells after being balanced.
7086 **
7087 ** The pPg->nFree field is invalid when this function returns. It is the
7088 ** responsibility of the caller to set it correctly.
7089 */
7090 static int editPage(
7091   MemPage *pPg,                   /* Edit this page */
7092   int iOld,                       /* Index of first cell currently on page */
7093   int iNew,                       /* Index of new first cell on page */
7094   int nNew,                       /* Final number of cells on page */
7095   CellArray *pCArray              /* Array of cells and sizes */
7096 ){
7097   u8 * const aData = pPg->aData;
7098   const int hdr = pPg->hdrOffset;
7099   u8 *pBegin = &pPg->aCellIdx[nNew * 2];
7100   int nCell = pPg->nCell;       /* Cells stored on pPg */
7101   u8 *pData;
7102   u8 *pCellptr;
7103   int i;
7104   int iOldEnd = iOld + pPg->nCell + pPg->nOverflow;
7105   int iNewEnd = iNew + nNew;
7106 
7107 #ifdef SQLITE_DEBUG
7108   u8 *pTmp = sqlite3PagerTempSpace(pPg->pBt->pPager);
7109   memcpy(pTmp, aData, pPg->pBt->usableSize);
7110 #endif
7111 
7112   /* Remove cells from the start and end of the page */
7113   assert( nCell>=0 );
7114   if( iOld<iNew ){
7115     int nShift = pageFreeArray(pPg, iOld, iNew-iOld, pCArray);
7116     if( nShift>nCell ) return SQLITE_CORRUPT_BKPT;
7117     memmove(pPg->aCellIdx, &pPg->aCellIdx[nShift*2], nCell*2);
7118     nCell -= nShift;
7119   }
7120   if( iNewEnd < iOldEnd ){
7121     int nTail = pageFreeArray(pPg, iNewEnd, iOldEnd - iNewEnd, pCArray);
7122     assert( nCell>=nTail );
7123     nCell -= nTail;
7124   }
7125 
7126   pData = &aData[get2byteNotZero(&aData[hdr+5])];
7127   if( pData<pBegin ) goto editpage_fail;
7128 
7129   /* Add cells to the start of the page */
7130   if( iNew<iOld ){
7131     int nAdd = MIN(nNew,iOld-iNew);
7132     assert( (iOld-iNew)<nNew || nCell==0 || CORRUPT_DB );
7133     assert( nAdd>=0 );
7134     pCellptr = pPg->aCellIdx;
7135     memmove(&pCellptr[nAdd*2], pCellptr, nCell*2);
7136     if( pageInsertArray(
7137           pPg, pBegin, &pData, pCellptr,
7138           iNew, nAdd, pCArray
7139     ) ) goto editpage_fail;
7140     nCell += nAdd;
7141   }
7142 
7143   /* Add any overflow cells */
7144   for(i=0; i<pPg->nOverflow; i++){
7145     int iCell = (iOld + pPg->aiOvfl[i]) - iNew;
7146     if( iCell>=0 && iCell<nNew ){
7147       pCellptr = &pPg->aCellIdx[iCell * 2];
7148       if( nCell>iCell ){
7149         memmove(&pCellptr[2], pCellptr, (nCell - iCell) * 2);
7150       }
7151       nCell++;
7152       if( pageInsertArray(
7153             pPg, pBegin, &pData, pCellptr,
7154             iCell+iNew, 1, pCArray
7155       ) ) goto editpage_fail;
7156     }
7157   }
7158 
7159   /* Append cells to the end of the page */
7160   assert( nCell>=0 );
7161   pCellptr = &pPg->aCellIdx[nCell*2];
7162   if( pageInsertArray(
7163         pPg, pBegin, &pData, pCellptr,
7164         iNew+nCell, nNew-nCell, pCArray
7165   ) ) goto editpage_fail;
7166 
7167   pPg->nCell = nNew;
7168   pPg->nOverflow = 0;
7169 
7170   put2byte(&aData[hdr+3], pPg->nCell);
7171   put2byte(&aData[hdr+5], pData - aData);
7172 
7173 #ifdef SQLITE_DEBUG
7174   for(i=0; i<nNew && !CORRUPT_DB; i++){
7175     u8 *pCell = pCArray->apCell[i+iNew];
7176     int iOff = get2byteAligned(&pPg->aCellIdx[i*2]);
7177     if( SQLITE_WITHIN(pCell, aData, &aData[pPg->pBt->usableSize]) ){
7178       pCell = &pTmp[pCell - aData];
7179     }
7180     assert( 0==memcmp(pCell, &aData[iOff],
7181             pCArray->pRef->xCellSize(pCArray->pRef, pCArray->apCell[i+iNew])) );
7182   }
7183 #endif
7184 
7185   return SQLITE_OK;
7186  editpage_fail:
7187   /* Unable to edit this page. Rebuild it from scratch instead. */
7188   populateCellCache(pCArray, iNew, nNew);
7189   return rebuildPage(pCArray, iNew, nNew, pPg);
7190 }
7191 
7192 
7193 #ifndef SQLITE_OMIT_QUICKBALANCE
7194 /*
7195 ** This version of balance() handles the common special case where
7196 ** a new entry is being inserted on the extreme right-end of the
7197 ** tree, in other words, when the new entry will become the largest
7198 ** entry in the tree.
7199 **
7200 ** Instead of trying to balance the 3 right-most leaf pages, just add
7201 ** a new page to the right-hand side and put the one new entry in
7202 ** that page.  This leaves the right side of the tree somewhat
7203 ** unbalanced.  But odds are that we will be inserting new entries
7204 ** at the end soon afterwards so the nearly empty page will quickly
7205 ** fill up.  On average.
7206 **
7207 ** pPage is the leaf page which is the right-most page in the tree.
7208 ** pParent is its parent.  pPage must have a single overflow entry
7209 ** which is also the right-most entry on the page.
7210 **
7211 ** The pSpace buffer is used to store a temporary copy of the divider
7212 ** cell that will be inserted into pParent. Such a cell consists of a 4
7213 ** byte page number followed by a variable length integer. In other
7214 ** words, at most 13 bytes. Hence the pSpace buffer must be at
7215 ** least 13 bytes in size.
7216 */
7217 static int balance_quick(MemPage *pParent, MemPage *pPage, u8 *pSpace){
7218   BtShared *const pBt = pPage->pBt;    /* B-Tree Database */
7219   MemPage *pNew;                       /* Newly allocated page */
7220   int rc;                              /* Return Code */
7221   Pgno pgnoNew;                        /* Page number of pNew */
7222 
7223   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
7224   assert( sqlite3PagerIswriteable(pParent->pDbPage) );
7225   assert( pPage->nOverflow==1 );
7226 
7227   if( pPage->nCell==0 ) return SQLITE_CORRUPT_BKPT;  /* dbfuzz001.test */
7228   assert( pPage->nFree>=0 );
7229   assert( pParent->nFree>=0 );
7230 
7231   /* Allocate a new page. This page will become the right-sibling of
7232   ** pPage. Make the parent page writable, so that the new divider cell
7233   ** may be inserted. If both these operations are successful, proceed.
7234   */
7235   rc = allocateBtreePage(pBt, &pNew, &pgnoNew, 0, 0);
7236 
7237   if( rc==SQLITE_OK ){
7238 
7239     u8 *pOut = &pSpace[4];
7240     u8 *pCell = pPage->apOvfl[0];
7241     u16 szCell = pPage->xCellSize(pPage, pCell);
7242     u8 *pStop;
7243     CellArray b;
7244 
7245     assert( sqlite3PagerIswriteable(pNew->pDbPage) );
7246     assert( CORRUPT_DB || pPage->aData[0]==(PTF_INTKEY|PTF_LEAFDATA|PTF_LEAF) );
7247     zeroPage(pNew, PTF_INTKEY|PTF_LEAFDATA|PTF_LEAF);
7248     b.nCell = 1;
7249     b.pRef = pPage;
7250     b.apCell = &pCell;
7251     b.szCell = &szCell;
7252     b.apEnd[0] = pPage->aDataEnd;
7253     b.ixNx[0] = 2;
7254     rc = rebuildPage(&b, 0, 1, pNew);
7255     if( NEVER(rc) ){
7256       releasePage(pNew);
7257       return rc;
7258     }
7259     pNew->nFree = pBt->usableSize - pNew->cellOffset - 2 - szCell;
7260 
7261     /* If this is an auto-vacuum database, update the pointer map
7262     ** with entries for the new page, and any pointer from the
7263     ** cell on the page to an overflow page. If either of these
7264     ** operations fails, the return code is set, but the contents
7265     ** of the parent page are still manipulated by thh code below.
7266     ** That is Ok, at this point the parent page is guaranteed to
7267     ** be marked as dirty. Returning an error code will cause a
7268     ** rollback, undoing any changes made to the parent page.
7269     */
7270     if( ISAUTOVACUUM ){
7271       ptrmapPut(pBt, pgnoNew, PTRMAP_BTREE, pParent->pgno, &rc);
7272       if( szCell>pNew->minLocal ){
7273         ptrmapPutOvflPtr(pNew, pNew, pCell, &rc);
7274       }
7275     }
7276 
7277     /* Create a divider cell to insert into pParent. The divider cell
7278     ** consists of a 4-byte page number (the page number of pPage) and
7279     ** a variable length key value (which must be the same value as the
7280     ** largest key on pPage).
7281     **
7282     ** To find the largest key value on pPage, first find the right-most
7283     ** cell on pPage. The first two fields of this cell are the
7284     ** record-length (a variable length integer at most 32-bits in size)
7285     ** and the key value (a variable length integer, may have any value).
7286     ** The first of the while(...) loops below skips over the record-length
7287     ** field. The second while(...) loop copies the key value from the
7288     ** cell on pPage into the pSpace buffer.
7289     */
7290     pCell = findCell(pPage, pPage->nCell-1);
7291     pStop = &pCell[9];
7292     while( (*(pCell++)&0x80) && pCell<pStop );
7293     pStop = &pCell[9];
7294     while( ((*(pOut++) = *(pCell++))&0x80) && pCell<pStop );
7295 
7296     /* Insert the new divider cell into pParent. */
7297     if( rc==SQLITE_OK ){
7298       insertCell(pParent, pParent->nCell, pSpace, (int)(pOut-pSpace),
7299                    0, pPage->pgno, &rc);
7300     }
7301 
7302     /* Set the right-child pointer of pParent to point to the new page. */
7303     put4byte(&pParent->aData[pParent->hdrOffset+8], pgnoNew);
7304 
7305     /* Release the reference to the new page. */
7306     releasePage(pNew);
7307   }
7308 
7309   return rc;
7310 }
7311 #endif /* SQLITE_OMIT_QUICKBALANCE */
7312 
7313 #if 0
7314 /*
7315 ** This function does not contribute anything to the operation of SQLite.
7316 ** it is sometimes activated temporarily while debugging code responsible
7317 ** for setting pointer-map entries.
7318 */
7319 static int ptrmapCheckPages(MemPage **apPage, int nPage){
7320   int i, j;
7321   for(i=0; i<nPage; i++){
7322     Pgno n;
7323     u8 e;
7324     MemPage *pPage = apPage[i];
7325     BtShared *pBt = pPage->pBt;
7326     assert( pPage->isInit );
7327 
7328     for(j=0; j<pPage->nCell; j++){
7329       CellInfo info;
7330       u8 *z;
7331 
7332       z = findCell(pPage, j);
7333       pPage->xParseCell(pPage, z, &info);
7334       if( info.nLocal<info.nPayload ){
7335         Pgno ovfl = get4byte(&z[info.nSize-4]);
7336         ptrmapGet(pBt, ovfl, &e, &n);
7337         assert( n==pPage->pgno && e==PTRMAP_OVERFLOW1 );
7338       }
7339       if( !pPage->leaf ){
7340         Pgno child = get4byte(z);
7341         ptrmapGet(pBt, child, &e, &n);
7342         assert( n==pPage->pgno && e==PTRMAP_BTREE );
7343       }
7344     }
7345     if( !pPage->leaf ){
7346       Pgno child = get4byte(&pPage->aData[pPage->hdrOffset+8]);
7347       ptrmapGet(pBt, child, &e, &n);
7348       assert( n==pPage->pgno && e==PTRMAP_BTREE );
7349     }
7350   }
7351   return 1;
7352 }
7353 #endif
7354 
7355 /*
7356 ** This function is used to copy the contents of the b-tree node stored
7357 ** on page pFrom to page pTo. If page pFrom was not a leaf page, then
7358 ** the pointer-map entries for each child page are updated so that the
7359 ** parent page stored in the pointer map is page pTo. If pFrom contained
7360 ** any cells with overflow page pointers, then the corresponding pointer
7361 ** map entries are also updated so that the parent page is page pTo.
7362 **
7363 ** If pFrom is currently carrying any overflow cells (entries in the
7364 ** MemPage.apOvfl[] array), they are not copied to pTo.
7365 **
7366 ** Before returning, page pTo is reinitialized using btreeInitPage().
7367 **
7368 ** The performance of this function is not critical. It is only used by
7369 ** the balance_shallower() and balance_deeper() procedures, neither of
7370 ** which are called often under normal circumstances.
7371 */
7372 static void copyNodeContent(MemPage *pFrom, MemPage *pTo, int *pRC){
7373   if( (*pRC)==SQLITE_OK ){
7374     BtShared * const pBt = pFrom->pBt;
7375     u8 * const aFrom = pFrom->aData;
7376     u8 * const aTo = pTo->aData;
7377     int const iFromHdr = pFrom->hdrOffset;
7378     int const iToHdr = ((pTo->pgno==1) ? 100 : 0);
7379     int rc;
7380     int iData;
7381 
7382 
7383     assert( pFrom->isInit );
7384     assert( pFrom->nFree>=iToHdr );
7385     assert( get2byte(&aFrom[iFromHdr+5]) <= (int)pBt->usableSize );
7386 
7387     /* Copy the b-tree node content from page pFrom to page pTo. */
7388     iData = get2byte(&aFrom[iFromHdr+5]);
7389     memcpy(&aTo[iData], &aFrom[iData], pBt->usableSize-iData);
7390     memcpy(&aTo[iToHdr], &aFrom[iFromHdr], pFrom->cellOffset + 2*pFrom->nCell);
7391 
7392     /* Reinitialize page pTo so that the contents of the MemPage structure
7393     ** match the new data. The initialization of pTo can actually fail under
7394     ** fairly obscure circumstances, even though it is a copy of initialized
7395     ** page pFrom.
7396     */
7397     pTo->isInit = 0;
7398     rc = btreeInitPage(pTo);
7399     if( rc==SQLITE_OK ) rc = btreeComputeFreeSpace(pTo);
7400     if( rc!=SQLITE_OK ){
7401       *pRC = rc;
7402       return;
7403     }
7404 
7405     /* If this is an auto-vacuum database, update the pointer-map entries
7406     ** for any b-tree or overflow pages that pTo now contains the pointers to.
7407     */
7408     if( ISAUTOVACUUM ){
7409       *pRC = setChildPtrmaps(pTo);
7410     }
7411   }
7412 }
7413 
7414 /*
7415 ** This routine redistributes cells on the iParentIdx'th child of pParent
7416 ** (hereafter "the page") and up to 2 siblings so that all pages have about the
7417 ** same amount of free space. Usually a single sibling on either side of the
7418 ** page are used in the balancing, though both siblings might come from one
7419 ** side if the page is the first or last child of its parent. If the page
7420 ** has fewer than 2 siblings (something which can only happen if the page
7421 ** is a root page or a child of a root page) then all available siblings
7422 ** participate in the balancing.
7423 **
7424 ** The number of siblings of the page might be increased or decreased by
7425 ** one or two in an effort to keep pages nearly full but not over full.
7426 **
7427 ** Note that when this routine is called, some of the cells on the page
7428 ** might not actually be stored in MemPage.aData[]. This can happen
7429 ** if the page is overfull. This routine ensures that all cells allocated
7430 ** to the page and its siblings fit into MemPage.aData[] before returning.
7431 **
7432 ** In the course of balancing the page and its siblings, cells may be
7433 ** inserted into or removed from the parent page (pParent). Doing so
7434 ** may cause the parent page to become overfull or underfull. If this
7435 ** happens, it is the responsibility of the caller to invoke the correct
7436 ** balancing routine to fix this problem (see the balance() routine).
7437 **
7438 ** If this routine fails for any reason, it might leave the database
7439 ** in a corrupted state. So if this routine fails, the database should
7440 ** be rolled back.
7441 **
7442 ** The third argument to this function, aOvflSpace, is a pointer to a
7443 ** buffer big enough to hold one page. If while inserting cells into the parent
7444 ** page (pParent) the parent page becomes overfull, this buffer is
7445 ** used to store the parent's overflow cells. Because this function inserts
7446 ** a maximum of four divider cells into the parent page, and the maximum
7447 ** size of a cell stored within an internal node is always less than 1/4
7448 ** of the page-size, the aOvflSpace[] buffer is guaranteed to be large
7449 ** enough for all overflow cells.
7450 **
7451 ** If aOvflSpace is set to a null pointer, this function returns
7452 ** SQLITE_NOMEM.
7453 */
7454 static int balance_nonroot(
7455   MemPage *pParent,               /* Parent page of siblings being balanced */
7456   int iParentIdx,                 /* Index of "the page" in pParent */
7457   u8 *aOvflSpace,                 /* page-size bytes of space for parent ovfl */
7458   int isRoot,                     /* True if pParent is a root-page */
7459   int bBulk                       /* True if this call is part of a bulk load */
7460 ){
7461   BtShared *pBt;               /* The whole database */
7462   int nMaxCells = 0;           /* Allocated size of apCell, szCell, aFrom. */
7463   int nNew = 0;                /* Number of pages in apNew[] */
7464   int nOld;                    /* Number of pages in apOld[] */
7465   int i, j, k;                 /* Loop counters */
7466   int nxDiv;                   /* Next divider slot in pParent->aCell[] */
7467   int rc = SQLITE_OK;          /* The return code */
7468   u16 leafCorrection;          /* 4 if pPage is a leaf.  0 if not */
7469   int leafData;                /* True if pPage is a leaf of a LEAFDATA tree */
7470   int usableSpace;             /* Bytes in pPage beyond the header */
7471   int pageFlags;               /* Value of pPage->aData[0] */
7472   int iSpace1 = 0;             /* First unused byte of aSpace1[] */
7473   int iOvflSpace = 0;          /* First unused byte of aOvflSpace[] */
7474   int szScratch;               /* Size of scratch memory requested */
7475   MemPage *apOld[NB];          /* pPage and up to two siblings */
7476   MemPage *apNew[NB+2];        /* pPage and up to NB siblings after balancing */
7477   u8 *pRight;                  /* Location in parent of right-sibling pointer */
7478   u8 *apDiv[NB-1];             /* Divider cells in pParent */
7479   int cntNew[NB+2];            /* Index in b.paCell[] of cell after i-th page */
7480   int cntOld[NB+2];            /* Old index in b.apCell[] */
7481   int szNew[NB+2];             /* Combined size of cells placed on i-th page */
7482   u8 *aSpace1;                 /* Space for copies of dividers cells */
7483   Pgno pgno;                   /* Temp var to store a page number in */
7484   u8 abDone[NB+2];             /* True after i'th new page is populated */
7485   Pgno aPgno[NB+2];            /* Page numbers of new pages before shuffling */
7486   Pgno aPgOrder[NB+2];         /* Copy of aPgno[] used for sorting pages */
7487   u16 aPgFlags[NB+2];          /* flags field of new pages before shuffling */
7488   CellArray b;                  /* Parsed information on cells being balanced */
7489 
7490   memset(abDone, 0, sizeof(abDone));
7491   b.nCell = 0;
7492   b.apCell = 0;
7493   pBt = pParent->pBt;
7494   assert( sqlite3_mutex_held(pBt->mutex) );
7495   assert( sqlite3PagerIswriteable(pParent->pDbPage) );
7496 
7497   /* At this point pParent may have at most one overflow cell. And if
7498   ** this overflow cell is present, it must be the cell with
7499   ** index iParentIdx. This scenario comes about when this function
7500   ** is called (indirectly) from sqlite3BtreeDelete().
7501   */
7502   assert( pParent->nOverflow==0 || pParent->nOverflow==1 );
7503   assert( pParent->nOverflow==0 || pParent->aiOvfl[0]==iParentIdx );
7504 
7505   if( !aOvflSpace ){
7506     return SQLITE_NOMEM_BKPT;
7507   }
7508   assert( pParent->nFree>=0 );
7509 
7510   /* Find the sibling pages to balance. Also locate the cells in pParent
7511   ** that divide the siblings. An attempt is made to find NN siblings on
7512   ** either side of pPage. More siblings are taken from one side, however,
7513   ** if there are fewer than NN siblings on the other side. If pParent
7514   ** has NB or fewer children then all children of pParent are taken.
7515   **
7516   ** This loop also drops the divider cells from the parent page. This
7517   ** way, the remainder of the function does not have to deal with any
7518   ** overflow cells in the parent page, since if any existed they will
7519   ** have already been removed.
7520   */
7521   i = pParent->nOverflow + pParent->nCell;
7522   if( i<2 ){
7523     nxDiv = 0;
7524   }else{
7525     assert( bBulk==0 || bBulk==1 );
7526     if( iParentIdx==0 ){
7527       nxDiv = 0;
7528     }else if( iParentIdx==i ){
7529       nxDiv = i-2+bBulk;
7530     }else{
7531       nxDiv = iParentIdx-1;
7532     }
7533     i = 2-bBulk;
7534   }
7535   nOld = i+1;
7536   if( (i+nxDiv-pParent->nOverflow)==pParent->nCell ){
7537     pRight = &pParent->aData[pParent->hdrOffset+8];
7538   }else{
7539     pRight = findCell(pParent, i+nxDiv-pParent->nOverflow);
7540   }
7541   pgno = get4byte(pRight);
7542   while( 1 ){
7543     rc = getAndInitPage(pBt, pgno, &apOld[i], 0, 0);
7544     if( rc ){
7545       memset(apOld, 0, (i+1)*sizeof(MemPage*));
7546       goto balance_cleanup;
7547     }
7548     if( apOld[i]->nFree<0 ){
7549       rc = btreeComputeFreeSpace(apOld[i]);
7550       if( rc ){
7551         memset(apOld, 0, (i)*sizeof(MemPage*));
7552         goto balance_cleanup;
7553       }
7554     }
7555     if( (i--)==0 ) break;
7556 
7557     if( pParent->nOverflow && i+nxDiv==pParent->aiOvfl[0] ){
7558       apDiv[i] = pParent->apOvfl[0];
7559       pgno = get4byte(apDiv[i]);
7560       szNew[i] = pParent->xCellSize(pParent, apDiv[i]);
7561       pParent->nOverflow = 0;
7562     }else{
7563       apDiv[i] = findCell(pParent, i+nxDiv-pParent->nOverflow);
7564       pgno = get4byte(apDiv[i]);
7565       szNew[i] = pParent->xCellSize(pParent, apDiv[i]);
7566 
7567       /* Drop the cell from the parent page. apDiv[i] still points to
7568       ** the cell within the parent, even though it has been dropped.
7569       ** This is safe because dropping a cell only overwrites the first
7570       ** four bytes of it, and this function does not need the first
7571       ** four bytes of the divider cell. So the pointer is safe to use
7572       ** later on.
7573       **
7574       ** But not if we are in secure-delete mode. In secure-delete mode,
7575       ** the dropCell() routine will overwrite the entire cell with zeroes.
7576       ** In this case, temporarily copy the cell into the aOvflSpace[]
7577       ** buffer. It will be copied out again as soon as the aSpace[] buffer
7578       ** is allocated.  */
7579       if( pBt->btsFlags & BTS_FAST_SECURE ){
7580         int iOff;
7581 
7582         iOff = SQLITE_PTR_TO_INT(apDiv[i]) - SQLITE_PTR_TO_INT(pParent->aData);
7583         if( (iOff+szNew[i])>(int)pBt->usableSize ){
7584           rc = SQLITE_CORRUPT_BKPT;
7585           memset(apOld, 0, (i+1)*sizeof(MemPage*));
7586           goto balance_cleanup;
7587         }else{
7588           memcpy(&aOvflSpace[iOff], apDiv[i], szNew[i]);
7589           apDiv[i] = &aOvflSpace[apDiv[i]-pParent->aData];
7590         }
7591       }
7592       dropCell(pParent, i+nxDiv-pParent->nOverflow, szNew[i], &rc);
7593     }
7594   }
7595 
7596   /* Make nMaxCells a multiple of 4 in order to preserve 8-byte
7597   ** alignment */
7598   nMaxCells = nOld*(MX_CELL(pBt) + ArraySize(pParent->apOvfl));
7599   nMaxCells = (nMaxCells + 3)&~3;
7600 
7601   /*
7602   ** Allocate space for memory structures
7603   */
7604   szScratch =
7605        nMaxCells*sizeof(u8*)                       /* b.apCell */
7606      + nMaxCells*sizeof(u16)                       /* b.szCell */
7607      + pBt->pageSize;                              /* aSpace1 */
7608 
7609   assert( szScratch<=7*(int)pBt->pageSize );
7610   b.apCell = sqlite3StackAllocRaw(0, szScratch );
7611   if( b.apCell==0 ){
7612     rc = SQLITE_NOMEM_BKPT;
7613     goto balance_cleanup;
7614   }
7615   b.szCell = (u16*)&b.apCell[nMaxCells];
7616   aSpace1 = (u8*)&b.szCell[nMaxCells];
7617   assert( EIGHT_BYTE_ALIGNMENT(aSpace1) );
7618 
7619   /*
7620   ** Load pointers to all cells on sibling pages and the divider cells
7621   ** into the local b.apCell[] array.  Make copies of the divider cells
7622   ** into space obtained from aSpace1[]. The divider cells have already
7623   ** been removed from pParent.
7624   **
7625   ** If the siblings are on leaf pages, then the child pointers of the
7626   ** divider cells are stripped from the cells before they are copied
7627   ** into aSpace1[].  In this way, all cells in b.apCell[] are without
7628   ** child pointers.  If siblings are not leaves, then all cell in
7629   ** b.apCell[] include child pointers.  Either way, all cells in b.apCell[]
7630   ** are alike.
7631   **
7632   ** leafCorrection:  4 if pPage is a leaf.  0 if pPage is not a leaf.
7633   **       leafData:  1 if pPage holds key+data and pParent holds only keys.
7634   */
7635   b.pRef = apOld[0];
7636   leafCorrection = b.pRef->leaf*4;
7637   leafData = b.pRef->intKeyLeaf;
7638   for(i=0; i<nOld; i++){
7639     MemPage *pOld = apOld[i];
7640     int limit = pOld->nCell;
7641     u8 *aData = pOld->aData;
7642     u16 maskPage = pOld->maskPage;
7643     u8 *piCell = aData + pOld->cellOffset;
7644     u8 *piEnd;
7645     VVA_ONLY( int nCellAtStart = b.nCell; )
7646 
7647     /* Verify that all sibling pages are of the same "type" (table-leaf,
7648     ** table-interior, index-leaf, or index-interior).
7649     */
7650     if( pOld->aData[0]!=apOld[0]->aData[0] ){
7651       rc = SQLITE_CORRUPT_BKPT;
7652       goto balance_cleanup;
7653     }
7654 
7655     /* Load b.apCell[] with pointers to all cells in pOld.  If pOld
7656     ** contains overflow cells, include them in the b.apCell[] array
7657     ** in the correct spot.
7658     **
7659     ** Note that when there are multiple overflow cells, it is always the
7660     ** case that they are sequential and adjacent.  This invariant arises
7661     ** because multiple overflows can only occurs when inserting divider
7662     ** cells into a parent on a prior balance, and divider cells are always
7663     ** adjacent and are inserted in order.  There is an assert() tagged
7664     ** with "NOTE 1" in the overflow cell insertion loop to prove this
7665     ** invariant.
7666     **
7667     ** This must be done in advance.  Once the balance starts, the cell
7668     ** offset section of the btree page will be overwritten and we will no
7669     ** long be able to find the cells if a pointer to each cell is not saved
7670     ** first.
7671     */
7672     memset(&b.szCell[b.nCell], 0, sizeof(b.szCell[0])*(limit+pOld->nOverflow));
7673     if( pOld->nOverflow>0 ){
7674       if( limit<pOld->aiOvfl[0] ){
7675         rc = SQLITE_CORRUPT_BKPT;
7676         goto balance_cleanup;
7677       }
7678       limit = pOld->aiOvfl[0];
7679       for(j=0; j<limit; j++){
7680         b.apCell[b.nCell] = aData + (maskPage & get2byteAligned(piCell));
7681         piCell += 2;
7682         b.nCell++;
7683       }
7684       for(k=0; k<pOld->nOverflow; k++){
7685         assert( k==0 || pOld->aiOvfl[k-1]+1==pOld->aiOvfl[k] );/* NOTE 1 */
7686         b.apCell[b.nCell] = pOld->apOvfl[k];
7687         b.nCell++;
7688       }
7689     }
7690     piEnd = aData + pOld->cellOffset + 2*pOld->nCell;
7691     while( piCell<piEnd ){
7692       assert( b.nCell<nMaxCells );
7693       b.apCell[b.nCell] = aData + (maskPage & get2byteAligned(piCell));
7694       piCell += 2;
7695       b.nCell++;
7696     }
7697     assert( (b.nCell-nCellAtStart)==(pOld->nCell+pOld->nOverflow) );
7698 
7699     cntOld[i] = b.nCell;
7700     if( i<nOld-1 && !leafData){
7701       u16 sz = (u16)szNew[i];
7702       u8 *pTemp;
7703       assert( b.nCell<nMaxCells );
7704       b.szCell[b.nCell] = sz;
7705       pTemp = &aSpace1[iSpace1];
7706       iSpace1 += sz;
7707       assert( sz<=pBt->maxLocal+23 );
7708       assert( iSpace1 <= (int)pBt->pageSize );
7709       memcpy(pTemp, apDiv[i], sz);
7710       b.apCell[b.nCell] = pTemp+leafCorrection;
7711       assert( leafCorrection==0 || leafCorrection==4 );
7712       b.szCell[b.nCell] = b.szCell[b.nCell] - leafCorrection;
7713       if( !pOld->leaf ){
7714         assert( leafCorrection==0 );
7715         assert( pOld->hdrOffset==0 );
7716         /* The right pointer of the child page pOld becomes the left
7717         ** pointer of the divider cell */
7718         memcpy(b.apCell[b.nCell], &pOld->aData[8], 4);
7719       }else{
7720         assert( leafCorrection==4 );
7721         while( b.szCell[b.nCell]<4 ){
7722           /* Do not allow any cells smaller than 4 bytes. If a smaller cell
7723           ** does exist, pad it with 0x00 bytes. */
7724           assert( b.szCell[b.nCell]==3 || CORRUPT_DB );
7725           assert( b.apCell[b.nCell]==&aSpace1[iSpace1-3] || CORRUPT_DB );
7726           aSpace1[iSpace1++] = 0x00;
7727           b.szCell[b.nCell]++;
7728         }
7729       }
7730       b.nCell++;
7731     }
7732   }
7733 
7734   /*
7735   ** Figure out the number of pages needed to hold all b.nCell cells.
7736   ** Store this number in "k".  Also compute szNew[] which is the total
7737   ** size of all cells on the i-th page and cntNew[] which is the index
7738   ** in b.apCell[] of the cell that divides page i from page i+1.
7739   ** cntNew[k] should equal b.nCell.
7740   **
7741   ** Values computed by this block:
7742   **
7743   **           k: The total number of sibling pages
7744   **    szNew[i]: Spaced used on the i-th sibling page.
7745   **   cntNew[i]: Index in b.apCell[] and b.szCell[] for the first cell to
7746   **              the right of the i-th sibling page.
7747   ** usableSpace: Number of bytes of space available on each sibling.
7748   **
7749   */
7750   usableSpace = pBt->usableSize - 12 + leafCorrection;
7751   for(i=k=0; i<nOld; i++, k++){
7752     MemPage *p = apOld[i];
7753     b.apEnd[k] = p->aDataEnd;
7754     b.ixNx[k] = cntOld[i];
7755     if( k && b.ixNx[k]==b.ixNx[k-1] ){
7756       k--;  /* Omit b.ixNx[] entry for child pages with no cells */
7757     }
7758     if( !leafData ){
7759       k++;
7760       b.apEnd[k] = pParent->aDataEnd;
7761       b.ixNx[k] = cntOld[i]+1;
7762     }
7763     assert( p->nFree>=0 );
7764     szNew[i] = usableSpace - p->nFree;
7765     for(j=0; j<p->nOverflow; j++){
7766       szNew[i] += 2 + p->xCellSize(p, p->apOvfl[j]);
7767     }
7768     cntNew[i] = cntOld[i];
7769   }
7770   k = nOld;
7771   for(i=0; i<k; i++){
7772     int sz;
7773     while( szNew[i]>usableSpace ){
7774       if( i+1>=k ){
7775         k = i+2;
7776         if( k>NB+2 ){ rc = SQLITE_CORRUPT_BKPT; goto balance_cleanup; }
7777         szNew[k-1] = 0;
7778         cntNew[k-1] = b.nCell;
7779       }
7780       sz = 2 + cachedCellSize(&b, cntNew[i]-1);
7781       szNew[i] -= sz;
7782       if( !leafData ){
7783         if( cntNew[i]<b.nCell ){
7784           sz = 2 + cachedCellSize(&b, cntNew[i]);
7785         }else{
7786           sz = 0;
7787         }
7788       }
7789       szNew[i+1] += sz;
7790       cntNew[i]--;
7791     }
7792     while( cntNew[i]<b.nCell ){
7793       sz = 2 + cachedCellSize(&b, cntNew[i]);
7794       if( szNew[i]+sz>usableSpace ) break;
7795       szNew[i] += sz;
7796       cntNew[i]++;
7797       if( !leafData ){
7798         if( cntNew[i]<b.nCell ){
7799           sz = 2 + cachedCellSize(&b, cntNew[i]);
7800         }else{
7801           sz = 0;
7802         }
7803       }
7804       szNew[i+1] -= sz;
7805     }
7806     if( cntNew[i]>=b.nCell ){
7807       k = i+1;
7808     }else if( cntNew[i] <= (i>0 ? cntNew[i-1] : 0) ){
7809       rc = SQLITE_CORRUPT_BKPT;
7810       goto balance_cleanup;
7811     }
7812   }
7813 
7814   /*
7815   ** The packing computed by the previous block is biased toward the siblings
7816   ** on the left side (siblings with smaller keys). The left siblings are
7817   ** always nearly full, while the right-most sibling might be nearly empty.
7818   ** The next block of code attempts to adjust the packing of siblings to
7819   ** get a better balance.
7820   **
7821   ** This adjustment is more than an optimization.  The packing above might
7822   ** be so out of balance as to be illegal.  For example, the right-most
7823   ** sibling might be completely empty.  This adjustment is not optional.
7824   */
7825   for(i=k-1; i>0; i--){
7826     int szRight = szNew[i];  /* Size of sibling on the right */
7827     int szLeft = szNew[i-1]; /* Size of sibling on the left */
7828     int r;              /* Index of right-most cell in left sibling */
7829     int d;              /* Index of first cell to the left of right sibling */
7830 
7831     r = cntNew[i-1] - 1;
7832     d = r + 1 - leafData;
7833     (void)cachedCellSize(&b, d);
7834     do{
7835       assert( d<nMaxCells );
7836       assert( r<nMaxCells );
7837       (void)cachedCellSize(&b, r);
7838       if( szRight!=0
7839        && (bBulk || szRight+b.szCell[d]+2 > szLeft-(b.szCell[r]+(i==k-1?0:2)))){
7840         break;
7841       }
7842       szRight += b.szCell[d] + 2;
7843       szLeft -= b.szCell[r] + 2;
7844       cntNew[i-1] = r;
7845       r--;
7846       d--;
7847     }while( r>=0 );
7848     szNew[i] = szRight;
7849     szNew[i-1] = szLeft;
7850     if( cntNew[i-1] <= (i>1 ? cntNew[i-2] : 0) ){
7851       rc = SQLITE_CORRUPT_BKPT;
7852       goto balance_cleanup;
7853     }
7854   }
7855 
7856   /* Sanity check:  For a non-corrupt database file one of the follwing
7857   ** must be true:
7858   **    (1) We found one or more cells (cntNew[0])>0), or
7859   **    (2) pPage is a virtual root page.  A virtual root page is when
7860   **        the real root page is page 1 and we are the only child of
7861   **        that page.
7862   */
7863   assert( cntNew[0]>0 || (pParent->pgno==1 && pParent->nCell==0) || CORRUPT_DB);
7864   TRACE(("BALANCE: old: %d(nc=%d) %d(nc=%d) %d(nc=%d)\n",
7865     apOld[0]->pgno, apOld[0]->nCell,
7866     nOld>=2 ? apOld[1]->pgno : 0, nOld>=2 ? apOld[1]->nCell : 0,
7867     nOld>=3 ? apOld[2]->pgno : 0, nOld>=3 ? apOld[2]->nCell : 0
7868   ));
7869 
7870   /*
7871   ** Allocate k new pages.  Reuse old pages where possible.
7872   */
7873   pageFlags = apOld[0]->aData[0];
7874   for(i=0; i<k; i++){
7875     MemPage *pNew;
7876     if( i<nOld ){
7877       pNew = apNew[i] = apOld[i];
7878       apOld[i] = 0;
7879       rc = sqlite3PagerWrite(pNew->pDbPage);
7880       nNew++;
7881       if( rc ) goto balance_cleanup;
7882     }else{
7883       assert( i>0 );
7884       rc = allocateBtreePage(pBt, &pNew, &pgno, (bBulk ? 1 : pgno), 0);
7885       if( rc ) goto balance_cleanup;
7886       zeroPage(pNew, pageFlags);
7887       apNew[i] = pNew;
7888       nNew++;
7889       cntOld[i] = b.nCell;
7890 
7891       /* Set the pointer-map entry for the new sibling page. */
7892       if( ISAUTOVACUUM ){
7893         ptrmapPut(pBt, pNew->pgno, PTRMAP_BTREE, pParent->pgno, &rc);
7894         if( rc!=SQLITE_OK ){
7895           goto balance_cleanup;
7896         }
7897       }
7898     }
7899   }
7900 
7901   /*
7902   ** Reassign page numbers so that the new pages are in ascending order.
7903   ** This helps to keep entries in the disk file in order so that a scan
7904   ** of the table is closer to a linear scan through the file. That in turn
7905   ** helps the operating system to deliver pages from the disk more rapidly.
7906   **
7907   ** An O(n^2) insertion sort algorithm is used, but since n is never more
7908   ** than (NB+2) (a small constant), that should not be a problem.
7909   **
7910   ** When NB==3, this one optimization makes the database about 25% faster
7911   ** for large insertions and deletions.
7912   */
7913   for(i=0; i<nNew; i++){
7914     aPgOrder[i] = aPgno[i] = apNew[i]->pgno;
7915     aPgFlags[i] = apNew[i]->pDbPage->flags;
7916     for(j=0; j<i; j++){
7917       if( aPgno[j]==aPgno[i] ){
7918         /* This branch is taken if the set of sibling pages somehow contains
7919         ** duplicate entries. This can happen if the database is corrupt.
7920         ** It would be simpler to detect this as part of the loop below, but
7921         ** we do the detection here in order to avoid populating the pager
7922         ** cache with two separate objects associated with the same
7923         ** page number.  */
7924         assert( CORRUPT_DB );
7925         rc = SQLITE_CORRUPT_BKPT;
7926         goto balance_cleanup;
7927       }
7928     }
7929   }
7930   for(i=0; i<nNew; i++){
7931     int iBest = 0;                /* aPgno[] index of page number to use */
7932     for(j=1; j<nNew; j++){
7933       if( aPgOrder[j]<aPgOrder[iBest] ) iBest = j;
7934     }
7935     pgno = aPgOrder[iBest];
7936     aPgOrder[iBest] = 0xffffffff;
7937     if( iBest!=i ){
7938       if( iBest>i ){
7939         sqlite3PagerRekey(apNew[iBest]->pDbPage, pBt->nPage+iBest+1, 0);
7940       }
7941       sqlite3PagerRekey(apNew[i]->pDbPage, pgno, aPgFlags[iBest]);
7942       apNew[i]->pgno = pgno;
7943     }
7944   }
7945 
7946   TRACE(("BALANCE: new: %d(%d nc=%d) %d(%d nc=%d) %d(%d nc=%d) "
7947          "%d(%d nc=%d) %d(%d nc=%d)\n",
7948     apNew[0]->pgno, szNew[0], cntNew[0],
7949     nNew>=2 ? apNew[1]->pgno : 0, nNew>=2 ? szNew[1] : 0,
7950     nNew>=2 ? cntNew[1] - cntNew[0] - !leafData : 0,
7951     nNew>=3 ? apNew[2]->pgno : 0, nNew>=3 ? szNew[2] : 0,
7952     nNew>=3 ? cntNew[2] - cntNew[1] - !leafData : 0,
7953     nNew>=4 ? apNew[3]->pgno : 0, nNew>=4 ? szNew[3] : 0,
7954     nNew>=4 ? cntNew[3] - cntNew[2] - !leafData : 0,
7955     nNew>=5 ? apNew[4]->pgno : 0, nNew>=5 ? szNew[4] : 0,
7956     nNew>=5 ? cntNew[4] - cntNew[3] - !leafData : 0
7957   ));
7958 
7959   assert( sqlite3PagerIswriteable(pParent->pDbPage) );
7960   assert( nNew>=1 && nNew<=ArraySize(apNew) );
7961   assert( apNew[nNew-1]!=0 );
7962   put4byte(pRight, apNew[nNew-1]->pgno);
7963 
7964   /* If the sibling pages are not leaves, ensure that the right-child pointer
7965   ** of the right-most new sibling page is set to the value that was
7966   ** originally in the same field of the right-most old sibling page. */
7967   if( (pageFlags & PTF_LEAF)==0 && nOld!=nNew ){
7968     MemPage *pOld = (nNew>nOld ? apNew : apOld)[nOld-1];
7969     memcpy(&apNew[nNew-1]->aData[8], &pOld->aData[8], 4);
7970   }
7971 
7972   /* Make any required updates to pointer map entries associated with
7973   ** cells stored on sibling pages following the balance operation. Pointer
7974   ** map entries associated with divider cells are set by the insertCell()
7975   ** routine. The associated pointer map entries are:
7976   **
7977   **   a) if the cell contains a reference to an overflow chain, the
7978   **      entry associated with the first page in the overflow chain, and
7979   **
7980   **   b) if the sibling pages are not leaves, the child page associated
7981   **      with the cell.
7982   **
7983   ** If the sibling pages are not leaves, then the pointer map entry
7984   ** associated with the right-child of each sibling may also need to be
7985   ** updated. This happens below, after the sibling pages have been
7986   ** populated, not here.
7987   */
7988   if( ISAUTOVACUUM ){
7989     MemPage *pOld;
7990     MemPage *pNew = pOld = apNew[0];
7991     int cntOldNext = pNew->nCell + pNew->nOverflow;
7992     int iNew = 0;
7993     int iOld = 0;
7994 
7995     for(i=0; i<b.nCell; i++){
7996       u8 *pCell = b.apCell[i];
7997       while( i==cntOldNext ){
7998         iOld++;
7999         assert( iOld<nNew || iOld<nOld );
8000         assert( iOld>=0 && iOld<NB );
8001         pOld = iOld<nNew ? apNew[iOld] : apOld[iOld];
8002         cntOldNext += pOld->nCell + pOld->nOverflow + !leafData;
8003       }
8004       if( i==cntNew[iNew] ){
8005         pNew = apNew[++iNew];
8006         if( !leafData ) continue;
8007       }
8008 
8009       /* Cell pCell is destined for new sibling page pNew. Originally, it
8010       ** was either part of sibling page iOld (possibly an overflow cell),
8011       ** or else the divider cell to the left of sibling page iOld. So,
8012       ** if sibling page iOld had the same page number as pNew, and if
8013       ** pCell really was a part of sibling page iOld (not a divider or
8014       ** overflow cell), we can skip updating the pointer map entries.  */
8015       if( iOld>=nNew
8016        || pNew->pgno!=aPgno[iOld]
8017        || !SQLITE_WITHIN(pCell,pOld->aData,pOld->aDataEnd)
8018       ){
8019         if( !leafCorrection ){
8020           ptrmapPut(pBt, get4byte(pCell), PTRMAP_BTREE, pNew->pgno, &rc);
8021         }
8022         if( cachedCellSize(&b,i)>pNew->minLocal ){
8023           ptrmapPutOvflPtr(pNew, pOld, pCell, &rc);
8024         }
8025         if( rc ) goto balance_cleanup;
8026       }
8027     }
8028   }
8029 
8030   /* Insert new divider cells into pParent. */
8031   for(i=0; i<nNew-1; i++){
8032     u8 *pCell;
8033     u8 *pTemp;
8034     int sz;
8035     MemPage *pNew = apNew[i];
8036     j = cntNew[i];
8037 
8038     assert( j<nMaxCells );
8039     assert( b.apCell[j]!=0 );
8040     pCell = b.apCell[j];
8041     sz = b.szCell[j] + leafCorrection;
8042     pTemp = &aOvflSpace[iOvflSpace];
8043     if( !pNew->leaf ){
8044       memcpy(&pNew->aData[8], pCell, 4);
8045     }else if( leafData ){
8046       /* If the tree is a leaf-data tree, and the siblings are leaves,
8047       ** then there is no divider cell in b.apCell[]. Instead, the divider
8048       ** cell consists of the integer key for the right-most cell of
8049       ** the sibling-page assembled above only.
8050       */
8051       CellInfo info;
8052       j--;
8053       pNew->xParseCell(pNew, b.apCell[j], &info);
8054       pCell = pTemp;
8055       sz = 4 + putVarint(&pCell[4], info.nKey);
8056       pTemp = 0;
8057     }else{
8058       pCell -= 4;
8059       /* Obscure case for non-leaf-data trees: If the cell at pCell was
8060       ** previously stored on a leaf node, and its reported size was 4
8061       ** bytes, then it may actually be smaller than this
8062       ** (see btreeParseCellPtr(), 4 bytes is the minimum size of
8063       ** any cell). But it is important to pass the correct size to
8064       ** insertCell(), so reparse the cell now.
8065       **
8066       ** This can only happen for b-trees used to evaluate "IN (SELECT ...)"
8067       ** and WITHOUT ROWID tables with exactly one column which is the
8068       ** primary key.
8069       */
8070       if( b.szCell[j]==4 ){
8071         assert(leafCorrection==4);
8072         sz = pParent->xCellSize(pParent, pCell);
8073       }
8074     }
8075     iOvflSpace += sz;
8076     assert( sz<=pBt->maxLocal+23 );
8077     assert( iOvflSpace <= (int)pBt->pageSize );
8078     insertCell(pParent, nxDiv+i, pCell, sz, pTemp, pNew->pgno, &rc);
8079     if( rc!=SQLITE_OK ) goto balance_cleanup;
8080     assert( sqlite3PagerIswriteable(pParent->pDbPage) );
8081   }
8082 
8083   /* Now update the actual sibling pages. The order in which they are updated
8084   ** is important, as this code needs to avoid disrupting any page from which
8085   ** cells may still to be read. In practice, this means:
8086   **
8087   **  (1) If cells are moving left (from apNew[iPg] to apNew[iPg-1])
8088   **      then it is not safe to update page apNew[iPg] until after
8089   **      the left-hand sibling apNew[iPg-1] has been updated.
8090   **
8091   **  (2) If cells are moving right (from apNew[iPg] to apNew[iPg+1])
8092   **      then it is not safe to update page apNew[iPg] until after
8093   **      the right-hand sibling apNew[iPg+1] has been updated.
8094   **
8095   ** If neither of the above apply, the page is safe to update.
8096   **
8097   ** The iPg value in the following loop starts at nNew-1 goes down
8098   ** to 0, then back up to nNew-1 again, thus making two passes over
8099   ** the pages.  On the initial downward pass, only condition (1) above
8100   ** needs to be tested because (2) will always be true from the previous
8101   ** step.  On the upward pass, both conditions are always true, so the
8102   ** upwards pass simply processes pages that were missed on the downward
8103   ** pass.
8104   */
8105   for(i=1-nNew; i<nNew; i++){
8106     int iPg = i<0 ? -i : i;
8107     assert( iPg>=0 && iPg<nNew );
8108     if( abDone[iPg] ) continue;         /* Skip pages already processed */
8109     if( i>=0                            /* On the upwards pass, or... */
8110      || cntOld[iPg-1]>=cntNew[iPg-1]    /* Condition (1) is true */
8111     ){
8112       int iNew;
8113       int iOld;
8114       int nNewCell;
8115 
8116       /* Verify condition (1):  If cells are moving left, update iPg
8117       ** only after iPg-1 has already been updated. */
8118       assert( iPg==0 || cntOld[iPg-1]>=cntNew[iPg-1] || abDone[iPg-1] );
8119 
8120       /* Verify condition (2):  If cells are moving right, update iPg
8121       ** only after iPg+1 has already been updated. */
8122       assert( cntNew[iPg]>=cntOld[iPg] || abDone[iPg+1] );
8123 
8124       if( iPg==0 ){
8125         iNew = iOld = 0;
8126         nNewCell = cntNew[0];
8127       }else{
8128         iOld = iPg<nOld ? (cntOld[iPg-1] + !leafData) : b.nCell;
8129         iNew = cntNew[iPg-1] + !leafData;
8130         nNewCell = cntNew[iPg] - iNew;
8131       }
8132 
8133       rc = editPage(apNew[iPg], iOld, iNew, nNewCell, &b);
8134       if( rc ) goto balance_cleanup;
8135       abDone[iPg]++;
8136       apNew[iPg]->nFree = usableSpace-szNew[iPg];
8137       assert( apNew[iPg]->nOverflow==0 );
8138       assert( apNew[iPg]->nCell==nNewCell );
8139     }
8140   }
8141 
8142   /* All pages have been processed exactly once */
8143   assert( memcmp(abDone, "\01\01\01\01\01", nNew)==0 );
8144 
8145   assert( nOld>0 );
8146   assert( nNew>0 );
8147 
8148   if( isRoot && pParent->nCell==0 && pParent->hdrOffset<=apNew[0]->nFree ){
8149     /* The root page of the b-tree now contains no cells. The only sibling
8150     ** page is the right-child of the parent. Copy the contents of the
8151     ** child page into the parent, decreasing the overall height of the
8152     ** b-tree structure by one. This is described as the "balance-shallower"
8153     ** sub-algorithm in some documentation.
8154     **
8155     ** If this is an auto-vacuum database, the call to copyNodeContent()
8156     ** sets all pointer-map entries corresponding to database image pages
8157     ** for which the pointer is stored within the content being copied.
8158     **
8159     ** It is critical that the child page be defragmented before being
8160     ** copied into the parent, because if the parent is page 1 then it will
8161     ** by smaller than the child due to the database header, and so all the
8162     ** free space needs to be up front.
8163     */
8164     assert( nNew==1 || CORRUPT_DB );
8165     rc = defragmentPage(apNew[0], -1);
8166     testcase( rc!=SQLITE_OK );
8167     assert( apNew[0]->nFree ==
8168         (get2byteNotZero(&apNew[0]->aData[5]) - apNew[0]->cellOffset
8169           - apNew[0]->nCell*2)
8170       || rc!=SQLITE_OK
8171     );
8172     copyNodeContent(apNew[0], pParent, &rc);
8173     freePage(apNew[0], &rc);
8174   }else if( ISAUTOVACUUM && !leafCorrection ){
8175     /* Fix the pointer map entries associated with the right-child of each
8176     ** sibling page. All other pointer map entries have already been taken
8177     ** care of.  */
8178     for(i=0; i<nNew; i++){
8179       u32 key = get4byte(&apNew[i]->aData[8]);
8180       ptrmapPut(pBt, key, PTRMAP_BTREE, apNew[i]->pgno, &rc);
8181     }
8182   }
8183 
8184   assert( pParent->isInit );
8185   TRACE(("BALANCE: finished: old=%d new=%d cells=%d\n",
8186           nOld, nNew, b.nCell));
8187 
8188   /* Free any old pages that were not reused as new pages.
8189   */
8190   for(i=nNew; i<nOld; i++){
8191     freePage(apOld[i], &rc);
8192   }
8193 
8194 #if 0
8195   if( ISAUTOVACUUM && rc==SQLITE_OK && apNew[0]->isInit ){
8196     /* The ptrmapCheckPages() contains assert() statements that verify that
8197     ** all pointer map pages are set correctly. This is helpful while
8198     ** debugging. This is usually disabled because a corrupt database may
8199     ** cause an assert() statement to fail.  */
8200     ptrmapCheckPages(apNew, nNew);
8201     ptrmapCheckPages(&pParent, 1);
8202   }
8203 #endif
8204 
8205   /*
8206   ** Cleanup before returning.
8207   */
8208 balance_cleanup:
8209   sqlite3StackFree(0, b.apCell);
8210   for(i=0; i<nOld; i++){
8211     releasePage(apOld[i]);
8212   }
8213   for(i=0; i<nNew; i++){
8214     releasePage(apNew[i]);
8215   }
8216 
8217   return rc;
8218 }
8219 
8220 
8221 /*
8222 ** This function is called when the root page of a b-tree structure is
8223 ** overfull (has one or more overflow pages).
8224 **
8225 ** A new child page is allocated and the contents of the current root
8226 ** page, including overflow cells, are copied into the child. The root
8227 ** page is then overwritten to make it an empty page with the right-child
8228 ** pointer pointing to the new page.
8229 **
8230 ** Before returning, all pointer-map entries corresponding to pages
8231 ** that the new child-page now contains pointers to are updated. The
8232 ** entry corresponding to the new right-child pointer of the root
8233 ** page is also updated.
8234 **
8235 ** If successful, *ppChild is set to contain a reference to the child
8236 ** page and SQLITE_OK is returned. In this case the caller is required
8237 ** to call releasePage() on *ppChild exactly once. If an error occurs,
8238 ** an error code is returned and *ppChild is set to 0.
8239 */
8240 static int balance_deeper(MemPage *pRoot, MemPage **ppChild){
8241   int rc;                        /* Return value from subprocedures */
8242   MemPage *pChild = 0;           /* Pointer to a new child page */
8243   Pgno pgnoChild = 0;            /* Page number of the new child page */
8244   BtShared *pBt = pRoot->pBt;    /* The BTree */
8245 
8246   assert( pRoot->nOverflow>0 );
8247   assert( sqlite3_mutex_held(pBt->mutex) );
8248 
8249   /* Make pRoot, the root page of the b-tree, writable. Allocate a new
8250   ** page that will become the new right-child of pPage. Copy the contents
8251   ** of the node stored on pRoot into the new child page.
8252   */
8253   rc = sqlite3PagerWrite(pRoot->pDbPage);
8254   if( rc==SQLITE_OK ){
8255     rc = allocateBtreePage(pBt,&pChild,&pgnoChild,pRoot->pgno,0);
8256     copyNodeContent(pRoot, pChild, &rc);
8257     if( ISAUTOVACUUM ){
8258       ptrmapPut(pBt, pgnoChild, PTRMAP_BTREE, pRoot->pgno, &rc);
8259     }
8260   }
8261   if( rc ){
8262     *ppChild = 0;
8263     releasePage(pChild);
8264     return rc;
8265   }
8266   assert( sqlite3PagerIswriteable(pChild->pDbPage) );
8267   assert( sqlite3PagerIswriteable(pRoot->pDbPage) );
8268   assert( pChild->nCell==pRoot->nCell || CORRUPT_DB );
8269 
8270   TRACE(("BALANCE: copy root %d into %d\n", pRoot->pgno, pChild->pgno));
8271 
8272   /* Copy the overflow cells from pRoot to pChild */
8273   memcpy(pChild->aiOvfl, pRoot->aiOvfl,
8274          pRoot->nOverflow*sizeof(pRoot->aiOvfl[0]));
8275   memcpy(pChild->apOvfl, pRoot->apOvfl,
8276          pRoot->nOverflow*sizeof(pRoot->apOvfl[0]));
8277   pChild->nOverflow = pRoot->nOverflow;
8278 
8279   /* Zero the contents of pRoot. Then install pChild as the right-child. */
8280   zeroPage(pRoot, pChild->aData[0] & ~PTF_LEAF);
8281   put4byte(&pRoot->aData[pRoot->hdrOffset+8], pgnoChild);
8282 
8283   *ppChild = pChild;
8284   return SQLITE_OK;
8285 }
8286 
8287 /*
8288 ** The page that pCur currently points to has just been modified in
8289 ** some way. This function figures out if this modification means the
8290 ** tree needs to be balanced, and if so calls the appropriate balancing
8291 ** routine. Balancing routines are:
8292 **
8293 **   balance_quick()
8294 **   balance_deeper()
8295 **   balance_nonroot()
8296 */
8297 static int balance(BtCursor *pCur){
8298   int rc = SQLITE_OK;
8299   const int nMin = pCur->pBt->usableSize * 2 / 3;
8300   u8 aBalanceQuickSpace[13];
8301   u8 *pFree = 0;
8302 
8303   VVA_ONLY( int balance_quick_called = 0 );
8304   VVA_ONLY( int balance_deeper_called = 0 );
8305 
8306   do {
8307     int iPage;
8308     MemPage *pPage = pCur->pPage;
8309 
8310     if( NEVER(pPage->nFree<0) && btreeComputeFreeSpace(pPage) ) break;
8311     if( pPage->nOverflow==0 && pPage->nFree<=nMin ){
8312       break;
8313     }else if( (iPage = pCur->iPage)==0 ){
8314       if( pPage->nOverflow ){
8315         /* The root page of the b-tree is overfull. In this case call the
8316         ** balance_deeper() function to create a new child for the root-page
8317         ** and copy the current contents of the root-page to it. The
8318         ** next iteration of the do-loop will balance the child page.
8319         */
8320         assert( balance_deeper_called==0 );
8321         VVA_ONLY( balance_deeper_called++ );
8322         rc = balance_deeper(pPage, &pCur->apPage[1]);
8323         if( rc==SQLITE_OK ){
8324           pCur->iPage = 1;
8325           pCur->ix = 0;
8326           pCur->aiIdx[0] = 0;
8327           pCur->apPage[0] = pPage;
8328           pCur->pPage = pCur->apPage[1];
8329           assert( pCur->pPage->nOverflow );
8330         }
8331       }else{
8332         break;
8333       }
8334     }else{
8335       MemPage * const pParent = pCur->apPage[iPage-1];
8336       int const iIdx = pCur->aiIdx[iPage-1];
8337 
8338       rc = sqlite3PagerWrite(pParent->pDbPage);
8339       if( rc==SQLITE_OK && pParent->nFree<0 ){
8340         rc = btreeComputeFreeSpace(pParent);
8341       }
8342       if( rc==SQLITE_OK ){
8343 #ifndef SQLITE_OMIT_QUICKBALANCE
8344         if( pPage->intKeyLeaf
8345          && pPage->nOverflow==1
8346          && pPage->aiOvfl[0]==pPage->nCell
8347          && pParent->pgno!=1
8348          && pParent->nCell==iIdx
8349         ){
8350           /* Call balance_quick() to create a new sibling of pPage on which
8351           ** to store the overflow cell. balance_quick() inserts a new cell
8352           ** into pParent, which may cause pParent overflow. If this
8353           ** happens, the next iteration of the do-loop will balance pParent
8354           ** use either balance_nonroot() or balance_deeper(). Until this
8355           ** happens, the overflow cell is stored in the aBalanceQuickSpace[]
8356           ** buffer.
8357           **
8358           ** The purpose of the following assert() is to check that only a
8359           ** single call to balance_quick() is made for each call to this
8360           ** function. If this were not verified, a subtle bug involving reuse
8361           ** of the aBalanceQuickSpace[] might sneak in.
8362           */
8363           assert( balance_quick_called==0 );
8364           VVA_ONLY( balance_quick_called++ );
8365           rc = balance_quick(pParent, pPage, aBalanceQuickSpace);
8366         }else
8367 #endif
8368         {
8369           /* In this case, call balance_nonroot() to redistribute cells
8370           ** between pPage and up to 2 of its sibling pages. This involves
8371           ** modifying the contents of pParent, which may cause pParent to
8372           ** become overfull or underfull. The next iteration of the do-loop
8373           ** will balance the parent page to correct this.
8374           **
8375           ** If the parent page becomes overfull, the overflow cell or cells
8376           ** are stored in the pSpace buffer allocated immediately below.
8377           ** A subsequent iteration of the do-loop will deal with this by
8378           ** calling balance_nonroot() (balance_deeper() may be called first,
8379           ** but it doesn't deal with overflow cells - just moves them to a
8380           ** different page). Once this subsequent call to balance_nonroot()
8381           ** has completed, it is safe to release the pSpace buffer used by
8382           ** the previous call, as the overflow cell data will have been
8383           ** copied either into the body of a database page or into the new
8384           ** pSpace buffer passed to the latter call to balance_nonroot().
8385           */
8386           u8 *pSpace = sqlite3PageMalloc(pCur->pBt->pageSize);
8387           rc = balance_nonroot(pParent, iIdx, pSpace, iPage==1,
8388                                pCur->hints&BTREE_BULKLOAD);
8389           if( pFree ){
8390             /* If pFree is not NULL, it points to the pSpace buffer used
8391             ** by a previous call to balance_nonroot(). Its contents are
8392             ** now stored either on real database pages or within the
8393             ** new pSpace buffer, so it may be safely freed here. */
8394             sqlite3PageFree(pFree);
8395           }
8396 
8397           /* The pSpace buffer will be freed after the next call to
8398           ** balance_nonroot(), or just before this function returns, whichever
8399           ** comes first. */
8400           pFree = pSpace;
8401         }
8402       }
8403 
8404       pPage->nOverflow = 0;
8405 
8406       /* The next iteration of the do-loop balances the parent page. */
8407       releasePage(pPage);
8408       pCur->iPage--;
8409       assert( pCur->iPage>=0 );
8410       pCur->pPage = pCur->apPage[pCur->iPage];
8411     }
8412   }while( rc==SQLITE_OK );
8413 
8414   if( pFree ){
8415     sqlite3PageFree(pFree);
8416   }
8417   return rc;
8418 }
8419 
8420 /* Overwrite content from pX into pDest.  Only do the write if the
8421 ** content is different from what is already there.
8422 */
8423 static int btreeOverwriteContent(
8424   MemPage *pPage,           /* MemPage on which writing will occur */
8425   u8 *pDest,                /* Pointer to the place to start writing */
8426   const BtreePayload *pX,   /* Source of data to write */
8427   int iOffset,              /* Offset of first byte to write */
8428   int iAmt                  /* Number of bytes to be written */
8429 ){
8430   int nData = pX->nData - iOffset;
8431   if( nData<=0 ){
8432     /* Overwritting with zeros */
8433     int i;
8434     for(i=0; i<iAmt && pDest[i]==0; i++){}
8435     if( i<iAmt ){
8436       int rc = sqlite3PagerWrite(pPage->pDbPage);
8437       if( rc ) return rc;
8438       memset(pDest + i, 0, iAmt - i);
8439     }
8440   }else{
8441     if( nData<iAmt ){
8442       /* Mixed read data and zeros at the end.  Make a recursive call
8443       ** to write the zeros then fall through to write the real data */
8444       int rc = btreeOverwriteContent(pPage, pDest+nData, pX, iOffset+nData,
8445                                  iAmt-nData);
8446       if( rc ) return rc;
8447       iAmt = nData;
8448     }
8449     if( memcmp(pDest, ((u8*)pX->pData) + iOffset, iAmt)!=0 ){
8450       int rc = sqlite3PagerWrite(pPage->pDbPage);
8451       if( rc ) return rc;
8452       /* In a corrupt database, it is possible for the source and destination
8453       ** buffers to overlap.  This is harmless since the database is already
8454       ** corrupt but it does cause valgrind and ASAN warnings.  So use
8455       ** memmove(). */
8456       memmove(pDest, ((u8*)pX->pData) + iOffset, iAmt);
8457     }
8458   }
8459   return SQLITE_OK;
8460 }
8461 
8462 /*
8463 ** Overwrite the cell that cursor pCur is pointing to with fresh content
8464 ** contained in pX.
8465 */
8466 static int btreeOverwriteCell(BtCursor *pCur, const BtreePayload *pX){
8467   int iOffset;                        /* Next byte of pX->pData to write */
8468   int nTotal = pX->nData + pX->nZero; /* Total bytes of to write */
8469   int rc;                             /* Return code */
8470   MemPage *pPage = pCur->pPage;       /* Page being written */
8471   BtShared *pBt;                      /* Btree */
8472   Pgno ovflPgno;                      /* Next overflow page to write */
8473   u32 ovflPageSize;                   /* Size to write on overflow page */
8474 
8475   if( pCur->info.pPayload + pCur->info.nLocal > pPage->aDataEnd ){
8476     return SQLITE_CORRUPT_BKPT;
8477   }
8478   /* Overwrite the local portion first */
8479   rc = btreeOverwriteContent(pPage, pCur->info.pPayload, pX,
8480                              0, pCur->info.nLocal);
8481   if( rc ) return rc;
8482   if( pCur->info.nLocal==nTotal ) return SQLITE_OK;
8483 
8484   /* Now overwrite the overflow pages */
8485   iOffset = pCur->info.nLocal;
8486   assert( nTotal>=0 );
8487   assert( iOffset>=0 );
8488   ovflPgno = get4byte(pCur->info.pPayload + iOffset);
8489   pBt = pPage->pBt;
8490   ovflPageSize = pBt->usableSize - 4;
8491   do{
8492     rc = btreeGetPage(pBt, ovflPgno, &pPage, 0);
8493     if( rc ) return rc;
8494     if( sqlite3PagerPageRefcount(pPage->pDbPage)!=1 ){
8495       rc = SQLITE_CORRUPT_BKPT;
8496     }else{
8497       if( iOffset+ovflPageSize<(u32)nTotal ){
8498         ovflPgno = get4byte(pPage->aData);
8499       }else{
8500         ovflPageSize = nTotal - iOffset;
8501       }
8502       rc = btreeOverwriteContent(pPage, pPage->aData+4, pX,
8503                                  iOffset, ovflPageSize);
8504     }
8505     sqlite3PagerUnref(pPage->pDbPage);
8506     if( rc ) return rc;
8507     iOffset += ovflPageSize;
8508   }while( iOffset<nTotal );
8509   return SQLITE_OK;
8510 }
8511 
8512 
8513 /*
8514 ** Insert a new record into the BTree.  The content of the new record
8515 ** is described by the pX object.  The pCur cursor is used only to
8516 ** define what table the record should be inserted into, and is left
8517 ** pointing at a random location.
8518 **
8519 ** For a table btree (used for rowid tables), only the pX.nKey value of
8520 ** the key is used. The pX.pKey value must be NULL.  The pX.nKey is the
8521 ** rowid or INTEGER PRIMARY KEY of the row.  The pX.nData,pData,nZero fields
8522 ** hold the content of the row.
8523 **
8524 ** For an index btree (used for indexes and WITHOUT ROWID tables), the
8525 ** key is an arbitrary byte sequence stored in pX.pKey,nKey.  The
8526 ** pX.pData,nData,nZero fields must be zero.
8527 **
8528 ** If the seekResult parameter is non-zero, then a successful call to
8529 ** MovetoUnpacked() to seek cursor pCur to (pKey,nKey) has already
8530 ** been performed.  In other words, if seekResult!=0 then the cursor
8531 ** is currently pointing to a cell that will be adjacent to the cell
8532 ** to be inserted.  If seekResult<0 then pCur points to a cell that is
8533 ** smaller then (pKey,nKey).  If seekResult>0 then pCur points to a cell
8534 ** that is larger than (pKey,nKey).
8535 **
8536 ** If seekResult==0, that means pCur is pointing at some unknown location.
8537 ** In that case, this routine must seek the cursor to the correct insertion
8538 ** point for (pKey,nKey) before doing the insertion.  For index btrees,
8539 ** if pX->nMem is non-zero, then pX->aMem contains pointers to the unpacked
8540 ** key values and pX->aMem can be used instead of pX->pKey to avoid having
8541 ** to decode the key.
8542 */
8543 int sqlite3BtreeInsert(
8544   BtCursor *pCur,                /* Insert data into the table of this cursor */
8545   const BtreePayload *pX,        /* Content of the row to be inserted */
8546   int flags,                     /* True if this is likely an append */
8547   int seekResult                 /* Result of prior MovetoUnpacked() call */
8548 ){
8549   int rc;
8550   int loc = seekResult;          /* -1: before desired location  +1: after */
8551   int szNew = 0;
8552   int idx;
8553   MemPage *pPage;
8554   Btree *p = pCur->pBtree;
8555   BtShared *pBt = p->pBt;
8556   unsigned char *oldCell;
8557   unsigned char *newCell = 0;
8558 
8559   assert( (flags & (BTREE_SAVEPOSITION|BTREE_APPEND))==flags );
8560 
8561   if( pCur->eState==CURSOR_FAULT ){
8562     assert( pCur->skipNext!=SQLITE_OK );
8563     return pCur->skipNext;
8564   }
8565 
8566   assert( cursorOwnsBtShared(pCur) );
8567   assert( (pCur->curFlags & BTCF_WriteFlag)!=0
8568               && pBt->inTransaction==TRANS_WRITE
8569               && (pBt->btsFlags & BTS_READ_ONLY)==0 );
8570   assert( hasSharedCacheTableLock(p, pCur->pgnoRoot, pCur->pKeyInfo!=0, 2) );
8571 
8572   /* Assert that the caller has been consistent. If this cursor was opened
8573   ** expecting an index b-tree, then the caller should be inserting blob
8574   ** keys with no associated data. If the cursor was opened expecting an
8575   ** intkey table, the caller should be inserting integer keys with a
8576   ** blob of associated data.  */
8577   assert( (pX->pKey==0)==(pCur->pKeyInfo==0) );
8578 
8579   /* Save the positions of any other cursors open on this table.
8580   **
8581   ** In some cases, the call to btreeMoveto() below is a no-op. For
8582   ** example, when inserting data into a table with auto-generated integer
8583   ** keys, the VDBE layer invokes sqlite3BtreeLast() to figure out the
8584   ** integer key to use. It then calls this function to actually insert the
8585   ** data into the intkey B-Tree. In this case btreeMoveto() recognizes
8586   ** that the cursor is already where it needs to be and returns without
8587   ** doing any work. To avoid thwarting these optimizations, it is important
8588   ** not to clear the cursor here.
8589   */
8590   if( pCur->curFlags & BTCF_Multiple ){
8591     rc = saveAllCursors(pBt, pCur->pgnoRoot, pCur);
8592     if( rc ) return rc;
8593   }
8594 
8595   if( pCur->pKeyInfo==0 ){
8596     assert( pX->pKey==0 );
8597     /* If this is an insert into a table b-tree, invalidate any incrblob
8598     ** cursors open on the row being replaced */
8599     invalidateIncrblobCursors(p, pCur->pgnoRoot, pX->nKey, 0);
8600 
8601     /* If BTREE_SAVEPOSITION is set, the cursor must already be pointing
8602     ** to a row with the same key as the new entry being inserted.
8603     */
8604 #ifdef SQLITE_DEBUG
8605     if( flags & BTREE_SAVEPOSITION ){
8606       assert( pCur->curFlags & BTCF_ValidNKey );
8607       assert( pX->nKey==pCur->info.nKey );
8608       assert( pCur->info.nSize!=0 );
8609       assert( loc==0 );
8610     }
8611 #endif
8612 
8613     /* On the other hand, BTREE_SAVEPOSITION==0 does not imply
8614     ** that the cursor is not pointing to a row to be overwritten.
8615     ** So do a complete check.
8616     */
8617     if( (pCur->curFlags&BTCF_ValidNKey)!=0 && pX->nKey==pCur->info.nKey ){
8618       /* The cursor is pointing to the entry that is to be
8619       ** overwritten */
8620       assert( pX->nData>=0 && pX->nZero>=0 );
8621       if( pCur->info.nSize!=0
8622        && pCur->info.nPayload==(u32)pX->nData+pX->nZero
8623       ){
8624         /* New entry is the same size as the old.  Do an overwrite */
8625         return btreeOverwriteCell(pCur, pX);
8626       }
8627       assert( loc==0 );
8628     }else if( loc==0 ){
8629       /* The cursor is *not* pointing to the cell to be overwritten, nor
8630       ** to an adjacent cell.  Move the cursor so that it is pointing either
8631       ** to the cell to be overwritten or an adjacent cell.
8632       */
8633       rc = sqlite3BtreeMovetoUnpacked(pCur, 0, pX->nKey, flags!=0, &loc);
8634       if( rc ) return rc;
8635     }
8636   }else{
8637     /* This is an index or a WITHOUT ROWID table */
8638 
8639     /* If BTREE_SAVEPOSITION is set, the cursor must already be pointing
8640     ** to a row with the same key as the new entry being inserted.
8641     */
8642     assert( (flags & BTREE_SAVEPOSITION)==0 || loc==0 );
8643 
8644     /* If the cursor is not already pointing either to the cell to be
8645     ** overwritten, or if a new cell is being inserted, if the cursor is
8646     ** not pointing to an immediately adjacent cell, then move the cursor
8647     ** so that it does.
8648     */
8649     if( loc==0 && (flags & BTREE_SAVEPOSITION)==0 ){
8650       if( pX->nMem ){
8651         UnpackedRecord r;
8652         r.pKeyInfo = pCur->pKeyInfo;
8653         r.aMem = pX->aMem;
8654         r.nField = pX->nMem;
8655         r.default_rc = 0;
8656         r.errCode = 0;
8657         r.r1 = 0;
8658         r.r2 = 0;
8659         r.eqSeen = 0;
8660         rc = sqlite3BtreeMovetoUnpacked(pCur, &r, 0, flags!=0, &loc);
8661       }else{
8662         rc = btreeMoveto(pCur, pX->pKey, pX->nKey, flags!=0, &loc);
8663       }
8664       if( rc ) return rc;
8665     }
8666 
8667     /* If the cursor is currently pointing to an entry to be overwritten
8668     ** and the new content is the same as as the old, then use the
8669     ** overwrite optimization.
8670     */
8671     if( loc==0 ){
8672       getCellInfo(pCur);
8673       if( pCur->info.nKey==pX->nKey ){
8674         BtreePayload x2;
8675         x2.pData = pX->pKey;
8676         x2.nData = pX->nKey;
8677         x2.nZero = 0;
8678         return btreeOverwriteCell(pCur, &x2);
8679       }
8680     }
8681 
8682   }
8683   assert( pCur->eState==CURSOR_VALID || (pCur->eState==CURSOR_INVALID && loc) );
8684 
8685   pPage = pCur->pPage;
8686   assert( pPage->intKey || pX->nKey>=0 );
8687   assert( pPage->leaf || !pPage->intKey );
8688   if( pPage->nFree<0 ){
8689     rc = btreeComputeFreeSpace(pPage);
8690     if( rc ) return rc;
8691   }
8692 
8693   TRACE(("INSERT: table=%d nkey=%lld ndata=%d page=%d %s\n",
8694           pCur->pgnoRoot, pX->nKey, pX->nData, pPage->pgno,
8695           loc==0 ? "overwrite" : "new entry"));
8696   assert( pPage->isInit );
8697   newCell = pBt->pTmpSpace;
8698   assert( newCell!=0 );
8699   rc = fillInCell(pPage, newCell, pX, &szNew);
8700   if( rc ) goto end_insert;
8701   assert( szNew==pPage->xCellSize(pPage, newCell) );
8702   assert( szNew <= MX_CELL_SIZE(pBt) );
8703   idx = pCur->ix;
8704   if( loc==0 ){
8705     CellInfo info;
8706     assert( idx<pPage->nCell );
8707     rc = sqlite3PagerWrite(pPage->pDbPage);
8708     if( rc ){
8709       goto end_insert;
8710     }
8711     oldCell = findCell(pPage, idx);
8712     if( !pPage->leaf ){
8713       memcpy(newCell, oldCell, 4);
8714     }
8715     rc = clearCell(pPage, oldCell, &info);
8716     if( info.nSize==szNew && info.nLocal==info.nPayload
8717      && (!ISAUTOVACUUM || szNew<pPage->minLocal)
8718     ){
8719       /* Overwrite the old cell with the new if they are the same size.
8720       ** We could also try to do this if the old cell is smaller, then add
8721       ** the leftover space to the free list.  But experiments show that
8722       ** doing that is no faster then skipping this optimization and just
8723       ** calling dropCell() and insertCell().
8724       **
8725       ** This optimization cannot be used on an autovacuum database if the
8726       ** new entry uses overflow pages, as the insertCell() call below is
8727       ** necessary to add the PTRMAP_OVERFLOW1 pointer-map entry.  */
8728       assert( rc==SQLITE_OK ); /* clearCell never fails when nLocal==nPayload */
8729       if( oldCell < pPage->aData+pPage->hdrOffset+10 ){
8730         return SQLITE_CORRUPT_BKPT;
8731       }
8732       if( oldCell+szNew > pPage->aDataEnd ){
8733         return SQLITE_CORRUPT_BKPT;
8734       }
8735       memcpy(oldCell, newCell, szNew);
8736       return SQLITE_OK;
8737     }
8738     dropCell(pPage, idx, info.nSize, &rc);
8739     if( rc ) goto end_insert;
8740   }else if( loc<0 && pPage->nCell>0 ){
8741     assert( pPage->leaf );
8742     idx = ++pCur->ix;
8743     pCur->curFlags &= ~BTCF_ValidNKey;
8744   }else{
8745     assert( pPage->leaf );
8746   }
8747   insertCell(pPage, idx, newCell, szNew, 0, 0, &rc);
8748   assert( pPage->nOverflow==0 || rc==SQLITE_OK );
8749   assert( rc!=SQLITE_OK || pPage->nCell>0 || pPage->nOverflow>0 );
8750 
8751   /* If no error has occurred and pPage has an overflow cell, call balance()
8752   ** to redistribute the cells within the tree. Since balance() may move
8753   ** the cursor, zero the BtCursor.info.nSize and BTCF_ValidNKey
8754   ** variables.
8755   **
8756   ** Previous versions of SQLite called moveToRoot() to move the cursor
8757   ** back to the root page as balance() used to invalidate the contents
8758   ** of BtCursor.apPage[] and BtCursor.aiIdx[]. Instead of doing that,
8759   ** set the cursor state to "invalid". This makes common insert operations
8760   ** slightly faster.
8761   **
8762   ** There is a subtle but important optimization here too. When inserting
8763   ** multiple records into an intkey b-tree using a single cursor (as can
8764   ** happen while processing an "INSERT INTO ... SELECT" statement), it
8765   ** is advantageous to leave the cursor pointing to the last entry in
8766   ** the b-tree if possible. If the cursor is left pointing to the last
8767   ** entry in the table, and the next row inserted has an integer key
8768   ** larger than the largest existing key, it is possible to insert the
8769   ** row without seeking the cursor. This can be a big performance boost.
8770   */
8771   pCur->info.nSize = 0;
8772   if( pPage->nOverflow ){
8773     assert( rc==SQLITE_OK );
8774     pCur->curFlags &= ~(BTCF_ValidNKey);
8775     rc = balance(pCur);
8776 
8777     /* Must make sure nOverflow is reset to zero even if the balance()
8778     ** fails. Internal data structure corruption will result otherwise.
8779     ** Also, set the cursor state to invalid. This stops saveCursorPosition()
8780     ** from trying to save the current position of the cursor.  */
8781     pCur->pPage->nOverflow = 0;
8782     pCur->eState = CURSOR_INVALID;
8783     if( (flags & BTREE_SAVEPOSITION) && rc==SQLITE_OK ){
8784       btreeReleaseAllCursorPages(pCur);
8785       if( pCur->pKeyInfo ){
8786         assert( pCur->pKey==0 );
8787         pCur->pKey = sqlite3Malloc( pX->nKey );
8788         if( pCur->pKey==0 ){
8789           rc = SQLITE_NOMEM;
8790         }else{
8791           memcpy(pCur->pKey, pX->pKey, pX->nKey);
8792         }
8793       }
8794       pCur->eState = CURSOR_REQUIRESEEK;
8795       pCur->nKey = pX->nKey;
8796     }
8797   }
8798   assert( pCur->iPage<0 || pCur->pPage->nOverflow==0 );
8799 
8800 end_insert:
8801   return rc;
8802 }
8803 
8804 /*
8805 ** Delete the entry that the cursor is pointing to.
8806 **
8807 ** If the BTREE_SAVEPOSITION bit of the flags parameter is zero, then
8808 ** the cursor is left pointing at an arbitrary location after the delete.
8809 ** But if that bit is set, then the cursor is left in a state such that
8810 ** the next call to BtreeNext() or BtreePrev() moves it to the same row
8811 ** as it would have been on if the call to BtreeDelete() had been omitted.
8812 **
8813 ** The BTREE_AUXDELETE bit of flags indicates that is one of several deletes
8814 ** associated with a single table entry and its indexes.  Only one of those
8815 ** deletes is considered the "primary" delete.  The primary delete occurs
8816 ** on a cursor that is not a BTREE_FORDELETE cursor.  All but one delete
8817 ** operation on non-FORDELETE cursors is tagged with the AUXDELETE flag.
8818 ** The BTREE_AUXDELETE bit is a hint that is not used by this implementation,
8819 ** but which might be used by alternative storage engines.
8820 */
8821 int sqlite3BtreeDelete(BtCursor *pCur, u8 flags){
8822   Btree *p = pCur->pBtree;
8823   BtShared *pBt = p->pBt;
8824   int rc;                              /* Return code */
8825   MemPage *pPage;                      /* Page to delete cell from */
8826   unsigned char *pCell;                /* Pointer to cell to delete */
8827   int iCellIdx;                        /* Index of cell to delete */
8828   int iCellDepth;                      /* Depth of node containing pCell */
8829   CellInfo info;                       /* Size of the cell being deleted */
8830   int bSkipnext = 0;                   /* Leaf cursor in SKIPNEXT state */
8831   u8 bPreserve = flags & BTREE_SAVEPOSITION;  /* Keep cursor valid */
8832 
8833   assert( cursorOwnsBtShared(pCur) );
8834   assert( pBt->inTransaction==TRANS_WRITE );
8835   assert( (pBt->btsFlags & BTS_READ_ONLY)==0 );
8836   assert( pCur->curFlags & BTCF_WriteFlag );
8837   assert( hasSharedCacheTableLock(p, pCur->pgnoRoot, pCur->pKeyInfo!=0, 2) );
8838   assert( !hasReadConflicts(p, pCur->pgnoRoot) );
8839   assert( (flags & ~(BTREE_SAVEPOSITION | BTREE_AUXDELETE))==0 );
8840   if( pCur->eState==CURSOR_REQUIRESEEK ){
8841     rc = btreeRestoreCursorPosition(pCur);
8842     if( rc ) return rc;
8843   }
8844   assert( pCur->eState==CURSOR_VALID );
8845 
8846   iCellDepth = pCur->iPage;
8847   iCellIdx = pCur->ix;
8848   pPage = pCur->pPage;
8849   pCell = findCell(pPage, iCellIdx);
8850   if( pPage->nFree<0 && btreeComputeFreeSpace(pPage) ) return SQLITE_CORRUPT;
8851 
8852   /* If the bPreserve flag is set to true, then the cursor position must
8853   ** be preserved following this delete operation. If the current delete
8854   ** will cause a b-tree rebalance, then this is done by saving the cursor
8855   ** key and leaving the cursor in CURSOR_REQUIRESEEK state before
8856   ** returning.
8857   **
8858   ** Or, if the current delete will not cause a rebalance, then the cursor
8859   ** will be left in CURSOR_SKIPNEXT state pointing to the entry immediately
8860   ** before or after the deleted entry. In this case set bSkipnext to true.  */
8861   if( bPreserve ){
8862     if( !pPage->leaf
8863      || (pPage->nFree+cellSizePtr(pPage,pCell)+2)>(int)(pBt->usableSize*2/3)
8864      || pPage->nCell==1  /* See dbfuzz001.test for a test case */
8865     ){
8866       /* A b-tree rebalance will be required after deleting this entry.
8867       ** Save the cursor key.  */
8868       rc = saveCursorKey(pCur);
8869       if( rc ) return rc;
8870     }else{
8871       bSkipnext = 1;
8872     }
8873   }
8874 
8875   /* If the page containing the entry to delete is not a leaf page, move
8876   ** the cursor to the largest entry in the tree that is smaller than
8877   ** the entry being deleted. This cell will replace the cell being deleted
8878   ** from the internal node. The 'previous' entry is used for this instead
8879   ** of the 'next' entry, as the previous entry is always a part of the
8880   ** sub-tree headed by the child page of the cell being deleted. This makes
8881   ** balancing the tree following the delete operation easier.  */
8882   if( !pPage->leaf ){
8883     rc = sqlite3BtreePrevious(pCur, 0);
8884     assert( rc!=SQLITE_DONE );
8885     if( rc ) return rc;
8886   }
8887 
8888   /* Save the positions of any other cursors open on this table before
8889   ** making any modifications.  */
8890   if( pCur->curFlags & BTCF_Multiple ){
8891     rc = saveAllCursors(pBt, pCur->pgnoRoot, pCur);
8892     if( rc ) return rc;
8893   }
8894 
8895   /* If this is a delete operation to remove a row from a table b-tree,
8896   ** invalidate any incrblob cursors open on the row being deleted.  */
8897   if( pCur->pKeyInfo==0 ){
8898     invalidateIncrblobCursors(p, pCur->pgnoRoot, pCur->info.nKey, 0);
8899   }
8900 
8901   /* Make the page containing the entry to be deleted writable. Then free any
8902   ** overflow pages associated with the entry and finally remove the cell
8903   ** itself from within the page.  */
8904   rc = sqlite3PagerWrite(pPage->pDbPage);
8905   if( rc ) return rc;
8906   rc = clearCell(pPage, pCell, &info);
8907   dropCell(pPage, iCellIdx, info.nSize, &rc);
8908   if( rc ) return rc;
8909 
8910   /* If the cell deleted was not located on a leaf page, then the cursor
8911   ** is currently pointing to the largest entry in the sub-tree headed
8912   ** by the child-page of the cell that was just deleted from an internal
8913   ** node. The cell from the leaf node needs to be moved to the internal
8914   ** node to replace the deleted cell.  */
8915   if( !pPage->leaf ){
8916     MemPage *pLeaf = pCur->pPage;
8917     int nCell;
8918     Pgno n;
8919     unsigned char *pTmp;
8920 
8921     if( pLeaf->nFree<0 ){
8922       rc = btreeComputeFreeSpace(pLeaf);
8923       if( rc ) return rc;
8924     }
8925     if( iCellDepth<pCur->iPage-1 ){
8926       n = pCur->apPage[iCellDepth+1]->pgno;
8927     }else{
8928       n = pCur->pPage->pgno;
8929     }
8930     pCell = findCell(pLeaf, pLeaf->nCell-1);
8931     if( pCell<&pLeaf->aData[4] ) return SQLITE_CORRUPT_BKPT;
8932     nCell = pLeaf->xCellSize(pLeaf, pCell);
8933     assert( MX_CELL_SIZE(pBt) >= nCell );
8934     pTmp = pBt->pTmpSpace;
8935     assert( pTmp!=0 );
8936     rc = sqlite3PagerWrite(pLeaf->pDbPage);
8937     if( rc==SQLITE_OK ){
8938       insertCell(pPage, iCellIdx, pCell-4, nCell+4, pTmp, n, &rc);
8939     }
8940     dropCell(pLeaf, pLeaf->nCell-1, nCell, &rc);
8941     if( rc ) return rc;
8942   }
8943 
8944   /* Balance the tree. If the entry deleted was located on a leaf page,
8945   ** then the cursor still points to that page. In this case the first
8946   ** call to balance() repairs the tree, and the if(...) condition is
8947   ** never true.
8948   **
8949   ** Otherwise, if the entry deleted was on an internal node page, then
8950   ** pCur is pointing to the leaf page from which a cell was removed to
8951   ** replace the cell deleted from the internal node. This is slightly
8952   ** tricky as the leaf node may be underfull, and the internal node may
8953   ** be either under or overfull. In this case run the balancing algorithm
8954   ** on the leaf node first. If the balance proceeds far enough up the
8955   ** tree that we can be sure that any problem in the internal node has
8956   ** been corrected, so be it. Otherwise, after balancing the leaf node,
8957   ** walk the cursor up the tree to the internal node and balance it as
8958   ** well.  */
8959   rc = balance(pCur);
8960   if( rc==SQLITE_OK && pCur->iPage>iCellDepth ){
8961     releasePageNotNull(pCur->pPage);
8962     pCur->iPage--;
8963     while( pCur->iPage>iCellDepth ){
8964       releasePage(pCur->apPage[pCur->iPage--]);
8965     }
8966     pCur->pPage = pCur->apPage[pCur->iPage];
8967     rc = balance(pCur);
8968   }
8969 
8970   if( rc==SQLITE_OK ){
8971     if( bSkipnext ){
8972       assert( bPreserve && (pCur->iPage==iCellDepth || CORRUPT_DB) );
8973       assert( pPage==pCur->pPage || CORRUPT_DB );
8974       assert( (pPage->nCell>0 || CORRUPT_DB) && iCellIdx<=pPage->nCell );
8975       pCur->eState = CURSOR_SKIPNEXT;
8976       if( iCellIdx>=pPage->nCell ){
8977         pCur->skipNext = -1;
8978         pCur->ix = pPage->nCell-1;
8979       }else{
8980         pCur->skipNext = 1;
8981       }
8982     }else{
8983       rc = moveToRoot(pCur);
8984       if( bPreserve ){
8985         btreeReleaseAllCursorPages(pCur);
8986         pCur->eState = CURSOR_REQUIRESEEK;
8987       }
8988       if( rc==SQLITE_EMPTY ) rc = SQLITE_OK;
8989     }
8990   }
8991   return rc;
8992 }
8993 
8994 /*
8995 ** Create a new BTree table.  Write into *piTable the page
8996 ** number for the root page of the new table.
8997 **
8998 ** The type of type is determined by the flags parameter.  Only the
8999 ** following values of flags are currently in use.  Other values for
9000 ** flags might not work:
9001 **
9002 **     BTREE_INTKEY|BTREE_LEAFDATA     Used for SQL tables with rowid keys
9003 **     BTREE_ZERODATA                  Used for SQL indices
9004 */
9005 static int btreeCreateTable(Btree *p, int *piTable, int createTabFlags){
9006   BtShared *pBt = p->pBt;
9007   MemPage *pRoot;
9008   Pgno pgnoRoot;
9009   int rc;
9010   int ptfFlags;          /* Page-type flage for the root page of new table */
9011 
9012   assert( sqlite3BtreeHoldsMutex(p) );
9013   assert( pBt->inTransaction==TRANS_WRITE );
9014   assert( (pBt->btsFlags & BTS_READ_ONLY)==0 );
9015 
9016 #ifdef SQLITE_OMIT_AUTOVACUUM
9017   rc = allocateBtreePage(pBt, &pRoot, &pgnoRoot, 1, 0);
9018   if( rc ){
9019     return rc;
9020   }
9021 #else
9022   if( pBt->autoVacuum ){
9023     Pgno pgnoMove;      /* Move a page here to make room for the root-page */
9024     MemPage *pPageMove; /* The page to move to. */
9025 
9026     /* Creating a new table may probably require moving an existing database
9027     ** to make room for the new tables root page. In case this page turns
9028     ** out to be an overflow page, delete all overflow page-map caches
9029     ** held by open cursors.
9030     */
9031     invalidateAllOverflowCache(pBt);
9032 
9033     /* Read the value of meta[3] from the database to determine where the
9034     ** root page of the new table should go. meta[3] is the largest root-page
9035     ** created so far, so the new root-page is (meta[3]+1).
9036     */
9037     sqlite3BtreeGetMeta(p, BTREE_LARGEST_ROOT_PAGE, &pgnoRoot);
9038     pgnoRoot++;
9039 
9040     /* The new root-page may not be allocated on a pointer-map page, or the
9041     ** PENDING_BYTE page.
9042     */
9043     while( pgnoRoot==PTRMAP_PAGENO(pBt, pgnoRoot) ||
9044         pgnoRoot==PENDING_BYTE_PAGE(pBt) ){
9045       pgnoRoot++;
9046     }
9047     assert( pgnoRoot>=3 || CORRUPT_DB );
9048     testcase( pgnoRoot<3 );
9049 
9050     /* Allocate a page. The page that currently resides at pgnoRoot will
9051     ** be moved to the allocated page (unless the allocated page happens
9052     ** to reside at pgnoRoot).
9053     */
9054     rc = allocateBtreePage(pBt, &pPageMove, &pgnoMove, pgnoRoot, BTALLOC_EXACT);
9055     if( rc!=SQLITE_OK ){
9056       return rc;
9057     }
9058 
9059     if( pgnoMove!=pgnoRoot ){
9060       /* pgnoRoot is the page that will be used for the root-page of
9061       ** the new table (assuming an error did not occur). But we were
9062       ** allocated pgnoMove. If required (i.e. if it was not allocated
9063       ** by extending the file), the current page at position pgnoMove
9064       ** is already journaled.
9065       */
9066       u8 eType = 0;
9067       Pgno iPtrPage = 0;
9068 
9069       /* Save the positions of any open cursors. This is required in
9070       ** case they are holding a reference to an xFetch reference
9071       ** corresponding to page pgnoRoot.  */
9072       rc = saveAllCursors(pBt, 0, 0);
9073       releasePage(pPageMove);
9074       if( rc!=SQLITE_OK ){
9075         return rc;
9076       }
9077 
9078       /* Move the page currently at pgnoRoot to pgnoMove. */
9079       rc = btreeGetPage(pBt, pgnoRoot, &pRoot, 0);
9080       if( rc!=SQLITE_OK ){
9081         return rc;
9082       }
9083       rc = ptrmapGet(pBt, pgnoRoot, &eType, &iPtrPage);
9084       if( eType==PTRMAP_ROOTPAGE || eType==PTRMAP_FREEPAGE ){
9085         rc = SQLITE_CORRUPT_BKPT;
9086       }
9087       if( rc!=SQLITE_OK ){
9088         releasePage(pRoot);
9089         return rc;
9090       }
9091       assert( eType!=PTRMAP_ROOTPAGE );
9092       assert( eType!=PTRMAP_FREEPAGE );
9093       rc = relocatePage(pBt, pRoot, eType, iPtrPage, pgnoMove, 0);
9094       releasePage(pRoot);
9095 
9096       /* Obtain the page at pgnoRoot */
9097       if( rc!=SQLITE_OK ){
9098         return rc;
9099       }
9100       rc = btreeGetPage(pBt, pgnoRoot, &pRoot, 0);
9101       if( rc!=SQLITE_OK ){
9102         return rc;
9103       }
9104       rc = sqlite3PagerWrite(pRoot->pDbPage);
9105       if( rc!=SQLITE_OK ){
9106         releasePage(pRoot);
9107         return rc;
9108       }
9109     }else{
9110       pRoot = pPageMove;
9111     }
9112 
9113     /* Update the pointer-map and meta-data with the new root-page number. */
9114     ptrmapPut(pBt, pgnoRoot, PTRMAP_ROOTPAGE, 0, &rc);
9115     if( rc ){
9116       releasePage(pRoot);
9117       return rc;
9118     }
9119 
9120     /* When the new root page was allocated, page 1 was made writable in
9121     ** order either to increase the database filesize, or to decrement the
9122     ** freelist count.  Hence, the sqlite3BtreeUpdateMeta() call cannot fail.
9123     */
9124     assert( sqlite3PagerIswriteable(pBt->pPage1->pDbPage) );
9125     rc = sqlite3BtreeUpdateMeta(p, 4, pgnoRoot);
9126     if( NEVER(rc) ){
9127       releasePage(pRoot);
9128       return rc;
9129     }
9130 
9131   }else{
9132     rc = allocateBtreePage(pBt, &pRoot, &pgnoRoot, 1, 0);
9133     if( rc ) return rc;
9134   }
9135 #endif
9136   assert( sqlite3PagerIswriteable(pRoot->pDbPage) );
9137   if( createTabFlags & BTREE_INTKEY ){
9138     ptfFlags = PTF_INTKEY | PTF_LEAFDATA | PTF_LEAF;
9139   }else{
9140     ptfFlags = PTF_ZERODATA | PTF_LEAF;
9141   }
9142   zeroPage(pRoot, ptfFlags);
9143   sqlite3PagerUnref(pRoot->pDbPage);
9144   assert( (pBt->openFlags & BTREE_SINGLE)==0 || pgnoRoot==2 );
9145   *piTable = (int)pgnoRoot;
9146   return SQLITE_OK;
9147 }
9148 int sqlite3BtreeCreateTable(Btree *p, int *piTable, int flags){
9149   int rc;
9150   sqlite3BtreeEnter(p);
9151   rc = btreeCreateTable(p, piTable, flags);
9152   sqlite3BtreeLeave(p);
9153   return rc;
9154 }
9155 
9156 /*
9157 ** Erase the given database page and all its children.  Return
9158 ** the page to the freelist.
9159 */
9160 static int clearDatabasePage(
9161   BtShared *pBt,           /* The BTree that contains the table */
9162   Pgno pgno,               /* Page number to clear */
9163   int freePageFlag,        /* Deallocate page if true */
9164   int *pnChange            /* Add number of Cells freed to this counter */
9165 ){
9166   MemPage *pPage;
9167   int rc;
9168   unsigned char *pCell;
9169   int i;
9170   int hdr;
9171   CellInfo info;
9172 
9173   assert( sqlite3_mutex_held(pBt->mutex) );
9174   if( pgno>btreePagecount(pBt) ){
9175     return SQLITE_CORRUPT_BKPT;
9176   }
9177   rc = getAndInitPage(pBt, pgno, &pPage, 0, 0);
9178   if( rc ) return rc;
9179   if( pPage->bBusy ){
9180     rc = SQLITE_CORRUPT_BKPT;
9181     goto cleardatabasepage_out;
9182   }
9183   pPage->bBusy = 1;
9184   hdr = pPage->hdrOffset;
9185   for(i=0; i<pPage->nCell; i++){
9186     pCell = findCell(pPage, i);
9187     if( !pPage->leaf ){
9188       rc = clearDatabasePage(pBt, get4byte(pCell), 1, pnChange);
9189       if( rc ) goto cleardatabasepage_out;
9190     }
9191     rc = clearCell(pPage, pCell, &info);
9192     if( rc ) goto cleardatabasepage_out;
9193   }
9194   if( !pPage->leaf ){
9195     rc = clearDatabasePage(pBt, get4byte(&pPage->aData[hdr+8]), 1, pnChange);
9196     if( rc ) goto cleardatabasepage_out;
9197   }else if( pnChange ){
9198     assert( pPage->intKey || CORRUPT_DB );
9199     testcase( !pPage->intKey );
9200     *pnChange += pPage->nCell;
9201   }
9202   if( freePageFlag ){
9203     freePage(pPage, &rc);
9204   }else if( (rc = sqlite3PagerWrite(pPage->pDbPage))==0 ){
9205     zeroPage(pPage, pPage->aData[hdr] | PTF_LEAF);
9206   }
9207 
9208 cleardatabasepage_out:
9209   pPage->bBusy = 0;
9210   releasePage(pPage);
9211   return rc;
9212 }
9213 
9214 /*
9215 ** Delete all information from a single table in the database.  iTable is
9216 ** the page number of the root of the table.  After this routine returns,
9217 ** the root page is empty, but still exists.
9218 **
9219 ** This routine will fail with SQLITE_LOCKED if there are any open
9220 ** read cursors on the table.  Open write cursors are moved to the
9221 ** root of the table.
9222 **
9223 ** If pnChange is not NULL, then table iTable must be an intkey table. The
9224 ** integer value pointed to by pnChange is incremented by the number of
9225 ** entries in the table.
9226 */
9227 int sqlite3BtreeClearTable(Btree *p, int iTable, int *pnChange){
9228   int rc;
9229   BtShared *pBt = p->pBt;
9230   sqlite3BtreeEnter(p);
9231   assert( p->inTrans==TRANS_WRITE );
9232 
9233   rc = saveAllCursors(pBt, (Pgno)iTable, 0);
9234 
9235   if( SQLITE_OK==rc ){
9236     /* Invalidate all incrblob cursors open on table iTable (assuming iTable
9237     ** is the root of a table b-tree - if it is not, the following call is
9238     ** a no-op).  */
9239     invalidateIncrblobCursors(p, (Pgno)iTable, 0, 1);
9240     rc = clearDatabasePage(pBt, (Pgno)iTable, 0, pnChange);
9241   }
9242   sqlite3BtreeLeave(p);
9243   return rc;
9244 }
9245 
9246 /*
9247 ** Delete all information from the single table that pCur is open on.
9248 **
9249 ** This routine only work for pCur on an ephemeral table.
9250 */
9251 int sqlite3BtreeClearTableOfCursor(BtCursor *pCur){
9252   return sqlite3BtreeClearTable(pCur->pBtree, pCur->pgnoRoot, 0);
9253 }
9254 
9255 /*
9256 ** Erase all information in a table and add the root of the table to
9257 ** the freelist.  Except, the root of the principle table (the one on
9258 ** page 1) is never added to the freelist.
9259 **
9260 ** This routine will fail with SQLITE_LOCKED if there are any open
9261 ** cursors on the table.
9262 **
9263 ** If AUTOVACUUM is enabled and the page at iTable is not the last
9264 ** root page in the database file, then the last root page
9265 ** in the database file is moved into the slot formerly occupied by
9266 ** iTable and that last slot formerly occupied by the last root page
9267 ** is added to the freelist instead of iTable.  In this say, all
9268 ** root pages are kept at the beginning of the database file, which
9269 ** is necessary for AUTOVACUUM to work right.  *piMoved is set to the
9270 ** page number that used to be the last root page in the file before
9271 ** the move.  If no page gets moved, *piMoved is set to 0.
9272 ** The last root page is recorded in meta[3] and the value of
9273 ** meta[3] is updated by this procedure.
9274 */
9275 static int btreeDropTable(Btree *p, Pgno iTable, int *piMoved){
9276   int rc;
9277   MemPage *pPage = 0;
9278   BtShared *pBt = p->pBt;
9279 
9280   assert( sqlite3BtreeHoldsMutex(p) );
9281   assert( p->inTrans==TRANS_WRITE );
9282   assert( iTable>=2 );
9283   if( iTable>btreePagecount(pBt) ){
9284     return SQLITE_CORRUPT_BKPT;
9285   }
9286 
9287   rc = btreeGetPage(pBt, (Pgno)iTable, &pPage, 0);
9288   if( rc ) return rc;
9289   rc = sqlite3BtreeClearTable(p, iTable, 0);
9290   if( rc ){
9291     releasePage(pPage);
9292     return rc;
9293   }
9294 
9295   *piMoved = 0;
9296 
9297 #ifdef SQLITE_OMIT_AUTOVACUUM
9298   freePage(pPage, &rc);
9299   releasePage(pPage);
9300 #else
9301   if( pBt->autoVacuum ){
9302     Pgno maxRootPgno;
9303     sqlite3BtreeGetMeta(p, BTREE_LARGEST_ROOT_PAGE, &maxRootPgno);
9304 
9305     if( iTable==maxRootPgno ){
9306       /* If the table being dropped is the table with the largest root-page
9307       ** number in the database, put the root page on the free list.
9308       */
9309       freePage(pPage, &rc);
9310       releasePage(pPage);
9311       if( rc!=SQLITE_OK ){
9312         return rc;
9313       }
9314     }else{
9315       /* The table being dropped does not have the largest root-page
9316       ** number in the database. So move the page that does into the
9317       ** gap left by the deleted root-page.
9318       */
9319       MemPage *pMove;
9320       releasePage(pPage);
9321       rc = btreeGetPage(pBt, maxRootPgno, &pMove, 0);
9322       if( rc!=SQLITE_OK ){
9323         return rc;
9324       }
9325       rc = relocatePage(pBt, pMove, PTRMAP_ROOTPAGE, 0, iTable, 0);
9326       releasePage(pMove);
9327       if( rc!=SQLITE_OK ){
9328         return rc;
9329       }
9330       pMove = 0;
9331       rc = btreeGetPage(pBt, maxRootPgno, &pMove, 0);
9332       freePage(pMove, &rc);
9333       releasePage(pMove);
9334       if( rc!=SQLITE_OK ){
9335         return rc;
9336       }
9337       *piMoved = maxRootPgno;
9338     }
9339 
9340     /* Set the new 'max-root-page' value in the database header. This
9341     ** is the old value less one, less one more if that happens to
9342     ** be a root-page number, less one again if that is the
9343     ** PENDING_BYTE_PAGE.
9344     */
9345     maxRootPgno--;
9346     while( maxRootPgno==PENDING_BYTE_PAGE(pBt)
9347            || PTRMAP_ISPAGE(pBt, maxRootPgno) ){
9348       maxRootPgno--;
9349     }
9350     assert( maxRootPgno!=PENDING_BYTE_PAGE(pBt) );
9351 
9352     rc = sqlite3BtreeUpdateMeta(p, 4, maxRootPgno);
9353   }else{
9354     freePage(pPage, &rc);
9355     releasePage(pPage);
9356   }
9357 #endif
9358   return rc;
9359 }
9360 int sqlite3BtreeDropTable(Btree *p, int iTable, int *piMoved){
9361   int rc;
9362   sqlite3BtreeEnter(p);
9363   rc = btreeDropTable(p, iTable, piMoved);
9364   sqlite3BtreeLeave(p);
9365   return rc;
9366 }
9367 
9368 
9369 /*
9370 ** This function may only be called if the b-tree connection already
9371 ** has a read or write transaction open on the database.
9372 **
9373 ** Read the meta-information out of a database file.  Meta[0]
9374 ** is the number of free pages currently in the database.  Meta[1]
9375 ** through meta[15] are available for use by higher layers.  Meta[0]
9376 ** is read-only, the others are read/write.
9377 **
9378 ** The schema layer numbers meta values differently.  At the schema
9379 ** layer (and the SetCookie and ReadCookie opcodes) the number of
9380 ** free pages is not visible.  So Cookie[0] is the same as Meta[1].
9381 **
9382 ** This routine treats Meta[BTREE_DATA_VERSION] as a special case.  Instead
9383 ** of reading the value out of the header, it instead loads the "DataVersion"
9384 ** from the pager.  The BTREE_DATA_VERSION value is not actually stored in the
9385 ** database file.  It is a number computed by the pager.  But its access
9386 ** pattern is the same as header meta values, and so it is convenient to
9387 ** read it from this routine.
9388 */
9389 void sqlite3BtreeGetMeta(Btree *p, int idx, u32 *pMeta){
9390   BtShared *pBt = p->pBt;
9391 
9392   sqlite3BtreeEnter(p);
9393   assert( p->inTrans>TRANS_NONE );
9394   assert( SQLITE_OK==querySharedCacheTableLock(p, MASTER_ROOT, READ_LOCK) );
9395   assert( pBt->pPage1 );
9396   assert( idx>=0 && idx<=15 );
9397 
9398   if( idx==BTREE_DATA_VERSION ){
9399     *pMeta = sqlite3PagerDataVersion(pBt->pPager) + p->iDataVersion;
9400   }else{
9401     *pMeta = get4byte(&pBt->pPage1->aData[36 + idx*4]);
9402   }
9403 
9404   /* If auto-vacuum is disabled in this build and this is an auto-vacuum
9405   ** database, mark the database as read-only.  */
9406 #ifdef SQLITE_OMIT_AUTOVACUUM
9407   if( idx==BTREE_LARGEST_ROOT_PAGE && *pMeta>0 ){
9408     pBt->btsFlags |= BTS_READ_ONLY;
9409   }
9410 #endif
9411 
9412   sqlite3BtreeLeave(p);
9413 }
9414 
9415 /*
9416 ** Write meta-information back into the database.  Meta[0] is
9417 ** read-only and may not be written.
9418 */
9419 int sqlite3BtreeUpdateMeta(Btree *p, int idx, u32 iMeta){
9420   BtShared *pBt = p->pBt;
9421   unsigned char *pP1;
9422   int rc;
9423   assert( idx>=1 && idx<=15 );
9424   sqlite3BtreeEnter(p);
9425   assert( p->inTrans==TRANS_WRITE );
9426   assert( pBt->pPage1!=0 );
9427   pP1 = pBt->pPage1->aData;
9428   rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
9429   if( rc==SQLITE_OK ){
9430     put4byte(&pP1[36 + idx*4], iMeta);
9431 #ifndef SQLITE_OMIT_AUTOVACUUM
9432     if( idx==BTREE_INCR_VACUUM ){
9433       assert( pBt->autoVacuum || iMeta==0 );
9434       assert( iMeta==0 || iMeta==1 );
9435       pBt->incrVacuum = (u8)iMeta;
9436     }
9437 #endif
9438   }
9439   sqlite3BtreeLeave(p);
9440   return rc;
9441 }
9442 
9443 #ifndef SQLITE_OMIT_BTREECOUNT
9444 /*
9445 ** The first argument, pCur, is a cursor opened on some b-tree. Count the
9446 ** number of entries in the b-tree and write the result to *pnEntry.
9447 **
9448 ** SQLITE_OK is returned if the operation is successfully executed.
9449 ** Otherwise, if an error is encountered (i.e. an IO error or database
9450 ** corruption) an SQLite error code is returned.
9451 */
9452 int sqlite3BtreeCount(BtCursor *pCur, i64 *pnEntry){
9453   i64 nEntry = 0;                      /* Value to return in *pnEntry */
9454   int rc;                              /* Return code */
9455 
9456   rc = moveToRoot(pCur);
9457   if( rc==SQLITE_EMPTY ){
9458     *pnEntry = 0;
9459     return SQLITE_OK;
9460   }
9461 
9462   /* Unless an error occurs, the following loop runs one iteration for each
9463   ** page in the B-Tree structure (not including overflow pages).
9464   */
9465   while( rc==SQLITE_OK ){
9466     int iIdx;                          /* Index of child node in parent */
9467     MemPage *pPage;                    /* Current page of the b-tree */
9468 
9469     /* If this is a leaf page or the tree is not an int-key tree, then
9470     ** this page contains countable entries. Increment the entry counter
9471     ** accordingly.
9472     */
9473     pPage = pCur->pPage;
9474     if( pPage->leaf || !pPage->intKey ){
9475       nEntry += pPage->nCell;
9476     }
9477 
9478     /* pPage is a leaf node. This loop navigates the cursor so that it
9479     ** points to the first interior cell that it points to the parent of
9480     ** the next page in the tree that has not yet been visited. The
9481     ** pCur->aiIdx[pCur->iPage] value is set to the index of the parent cell
9482     ** of the page, or to the number of cells in the page if the next page
9483     ** to visit is the right-child of its parent.
9484     **
9485     ** If all pages in the tree have been visited, return SQLITE_OK to the
9486     ** caller.
9487     */
9488     if( pPage->leaf ){
9489       do {
9490         if( pCur->iPage==0 ){
9491           /* All pages of the b-tree have been visited. Return successfully. */
9492           *pnEntry = nEntry;
9493           return moveToRoot(pCur);
9494         }
9495         moveToParent(pCur);
9496       }while ( pCur->ix>=pCur->pPage->nCell );
9497 
9498       pCur->ix++;
9499       pPage = pCur->pPage;
9500     }
9501 
9502     /* Descend to the child node of the cell that the cursor currently
9503     ** points at. This is the right-child if (iIdx==pPage->nCell).
9504     */
9505     iIdx = pCur->ix;
9506     if( iIdx==pPage->nCell ){
9507       rc = moveToChild(pCur, get4byte(&pPage->aData[pPage->hdrOffset+8]));
9508     }else{
9509       rc = moveToChild(pCur, get4byte(findCell(pPage, iIdx)));
9510     }
9511   }
9512 
9513   /* An error has occurred. Return an error code. */
9514   return rc;
9515 }
9516 #endif
9517 
9518 /*
9519 ** Return the pager associated with a BTree.  This routine is used for
9520 ** testing and debugging only.
9521 */
9522 Pager *sqlite3BtreePager(Btree *p){
9523   return p->pBt->pPager;
9524 }
9525 
9526 #ifndef SQLITE_OMIT_INTEGRITY_CHECK
9527 /*
9528 ** Append a message to the error message string.
9529 */
9530 static void checkAppendMsg(
9531   IntegrityCk *pCheck,
9532   const char *zFormat,
9533   ...
9534 ){
9535   va_list ap;
9536   if( !pCheck->mxErr ) return;
9537   pCheck->mxErr--;
9538   pCheck->nErr++;
9539   va_start(ap, zFormat);
9540   if( pCheck->errMsg.nChar ){
9541     sqlite3_str_append(&pCheck->errMsg, "\n", 1);
9542   }
9543   if( pCheck->zPfx ){
9544     sqlite3_str_appendf(&pCheck->errMsg, pCheck->zPfx, pCheck->v1, pCheck->v2);
9545   }
9546   sqlite3_str_vappendf(&pCheck->errMsg, zFormat, ap);
9547   va_end(ap);
9548   if( pCheck->errMsg.accError==SQLITE_NOMEM ){
9549     pCheck->mallocFailed = 1;
9550   }
9551 }
9552 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
9553 
9554 #ifndef SQLITE_OMIT_INTEGRITY_CHECK
9555 
9556 /*
9557 ** Return non-zero if the bit in the IntegrityCk.aPgRef[] array that
9558 ** corresponds to page iPg is already set.
9559 */
9560 static int getPageReferenced(IntegrityCk *pCheck, Pgno iPg){
9561   assert( iPg<=pCheck->nPage && sizeof(pCheck->aPgRef[0])==1 );
9562   return (pCheck->aPgRef[iPg/8] & (1 << (iPg & 0x07)));
9563 }
9564 
9565 /*
9566 ** Set the bit in the IntegrityCk.aPgRef[] array that corresponds to page iPg.
9567 */
9568 static void setPageReferenced(IntegrityCk *pCheck, Pgno iPg){
9569   assert( iPg<=pCheck->nPage && sizeof(pCheck->aPgRef[0])==1 );
9570   pCheck->aPgRef[iPg/8] |= (1 << (iPg & 0x07));
9571 }
9572 
9573 
9574 /*
9575 ** Add 1 to the reference count for page iPage.  If this is the second
9576 ** reference to the page, add an error message to pCheck->zErrMsg.
9577 ** Return 1 if there are 2 or more references to the page and 0 if
9578 ** if this is the first reference to the page.
9579 **
9580 ** Also check that the page number is in bounds.
9581 */
9582 static int checkRef(IntegrityCk *pCheck, Pgno iPage){
9583   if( iPage>pCheck->nPage || iPage==0 ){
9584     checkAppendMsg(pCheck, "invalid page number %d", iPage);
9585     return 1;
9586   }
9587   if( getPageReferenced(pCheck, iPage) ){
9588     checkAppendMsg(pCheck, "2nd reference to page %d", iPage);
9589     return 1;
9590   }
9591   setPageReferenced(pCheck, iPage);
9592   return 0;
9593 }
9594 
9595 #ifndef SQLITE_OMIT_AUTOVACUUM
9596 /*
9597 ** Check that the entry in the pointer-map for page iChild maps to
9598 ** page iParent, pointer type ptrType. If not, append an error message
9599 ** to pCheck.
9600 */
9601 static void checkPtrmap(
9602   IntegrityCk *pCheck,   /* Integrity check context */
9603   Pgno iChild,           /* Child page number */
9604   u8 eType,              /* Expected pointer map type */
9605   Pgno iParent           /* Expected pointer map parent page number */
9606 ){
9607   int rc;
9608   u8 ePtrmapType;
9609   Pgno iPtrmapParent;
9610 
9611   rc = ptrmapGet(pCheck->pBt, iChild, &ePtrmapType, &iPtrmapParent);
9612   if( rc!=SQLITE_OK ){
9613     if( rc==SQLITE_NOMEM || rc==SQLITE_IOERR_NOMEM ) pCheck->mallocFailed = 1;
9614     checkAppendMsg(pCheck, "Failed to read ptrmap key=%d", iChild);
9615     return;
9616   }
9617 
9618   if( ePtrmapType!=eType || iPtrmapParent!=iParent ){
9619     checkAppendMsg(pCheck,
9620       "Bad ptr map entry key=%d expected=(%d,%d) got=(%d,%d)",
9621       iChild, eType, iParent, ePtrmapType, iPtrmapParent);
9622   }
9623 }
9624 #endif
9625 
9626 /*
9627 ** Check the integrity of the freelist or of an overflow page list.
9628 ** Verify that the number of pages on the list is N.
9629 */
9630 static void checkList(
9631   IntegrityCk *pCheck,  /* Integrity checking context */
9632   int isFreeList,       /* True for a freelist.  False for overflow page list */
9633   int iPage,            /* Page number for first page in the list */
9634   u32 N                 /* Expected number of pages in the list */
9635 ){
9636   int i;
9637   u32 expected = N;
9638   int nErrAtStart = pCheck->nErr;
9639   while( iPage!=0 && pCheck->mxErr ){
9640     DbPage *pOvflPage;
9641     unsigned char *pOvflData;
9642     if( checkRef(pCheck, iPage) ) break;
9643     N--;
9644     if( sqlite3PagerGet(pCheck->pPager, (Pgno)iPage, &pOvflPage, 0) ){
9645       checkAppendMsg(pCheck, "failed to get page %d", iPage);
9646       break;
9647     }
9648     pOvflData = (unsigned char *)sqlite3PagerGetData(pOvflPage);
9649     if( isFreeList ){
9650       u32 n = (u32)get4byte(&pOvflData[4]);
9651 #ifndef SQLITE_OMIT_AUTOVACUUM
9652       if( pCheck->pBt->autoVacuum ){
9653         checkPtrmap(pCheck, iPage, PTRMAP_FREEPAGE, 0);
9654       }
9655 #endif
9656       if( n>pCheck->pBt->usableSize/4-2 ){
9657         checkAppendMsg(pCheck,
9658            "freelist leaf count too big on page %d", iPage);
9659         N--;
9660       }else{
9661         for(i=0; i<(int)n; i++){
9662           Pgno iFreePage = get4byte(&pOvflData[8+i*4]);
9663 #ifndef SQLITE_OMIT_AUTOVACUUM
9664           if( pCheck->pBt->autoVacuum ){
9665             checkPtrmap(pCheck, iFreePage, PTRMAP_FREEPAGE, 0);
9666           }
9667 #endif
9668           checkRef(pCheck, iFreePage);
9669         }
9670         N -= n;
9671       }
9672     }
9673 #ifndef SQLITE_OMIT_AUTOVACUUM
9674     else{
9675       /* If this database supports auto-vacuum and iPage is not the last
9676       ** page in this overflow list, check that the pointer-map entry for
9677       ** the following page matches iPage.
9678       */
9679       if( pCheck->pBt->autoVacuum && N>0 ){
9680         i = get4byte(pOvflData);
9681         checkPtrmap(pCheck, i, PTRMAP_OVERFLOW2, iPage);
9682       }
9683     }
9684 #endif
9685     iPage = get4byte(pOvflData);
9686     sqlite3PagerUnref(pOvflPage);
9687   }
9688   if( N && nErrAtStart==pCheck->nErr ){
9689     checkAppendMsg(pCheck,
9690       "%s is %d but should be %d",
9691       isFreeList ? "size" : "overflow list length",
9692       expected-N, expected);
9693   }
9694 }
9695 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
9696 
9697 /*
9698 ** An implementation of a min-heap.
9699 **
9700 ** aHeap[0] is the number of elements on the heap.  aHeap[1] is the
9701 ** root element.  The daughter nodes of aHeap[N] are aHeap[N*2]
9702 ** and aHeap[N*2+1].
9703 **
9704 ** The heap property is this:  Every node is less than or equal to both
9705 ** of its daughter nodes.  A consequence of the heap property is that the
9706 ** root node aHeap[1] is always the minimum value currently in the heap.
9707 **
9708 ** The btreeHeapInsert() routine inserts an unsigned 32-bit number onto
9709 ** the heap, preserving the heap property.  The btreeHeapPull() routine
9710 ** removes the root element from the heap (the minimum value in the heap)
9711 ** and then moves other nodes around as necessary to preserve the heap
9712 ** property.
9713 **
9714 ** This heap is used for cell overlap and coverage testing.  Each u32
9715 ** entry represents the span of a cell or freeblock on a btree page.
9716 ** The upper 16 bits are the index of the first byte of a range and the
9717 ** lower 16 bits are the index of the last byte of that range.
9718 */
9719 static void btreeHeapInsert(u32 *aHeap, u32 x){
9720   u32 j, i = ++aHeap[0];
9721   aHeap[i] = x;
9722   while( (j = i/2)>0 && aHeap[j]>aHeap[i] ){
9723     x = aHeap[j];
9724     aHeap[j] = aHeap[i];
9725     aHeap[i] = x;
9726     i = j;
9727   }
9728 }
9729 static int btreeHeapPull(u32 *aHeap, u32 *pOut){
9730   u32 j, i, x;
9731   if( (x = aHeap[0])==0 ) return 0;
9732   *pOut = aHeap[1];
9733   aHeap[1] = aHeap[x];
9734   aHeap[x] = 0xffffffff;
9735   aHeap[0]--;
9736   i = 1;
9737   while( (j = i*2)<=aHeap[0] ){
9738     if( aHeap[j]>aHeap[j+1] ) j++;
9739     if( aHeap[i]<aHeap[j] ) break;
9740     x = aHeap[i];
9741     aHeap[i] = aHeap[j];
9742     aHeap[j] = x;
9743     i = j;
9744   }
9745   return 1;
9746 }
9747 
9748 #ifndef SQLITE_OMIT_INTEGRITY_CHECK
9749 /*
9750 ** Do various sanity checks on a single page of a tree.  Return
9751 ** the tree depth.  Root pages return 0.  Parents of root pages
9752 ** return 1, and so forth.
9753 **
9754 ** These checks are done:
9755 **
9756 **      1.  Make sure that cells and freeblocks do not overlap
9757 **          but combine to completely cover the page.
9758 **      2.  Make sure integer cell keys are in order.
9759 **      3.  Check the integrity of overflow pages.
9760 **      4.  Recursively call checkTreePage on all children.
9761 **      5.  Verify that the depth of all children is the same.
9762 */
9763 static int checkTreePage(
9764   IntegrityCk *pCheck,  /* Context for the sanity check */
9765   int iPage,            /* Page number of the page to check */
9766   i64 *piMinKey,        /* Write minimum integer primary key here */
9767   i64 maxKey            /* Error if integer primary key greater than this */
9768 ){
9769   MemPage *pPage = 0;      /* The page being analyzed */
9770   int i;                   /* Loop counter */
9771   int rc;                  /* Result code from subroutine call */
9772   int depth = -1, d2;      /* Depth of a subtree */
9773   int pgno;                /* Page number */
9774   int nFrag;               /* Number of fragmented bytes on the page */
9775   int hdr;                 /* Offset to the page header */
9776   int cellStart;           /* Offset to the start of the cell pointer array */
9777   int nCell;               /* Number of cells */
9778   int doCoverageCheck = 1; /* True if cell coverage checking should be done */
9779   int keyCanBeEqual = 1;   /* True if IPK can be equal to maxKey
9780                            ** False if IPK must be strictly less than maxKey */
9781   u8 *data;                /* Page content */
9782   u8 *pCell;               /* Cell content */
9783   u8 *pCellIdx;            /* Next element of the cell pointer array */
9784   BtShared *pBt;           /* The BtShared object that owns pPage */
9785   u32 pc;                  /* Address of a cell */
9786   u32 usableSize;          /* Usable size of the page */
9787   u32 contentOffset;       /* Offset to the start of the cell content area */
9788   u32 *heap = 0;           /* Min-heap used for checking cell coverage */
9789   u32 x, prev = 0;         /* Next and previous entry on the min-heap */
9790   const char *saved_zPfx = pCheck->zPfx;
9791   int saved_v1 = pCheck->v1;
9792   int saved_v2 = pCheck->v2;
9793   u8 savedIsInit = 0;
9794 
9795   /* Check that the page exists
9796   */
9797   pBt = pCheck->pBt;
9798   usableSize = pBt->usableSize;
9799   if( iPage==0 ) return 0;
9800   if( checkRef(pCheck, iPage) ) return 0;
9801   pCheck->zPfx = "Page %d: ";
9802   pCheck->v1 = iPage;
9803   if( (rc = btreeGetPage(pBt, (Pgno)iPage, &pPage, 0))!=0 ){
9804     checkAppendMsg(pCheck,
9805        "unable to get the page. error code=%d", rc);
9806     goto end_of_check;
9807   }
9808 
9809   /* Clear MemPage.isInit to make sure the corruption detection code in
9810   ** btreeInitPage() is executed.  */
9811   savedIsInit = pPage->isInit;
9812   pPage->isInit = 0;
9813   if( (rc = btreeInitPage(pPage))!=0 ){
9814     assert( rc==SQLITE_CORRUPT );  /* The only possible error from InitPage */
9815     checkAppendMsg(pCheck,
9816                    "btreeInitPage() returns error code %d", rc);
9817     goto end_of_check;
9818   }
9819   if( (rc = btreeComputeFreeSpace(pPage))!=0 ){
9820     assert( rc==SQLITE_CORRUPT );
9821     checkAppendMsg(pCheck, "free space corruption", rc);
9822     goto end_of_check;
9823   }
9824   data = pPage->aData;
9825   hdr = pPage->hdrOffset;
9826 
9827   /* Set up for cell analysis */
9828   pCheck->zPfx = "On tree page %d cell %d: ";
9829   contentOffset = get2byteNotZero(&data[hdr+5]);
9830   assert( contentOffset<=usableSize );  /* Enforced by btreeInitPage() */
9831 
9832   /* EVIDENCE-OF: R-37002-32774 The two-byte integer at offset 3 gives the
9833   ** number of cells on the page. */
9834   nCell = get2byte(&data[hdr+3]);
9835   assert( pPage->nCell==nCell );
9836 
9837   /* EVIDENCE-OF: R-23882-45353 The cell pointer array of a b-tree page
9838   ** immediately follows the b-tree page header. */
9839   cellStart = hdr + 12 - 4*pPage->leaf;
9840   assert( pPage->aCellIdx==&data[cellStart] );
9841   pCellIdx = &data[cellStart + 2*(nCell-1)];
9842 
9843   if( !pPage->leaf ){
9844     /* Analyze the right-child page of internal pages */
9845     pgno = get4byte(&data[hdr+8]);
9846 #ifndef SQLITE_OMIT_AUTOVACUUM
9847     if( pBt->autoVacuum ){
9848       pCheck->zPfx = "On page %d at right child: ";
9849       checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage);
9850     }
9851 #endif
9852     depth = checkTreePage(pCheck, pgno, &maxKey, maxKey);
9853     keyCanBeEqual = 0;
9854   }else{
9855     /* For leaf pages, the coverage check will occur in the same loop
9856     ** as the other cell checks, so initialize the heap.  */
9857     heap = pCheck->heap;
9858     heap[0] = 0;
9859   }
9860 
9861   /* EVIDENCE-OF: R-02776-14802 The cell pointer array consists of K 2-byte
9862   ** integer offsets to the cell contents. */
9863   for(i=nCell-1; i>=0 && pCheck->mxErr; i--){
9864     CellInfo info;
9865 
9866     /* Check cell size */
9867     pCheck->v2 = i;
9868     assert( pCellIdx==&data[cellStart + i*2] );
9869     pc = get2byteAligned(pCellIdx);
9870     pCellIdx -= 2;
9871     if( pc<contentOffset || pc>usableSize-4 ){
9872       checkAppendMsg(pCheck, "Offset %d out of range %d..%d",
9873                              pc, contentOffset, usableSize-4);
9874       doCoverageCheck = 0;
9875       continue;
9876     }
9877     pCell = &data[pc];
9878     pPage->xParseCell(pPage, pCell, &info);
9879     if( pc+info.nSize>usableSize ){
9880       checkAppendMsg(pCheck, "Extends off end of page");
9881       doCoverageCheck = 0;
9882       continue;
9883     }
9884 
9885     /* Check for integer primary key out of range */
9886     if( pPage->intKey ){
9887       if( keyCanBeEqual ? (info.nKey > maxKey) : (info.nKey >= maxKey) ){
9888         checkAppendMsg(pCheck, "Rowid %lld out of order", info.nKey);
9889       }
9890       maxKey = info.nKey;
9891       keyCanBeEqual = 0;     /* Only the first key on the page may ==maxKey */
9892     }
9893 
9894     /* Check the content overflow list */
9895     if( info.nPayload>info.nLocal ){
9896       u32 nPage;       /* Number of pages on the overflow chain */
9897       Pgno pgnoOvfl;   /* First page of the overflow chain */
9898       assert( pc + info.nSize - 4 <= usableSize );
9899       nPage = (info.nPayload - info.nLocal + usableSize - 5)/(usableSize - 4);
9900       pgnoOvfl = get4byte(&pCell[info.nSize - 4]);
9901 #ifndef SQLITE_OMIT_AUTOVACUUM
9902       if( pBt->autoVacuum ){
9903         checkPtrmap(pCheck, pgnoOvfl, PTRMAP_OVERFLOW1, iPage);
9904       }
9905 #endif
9906       checkList(pCheck, 0, pgnoOvfl, nPage);
9907     }
9908 
9909     if( !pPage->leaf ){
9910       /* Check sanity of left child page for internal pages */
9911       pgno = get4byte(pCell);
9912 #ifndef SQLITE_OMIT_AUTOVACUUM
9913       if( pBt->autoVacuum ){
9914         checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage);
9915       }
9916 #endif
9917       d2 = checkTreePage(pCheck, pgno, &maxKey, maxKey);
9918       keyCanBeEqual = 0;
9919       if( d2!=depth ){
9920         checkAppendMsg(pCheck, "Child page depth differs");
9921         depth = d2;
9922       }
9923     }else{
9924       /* Populate the coverage-checking heap for leaf pages */
9925       btreeHeapInsert(heap, (pc<<16)|(pc+info.nSize-1));
9926     }
9927   }
9928   *piMinKey = maxKey;
9929 
9930   /* Check for complete coverage of the page
9931   */
9932   pCheck->zPfx = 0;
9933   if( doCoverageCheck && pCheck->mxErr>0 ){
9934     /* For leaf pages, the min-heap has already been initialized and the
9935     ** cells have already been inserted.  But for internal pages, that has
9936     ** not yet been done, so do it now */
9937     if( !pPage->leaf ){
9938       heap = pCheck->heap;
9939       heap[0] = 0;
9940       for(i=nCell-1; i>=0; i--){
9941         u32 size;
9942         pc = get2byteAligned(&data[cellStart+i*2]);
9943         size = pPage->xCellSize(pPage, &data[pc]);
9944         btreeHeapInsert(heap, (pc<<16)|(pc+size-1));
9945       }
9946     }
9947     /* Add the freeblocks to the min-heap
9948     **
9949     ** EVIDENCE-OF: R-20690-50594 The second field of the b-tree page header
9950     ** is the offset of the first freeblock, or zero if there are no
9951     ** freeblocks on the page.
9952     */
9953     i = get2byte(&data[hdr+1]);
9954     while( i>0 ){
9955       int size, j;
9956       assert( (u32)i<=usableSize-4 ); /* Enforced by btreeComputeFreeSpace() */
9957       size = get2byte(&data[i+2]);
9958       assert( (u32)(i+size)<=usableSize ); /* due to btreeComputeFreeSpace() */
9959       btreeHeapInsert(heap, (((u32)i)<<16)|(i+size-1));
9960       /* EVIDENCE-OF: R-58208-19414 The first 2 bytes of a freeblock are a
9961       ** big-endian integer which is the offset in the b-tree page of the next
9962       ** freeblock in the chain, or zero if the freeblock is the last on the
9963       ** chain. */
9964       j = get2byte(&data[i]);
9965       /* EVIDENCE-OF: R-06866-39125 Freeblocks are always connected in order of
9966       ** increasing offset. */
9967       assert( j==0 || j>i+size );     /* Enforced by btreeComputeFreeSpace() */
9968       assert( (u32)j<=usableSize-4 ); /* Enforced by btreeComputeFreeSpace() */
9969       i = j;
9970     }
9971     /* Analyze the min-heap looking for overlap between cells and/or
9972     ** freeblocks, and counting the number of untracked bytes in nFrag.
9973     **
9974     ** Each min-heap entry is of the form:    (start_address<<16)|end_address.
9975     ** There is an implied first entry the covers the page header, the cell
9976     ** pointer index, and the gap between the cell pointer index and the start
9977     ** of cell content.
9978     **
9979     ** The loop below pulls entries from the min-heap in order and compares
9980     ** the start_address against the previous end_address.  If there is an
9981     ** overlap, that means bytes are used multiple times.  If there is a gap,
9982     ** that gap is added to the fragmentation count.
9983     */
9984     nFrag = 0;
9985     prev = contentOffset - 1;   /* Implied first min-heap entry */
9986     while( btreeHeapPull(heap,&x) ){
9987       if( (prev&0xffff)>=(x>>16) ){
9988         checkAppendMsg(pCheck,
9989           "Multiple uses for byte %u of page %d", x>>16, iPage);
9990         break;
9991       }else{
9992         nFrag += (x>>16) - (prev&0xffff) - 1;
9993         prev = x;
9994       }
9995     }
9996     nFrag += usableSize - (prev&0xffff) - 1;
9997     /* EVIDENCE-OF: R-43263-13491 The total number of bytes in all fragments
9998     ** is stored in the fifth field of the b-tree page header.
9999     ** EVIDENCE-OF: R-07161-27322 The one-byte integer at offset 7 gives the
10000     ** number of fragmented free bytes within the cell content area.
10001     */
10002     if( heap[0]==0 && nFrag!=data[hdr+7] ){
10003       checkAppendMsg(pCheck,
10004           "Fragmentation of %d bytes reported as %d on page %d",
10005           nFrag, data[hdr+7], iPage);
10006     }
10007   }
10008 
10009 end_of_check:
10010   if( !doCoverageCheck ) pPage->isInit = savedIsInit;
10011   releasePage(pPage);
10012   pCheck->zPfx = saved_zPfx;
10013   pCheck->v1 = saved_v1;
10014   pCheck->v2 = saved_v2;
10015   return depth+1;
10016 }
10017 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
10018 
10019 #ifndef SQLITE_OMIT_INTEGRITY_CHECK
10020 /*
10021 ** This routine does a complete check of the given BTree file.  aRoot[] is
10022 ** an array of pages numbers were each page number is the root page of
10023 ** a table.  nRoot is the number of entries in aRoot.
10024 **
10025 ** A read-only or read-write transaction must be opened before calling
10026 ** this function.
10027 **
10028 ** Write the number of error seen in *pnErr.  Except for some memory
10029 ** allocation errors,  an error message held in memory obtained from
10030 ** malloc is returned if *pnErr is non-zero.  If *pnErr==0 then NULL is
10031 ** returned.  If a memory allocation error occurs, NULL is returned.
10032 */
10033 char *sqlite3BtreeIntegrityCheck(
10034   Btree *p,     /* The btree to be checked */
10035   int *aRoot,   /* An array of root pages numbers for individual trees */
10036   int nRoot,    /* Number of entries in aRoot[] */
10037   int mxErr,    /* Stop reporting errors after this many */
10038   int *pnErr    /* Write number of errors seen to this variable */
10039 ){
10040   Pgno i;
10041   IntegrityCk sCheck;
10042   BtShared *pBt = p->pBt;
10043   u64 savedDbFlags = pBt->db->flags;
10044   char zErr[100];
10045   VVA_ONLY( int nRef );
10046 
10047   sqlite3BtreeEnter(p);
10048   assert( p->inTrans>TRANS_NONE && pBt->inTransaction>TRANS_NONE );
10049   VVA_ONLY( nRef = sqlite3PagerRefcount(pBt->pPager) );
10050   assert( nRef>=0 );
10051   sCheck.pBt = pBt;
10052   sCheck.pPager = pBt->pPager;
10053   sCheck.nPage = btreePagecount(sCheck.pBt);
10054   sCheck.mxErr = mxErr;
10055   sCheck.nErr = 0;
10056   sCheck.mallocFailed = 0;
10057   sCheck.zPfx = 0;
10058   sCheck.v1 = 0;
10059   sCheck.v2 = 0;
10060   sCheck.aPgRef = 0;
10061   sCheck.heap = 0;
10062   sqlite3StrAccumInit(&sCheck.errMsg, 0, zErr, sizeof(zErr), SQLITE_MAX_LENGTH);
10063   sCheck.errMsg.printfFlags = SQLITE_PRINTF_INTERNAL;
10064   if( sCheck.nPage==0 ){
10065     goto integrity_ck_cleanup;
10066   }
10067 
10068   sCheck.aPgRef = sqlite3MallocZero((sCheck.nPage / 8)+ 1);
10069   if( !sCheck.aPgRef ){
10070     sCheck.mallocFailed = 1;
10071     goto integrity_ck_cleanup;
10072   }
10073   sCheck.heap = (u32*)sqlite3PageMalloc( pBt->pageSize );
10074   if( sCheck.heap==0 ){
10075     sCheck.mallocFailed = 1;
10076     goto integrity_ck_cleanup;
10077   }
10078 
10079   i = PENDING_BYTE_PAGE(pBt);
10080   if( i<=sCheck.nPage ) setPageReferenced(&sCheck, i);
10081 
10082   /* Check the integrity of the freelist
10083   */
10084   sCheck.zPfx = "Main freelist: ";
10085   checkList(&sCheck, 1, get4byte(&pBt->pPage1->aData[32]),
10086             get4byte(&pBt->pPage1->aData[36]));
10087   sCheck.zPfx = 0;
10088 
10089   /* Check all the tables.
10090   */
10091 #ifndef SQLITE_OMIT_AUTOVACUUM
10092   if( pBt->autoVacuum ){
10093     int mx = 0;
10094     int mxInHdr;
10095     for(i=0; (int)i<nRoot; i++) if( mx<aRoot[i] ) mx = aRoot[i];
10096     mxInHdr = get4byte(&pBt->pPage1->aData[52]);
10097     if( mx!=mxInHdr ){
10098       checkAppendMsg(&sCheck,
10099         "max rootpage (%d) disagrees with header (%d)",
10100         mx, mxInHdr
10101       );
10102     }
10103   }else if( get4byte(&pBt->pPage1->aData[64])!=0 ){
10104     checkAppendMsg(&sCheck,
10105       "incremental_vacuum enabled with a max rootpage of zero"
10106     );
10107   }
10108 #endif
10109   testcase( pBt->db->flags & SQLITE_CellSizeCk );
10110   pBt->db->flags &= ~(u64)SQLITE_CellSizeCk;
10111   for(i=0; (int)i<nRoot && sCheck.mxErr; i++){
10112     i64 notUsed;
10113     if( aRoot[i]==0 ) continue;
10114 #ifndef SQLITE_OMIT_AUTOVACUUM
10115     if( pBt->autoVacuum && aRoot[i]>1 ){
10116       checkPtrmap(&sCheck, aRoot[i], PTRMAP_ROOTPAGE, 0);
10117     }
10118 #endif
10119     checkTreePage(&sCheck, aRoot[i], &notUsed, LARGEST_INT64);
10120   }
10121   pBt->db->flags = savedDbFlags;
10122 
10123   /* Make sure every page in the file is referenced
10124   */
10125   for(i=1; i<=sCheck.nPage && sCheck.mxErr; i++){
10126 #ifdef SQLITE_OMIT_AUTOVACUUM
10127     if( getPageReferenced(&sCheck, i)==0 ){
10128       checkAppendMsg(&sCheck, "Page %d is never used", i);
10129     }
10130 #else
10131     /* If the database supports auto-vacuum, make sure no tables contain
10132     ** references to pointer-map pages.
10133     */
10134     if( getPageReferenced(&sCheck, i)==0 &&
10135        (PTRMAP_PAGENO(pBt, i)!=i || !pBt->autoVacuum) ){
10136       checkAppendMsg(&sCheck, "Page %d is never used", i);
10137     }
10138     if( getPageReferenced(&sCheck, i)!=0 &&
10139        (PTRMAP_PAGENO(pBt, i)==i && pBt->autoVacuum) ){
10140       checkAppendMsg(&sCheck, "Pointer map page %d is referenced", i);
10141     }
10142 #endif
10143   }
10144 
10145   /* Clean  up and report errors.
10146   */
10147 integrity_ck_cleanup:
10148   sqlite3PageFree(sCheck.heap);
10149   sqlite3_free(sCheck.aPgRef);
10150   if( sCheck.mallocFailed ){
10151     sqlite3_str_reset(&sCheck.errMsg);
10152     sCheck.nErr++;
10153   }
10154   *pnErr = sCheck.nErr;
10155   if( sCheck.nErr==0 ) sqlite3_str_reset(&sCheck.errMsg);
10156   /* Make sure this analysis did not leave any unref() pages. */
10157   assert( nRef==sqlite3PagerRefcount(pBt->pPager) );
10158   sqlite3BtreeLeave(p);
10159   return sqlite3StrAccumFinish(&sCheck.errMsg);
10160 }
10161 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
10162 
10163 /*
10164 ** Return the full pathname of the underlying database file.  Return
10165 ** an empty string if the database is in-memory or a TEMP database.
10166 **
10167 ** The pager filename is invariant as long as the pager is
10168 ** open so it is safe to access without the BtShared mutex.
10169 */
10170 const char *sqlite3BtreeGetFilename(Btree *p){
10171   assert( p->pBt->pPager!=0 );
10172   return sqlite3PagerFilename(p->pBt->pPager, 1);
10173 }
10174 
10175 /*
10176 ** Return the pathname of the journal file for this database. The return
10177 ** value of this routine is the same regardless of whether the journal file
10178 ** has been created or not.
10179 **
10180 ** The pager journal filename is invariant as long as the pager is
10181 ** open so it is safe to access without the BtShared mutex.
10182 */
10183 const char *sqlite3BtreeGetJournalname(Btree *p){
10184   assert( p->pBt->pPager!=0 );
10185   return sqlite3PagerJournalname(p->pBt->pPager);
10186 }
10187 
10188 /*
10189 ** Return non-zero if a transaction is active.
10190 */
10191 int sqlite3BtreeIsInTrans(Btree *p){
10192   assert( p==0 || sqlite3_mutex_held(p->db->mutex) );
10193   return (p && (p->inTrans==TRANS_WRITE));
10194 }
10195 
10196 #ifndef SQLITE_OMIT_WAL
10197 /*
10198 ** Run a checkpoint on the Btree passed as the first argument.
10199 **
10200 ** Return SQLITE_LOCKED if this or any other connection has an open
10201 ** transaction on the shared-cache the argument Btree is connected to.
10202 **
10203 ** Parameter eMode is one of SQLITE_CHECKPOINT_PASSIVE, FULL or RESTART.
10204 */
10205 int sqlite3BtreeCheckpoint(Btree *p, int eMode, int *pnLog, int *pnCkpt){
10206   int rc = SQLITE_OK;
10207   if( p ){
10208     BtShared *pBt = p->pBt;
10209     sqlite3BtreeEnter(p);
10210     if( pBt->inTransaction!=TRANS_NONE ){
10211       rc = SQLITE_LOCKED;
10212     }else{
10213       rc = sqlite3PagerCheckpoint(pBt->pPager, p->db, eMode, pnLog, pnCkpt);
10214     }
10215     sqlite3BtreeLeave(p);
10216   }
10217   return rc;
10218 }
10219 #endif
10220 
10221 /*
10222 ** Return non-zero if a read (or write) transaction is active.
10223 */
10224 int sqlite3BtreeIsInReadTrans(Btree *p){
10225   assert( p );
10226   assert( sqlite3_mutex_held(p->db->mutex) );
10227   return p->inTrans!=TRANS_NONE;
10228 }
10229 
10230 int sqlite3BtreeIsInBackup(Btree *p){
10231   assert( p );
10232   assert( sqlite3_mutex_held(p->db->mutex) );
10233   return p->nBackup!=0;
10234 }
10235 
10236 /*
10237 ** This function returns a pointer to a blob of memory associated with
10238 ** a single shared-btree. The memory is used by client code for its own
10239 ** purposes (for example, to store a high-level schema associated with
10240 ** the shared-btree). The btree layer manages reference counting issues.
10241 **
10242 ** The first time this is called on a shared-btree, nBytes bytes of memory
10243 ** are allocated, zeroed, and returned to the caller. For each subsequent
10244 ** call the nBytes parameter is ignored and a pointer to the same blob
10245 ** of memory returned.
10246 **
10247 ** If the nBytes parameter is 0 and the blob of memory has not yet been
10248 ** allocated, a null pointer is returned. If the blob has already been
10249 ** allocated, it is returned as normal.
10250 **
10251 ** Just before the shared-btree is closed, the function passed as the
10252 ** xFree argument when the memory allocation was made is invoked on the
10253 ** blob of allocated memory. The xFree function should not call sqlite3_free()
10254 ** on the memory, the btree layer does that.
10255 */
10256 void *sqlite3BtreeSchema(Btree *p, int nBytes, void(*xFree)(void *)){
10257   BtShared *pBt = p->pBt;
10258   sqlite3BtreeEnter(p);
10259   if( !pBt->pSchema && nBytes ){
10260     pBt->pSchema = sqlite3DbMallocZero(0, nBytes);
10261     pBt->xFreeSchema = xFree;
10262   }
10263   sqlite3BtreeLeave(p);
10264   return pBt->pSchema;
10265 }
10266 
10267 /*
10268 ** Return SQLITE_LOCKED_SHAREDCACHE if another user of the same shared
10269 ** btree as the argument handle holds an exclusive lock on the
10270 ** sqlite_master table. Otherwise SQLITE_OK.
10271 */
10272 int sqlite3BtreeSchemaLocked(Btree *p){
10273   int rc;
10274   assert( sqlite3_mutex_held(p->db->mutex) );
10275   sqlite3BtreeEnter(p);
10276   rc = querySharedCacheTableLock(p, MASTER_ROOT, READ_LOCK);
10277   assert( rc==SQLITE_OK || rc==SQLITE_LOCKED_SHAREDCACHE );
10278   sqlite3BtreeLeave(p);
10279   return rc;
10280 }
10281 
10282 
10283 #ifndef SQLITE_OMIT_SHARED_CACHE
10284 /*
10285 ** Obtain a lock on the table whose root page is iTab.  The
10286 ** lock is a write lock if isWritelock is true or a read lock
10287 ** if it is false.
10288 */
10289 int sqlite3BtreeLockTable(Btree *p, int iTab, u8 isWriteLock){
10290   int rc = SQLITE_OK;
10291   assert( p->inTrans!=TRANS_NONE );
10292   if( p->sharable ){
10293     u8 lockType = READ_LOCK + isWriteLock;
10294     assert( READ_LOCK+1==WRITE_LOCK );
10295     assert( isWriteLock==0 || isWriteLock==1 );
10296 
10297     sqlite3BtreeEnter(p);
10298     rc = querySharedCacheTableLock(p, iTab, lockType);
10299     if( rc==SQLITE_OK ){
10300       rc = setSharedCacheTableLock(p, iTab, lockType);
10301     }
10302     sqlite3BtreeLeave(p);
10303   }
10304   return rc;
10305 }
10306 #endif
10307 
10308 #ifndef SQLITE_OMIT_INCRBLOB
10309 /*
10310 ** Argument pCsr must be a cursor opened for writing on an
10311 ** INTKEY table currently pointing at a valid table entry.
10312 ** This function modifies the data stored as part of that entry.
10313 **
10314 ** Only the data content may only be modified, it is not possible to
10315 ** change the length of the data stored. If this function is called with
10316 ** parameters that attempt to write past the end of the existing data,
10317 ** no modifications are made and SQLITE_CORRUPT is returned.
10318 */
10319 int sqlite3BtreePutData(BtCursor *pCsr, u32 offset, u32 amt, void *z){
10320   int rc;
10321   assert( cursorOwnsBtShared(pCsr) );
10322   assert( sqlite3_mutex_held(pCsr->pBtree->db->mutex) );
10323   assert( pCsr->curFlags & BTCF_Incrblob );
10324 
10325   rc = restoreCursorPosition(pCsr);
10326   if( rc!=SQLITE_OK ){
10327     return rc;
10328   }
10329   assert( pCsr->eState!=CURSOR_REQUIRESEEK );
10330   if( pCsr->eState!=CURSOR_VALID ){
10331     return SQLITE_ABORT;
10332   }
10333 
10334   /* Save the positions of all other cursors open on this table. This is
10335   ** required in case any of them are holding references to an xFetch
10336   ** version of the b-tree page modified by the accessPayload call below.
10337   **
10338   ** Note that pCsr must be open on a INTKEY table and saveCursorPosition()
10339   ** and hence saveAllCursors() cannot fail on a BTREE_INTKEY table, hence
10340   ** saveAllCursors can only return SQLITE_OK.
10341   */
10342   VVA_ONLY(rc =) saveAllCursors(pCsr->pBt, pCsr->pgnoRoot, pCsr);
10343   assert( rc==SQLITE_OK );
10344 
10345   /* Check some assumptions:
10346   **   (a) the cursor is open for writing,
10347   **   (b) there is a read/write transaction open,
10348   **   (c) the connection holds a write-lock on the table (if required),
10349   **   (d) there are no conflicting read-locks, and
10350   **   (e) the cursor points at a valid row of an intKey table.
10351   */
10352   if( (pCsr->curFlags & BTCF_WriteFlag)==0 ){
10353     return SQLITE_READONLY;
10354   }
10355   assert( (pCsr->pBt->btsFlags & BTS_READ_ONLY)==0
10356               && pCsr->pBt->inTransaction==TRANS_WRITE );
10357   assert( hasSharedCacheTableLock(pCsr->pBtree, pCsr->pgnoRoot, 0, 2) );
10358   assert( !hasReadConflicts(pCsr->pBtree, pCsr->pgnoRoot) );
10359   assert( pCsr->pPage->intKey );
10360 
10361   return accessPayload(pCsr, offset, amt, (unsigned char *)z, 1);
10362 }
10363 
10364 /*
10365 ** Mark this cursor as an incremental blob cursor.
10366 */
10367 void sqlite3BtreeIncrblobCursor(BtCursor *pCur){
10368   pCur->curFlags |= BTCF_Incrblob;
10369   pCur->pBtree->hasIncrblobCur = 1;
10370 }
10371 #endif
10372 
10373 /*
10374 ** Set both the "read version" (single byte at byte offset 18) and
10375 ** "write version" (single byte at byte offset 19) fields in the database
10376 ** header to iVersion.
10377 */
10378 int sqlite3BtreeSetVersion(Btree *pBtree, int iVersion){
10379   BtShared *pBt = pBtree->pBt;
10380   int rc;                         /* Return code */
10381 
10382   assert( iVersion==1 || iVersion==2 );
10383 
10384   /* If setting the version fields to 1, do not automatically open the
10385   ** WAL connection, even if the version fields are currently set to 2.
10386   */
10387   pBt->btsFlags &= ~BTS_NO_WAL;
10388   if( iVersion==1 ) pBt->btsFlags |= BTS_NO_WAL;
10389 
10390   rc = sqlite3BtreeBeginTrans(pBtree, 0, 0);
10391   if( rc==SQLITE_OK ){
10392     u8 *aData = pBt->pPage1->aData;
10393     if( aData[18]!=(u8)iVersion || aData[19]!=(u8)iVersion ){
10394       rc = sqlite3BtreeBeginTrans(pBtree, 2, 0);
10395       if( rc==SQLITE_OK ){
10396         rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
10397         if( rc==SQLITE_OK ){
10398           aData[18] = (u8)iVersion;
10399           aData[19] = (u8)iVersion;
10400         }
10401       }
10402     }
10403   }
10404 
10405   pBt->btsFlags &= ~BTS_NO_WAL;
10406   return rc;
10407 }
10408 
10409 /*
10410 ** Return true if the cursor has a hint specified.  This routine is
10411 ** only used from within assert() statements
10412 */
10413 int sqlite3BtreeCursorHasHint(BtCursor *pCsr, unsigned int mask){
10414   return (pCsr->hints & mask)!=0;
10415 }
10416 
10417 /*
10418 ** Return true if the given Btree is read-only.
10419 */
10420 int sqlite3BtreeIsReadonly(Btree *p){
10421   return (p->pBt->btsFlags & BTS_READ_ONLY)!=0;
10422 }
10423 
10424 /*
10425 ** Return the size of the header added to each page by this module.
10426 */
10427 int sqlite3HeaderSizeBtree(void){ return ROUND8(sizeof(MemPage)); }
10428 
10429 #if !defined(SQLITE_OMIT_SHARED_CACHE)
10430 /*
10431 ** Return true if the Btree passed as the only argument is sharable.
10432 */
10433 int sqlite3BtreeSharable(Btree *p){
10434   return p->sharable;
10435 }
10436 
10437 /*
10438 ** Return the number of connections to the BtShared object accessed by
10439 ** the Btree handle passed as the only argument. For private caches
10440 ** this is always 1. For shared caches it may be 1 or greater.
10441 */
10442 int sqlite3BtreeConnectionCount(Btree *p){
10443   testcase( p->sharable );
10444   return p->pBt->nRef;
10445 }
10446 #endif
10447