xref: /sqlite-3.40.0/src/btree.c (revision 87f500ce)
1 /*
2 ** 2004 April 6
3 **
4 ** The author disclaims copyright to this source code.  In place of
5 ** a legal notice, here is a blessing:
6 **
7 **    May you do good and not evil.
8 **    May you find forgiveness for yourself and forgive others.
9 **    May you share freely, never taking more than you give.
10 **
11 *************************************************************************
12 ** This file implements an external (disk-based) database using BTrees.
13 ** See the header comment on "btreeInt.h" for additional information.
14 ** Including a description of file format and an overview of operation.
15 */
16 #include "btreeInt.h"
17 
18 /*
19 ** The header string that appears at the beginning of every
20 ** SQLite database.
21 */
22 static const char zMagicHeader[] = SQLITE_FILE_HEADER;
23 
24 /*
25 ** Set this global variable to 1 to enable tracing using the TRACE
26 ** macro.
27 */
28 #if 0
29 int sqlite3BtreeTrace=1;  /* True to enable tracing */
30 # define TRACE(X)  if(sqlite3BtreeTrace){printf X;fflush(stdout);}
31 #else
32 # define TRACE(X)
33 #endif
34 
35 /*
36 ** Extract a 2-byte big-endian integer from an array of unsigned bytes.
37 ** But if the value is zero, make it 65536.
38 **
39 ** This routine is used to extract the "offset to cell content area" value
40 ** from the header of a btree page.  If the page size is 65536 and the page
41 ** is empty, the offset should be 65536, but the 2-byte value stores zero.
42 ** This routine makes the necessary adjustment to 65536.
43 */
44 #define get2byteNotZero(X)  (((((int)get2byte(X))-1)&0xffff)+1)
45 
46 /*
47 ** Values passed as the 5th argument to allocateBtreePage()
48 */
49 #define BTALLOC_ANY   0           /* Allocate any page */
50 #define BTALLOC_EXACT 1           /* Allocate exact page if possible */
51 #define BTALLOC_LE    2           /* Allocate any page <= the parameter */
52 
53 /*
54 ** Macro IfNotOmitAV(x) returns (x) if SQLITE_OMIT_AUTOVACUUM is not
55 ** defined, or 0 if it is. For example:
56 **
57 **   bIncrVacuum = IfNotOmitAV(pBtShared->incrVacuum);
58 */
59 #ifndef SQLITE_OMIT_AUTOVACUUM
60 #define IfNotOmitAV(expr) (expr)
61 #else
62 #define IfNotOmitAV(expr) 0
63 #endif
64 
65 #ifndef SQLITE_OMIT_SHARED_CACHE
66 /*
67 ** A list of BtShared objects that are eligible for participation
68 ** in shared cache.  This variable has file scope during normal builds,
69 ** but the test harness needs to access it so we make it global for
70 ** test builds.
71 **
72 ** Access to this variable is protected by SQLITE_MUTEX_STATIC_MASTER.
73 */
74 #ifdef SQLITE_TEST
75 BtShared *SQLITE_WSD sqlite3SharedCacheList = 0;
76 #else
77 static BtShared *SQLITE_WSD sqlite3SharedCacheList = 0;
78 #endif
79 #endif /* SQLITE_OMIT_SHARED_CACHE */
80 
81 #ifndef SQLITE_OMIT_SHARED_CACHE
82 /*
83 ** Enable or disable the shared pager and schema features.
84 **
85 ** This routine has no effect on existing database connections.
86 ** The shared cache setting effects only future calls to
87 ** sqlite3_open(), sqlite3_open16(), or sqlite3_open_v2().
88 */
89 int sqlite3_enable_shared_cache(int enable){
90   sqlite3GlobalConfig.sharedCacheEnabled = enable;
91   return SQLITE_OK;
92 }
93 #endif
94 
95 
96 
97 #ifdef SQLITE_OMIT_SHARED_CACHE
98   /*
99   ** The functions querySharedCacheTableLock(), setSharedCacheTableLock(),
100   ** and clearAllSharedCacheTableLocks()
101   ** manipulate entries in the BtShared.pLock linked list used to store
102   ** shared-cache table level locks. If the library is compiled with the
103   ** shared-cache feature disabled, then there is only ever one user
104   ** of each BtShared structure and so this locking is not necessary.
105   ** So define the lock related functions as no-ops.
106   */
107   #define querySharedCacheTableLock(a,b,c) SQLITE_OK
108   #define setSharedCacheTableLock(a,b,c) SQLITE_OK
109   #define clearAllSharedCacheTableLocks(a)
110   #define downgradeAllSharedCacheTableLocks(a)
111   #define hasSharedCacheTableLock(a,b,c,d) 1
112   #define hasReadConflicts(a, b) 0
113 #endif
114 
115 #ifndef SQLITE_OMIT_SHARED_CACHE
116 
117 #ifdef SQLITE_DEBUG
118 /*
119 **** This function is only used as part of an assert() statement. ***
120 **
121 ** Check to see if pBtree holds the required locks to read or write to the
122 ** table with root page iRoot.   Return 1 if it does and 0 if not.
123 **
124 ** For example, when writing to a table with root-page iRoot via
125 ** Btree connection pBtree:
126 **
127 **    assert( hasSharedCacheTableLock(pBtree, iRoot, 0, WRITE_LOCK) );
128 **
129 ** When writing to an index that resides in a sharable database, the
130 ** caller should have first obtained a lock specifying the root page of
131 ** the corresponding table. This makes things a bit more complicated,
132 ** as this module treats each table as a separate structure. To determine
133 ** the table corresponding to the index being written, this
134 ** function has to search through the database schema.
135 **
136 ** Instead of a lock on the table/index rooted at page iRoot, the caller may
137 ** hold a write-lock on the schema table (root page 1). This is also
138 ** acceptable.
139 */
140 static int hasSharedCacheTableLock(
141   Btree *pBtree,         /* Handle that must hold lock */
142   Pgno iRoot,            /* Root page of b-tree */
143   int isIndex,           /* True if iRoot is the root of an index b-tree */
144   int eLockType          /* Required lock type (READ_LOCK or WRITE_LOCK) */
145 ){
146   Schema *pSchema = (Schema *)pBtree->pBt->pSchema;
147   Pgno iTab = 0;
148   BtLock *pLock;
149 
150   /* If this database is not shareable, or if the client is reading
151   ** and has the read-uncommitted flag set, then no lock is required.
152   ** Return true immediately.
153   */
154   if( (pBtree->sharable==0)
155    || (eLockType==READ_LOCK && (pBtree->db->flags & SQLITE_ReadUncommitted))
156   ){
157     return 1;
158   }
159 
160   /* If the client is reading  or writing an index and the schema is
161   ** not loaded, then it is too difficult to actually check to see if
162   ** the correct locks are held.  So do not bother - just return true.
163   ** This case does not come up very often anyhow.
164   */
165   if( isIndex && (!pSchema || (pSchema->schemaFlags&DB_SchemaLoaded)==0) ){
166     return 1;
167   }
168 
169   /* Figure out the root-page that the lock should be held on. For table
170   ** b-trees, this is just the root page of the b-tree being read or
171   ** written. For index b-trees, it is the root page of the associated
172   ** table.  */
173   if( isIndex ){
174     HashElem *p;
175     for(p=sqliteHashFirst(&pSchema->idxHash); p; p=sqliteHashNext(p)){
176       Index *pIdx = (Index *)sqliteHashData(p);
177       if( pIdx->tnum==(int)iRoot ){
178         if( iTab ){
179           /* Two or more indexes share the same root page.  There must
180           ** be imposter tables.  So just return true.  The assert is not
181           ** useful in that case. */
182           return 1;
183         }
184         iTab = pIdx->pTable->tnum;
185       }
186     }
187   }else{
188     iTab = iRoot;
189   }
190 
191   /* Search for the required lock. Either a write-lock on root-page iTab, a
192   ** write-lock on the schema table, or (if the client is reading) a
193   ** read-lock on iTab will suffice. Return 1 if any of these are found.  */
194   for(pLock=pBtree->pBt->pLock; pLock; pLock=pLock->pNext){
195     if( pLock->pBtree==pBtree
196      && (pLock->iTable==iTab || (pLock->eLock==WRITE_LOCK && pLock->iTable==1))
197      && pLock->eLock>=eLockType
198     ){
199       return 1;
200     }
201   }
202 
203   /* Failed to find the required lock. */
204   return 0;
205 }
206 #endif /* SQLITE_DEBUG */
207 
208 #ifdef SQLITE_DEBUG
209 /*
210 **** This function may be used as part of assert() statements only. ****
211 **
212 ** Return true if it would be illegal for pBtree to write into the
213 ** table or index rooted at iRoot because other shared connections are
214 ** simultaneously reading that same table or index.
215 **
216 ** It is illegal for pBtree to write if some other Btree object that
217 ** shares the same BtShared object is currently reading or writing
218 ** the iRoot table.  Except, if the other Btree object has the
219 ** read-uncommitted flag set, then it is OK for the other object to
220 ** have a read cursor.
221 **
222 ** For example, before writing to any part of the table or index
223 ** rooted at page iRoot, one should call:
224 **
225 **    assert( !hasReadConflicts(pBtree, iRoot) );
226 */
227 static int hasReadConflicts(Btree *pBtree, Pgno iRoot){
228   BtCursor *p;
229   for(p=pBtree->pBt->pCursor; p; p=p->pNext){
230     if( p->pgnoRoot==iRoot
231      && p->pBtree!=pBtree
232      && 0==(p->pBtree->db->flags & SQLITE_ReadUncommitted)
233     ){
234       return 1;
235     }
236   }
237   return 0;
238 }
239 #endif    /* #ifdef SQLITE_DEBUG */
240 
241 /*
242 ** Query to see if Btree handle p may obtain a lock of type eLock
243 ** (READ_LOCK or WRITE_LOCK) on the table with root-page iTab. Return
244 ** SQLITE_OK if the lock may be obtained (by calling
245 ** setSharedCacheTableLock()), or SQLITE_LOCKED if not.
246 */
247 static int querySharedCacheTableLock(Btree *p, Pgno iTab, u8 eLock){
248   BtShared *pBt = p->pBt;
249   BtLock *pIter;
250 
251   assert( sqlite3BtreeHoldsMutex(p) );
252   assert( eLock==READ_LOCK || eLock==WRITE_LOCK );
253   assert( p->db!=0 );
254   assert( !(p->db->flags&SQLITE_ReadUncommitted)||eLock==WRITE_LOCK||iTab==1 );
255 
256   /* If requesting a write-lock, then the Btree must have an open write
257   ** transaction on this file. And, obviously, for this to be so there
258   ** must be an open write transaction on the file itself.
259   */
260   assert( eLock==READ_LOCK || (p==pBt->pWriter && p->inTrans==TRANS_WRITE) );
261   assert( eLock==READ_LOCK || pBt->inTransaction==TRANS_WRITE );
262 
263   /* This routine is a no-op if the shared-cache is not enabled */
264   if( !p->sharable ){
265     return SQLITE_OK;
266   }
267 
268   /* If some other connection is holding an exclusive lock, the
269   ** requested lock may not be obtained.
270   */
271   if( pBt->pWriter!=p && (pBt->btsFlags & BTS_EXCLUSIVE)!=0 ){
272     sqlite3ConnectionBlocked(p->db, pBt->pWriter->db);
273     return SQLITE_LOCKED_SHAREDCACHE;
274   }
275 
276   for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){
277     /* The condition (pIter->eLock!=eLock) in the following if(...)
278     ** statement is a simplification of:
279     **
280     **   (eLock==WRITE_LOCK || pIter->eLock==WRITE_LOCK)
281     **
282     ** since we know that if eLock==WRITE_LOCK, then no other connection
283     ** may hold a WRITE_LOCK on any table in this file (since there can
284     ** only be a single writer).
285     */
286     assert( pIter->eLock==READ_LOCK || pIter->eLock==WRITE_LOCK );
287     assert( eLock==READ_LOCK || pIter->pBtree==p || pIter->eLock==READ_LOCK);
288     if( pIter->pBtree!=p && pIter->iTable==iTab && pIter->eLock!=eLock ){
289       sqlite3ConnectionBlocked(p->db, pIter->pBtree->db);
290       if( eLock==WRITE_LOCK ){
291         assert( p==pBt->pWriter );
292         pBt->btsFlags |= BTS_PENDING;
293       }
294       return SQLITE_LOCKED_SHAREDCACHE;
295     }
296   }
297   return SQLITE_OK;
298 }
299 #endif /* !SQLITE_OMIT_SHARED_CACHE */
300 
301 #ifndef SQLITE_OMIT_SHARED_CACHE
302 /*
303 ** Add a lock on the table with root-page iTable to the shared-btree used
304 ** by Btree handle p. Parameter eLock must be either READ_LOCK or
305 ** WRITE_LOCK.
306 **
307 ** This function assumes the following:
308 **
309 **   (a) The specified Btree object p is connected to a sharable
310 **       database (one with the BtShared.sharable flag set), and
311 **
312 **   (b) No other Btree objects hold a lock that conflicts
313 **       with the requested lock (i.e. querySharedCacheTableLock() has
314 **       already been called and returned SQLITE_OK).
315 **
316 ** SQLITE_OK is returned if the lock is added successfully. SQLITE_NOMEM
317 ** is returned if a malloc attempt fails.
318 */
319 static int setSharedCacheTableLock(Btree *p, Pgno iTable, u8 eLock){
320   BtShared *pBt = p->pBt;
321   BtLock *pLock = 0;
322   BtLock *pIter;
323 
324   assert( sqlite3BtreeHoldsMutex(p) );
325   assert( eLock==READ_LOCK || eLock==WRITE_LOCK );
326   assert( p->db!=0 );
327 
328   /* A connection with the read-uncommitted flag set will never try to
329   ** obtain a read-lock using this function. The only read-lock obtained
330   ** by a connection in read-uncommitted mode is on the sqlite_master
331   ** table, and that lock is obtained in BtreeBeginTrans().  */
332   assert( 0==(p->db->flags&SQLITE_ReadUncommitted) || eLock==WRITE_LOCK );
333 
334   /* This function should only be called on a sharable b-tree after it
335   ** has been determined that no other b-tree holds a conflicting lock.  */
336   assert( p->sharable );
337   assert( SQLITE_OK==querySharedCacheTableLock(p, iTable, eLock) );
338 
339   /* First search the list for an existing lock on this table. */
340   for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){
341     if( pIter->iTable==iTable && pIter->pBtree==p ){
342       pLock = pIter;
343       break;
344     }
345   }
346 
347   /* If the above search did not find a BtLock struct associating Btree p
348   ** with table iTable, allocate one and link it into the list.
349   */
350   if( !pLock ){
351     pLock = (BtLock *)sqlite3MallocZero(sizeof(BtLock));
352     if( !pLock ){
353       return SQLITE_NOMEM_BKPT;
354     }
355     pLock->iTable = iTable;
356     pLock->pBtree = p;
357     pLock->pNext = pBt->pLock;
358     pBt->pLock = pLock;
359   }
360 
361   /* Set the BtLock.eLock variable to the maximum of the current lock
362   ** and the requested lock. This means if a write-lock was already held
363   ** and a read-lock requested, we don't incorrectly downgrade the lock.
364   */
365   assert( WRITE_LOCK>READ_LOCK );
366   if( eLock>pLock->eLock ){
367     pLock->eLock = eLock;
368   }
369 
370   return SQLITE_OK;
371 }
372 #endif /* !SQLITE_OMIT_SHARED_CACHE */
373 
374 #ifndef SQLITE_OMIT_SHARED_CACHE
375 /*
376 ** Release all the table locks (locks obtained via calls to
377 ** the setSharedCacheTableLock() procedure) held by Btree object p.
378 **
379 ** This function assumes that Btree p has an open read or write
380 ** transaction. If it does not, then the BTS_PENDING flag
381 ** may be incorrectly cleared.
382 */
383 static void clearAllSharedCacheTableLocks(Btree *p){
384   BtShared *pBt = p->pBt;
385   BtLock **ppIter = &pBt->pLock;
386 
387   assert( sqlite3BtreeHoldsMutex(p) );
388   assert( p->sharable || 0==*ppIter );
389   assert( p->inTrans>0 );
390 
391   while( *ppIter ){
392     BtLock *pLock = *ppIter;
393     assert( (pBt->btsFlags & BTS_EXCLUSIVE)==0 || pBt->pWriter==pLock->pBtree );
394     assert( pLock->pBtree->inTrans>=pLock->eLock );
395     if( pLock->pBtree==p ){
396       *ppIter = pLock->pNext;
397       assert( pLock->iTable!=1 || pLock==&p->lock );
398       if( pLock->iTable!=1 ){
399         sqlite3_free(pLock);
400       }
401     }else{
402       ppIter = &pLock->pNext;
403     }
404   }
405 
406   assert( (pBt->btsFlags & BTS_PENDING)==0 || pBt->pWriter );
407   if( pBt->pWriter==p ){
408     pBt->pWriter = 0;
409     pBt->btsFlags &= ~(BTS_EXCLUSIVE|BTS_PENDING);
410   }else if( pBt->nTransaction==2 ){
411     /* This function is called when Btree p is concluding its
412     ** transaction. If there currently exists a writer, and p is not
413     ** that writer, then the number of locks held by connections other
414     ** than the writer must be about to drop to zero. In this case
415     ** set the BTS_PENDING flag to 0.
416     **
417     ** If there is not currently a writer, then BTS_PENDING must
418     ** be zero already. So this next line is harmless in that case.
419     */
420     pBt->btsFlags &= ~BTS_PENDING;
421   }
422 }
423 
424 /*
425 ** This function changes all write-locks held by Btree p into read-locks.
426 */
427 static void downgradeAllSharedCacheTableLocks(Btree *p){
428   BtShared *pBt = p->pBt;
429   if( pBt->pWriter==p ){
430     BtLock *pLock;
431     pBt->pWriter = 0;
432     pBt->btsFlags &= ~(BTS_EXCLUSIVE|BTS_PENDING);
433     for(pLock=pBt->pLock; pLock; pLock=pLock->pNext){
434       assert( pLock->eLock==READ_LOCK || pLock->pBtree==p );
435       pLock->eLock = READ_LOCK;
436     }
437   }
438 }
439 
440 #endif /* SQLITE_OMIT_SHARED_CACHE */
441 
442 static void releasePage(MemPage *pPage);  /* Forward reference */
443 
444 /*
445 ***** This routine is used inside of assert() only ****
446 **
447 ** Verify that the cursor holds the mutex on its BtShared
448 */
449 #ifdef SQLITE_DEBUG
450 static int cursorHoldsMutex(BtCursor *p){
451   return sqlite3_mutex_held(p->pBt->mutex);
452 }
453 
454 /* Verify that the cursor and the BtShared agree about what is the current
455 ** database connetion. This is important in shared-cache mode. If the database
456 ** connection pointers get out-of-sync, it is possible for routines like
457 ** btreeInitPage() to reference an stale connection pointer that references a
458 ** a connection that has already closed.  This routine is used inside assert()
459 ** statements only and for the purpose of double-checking that the btree code
460 ** does keep the database connection pointers up-to-date.
461 */
462 static int cursorOwnsBtShared(BtCursor *p){
463   assert( cursorHoldsMutex(p) );
464   return (p->pBtree->db==p->pBt->db);
465 }
466 #endif
467 
468 /*
469 ** Invalidate the overflow cache of the cursor passed as the first argument.
470 ** on the shared btree structure pBt.
471 */
472 #define invalidateOverflowCache(pCur) (pCur->curFlags &= ~BTCF_ValidOvfl)
473 
474 /*
475 ** Invalidate the overflow page-list cache for all cursors opened
476 ** on the shared btree structure pBt.
477 */
478 static void invalidateAllOverflowCache(BtShared *pBt){
479   BtCursor *p;
480   assert( sqlite3_mutex_held(pBt->mutex) );
481   for(p=pBt->pCursor; p; p=p->pNext){
482     invalidateOverflowCache(p);
483   }
484 }
485 
486 #ifndef SQLITE_OMIT_INCRBLOB
487 /*
488 ** This function is called before modifying the contents of a table
489 ** to invalidate any incrblob cursors that are open on the
490 ** row or one of the rows being modified.
491 **
492 ** If argument isClearTable is true, then the entire contents of the
493 ** table is about to be deleted. In this case invalidate all incrblob
494 ** cursors open on any row within the table with root-page pgnoRoot.
495 **
496 ** Otherwise, if argument isClearTable is false, then the row with
497 ** rowid iRow is being replaced or deleted. In this case invalidate
498 ** only those incrblob cursors open on that specific row.
499 */
500 static void invalidateIncrblobCursors(
501   Btree *pBtree,          /* The database file to check */
502   i64 iRow,               /* The rowid that might be changing */
503   int isClearTable        /* True if all rows are being deleted */
504 ){
505   BtCursor *p;
506   if( pBtree->hasIncrblobCur==0 ) return;
507   assert( sqlite3BtreeHoldsMutex(pBtree) );
508   pBtree->hasIncrblobCur = 0;
509   for(p=pBtree->pBt->pCursor; p; p=p->pNext){
510     if( (p->curFlags & BTCF_Incrblob)!=0 ){
511       pBtree->hasIncrblobCur = 1;
512       if( isClearTable || p->info.nKey==iRow ){
513         p->eState = CURSOR_INVALID;
514       }
515     }
516   }
517 }
518 
519 #else
520   /* Stub function when INCRBLOB is omitted */
521   #define invalidateIncrblobCursors(x,y,z)
522 #endif /* SQLITE_OMIT_INCRBLOB */
523 
524 /*
525 ** Set bit pgno of the BtShared.pHasContent bitvec. This is called
526 ** when a page that previously contained data becomes a free-list leaf
527 ** page.
528 **
529 ** The BtShared.pHasContent bitvec exists to work around an obscure
530 ** bug caused by the interaction of two useful IO optimizations surrounding
531 ** free-list leaf pages:
532 **
533 **   1) When all data is deleted from a page and the page becomes
534 **      a free-list leaf page, the page is not written to the database
535 **      (as free-list leaf pages contain no meaningful data). Sometimes
536 **      such a page is not even journalled (as it will not be modified,
537 **      why bother journalling it?).
538 **
539 **   2) When a free-list leaf page is reused, its content is not read
540 **      from the database or written to the journal file (why should it
541 **      be, if it is not at all meaningful?).
542 **
543 ** By themselves, these optimizations work fine and provide a handy
544 ** performance boost to bulk delete or insert operations. However, if
545 ** a page is moved to the free-list and then reused within the same
546 ** transaction, a problem comes up. If the page is not journalled when
547 ** it is moved to the free-list and it is also not journalled when it
548 ** is extracted from the free-list and reused, then the original data
549 ** may be lost. In the event of a rollback, it may not be possible
550 ** to restore the database to its original configuration.
551 **
552 ** The solution is the BtShared.pHasContent bitvec. Whenever a page is
553 ** moved to become a free-list leaf page, the corresponding bit is
554 ** set in the bitvec. Whenever a leaf page is extracted from the free-list,
555 ** optimization 2 above is omitted if the corresponding bit is already
556 ** set in BtShared.pHasContent. The contents of the bitvec are cleared
557 ** at the end of every transaction.
558 */
559 static int btreeSetHasContent(BtShared *pBt, Pgno pgno){
560   int rc = SQLITE_OK;
561   if( !pBt->pHasContent ){
562     assert( pgno<=pBt->nPage );
563     pBt->pHasContent = sqlite3BitvecCreate(pBt->nPage);
564     if( !pBt->pHasContent ){
565       rc = SQLITE_NOMEM_BKPT;
566     }
567   }
568   if( rc==SQLITE_OK && pgno<=sqlite3BitvecSize(pBt->pHasContent) ){
569     rc = sqlite3BitvecSet(pBt->pHasContent, pgno);
570   }
571   return rc;
572 }
573 
574 /*
575 ** Query the BtShared.pHasContent vector.
576 **
577 ** This function is called when a free-list leaf page is removed from the
578 ** free-list for reuse. It returns false if it is safe to retrieve the
579 ** page from the pager layer with the 'no-content' flag set. True otherwise.
580 */
581 static int btreeGetHasContent(BtShared *pBt, Pgno pgno){
582   Bitvec *p = pBt->pHasContent;
583   return (p && (pgno>sqlite3BitvecSize(p) || sqlite3BitvecTest(p, pgno)));
584 }
585 
586 /*
587 ** Clear (destroy) the BtShared.pHasContent bitvec. This should be
588 ** invoked at the conclusion of each write-transaction.
589 */
590 static void btreeClearHasContent(BtShared *pBt){
591   sqlite3BitvecDestroy(pBt->pHasContent);
592   pBt->pHasContent = 0;
593 }
594 
595 /*
596 ** Release all of the apPage[] pages for a cursor.
597 */
598 static void btreeReleaseAllCursorPages(BtCursor *pCur){
599   int i;
600   for(i=0; i<=pCur->iPage; i++){
601     releasePage(pCur->apPage[i]);
602     pCur->apPage[i] = 0;
603   }
604   pCur->iPage = -1;
605 }
606 
607 /*
608 ** The cursor passed as the only argument must point to a valid entry
609 ** when this function is called (i.e. have eState==CURSOR_VALID). This
610 ** function saves the current cursor key in variables pCur->nKey and
611 ** pCur->pKey. SQLITE_OK is returned if successful or an SQLite error
612 ** code otherwise.
613 **
614 ** If the cursor is open on an intkey table, then the integer key
615 ** (the rowid) is stored in pCur->nKey and pCur->pKey is left set to
616 ** NULL. If the cursor is open on a non-intkey table, then pCur->pKey is
617 ** set to point to a malloced buffer pCur->nKey bytes in size containing
618 ** the key.
619 */
620 static int saveCursorKey(BtCursor *pCur){
621   int rc = SQLITE_OK;
622   assert( CURSOR_VALID==pCur->eState );
623   assert( 0==pCur->pKey );
624   assert( cursorHoldsMutex(pCur) );
625 
626   if( pCur->curIntKey ){
627     /* Only the rowid is required for a table btree */
628     pCur->nKey = sqlite3BtreeIntegerKey(pCur);
629   }else{
630     /* For an index btree, save the complete key content */
631     void *pKey;
632     pCur->nKey = sqlite3BtreePayloadSize(pCur);
633     pKey = sqlite3Malloc( pCur->nKey );
634     if( pKey ){
635       rc = sqlite3BtreePayload(pCur, 0, (int)pCur->nKey, pKey);
636       if( rc==SQLITE_OK ){
637         pCur->pKey = pKey;
638       }else{
639         sqlite3_free(pKey);
640       }
641     }else{
642       rc = SQLITE_NOMEM_BKPT;
643     }
644   }
645   assert( !pCur->curIntKey || !pCur->pKey );
646   return rc;
647 }
648 
649 /*
650 ** Save the current cursor position in the variables BtCursor.nKey
651 ** and BtCursor.pKey. The cursor's state is set to CURSOR_REQUIRESEEK.
652 **
653 ** The caller must ensure that the cursor is valid (has eState==CURSOR_VALID)
654 ** prior to calling this routine.
655 */
656 static int saveCursorPosition(BtCursor *pCur){
657   int rc;
658 
659   assert( CURSOR_VALID==pCur->eState || CURSOR_SKIPNEXT==pCur->eState );
660   assert( 0==pCur->pKey );
661   assert( cursorHoldsMutex(pCur) );
662 
663   if( pCur->eState==CURSOR_SKIPNEXT ){
664     pCur->eState = CURSOR_VALID;
665   }else{
666     pCur->skipNext = 0;
667   }
668 
669   rc = saveCursorKey(pCur);
670   if( rc==SQLITE_OK ){
671     btreeReleaseAllCursorPages(pCur);
672     pCur->eState = CURSOR_REQUIRESEEK;
673   }
674 
675   pCur->curFlags &= ~(BTCF_ValidNKey|BTCF_ValidOvfl|BTCF_AtLast);
676   return rc;
677 }
678 
679 /* Forward reference */
680 static int SQLITE_NOINLINE saveCursorsOnList(BtCursor*,Pgno,BtCursor*);
681 
682 /*
683 ** Save the positions of all cursors (except pExcept) that are open on
684 ** the table with root-page iRoot.  "Saving the cursor position" means that
685 ** the location in the btree is remembered in such a way that it can be
686 ** moved back to the same spot after the btree has been modified.  This
687 ** routine is called just before cursor pExcept is used to modify the
688 ** table, for example in BtreeDelete() or BtreeInsert().
689 **
690 ** If there are two or more cursors on the same btree, then all such
691 ** cursors should have their BTCF_Multiple flag set.  The btreeCursor()
692 ** routine enforces that rule.  This routine only needs to be called in
693 ** the uncommon case when pExpect has the BTCF_Multiple flag set.
694 **
695 ** If pExpect!=NULL and if no other cursors are found on the same root-page,
696 ** then the BTCF_Multiple flag on pExpect is cleared, to avoid another
697 ** pointless call to this routine.
698 **
699 ** Implementation note:  This routine merely checks to see if any cursors
700 ** need to be saved.  It calls out to saveCursorsOnList() in the (unusual)
701 ** event that cursors are in need to being saved.
702 */
703 static int saveAllCursors(BtShared *pBt, Pgno iRoot, BtCursor *pExcept){
704   BtCursor *p;
705   assert( sqlite3_mutex_held(pBt->mutex) );
706   assert( pExcept==0 || pExcept->pBt==pBt );
707   for(p=pBt->pCursor; p; p=p->pNext){
708     if( p!=pExcept && (0==iRoot || p->pgnoRoot==iRoot) ) break;
709   }
710   if( p ) return saveCursorsOnList(p, iRoot, pExcept);
711   if( pExcept ) pExcept->curFlags &= ~BTCF_Multiple;
712   return SQLITE_OK;
713 }
714 
715 /* This helper routine to saveAllCursors does the actual work of saving
716 ** the cursors if and when a cursor is found that actually requires saving.
717 ** The common case is that no cursors need to be saved, so this routine is
718 ** broken out from its caller to avoid unnecessary stack pointer movement.
719 */
720 static int SQLITE_NOINLINE saveCursorsOnList(
721   BtCursor *p,         /* The first cursor that needs saving */
722   Pgno iRoot,          /* Only save cursor with this iRoot. Save all if zero */
723   BtCursor *pExcept    /* Do not save this cursor */
724 ){
725   do{
726     if( p!=pExcept && (0==iRoot || p->pgnoRoot==iRoot) ){
727       if( p->eState==CURSOR_VALID || p->eState==CURSOR_SKIPNEXT ){
728         int rc = saveCursorPosition(p);
729         if( SQLITE_OK!=rc ){
730           return rc;
731         }
732       }else{
733         testcase( p->iPage>0 );
734         btreeReleaseAllCursorPages(p);
735       }
736     }
737     p = p->pNext;
738   }while( p );
739   return SQLITE_OK;
740 }
741 
742 /*
743 ** Clear the current cursor position.
744 */
745 void sqlite3BtreeClearCursor(BtCursor *pCur){
746   assert( cursorHoldsMutex(pCur) );
747   sqlite3_free(pCur->pKey);
748   pCur->pKey = 0;
749   pCur->eState = CURSOR_INVALID;
750 }
751 
752 /*
753 ** In this version of BtreeMoveto, pKey is a packed index record
754 ** such as is generated by the OP_MakeRecord opcode.  Unpack the
755 ** record and then call BtreeMovetoUnpacked() to do the work.
756 */
757 static int btreeMoveto(
758   BtCursor *pCur,     /* Cursor open on the btree to be searched */
759   const void *pKey,   /* Packed key if the btree is an index */
760   i64 nKey,           /* Integer key for tables.  Size of pKey for indices */
761   int bias,           /* Bias search to the high end */
762   int *pRes           /* Write search results here */
763 ){
764   int rc;                    /* Status code */
765   UnpackedRecord *pIdxKey;   /* Unpacked index key */
766 
767   if( pKey ){
768     assert( nKey==(i64)(int)nKey );
769     pIdxKey = sqlite3VdbeAllocUnpackedRecord(pCur->pKeyInfo);
770     if( pIdxKey==0 ) return SQLITE_NOMEM_BKPT;
771     sqlite3VdbeRecordUnpack(pCur->pKeyInfo, (int)nKey, pKey, pIdxKey);
772     if( pIdxKey->nField==0 ){
773       rc = SQLITE_CORRUPT_BKPT;
774       goto moveto_done;
775     }
776   }else{
777     pIdxKey = 0;
778   }
779   rc = sqlite3BtreeMovetoUnpacked(pCur, pIdxKey, nKey, bias, pRes);
780 moveto_done:
781   if( pIdxKey ){
782     sqlite3DbFree(pCur->pKeyInfo->db, pIdxKey);
783   }
784   return rc;
785 }
786 
787 /*
788 ** Restore the cursor to the position it was in (or as close to as possible)
789 ** when saveCursorPosition() was called. Note that this call deletes the
790 ** saved position info stored by saveCursorPosition(), so there can be
791 ** at most one effective restoreCursorPosition() call after each
792 ** saveCursorPosition().
793 */
794 static int btreeRestoreCursorPosition(BtCursor *pCur){
795   int rc;
796   int skipNext;
797   assert( cursorOwnsBtShared(pCur) );
798   assert( pCur->eState>=CURSOR_REQUIRESEEK );
799   if( pCur->eState==CURSOR_FAULT ){
800     return pCur->skipNext;
801   }
802   pCur->eState = CURSOR_INVALID;
803   rc = btreeMoveto(pCur, pCur->pKey, pCur->nKey, 0, &skipNext);
804   if( rc==SQLITE_OK ){
805     sqlite3_free(pCur->pKey);
806     pCur->pKey = 0;
807     assert( pCur->eState==CURSOR_VALID || pCur->eState==CURSOR_INVALID );
808     pCur->skipNext |= skipNext;
809     if( pCur->skipNext && pCur->eState==CURSOR_VALID ){
810       pCur->eState = CURSOR_SKIPNEXT;
811     }
812   }
813   return rc;
814 }
815 
816 #define restoreCursorPosition(p) \
817   (p->eState>=CURSOR_REQUIRESEEK ? \
818          btreeRestoreCursorPosition(p) : \
819          SQLITE_OK)
820 
821 /*
822 ** Determine whether or not a cursor has moved from the position where
823 ** it was last placed, or has been invalidated for any other reason.
824 ** Cursors can move when the row they are pointing at is deleted out
825 ** from under them, for example.  Cursor might also move if a btree
826 ** is rebalanced.
827 **
828 ** Calling this routine with a NULL cursor pointer returns false.
829 **
830 ** Use the separate sqlite3BtreeCursorRestore() routine to restore a cursor
831 ** back to where it ought to be if this routine returns true.
832 */
833 int sqlite3BtreeCursorHasMoved(BtCursor *pCur){
834   return pCur->eState!=CURSOR_VALID;
835 }
836 
837 /*
838 ** This routine restores a cursor back to its original position after it
839 ** has been moved by some outside activity (such as a btree rebalance or
840 ** a row having been deleted out from under the cursor).
841 **
842 ** On success, the *pDifferentRow parameter is false if the cursor is left
843 ** pointing at exactly the same row.  *pDifferntRow is the row the cursor
844 ** was pointing to has been deleted, forcing the cursor to point to some
845 ** nearby row.
846 **
847 ** This routine should only be called for a cursor that just returned
848 ** TRUE from sqlite3BtreeCursorHasMoved().
849 */
850 int sqlite3BtreeCursorRestore(BtCursor *pCur, int *pDifferentRow){
851   int rc;
852 
853   assert( pCur!=0 );
854   assert( pCur->eState!=CURSOR_VALID );
855   rc = restoreCursorPosition(pCur);
856   if( rc ){
857     *pDifferentRow = 1;
858     return rc;
859   }
860   if( pCur->eState!=CURSOR_VALID ){
861     *pDifferentRow = 1;
862   }else{
863     assert( pCur->skipNext==0 );
864     *pDifferentRow = 0;
865   }
866   return SQLITE_OK;
867 }
868 
869 #ifdef SQLITE_ENABLE_CURSOR_HINTS
870 /*
871 ** Provide hints to the cursor.  The particular hint given (and the type
872 ** and number of the varargs parameters) is determined by the eHintType
873 ** parameter.  See the definitions of the BTREE_HINT_* macros for details.
874 */
875 void sqlite3BtreeCursorHint(BtCursor *pCur, int eHintType, ...){
876   /* Used only by system that substitute their own storage engine */
877 }
878 #endif
879 
880 /*
881 ** Provide flag hints to the cursor.
882 */
883 void sqlite3BtreeCursorHintFlags(BtCursor *pCur, unsigned x){
884   assert( x==BTREE_SEEK_EQ || x==BTREE_BULKLOAD || x==0 );
885   pCur->hints = x;
886 }
887 
888 
889 #ifndef SQLITE_OMIT_AUTOVACUUM
890 /*
891 ** Given a page number of a regular database page, return the page
892 ** number for the pointer-map page that contains the entry for the
893 ** input page number.
894 **
895 ** Return 0 (not a valid page) for pgno==1 since there is
896 ** no pointer map associated with page 1.  The integrity_check logic
897 ** requires that ptrmapPageno(*,1)!=1.
898 */
899 static Pgno ptrmapPageno(BtShared *pBt, Pgno pgno){
900   int nPagesPerMapPage;
901   Pgno iPtrMap, ret;
902   assert( sqlite3_mutex_held(pBt->mutex) );
903   if( pgno<2 ) return 0;
904   nPagesPerMapPage = (pBt->usableSize/5)+1;
905   iPtrMap = (pgno-2)/nPagesPerMapPage;
906   ret = (iPtrMap*nPagesPerMapPage) + 2;
907   if( ret==PENDING_BYTE_PAGE(pBt) ){
908     ret++;
909   }
910   return ret;
911 }
912 
913 /*
914 ** Write an entry into the pointer map.
915 **
916 ** This routine updates the pointer map entry for page number 'key'
917 ** so that it maps to type 'eType' and parent page number 'pgno'.
918 **
919 ** If *pRC is initially non-zero (non-SQLITE_OK) then this routine is
920 ** a no-op.  If an error occurs, the appropriate error code is written
921 ** into *pRC.
922 */
923 static void ptrmapPut(BtShared *pBt, Pgno key, u8 eType, Pgno parent, int *pRC){
924   DbPage *pDbPage;  /* The pointer map page */
925   u8 *pPtrmap;      /* The pointer map data */
926   Pgno iPtrmap;     /* The pointer map page number */
927   int offset;       /* Offset in pointer map page */
928   int rc;           /* Return code from subfunctions */
929 
930   if( *pRC ) return;
931 
932   assert( sqlite3_mutex_held(pBt->mutex) );
933   /* The master-journal page number must never be used as a pointer map page */
934   assert( 0==PTRMAP_ISPAGE(pBt, PENDING_BYTE_PAGE(pBt)) );
935 
936   assert( pBt->autoVacuum );
937   if( key==0 ){
938     *pRC = SQLITE_CORRUPT_BKPT;
939     return;
940   }
941   iPtrmap = PTRMAP_PAGENO(pBt, key);
942   rc = sqlite3PagerGet(pBt->pPager, iPtrmap, &pDbPage, 0);
943   if( rc!=SQLITE_OK ){
944     *pRC = rc;
945     return;
946   }
947   offset = PTRMAP_PTROFFSET(iPtrmap, key);
948   if( offset<0 ){
949     *pRC = SQLITE_CORRUPT_BKPT;
950     goto ptrmap_exit;
951   }
952   assert( offset <= (int)pBt->usableSize-5 );
953   pPtrmap = (u8 *)sqlite3PagerGetData(pDbPage);
954 
955   if( eType!=pPtrmap[offset] || get4byte(&pPtrmap[offset+1])!=parent ){
956     TRACE(("PTRMAP_UPDATE: %d->(%d,%d)\n", key, eType, parent));
957     *pRC= rc = sqlite3PagerWrite(pDbPage);
958     if( rc==SQLITE_OK ){
959       pPtrmap[offset] = eType;
960       put4byte(&pPtrmap[offset+1], parent);
961     }
962   }
963 
964 ptrmap_exit:
965   sqlite3PagerUnref(pDbPage);
966 }
967 
968 /*
969 ** Read an entry from the pointer map.
970 **
971 ** This routine retrieves the pointer map entry for page 'key', writing
972 ** the type and parent page number to *pEType and *pPgno respectively.
973 ** An error code is returned if something goes wrong, otherwise SQLITE_OK.
974 */
975 static int ptrmapGet(BtShared *pBt, Pgno key, u8 *pEType, Pgno *pPgno){
976   DbPage *pDbPage;   /* The pointer map page */
977   int iPtrmap;       /* Pointer map page index */
978   u8 *pPtrmap;       /* Pointer map page data */
979   int offset;        /* Offset of entry in pointer map */
980   int rc;
981 
982   assert( sqlite3_mutex_held(pBt->mutex) );
983 
984   iPtrmap = PTRMAP_PAGENO(pBt, key);
985   rc = sqlite3PagerGet(pBt->pPager, iPtrmap, &pDbPage, 0);
986   if( rc!=0 ){
987     return rc;
988   }
989   pPtrmap = (u8 *)sqlite3PagerGetData(pDbPage);
990 
991   offset = PTRMAP_PTROFFSET(iPtrmap, key);
992   if( offset<0 ){
993     sqlite3PagerUnref(pDbPage);
994     return SQLITE_CORRUPT_BKPT;
995   }
996   assert( offset <= (int)pBt->usableSize-5 );
997   assert( pEType!=0 );
998   *pEType = pPtrmap[offset];
999   if( pPgno ) *pPgno = get4byte(&pPtrmap[offset+1]);
1000 
1001   sqlite3PagerUnref(pDbPage);
1002   if( *pEType<1 || *pEType>5 ) return SQLITE_CORRUPT_BKPT;
1003   return SQLITE_OK;
1004 }
1005 
1006 #else /* if defined SQLITE_OMIT_AUTOVACUUM */
1007   #define ptrmapPut(w,x,y,z,rc)
1008   #define ptrmapGet(w,x,y,z) SQLITE_OK
1009   #define ptrmapPutOvflPtr(x, y, rc)
1010 #endif
1011 
1012 /*
1013 ** Given a btree page and a cell index (0 means the first cell on
1014 ** the page, 1 means the second cell, and so forth) return a pointer
1015 ** to the cell content.
1016 **
1017 ** findCellPastPtr() does the same except it skips past the initial
1018 ** 4-byte child pointer found on interior pages, if there is one.
1019 **
1020 ** This routine works only for pages that do not contain overflow cells.
1021 */
1022 #define findCell(P,I) \
1023   ((P)->aData + ((P)->maskPage & get2byteAligned(&(P)->aCellIdx[2*(I)])))
1024 #define findCellPastPtr(P,I) \
1025   ((P)->aDataOfst + ((P)->maskPage & get2byteAligned(&(P)->aCellIdx[2*(I)])))
1026 
1027 
1028 /*
1029 ** This is common tail processing for btreeParseCellPtr() and
1030 ** btreeParseCellPtrIndex() for the case when the cell does not fit entirely
1031 ** on a single B-tree page.  Make necessary adjustments to the CellInfo
1032 ** structure.
1033 */
1034 static SQLITE_NOINLINE void btreeParseCellAdjustSizeForOverflow(
1035   MemPage *pPage,         /* Page containing the cell */
1036   u8 *pCell,              /* Pointer to the cell text. */
1037   CellInfo *pInfo         /* Fill in this structure */
1038 ){
1039   /* If the payload will not fit completely on the local page, we have
1040   ** to decide how much to store locally and how much to spill onto
1041   ** overflow pages.  The strategy is to minimize the amount of unused
1042   ** space on overflow pages while keeping the amount of local storage
1043   ** in between minLocal and maxLocal.
1044   **
1045   ** Warning:  changing the way overflow payload is distributed in any
1046   ** way will result in an incompatible file format.
1047   */
1048   int minLocal;  /* Minimum amount of payload held locally */
1049   int maxLocal;  /* Maximum amount of payload held locally */
1050   int surplus;   /* Overflow payload available for local storage */
1051 
1052   minLocal = pPage->minLocal;
1053   maxLocal = pPage->maxLocal;
1054   surplus = minLocal + (pInfo->nPayload - minLocal)%(pPage->pBt->usableSize-4);
1055   testcase( surplus==maxLocal );
1056   testcase( surplus==maxLocal+1 );
1057   if( surplus <= maxLocal ){
1058     pInfo->nLocal = (u16)surplus;
1059   }else{
1060     pInfo->nLocal = (u16)minLocal;
1061   }
1062   pInfo->nSize = (u16)(&pInfo->pPayload[pInfo->nLocal] - pCell) + 4;
1063 }
1064 
1065 /*
1066 ** The following routines are implementations of the MemPage.xParseCell()
1067 ** method.
1068 **
1069 ** Parse a cell content block and fill in the CellInfo structure.
1070 **
1071 ** btreeParseCellPtr()        =>   table btree leaf nodes
1072 ** btreeParseCellNoPayload()  =>   table btree internal nodes
1073 ** btreeParseCellPtrIndex()   =>   index btree nodes
1074 **
1075 ** There is also a wrapper function btreeParseCell() that works for
1076 ** all MemPage types and that references the cell by index rather than
1077 ** by pointer.
1078 */
1079 static void btreeParseCellPtrNoPayload(
1080   MemPage *pPage,         /* Page containing the cell */
1081   u8 *pCell,              /* Pointer to the cell text. */
1082   CellInfo *pInfo         /* Fill in this structure */
1083 ){
1084   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1085   assert( pPage->leaf==0 );
1086   assert( pPage->childPtrSize==4 );
1087 #ifndef SQLITE_DEBUG
1088   UNUSED_PARAMETER(pPage);
1089 #endif
1090   pInfo->nSize = 4 + getVarint(&pCell[4], (u64*)&pInfo->nKey);
1091   pInfo->nPayload = 0;
1092   pInfo->nLocal = 0;
1093   pInfo->pPayload = 0;
1094   return;
1095 }
1096 static void btreeParseCellPtr(
1097   MemPage *pPage,         /* Page containing the cell */
1098   u8 *pCell,              /* Pointer to the cell text. */
1099   CellInfo *pInfo         /* Fill in this structure */
1100 ){
1101   u8 *pIter;              /* For scanning through pCell */
1102   u32 nPayload;           /* Number of bytes of cell payload */
1103   u64 iKey;               /* Extracted Key value */
1104 
1105   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1106   assert( pPage->leaf==0 || pPage->leaf==1 );
1107   assert( pPage->intKeyLeaf );
1108   assert( pPage->childPtrSize==0 );
1109   pIter = pCell;
1110 
1111   /* The next block of code is equivalent to:
1112   **
1113   **     pIter += getVarint32(pIter, nPayload);
1114   **
1115   ** The code is inlined to avoid a function call.
1116   */
1117   nPayload = *pIter;
1118   if( nPayload>=0x80 ){
1119     u8 *pEnd = &pIter[8];
1120     nPayload &= 0x7f;
1121     do{
1122       nPayload = (nPayload<<7) | (*++pIter & 0x7f);
1123     }while( (*pIter)>=0x80 && pIter<pEnd );
1124   }
1125   pIter++;
1126 
1127   /* The next block of code is equivalent to:
1128   **
1129   **     pIter += getVarint(pIter, (u64*)&pInfo->nKey);
1130   **
1131   ** The code is inlined to avoid a function call.
1132   */
1133   iKey = *pIter;
1134   if( iKey>=0x80 ){
1135     u8 *pEnd = &pIter[7];
1136     iKey &= 0x7f;
1137     while(1){
1138       iKey = (iKey<<7) | (*++pIter & 0x7f);
1139       if( (*pIter)<0x80 ) break;
1140       if( pIter>=pEnd ){
1141         iKey = (iKey<<8) | *++pIter;
1142         break;
1143       }
1144     }
1145   }
1146   pIter++;
1147 
1148   pInfo->nKey = *(i64*)&iKey;
1149   pInfo->nPayload = nPayload;
1150   pInfo->pPayload = pIter;
1151   testcase( nPayload==pPage->maxLocal );
1152   testcase( nPayload==pPage->maxLocal+1 );
1153   if( nPayload<=pPage->maxLocal ){
1154     /* This is the (easy) common case where the entire payload fits
1155     ** on the local page.  No overflow is required.
1156     */
1157     pInfo->nSize = nPayload + (u16)(pIter - pCell);
1158     if( pInfo->nSize<4 ) pInfo->nSize = 4;
1159     pInfo->nLocal = (u16)nPayload;
1160   }else{
1161     btreeParseCellAdjustSizeForOverflow(pPage, pCell, pInfo);
1162   }
1163 }
1164 static void btreeParseCellPtrIndex(
1165   MemPage *pPage,         /* Page containing the cell */
1166   u8 *pCell,              /* Pointer to the cell text. */
1167   CellInfo *pInfo         /* Fill in this structure */
1168 ){
1169   u8 *pIter;              /* For scanning through pCell */
1170   u32 nPayload;           /* Number of bytes of cell payload */
1171 
1172   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1173   assert( pPage->leaf==0 || pPage->leaf==1 );
1174   assert( pPage->intKeyLeaf==0 );
1175   pIter = pCell + pPage->childPtrSize;
1176   nPayload = *pIter;
1177   if( nPayload>=0x80 ){
1178     u8 *pEnd = &pIter[8];
1179     nPayload &= 0x7f;
1180     do{
1181       nPayload = (nPayload<<7) | (*++pIter & 0x7f);
1182     }while( *(pIter)>=0x80 && pIter<pEnd );
1183   }
1184   pIter++;
1185   pInfo->nKey = nPayload;
1186   pInfo->nPayload = nPayload;
1187   pInfo->pPayload = pIter;
1188   testcase( nPayload==pPage->maxLocal );
1189   testcase( nPayload==pPage->maxLocal+1 );
1190   if( nPayload<=pPage->maxLocal ){
1191     /* This is the (easy) common case where the entire payload fits
1192     ** on the local page.  No overflow is required.
1193     */
1194     pInfo->nSize = nPayload + (u16)(pIter - pCell);
1195     if( pInfo->nSize<4 ) pInfo->nSize = 4;
1196     pInfo->nLocal = (u16)nPayload;
1197   }else{
1198     btreeParseCellAdjustSizeForOverflow(pPage, pCell, pInfo);
1199   }
1200 }
1201 static void btreeParseCell(
1202   MemPage *pPage,         /* Page containing the cell */
1203   int iCell,              /* The cell index.  First cell is 0 */
1204   CellInfo *pInfo         /* Fill in this structure */
1205 ){
1206   pPage->xParseCell(pPage, findCell(pPage, iCell), pInfo);
1207 }
1208 
1209 /*
1210 ** The following routines are implementations of the MemPage.xCellSize
1211 ** method.
1212 **
1213 ** Compute the total number of bytes that a Cell needs in the cell
1214 ** data area of the btree-page.  The return number includes the cell
1215 ** data header and the local payload, but not any overflow page or
1216 ** the space used by the cell pointer.
1217 **
1218 ** cellSizePtrNoPayload()    =>   table internal nodes
1219 ** cellSizePtr()             =>   all index nodes & table leaf nodes
1220 */
1221 static u16 cellSizePtr(MemPage *pPage, u8 *pCell){
1222   u8 *pIter = pCell + pPage->childPtrSize; /* For looping over bytes of pCell */
1223   u8 *pEnd;                                /* End mark for a varint */
1224   u32 nSize;                               /* Size value to return */
1225 
1226 #ifdef SQLITE_DEBUG
1227   /* The value returned by this function should always be the same as
1228   ** the (CellInfo.nSize) value found by doing a full parse of the
1229   ** cell. If SQLITE_DEBUG is defined, an assert() at the bottom of
1230   ** this function verifies that this invariant is not violated. */
1231   CellInfo debuginfo;
1232   pPage->xParseCell(pPage, pCell, &debuginfo);
1233 #endif
1234 
1235   nSize = *pIter;
1236   if( nSize>=0x80 ){
1237     pEnd = &pIter[8];
1238     nSize &= 0x7f;
1239     do{
1240       nSize = (nSize<<7) | (*++pIter & 0x7f);
1241     }while( *(pIter)>=0x80 && pIter<pEnd );
1242   }
1243   pIter++;
1244   if( pPage->intKey ){
1245     /* pIter now points at the 64-bit integer key value, a variable length
1246     ** integer. The following block moves pIter to point at the first byte
1247     ** past the end of the key value. */
1248     pEnd = &pIter[9];
1249     while( (*pIter++)&0x80 && pIter<pEnd );
1250   }
1251   testcase( nSize==pPage->maxLocal );
1252   testcase( nSize==pPage->maxLocal+1 );
1253   if( nSize<=pPage->maxLocal ){
1254     nSize += (u32)(pIter - pCell);
1255     if( nSize<4 ) nSize = 4;
1256   }else{
1257     int minLocal = pPage->minLocal;
1258     nSize = minLocal + (nSize - minLocal) % (pPage->pBt->usableSize - 4);
1259     testcase( nSize==pPage->maxLocal );
1260     testcase( nSize==pPage->maxLocal+1 );
1261     if( nSize>pPage->maxLocal ){
1262       nSize = minLocal;
1263     }
1264     nSize += 4 + (u16)(pIter - pCell);
1265   }
1266   assert( nSize==debuginfo.nSize || CORRUPT_DB );
1267   return (u16)nSize;
1268 }
1269 static u16 cellSizePtrNoPayload(MemPage *pPage, u8 *pCell){
1270   u8 *pIter = pCell + 4; /* For looping over bytes of pCell */
1271   u8 *pEnd;              /* End mark for a varint */
1272 
1273 #ifdef SQLITE_DEBUG
1274   /* The value returned by this function should always be the same as
1275   ** the (CellInfo.nSize) value found by doing a full parse of the
1276   ** cell. If SQLITE_DEBUG is defined, an assert() at the bottom of
1277   ** this function verifies that this invariant is not violated. */
1278   CellInfo debuginfo;
1279   pPage->xParseCell(pPage, pCell, &debuginfo);
1280 #else
1281   UNUSED_PARAMETER(pPage);
1282 #endif
1283 
1284   assert( pPage->childPtrSize==4 );
1285   pEnd = pIter + 9;
1286   while( (*pIter++)&0x80 && pIter<pEnd );
1287   assert( debuginfo.nSize==(u16)(pIter - pCell) || CORRUPT_DB );
1288   return (u16)(pIter - pCell);
1289 }
1290 
1291 
1292 #ifdef SQLITE_DEBUG
1293 /* This variation on cellSizePtr() is used inside of assert() statements
1294 ** only. */
1295 static u16 cellSize(MemPage *pPage, int iCell){
1296   return pPage->xCellSize(pPage, findCell(pPage, iCell));
1297 }
1298 #endif
1299 
1300 #ifndef SQLITE_OMIT_AUTOVACUUM
1301 /*
1302 ** If the cell pCell, part of page pPage contains a pointer
1303 ** to an overflow page, insert an entry into the pointer-map
1304 ** for the overflow page.
1305 */
1306 static void ptrmapPutOvflPtr(MemPage *pPage, u8 *pCell, int *pRC){
1307   CellInfo info;
1308   if( *pRC ) return;
1309   assert( pCell!=0 );
1310   pPage->xParseCell(pPage, pCell, &info);
1311   if( info.nLocal<info.nPayload ){
1312     Pgno ovfl = get4byte(&pCell[info.nSize-4]);
1313     ptrmapPut(pPage->pBt, ovfl, PTRMAP_OVERFLOW1, pPage->pgno, pRC);
1314   }
1315 }
1316 #endif
1317 
1318 
1319 /*
1320 ** Defragment the page given.  All Cells are moved to the
1321 ** end of the page and all free space is collected into one
1322 ** big FreeBlk that occurs in between the header and cell
1323 ** pointer array and the cell content area.
1324 **
1325 ** EVIDENCE-OF: R-44582-60138 SQLite may from time to time reorganize a
1326 ** b-tree page so that there are no freeblocks or fragment bytes, all
1327 ** unused bytes are contained in the unallocated space region, and all
1328 ** cells are packed tightly at the end of the page.
1329 */
1330 static int defragmentPage(MemPage *pPage){
1331   int i;                     /* Loop counter */
1332   int pc;                    /* Address of the i-th cell */
1333   int hdr;                   /* Offset to the page header */
1334   int size;                  /* Size of a cell */
1335   int usableSize;            /* Number of usable bytes on a page */
1336   int cellOffset;            /* Offset to the cell pointer array */
1337   int cbrk;                  /* Offset to the cell content area */
1338   int nCell;                 /* Number of cells on the page */
1339   unsigned char *data;       /* The page data */
1340   unsigned char *temp;       /* Temp area for cell content */
1341   unsigned char *src;        /* Source of content */
1342   int iCellFirst;            /* First allowable cell index */
1343   int iCellLast;             /* Last possible cell index */
1344 
1345 
1346   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
1347   assert( pPage->pBt!=0 );
1348   assert( pPage->pBt->usableSize <= SQLITE_MAX_PAGE_SIZE );
1349   assert( pPage->nOverflow==0 );
1350   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1351   temp = 0;
1352   src = data = pPage->aData;
1353   hdr = pPage->hdrOffset;
1354   cellOffset = pPage->cellOffset;
1355   nCell = pPage->nCell;
1356   assert( nCell==get2byte(&data[hdr+3]) );
1357   usableSize = pPage->pBt->usableSize;
1358   cbrk = usableSize;
1359   iCellFirst = cellOffset + 2*nCell;
1360   iCellLast = usableSize - 4;
1361   for(i=0; i<nCell; i++){
1362     u8 *pAddr;     /* The i-th cell pointer */
1363     pAddr = &data[cellOffset + i*2];
1364     pc = get2byte(pAddr);
1365     testcase( pc==iCellFirst );
1366     testcase( pc==iCellLast );
1367     /* These conditions have already been verified in btreeInitPage()
1368     ** if PRAGMA cell_size_check=ON.
1369     */
1370     if( pc<iCellFirst || pc>iCellLast ){
1371       return SQLITE_CORRUPT_BKPT;
1372     }
1373     assert( pc>=iCellFirst && pc<=iCellLast );
1374     size = pPage->xCellSize(pPage, &src[pc]);
1375     cbrk -= size;
1376     if( cbrk<iCellFirst || pc+size>usableSize ){
1377       return SQLITE_CORRUPT_BKPT;
1378     }
1379     assert( cbrk+size<=usableSize && cbrk>=iCellFirst );
1380     testcase( cbrk+size==usableSize );
1381     testcase( pc+size==usableSize );
1382     put2byte(pAddr, cbrk);
1383     if( temp==0 ){
1384       int x;
1385       if( cbrk==pc ) continue;
1386       temp = sqlite3PagerTempSpace(pPage->pBt->pPager);
1387       x = get2byte(&data[hdr+5]);
1388       memcpy(&temp[x], &data[x], (cbrk+size) - x);
1389       src = temp;
1390     }
1391     memcpy(&data[cbrk], &src[pc], size);
1392   }
1393   assert( cbrk>=iCellFirst );
1394   put2byte(&data[hdr+5], cbrk);
1395   data[hdr+1] = 0;
1396   data[hdr+2] = 0;
1397   data[hdr+7] = 0;
1398   memset(&data[iCellFirst], 0, cbrk-iCellFirst);
1399   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
1400   if( cbrk-iCellFirst!=pPage->nFree ){
1401     return SQLITE_CORRUPT_BKPT;
1402   }
1403   return SQLITE_OK;
1404 }
1405 
1406 /*
1407 ** Search the free-list on page pPg for space to store a cell nByte bytes in
1408 ** size. If one can be found, return a pointer to the space and remove it
1409 ** from the free-list.
1410 **
1411 ** If no suitable space can be found on the free-list, return NULL.
1412 **
1413 ** This function may detect corruption within pPg.  If corruption is
1414 ** detected then *pRc is set to SQLITE_CORRUPT and NULL is returned.
1415 **
1416 ** Slots on the free list that are between 1 and 3 bytes larger than nByte
1417 ** will be ignored if adding the extra space to the fragmentation count
1418 ** causes the fragmentation count to exceed 60.
1419 */
1420 static u8 *pageFindSlot(MemPage *pPg, int nByte, int *pRc){
1421   const int hdr = pPg->hdrOffset;
1422   u8 * const aData = pPg->aData;
1423   int iAddr = hdr + 1;
1424   int pc = get2byte(&aData[iAddr]);
1425   int x;
1426   int usableSize = pPg->pBt->usableSize;
1427 
1428   assert( pc>0 );
1429   do{
1430     int size;            /* Size of the free slot */
1431     /* EVIDENCE-OF: R-06866-39125 Freeblocks are always connected in order of
1432     ** increasing offset. */
1433     if( pc>usableSize-4 || pc<iAddr+4 ){
1434       *pRc = SQLITE_CORRUPT_BKPT;
1435       return 0;
1436     }
1437     /* EVIDENCE-OF: R-22710-53328 The third and fourth bytes of each
1438     ** freeblock form a big-endian integer which is the size of the freeblock
1439     ** in bytes, including the 4-byte header. */
1440     size = get2byte(&aData[pc+2]);
1441     if( (x = size - nByte)>=0 ){
1442       testcase( x==4 );
1443       testcase( x==3 );
1444       if( pc < pPg->cellOffset+2*pPg->nCell || size+pc > usableSize ){
1445         *pRc = SQLITE_CORRUPT_BKPT;
1446         return 0;
1447       }else if( x<4 ){
1448         /* EVIDENCE-OF: R-11498-58022 In a well-formed b-tree page, the total
1449         ** number of bytes in fragments may not exceed 60. */
1450         if( aData[hdr+7]>57 ) return 0;
1451 
1452         /* Remove the slot from the free-list. Update the number of
1453         ** fragmented bytes within the page. */
1454         memcpy(&aData[iAddr], &aData[pc], 2);
1455         aData[hdr+7] += (u8)x;
1456       }else{
1457         /* The slot remains on the free-list. Reduce its size to account
1458          ** for the portion used by the new allocation. */
1459         put2byte(&aData[pc+2], x);
1460       }
1461       return &aData[pc + x];
1462     }
1463     iAddr = pc;
1464     pc = get2byte(&aData[pc]);
1465   }while( pc );
1466 
1467   return 0;
1468 }
1469 
1470 /*
1471 ** Allocate nByte bytes of space from within the B-Tree page passed
1472 ** as the first argument. Write into *pIdx the index into pPage->aData[]
1473 ** of the first byte of allocated space. Return either SQLITE_OK or
1474 ** an error code (usually SQLITE_CORRUPT).
1475 **
1476 ** The caller guarantees that there is sufficient space to make the
1477 ** allocation.  This routine might need to defragment in order to bring
1478 ** all the space together, however.  This routine will avoid using
1479 ** the first two bytes past the cell pointer area since presumably this
1480 ** allocation is being made in order to insert a new cell, so we will
1481 ** also end up needing a new cell pointer.
1482 */
1483 static int allocateSpace(MemPage *pPage, int nByte, int *pIdx){
1484   const int hdr = pPage->hdrOffset;    /* Local cache of pPage->hdrOffset */
1485   u8 * const data = pPage->aData;      /* Local cache of pPage->aData */
1486   int top;                             /* First byte of cell content area */
1487   int rc = SQLITE_OK;                  /* Integer return code */
1488   int gap;        /* First byte of gap between cell pointers and cell content */
1489 
1490   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
1491   assert( pPage->pBt );
1492   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1493   assert( nByte>=0 );  /* Minimum cell size is 4 */
1494   assert( pPage->nFree>=nByte );
1495   assert( pPage->nOverflow==0 );
1496   assert( nByte < (int)(pPage->pBt->usableSize-8) );
1497 
1498   assert( pPage->cellOffset == hdr + 12 - 4*pPage->leaf );
1499   gap = pPage->cellOffset + 2*pPage->nCell;
1500   assert( gap<=65536 );
1501   /* EVIDENCE-OF: R-29356-02391 If the database uses a 65536-byte page size
1502   ** and the reserved space is zero (the usual value for reserved space)
1503   ** then the cell content offset of an empty page wants to be 65536.
1504   ** However, that integer is too large to be stored in a 2-byte unsigned
1505   ** integer, so a value of 0 is used in its place. */
1506   top = get2byte(&data[hdr+5]);
1507   assert( top<=(int)pPage->pBt->usableSize ); /* Prevent by getAndInitPage() */
1508   if( gap>top ){
1509     if( top==0 && pPage->pBt->usableSize==65536 ){
1510       top = 65536;
1511     }else{
1512       return SQLITE_CORRUPT_BKPT;
1513     }
1514   }
1515 
1516   /* If there is enough space between gap and top for one more cell pointer
1517   ** array entry offset, and if the freelist is not empty, then search the
1518   ** freelist looking for a free slot big enough to satisfy the request.
1519   */
1520   testcase( gap+2==top );
1521   testcase( gap+1==top );
1522   testcase( gap==top );
1523   if( (data[hdr+2] || data[hdr+1]) && gap+2<=top ){
1524     u8 *pSpace = pageFindSlot(pPage, nByte, &rc);
1525     if( pSpace ){
1526       assert( pSpace>=data && (pSpace - data)<65536 );
1527       *pIdx = (int)(pSpace - data);
1528       return SQLITE_OK;
1529     }else if( rc ){
1530       return rc;
1531     }
1532   }
1533 
1534   /* The request could not be fulfilled using a freelist slot.  Check
1535   ** to see if defragmentation is necessary.
1536   */
1537   testcase( gap+2+nByte==top );
1538   if( gap+2+nByte>top ){
1539     assert( pPage->nCell>0 || CORRUPT_DB );
1540     rc = defragmentPage(pPage);
1541     if( rc ) return rc;
1542     top = get2byteNotZero(&data[hdr+5]);
1543     assert( gap+nByte<=top );
1544   }
1545 
1546 
1547   /* Allocate memory from the gap in between the cell pointer array
1548   ** and the cell content area.  The btreeInitPage() call has already
1549   ** validated the freelist.  Given that the freelist is valid, there
1550   ** is no way that the allocation can extend off the end of the page.
1551   ** The assert() below verifies the previous sentence.
1552   */
1553   top -= nByte;
1554   put2byte(&data[hdr+5], top);
1555   assert( top+nByte <= (int)pPage->pBt->usableSize );
1556   *pIdx = top;
1557   return SQLITE_OK;
1558 }
1559 
1560 /*
1561 ** Return a section of the pPage->aData to the freelist.
1562 ** The first byte of the new free block is pPage->aData[iStart]
1563 ** and the size of the block is iSize bytes.
1564 **
1565 ** Adjacent freeblocks are coalesced.
1566 **
1567 ** Note that even though the freeblock list was checked by btreeInitPage(),
1568 ** that routine will not detect overlap between cells or freeblocks.  Nor
1569 ** does it detect cells or freeblocks that encrouch into the reserved bytes
1570 ** at the end of the page.  So do additional corruption checks inside this
1571 ** routine and return SQLITE_CORRUPT if any problems are found.
1572 */
1573 static int freeSpace(MemPage *pPage, u16 iStart, u16 iSize){
1574   u16 iPtr;                             /* Address of ptr to next freeblock */
1575   u16 iFreeBlk;                         /* Address of the next freeblock */
1576   u8 hdr;                               /* Page header size.  0 or 100 */
1577   u8 nFrag = 0;                         /* Reduction in fragmentation */
1578   u16 iOrigSize = iSize;                /* Original value of iSize */
1579   u32 iLast = pPage->pBt->usableSize-4; /* Largest possible freeblock offset */
1580   u32 iEnd = iStart + iSize;            /* First byte past the iStart buffer */
1581   unsigned char *data = pPage->aData;   /* Page content */
1582 
1583   assert( pPage->pBt!=0 );
1584   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
1585   assert( CORRUPT_DB || iStart>=pPage->hdrOffset+6+pPage->childPtrSize );
1586   assert( CORRUPT_DB || iEnd <= pPage->pBt->usableSize );
1587   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1588   assert( iSize>=4 );   /* Minimum cell size is 4 */
1589   assert( iStart<=iLast );
1590 
1591   /* Overwrite deleted information with zeros when the secure_delete
1592   ** option is enabled */
1593   if( pPage->pBt->btsFlags & BTS_SECURE_DELETE ){
1594     memset(&data[iStart], 0, iSize);
1595   }
1596 
1597   /* The list of freeblocks must be in ascending order.  Find the
1598   ** spot on the list where iStart should be inserted.
1599   */
1600   hdr = pPage->hdrOffset;
1601   iPtr = hdr + 1;
1602   if( data[iPtr+1]==0 && data[iPtr]==0 ){
1603     iFreeBlk = 0;  /* Shortcut for the case when the freelist is empty */
1604   }else{
1605     while( (iFreeBlk = get2byte(&data[iPtr]))<iStart ){
1606       if( iFreeBlk<iPtr+4 ){
1607         if( iFreeBlk==0 ) break;
1608         return SQLITE_CORRUPT_BKPT;
1609       }
1610       iPtr = iFreeBlk;
1611     }
1612     if( iFreeBlk>iLast ) return SQLITE_CORRUPT_BKPT;
1613     assert( iFreeBlk>iPtr || iFreeBlk==0 );
1614 
1615     /* At this point:
1616     **    iFreeBlk:   First freeblock after iStart, or zero if none
1617     **    iPtr:       The address of a pointer to iFreeBlk
1618     **
1619     ** Check to see if iFreeBlk should be coalesced onto the end of iStart.
1620     */
1621     if( iFreeBlk && iEnd+3>=iFreeBlk ){
1622       nFrag = iFreeBlk - iEnd;
1623       if( iEnd>iFreeBlk ) return SQLITE_CORRUPT_BKPT;
1624       iEnd = iFreeBlk + get2byte(&data[iFreeBlk+2]);
1625       if( iEnd > pPage->pBt->usableSize ) return SQLITE_CORRUPT_BKPT;
1626       iSize = iEnd - iStart;
1627       iFreeBlk = get2byte(&data[iFreeBlk]);
1628     }
1629 
1630     /* If iPtr is another freeblock (that is, if iPtr is not the freelist
1631     ** pointer in the page header) then check to see if iStart should be
1632     ** coalesced onto the end of iPtr.
1633     */
1634     if( iPtr>hdr+1 ){
1635       int iPtrEnd = iPtr + get2byte(&data[iPtr+2]);
1636       if( iPtrEnd+3>=iStart ){
1637         if( iPtrEnd>iStart ) return SQLITE_CORRUPT_BKPT;
1638         nFrag += iStart - iPtrEnd;
1639         iSize = iEnd - iPtr;
1640         iStart = iPtr;
1641       }
1642     }
1643     if( nFrag>data[hdr+7] ) return SQLITE_CORRUPT_BKPT;
1644     data[hdr+7] -= nFrag;
1645   }
1646   if( iStart==get2byte(&data[hdr+5]) ){
1647     /* The new freeblock is at the beginning of the cell content area,
1648     ** so just extend the cell content area rather than create another
1649     ** freelist entry */
1650     if( iPtr!=hdr+1 ) return SQLITE_CORRUPT_BKPT;
1651     put2byte(&data[hdr+1], iFreeBlk);
1652     put2byte(&data[hdr+5], iEnd);
1653   }else{
1654     /* Insert the new freeblock into the freelist */
1655     put2byte(&data[iPtr], iStart);
1656     put2byte(&data[iStart], iFreeBlk);
1657     put2byte(&data[iStart+2], iSize);
1658   }
1659   pPage->nFree += iOrigSize;
1660   return SQLITE_OK;
1661 }
1662 
1663 /*
1664 ** Decode the flags byte (the first byte of the header) for a page
1665 ** and initialize fields of the MemPage structure accordingly.
1666 **
1667 ** Only the following combinations are supported.  Anything different
1668 ** indicates a corrupt database files:
1669 **
1670 **         PTF_ZERODATA
1671 **         PTF_ZERODATA | PTF_LEAF
1672 **         PTF_LEAFDATA | PTF_INTKEY
1673 **         PTF_LEAFDATA | PTF_INTKEY | PTF_LEAF
1674 */
1675 static int decodeFlags(MemPage *pPage, int flagByte){
1676   BtShared *pBt;     /* A copy of pPage->pBt */
1677 
1678   assert( pPage->hdrOffset==(pPage->pgno==1 ? 100 : 0) );
1679   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1680   pPage->leaf = (u8)(flagByte>>3);  assert( PTF_LEAF == 1<<3 );
1681   flagByte &= ~PTF_LEAF;
1682   pPage->childPtrSize = 4-4*pPage->leaf;
1683   pPage->xCellSize = cellSizePtr;
1684   pBt = pPage->pBt;
1685   if( flagByte==(PTF_LEAFDATA | PTF_INTKEY) ){
1686     /* EVIDENCE-OF: R-07291-35328 A value of 5 (0x05) means the page is an
1687     ** interior table b-tree page. */
1688     assert( (PTF_LEAFDATA|PTF_INTKEY)==5 );
1689     /* EVIDENCE-OF: R-26900-09176 A value of 13 (0x0d) means the page is a
1690     ** leaf table b-tree page. */
1691     assert( (PTF_LEAFDATA|PTF_INTKEY|PTF_LEAF)==13 );
1692     pPage->intKey = 1;
1693     if( pPage->leaf ){
1694       pPage->intKeyLeaf = 1;
1695       pPage->xParseCell = btreeParseCellPtr;
1696     }else{
1697       pPage->intKeyLeaf = 0;
1698       pPage->xCellSize = cellSizePtrNoPayload;
1699       pPage->xParseCell = btreeParseCellPtrNoPayload;
1700     }
1701     pPage->maxLocal = pBt->maxLeaf;
1702     pPage->minLocal = pBt->minLeaf;
1703   }else if( flagByte==PTF_ZERODATA ){
1704     /* EVIDENCE-OF: R-43316-37308 A value of 2 (0x02) means the page is an
1705     ** interior index b-tree page. */
1706     assert( (PTF_ZERODATA)==2 );
1707     /* EVIDENCE-OF: R-59615-42828 A value of 10 (0x0a) means the page is a
1708     ** leaf index b-tree page. */
1709     assert( (PTF_ZERODATA|PTF_LEAF)==10 );
1710     pPage->intKey = 0;
1711     pPage->intKeyLeaf = 0;
1712     pPage->xParseCell = btreeParseCellPtrIndex;
1713     pPage->maxLocal = pBt->maxLocal;
1714     pPage->minLocal = pBt->minLocal;
1715   }else{
1716     /* EVIDENCE-OF: R-47608-56469 Any other value for the b-tree page type is
1717     ** an error. */
1718     return SQLITE_CORRUPT_BKPT;
1719   }
1720   pPage->max1bytePayload = pBt->max1bytePayload;
1721   return SQLITE_OK;
1722 }
1723 
1724 /*
1725 ** Initialize the auxiliary information for a disk block.
1726 **
1727 ** Return SQLITE_OK on success.  If we see that the page does
1728 ** not contain a well-formed database page, then return
1729 ** SQLITE_CORRUPT.  Note that a return of SQLITE_OK does not
1730 ** guarantee that the page is well-formed.  It only shows that
1731 ** we failed to detect any corruption.
1732 */
1733 static int btreeInitPage(MemPage *pPage){
1734 
1735   assert( pPage->pBt!=0 );
1736   assert( pPage->pBt->db!=0 );
1737   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1738   assert( pPage->pgno==sqlite3PagerPagenumber(pPage->pDbPage) );
1739   assert( pPage == sqlite3PagerGetExtra(pPage->pDbPage) );
1740   assert( pPage->aData == sqlite3PagerGetData(pPage->pDbPage) );
1741 
1742   if( !pPage->isInit ){
1743     int pc;            /* Address of a freeblock within pPage->aData[] */
1744     u8 hdr;            /* Offset to beginning of page header */
1745     u8 *data;          /* Equal to pPage->aData */
1746     BtShared *pBt;        /* The main btree structure */
1747     int usableSize;    /* Amount of usable space on each page */
1748     u16 cellOffset;    /* Offset from start of page to first cell pointer */
1749     int nFree;         /* Number of unused bytes on the page */
1750     int top;           /* First byte of the cell content area */
1751     int iCellFirst;    /* First allowable cell or freeblock offset */
1752     int iCellLast;     /* Last possible cell or freeblock offset */
1753 
1754     pBt = pPage->pBt;
1755 
1756     hdr = pPage->hdrOffset;
1757     data = pPage->aData;
1758     /* EVIDENCE-OF: R-28594-02890 The one-byte flag at offset 0 indicating
1759     ** the b-tree page type. */
1760     if( decodeFlags(pPage, data[hdr]) ) return SQLITE_CORRUPT_BKPT;
1761     assert( pBt->pageSize>=512 && pBt->pageSize<=65536 );
1762     pPage->maskPage = (u16)(pBt->pageSize - 1);
1763     pPage->nOverflow = 0;
1764     usableSize = pBt->usableSize;
1765     pPage->cellOffset = cellOffset = hdr + 8 + pPage->childPtrSize;
1766     pPage->aDataEnd = &data[usableSize];
1767     pPage->aCellIdx = &data[cellOffset];
1768     pPage->aDataOfst = &data[pPage->childPtrSize];
1769     /* EVIDENCE-OF: R-58015-48175 The two-byte integer at offset 5 designates
1770     ** the start of the cell content area. A zero value for this integer is
1771     ** interpreted as 65536. */
1772     top = get2byteNotZero(&data[hdr+5]);
1773     /* EVIDENCE-OF: R-37002-32774 The two-byte integer at offset 3 gives the
1774     ** number of cells on the page. */
1775     pPage->nCell = get2byte(&data[hdr+3]);
1776     if( pPage->nCell>MX_CELL(pBt) ){
1777       /* To many cells for a single page.  The page must be corrupt */
1778       return SQLITE_CORRUPT_BKPT;
1779     }
1780     testcase( pPage->nCell==MX_CELL(pBt) );
1781     /* EVIDENCE-OF: R-24089-57979 If a page contains no cells (which is only
1782     ** possible for a root page of a table that contains no rows) then the
1783     ** offset to the cell content area will equal the page size minus the
1784     ** bytes of reserved space. */
1785     assert( pPage->nCell>0 || top==usableSize || CORRUPT_DB );
1786 
1787     /* A malformed database page might cause us to read past the end
1788     ** of page when parsing a cell.
1789     **
1790     ** The following block of code checks early to see if a cell extends
1791     ** past the end of a page boundary and causes SQLITE_CORRUPT to be
1792     ** returned if it does.
1793     */
1794     iCellFirst = cellOffset + 2*pPage->nCell;
1795     iCellLast = usableSize - 4;
1796     if( pBt->db->flags & SQLITE_CellSizeCk ){
1797       int i;            /* Index into the cell pointer array */
1798       int sz;           /* Size of a cell */
1799 
1800       if( !pPage->leaf ) iCellLast--;
1801       for(i=0; i<pPage->nCell; i++){
1802         pc = get2byteAligned(&data[cellOffset+i*2]);
1803         testcase( pc==iCellFirst );
1804         testcase( pc==iCellLast );
1805         if( pc<iCellFirst || pc>iCellLast ){
1806           return SQLITE_CORRUPT_BKPT;
1807         }
1808         sz = pPage->xCellSize(pPage, &data[pc]);
1809         testcase( pc+sz==usableSize );
1810         if( pc+sz>usableSize ){
1811           return SQLITE_CORRUPT_BKPT;
1812         }
1813       }
1814       if( !pPage->leaf ) iCellLast++;
1815     }
1816 
1817     /* Compute the total free space on the page
1818     ** EVIDENCE-OF: R-23588-34450 The two-byte integer at offset 1 gives the
1819     ** start of the first freeblock on the page, or is zero if there are no
1820     ** freeblocks. */
1821     pc = get2byte(&data[hdr+1]);
1822     nFree = data[hdr+7] + top;  /* Init nFree to non-freeblock free space */
1823     if( pc>0 ){
1824       u32 next, size;
1825       if( pc<iCellFirst ){
1826         /* EVIDENCE-OF: R-55530-52930 In a well-formed b-tree page, there will
1827         ** always be at least one cell before the first freeblock.
1828         */
1829         return SQLITE_CORRUPT_BKPT;
1830       }
1831       while( 1 ){
1832         if( pc>iCellLast ){
1833           return SQLITE_CORRUPT_BKPT; /* Freeblock off the end of the page */
1834         }
1835         next = get2byte(&data[pc]);
1836         size = get2byte(&data[pc+2]);
1837         nFree = nFree + size;
1838         if( next<=pc+size+3 ) break;
1839         pc = next;
1840       }
1841       if( next>0 ){
1842         return SQLITE_CORRUPT_BKPT;  /* Freeblock not in ascending order */
1843       }
1844       if( pc+size>(unsigned int)usableSize ){
1845         return SQLITE_CORRUPT_BKPT;  /* Last freeblock extends past page end */
1846       }
1847     }
1848 
1849     /* At this point, nFree contains the sum of the offset to the start
1850     ** of the cell-content area plus the number of free bytes within
1851     ** the cell-content area. If this is greater than the usable-size
1852     ** of the page, then the page must be corrupted. This check also
1853     ** serves to verify that the offset to the start of the cell-content
1854     ** area, according to the page header, lies within the page.
1855     */
1856     if( nFree>usableSize ){
1857       return SQLITE_CORRUPT_BKPT;
1858     }
1859     pPage->nFree = (u16)(nFree - iCellFirst);
1860     pPage->isInit = 1;
1861   }
1862   return SQLITE_OK;
1863 }
1864 
1865 /*
1866 ** Set up a raw page so that it looks like a database page holding
1867 ** no entries.
1868 */
1869 static void zeroPage(MemPage *pPage, int flags){
1870   unsigned char *data = pPage->aData;
1871   BtShared *pBt = pPage->pBt;
1872   u8 hdr = pPage->hdrOffset;
1873   u16 first;
1874 
1875   assert( sqlite3PagerPagenumber(pPage->pDbPage)==pPage->pgno );
1876   assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage );
1877   assert( sqlite3PagerGetData(pPage->pDbPage) == data );
1878   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
1879   assert( sqlite3_mutex_held(pBt->mutex) );
1880   if( pBt->btsFlags & BTS_SECURE_DELETE ){
1881     memset(&data[hdr], 0, pBt->usableSize - hdr);
1882   }
1883   data[hdr] = (char)flags;
1884   first = hdr + ((flags&PTF_LEAF)==0 ? 12 : 8);
1885   memset(&data[hdr+1], 0, 4);
1886   data[hdr+7] = 0;
1887   put2byte(&data[hdr+5], pBt->usableSize);
1888   pPage->nFree = (u16)(pBt->usableSize - first);
1889   decodeFlags(pPage, flags);
1890   pPage->cellOffset = first;
1891   pPage->aDataEnd = &data[pBt->usableSize];
1892   pPage->aCellIdx = &data[first];
1893   pPage->aDataOfst = &data[pPage->childPtrSize];
1894   pPage->nOverflow = 0;
1895   assert( pBt->pageSize>=512 && pBt->pageSize<=65536 );
1896   pPage->maskPage = (u16)(pBt->pageSize - 1);
1897   pPage->nCell = 0;
1898   pPage->isInit = 1;
1899 }
1900 
1901 
1902 /*
1903 ** Convert a DbPage obtained from the pager into a MemPage used by
1904 ** the btree layer.
1905 */
1906 static MemPage *btreePageFromDbPage(DbPage *pDbPage, Pgno pgno, BtShared *pBt){
1907   MemPage *pPage = (MemPage*)sqlite3PagerGetExtra(pDbPage);
1908   if( pgno!=pPage->pgno ){
1909     pPage->aData = sqlite3PagerGetData(pDbPage);
1910     pPage->pDbPage = pDbPage;
1911     pPage->pBt = pBt;
1912     pPage->pgno = pgno;
1913     pPage->hdrOffset = pgno==1 ? 100 : 0;
1914   }
1915   assert( pPage->aData==sqlite3PagerGetData(pDbPage) );
1916   return pPage;
1917 }
1918 
1919 /*
1920 ** Get a page from the pager.  Initialize the MemPage.pBt and
1921 ** MemPage.aData elements if needed.  See also: btreeGetUnusedPage().
1922 **
1923 ** If the PAGER_GET_NOCONTENT flag is set, it means that we do not care
1924 ** about the content of the page at this time.  So do not go to the disk
1925 ** to fetch the content.  Just fill in the content with zeros for now.
1926 ** If in the future we call sqlite3PagerWrite() on this page, that
1927 ** means we have started to be concerned about content and the disk
1928 ** read should occur at that point.
1929 */
1930 static int btreeGetPage(
1931   BtShared *pBt,       /* The btree */
1932   Pgno pgno,           /* Number of the page to fetch */
1933   MemPage **ppPage,    /* Return the page in this parameter */
1934   int flags            /* PAGER_GET_NOCONTENT or PAGER_GET_READONLY */
1935 ){
1936   int rc;
1937   DbPage *pDbPage;
1938 
1939   assert( flags==0 || flags==PAGER_GET_NOCONTENT || flags==PAGER_GET_READONLY );
1940   assert( sqlite3_mutex_held(pBt->mutex) );
1941   rc = sqlite3PagerGet(pBt->pPager, pgno, (DbPage**)&pDbPage, flags);
1942   if( rc ) return rc;
1943   *ppPage = btreePageFromDbPage(pDbPage, pgno, pBt);
1944   return SQLITE_OK;
1945 }
1946 
1947 /*
1948 ** Retrieve a page from the pager cache. If the requested page is not
1949 ** already in the pager cache return NULL. Initialize the MemPage.pBt and
1950 ** MemPage.aData elements if needed.
1951 */
1952 static MemPage *btreePageLookup(BtShared *pBt, Pgno pgno){
1953   DbPage *pDbPage;
1954   assert( sqlite3_mutex_held(pBt->mutex) );
1955   pDbPage = sqlite3PagerLookup(pBt->pPager, pgno);
1956   if( pDbPage ){
1957     return btreePageFromDbPage(pDbPage, pgno, pBt);
1958   }
1959   return 0;
1960 }
1961 
1962 /*
1963 ** Return the size of the database file in pages. If there is any kind of
1964 ** error, return ((unsigned int)-1).
1965 */
1966 static Pgno btreePagecount(BtShared *pBt){
1967   return pBt->nPage;
1968 }
1969 u32 sqlite3BtreeLastPage(Btree *p){
1970   assert( sqlite3BtreeHoldsMutex(p) );
1971   assert( ((p->pBt->nPage)&0x8000000)==0 );
1972   return btreePagecount(p->pBt);
1973 }
1974 
1975 /*
1976 ** Get a page from the pager and initialize it.
1977 **
1978 ** If pCur!=0 then the page is being fetched as part of a moveToChild()
1979 ** call.  Do additional sanity checking on the page in this case.
1980 ** And if the fetch fails, this routine must decrement pCur->iPage.
1981 **
1982 ** The page is fetched as read-write unless pCur is not NULL and is
1983 ** a read-only cursor.
1984 **
1985 ** If an error occurs, then *ppPage is undefined. It
1986 ** may remain unchanged, or it may be set to an invalid value.
1987 */
1988 static int getAndInitPage(
1989   BtShared *pBt,                  /* The database file */
1990   Pgno pgno,                      /* Number of the page to get */
1991   MemPage **ppPage,               /* Write the page pointer here */
1992   BtCursor *pCur,                 /* Cursor to receive the page, or NULL */
1993   int bReadOnly                   /* True for a read-only page */
1994 ){
1995   int rc;
1996   DbPage *pDbPage;
1997   assert( sqlite3_mutex_held(pBt->mutex) );
1998   assert( pCur==0 || ppPage==&pCur->apPage[pCur->iPage] );
1999   assert( pCur==0 || bReadOnly==pCur->curPagerFlags );
2000   assert( pCur==0 || pCur->iPage>0 );
2001 
2002   if( pgno>btreePagecount(pBt) ){
2003     rc = SQLITE_CORRUPT_BKPT;
2004     goto getAndInitPage_error;
2005   }
2006   rc = sqlite3PagerGet(pBt->pPager, pgno, (DbPage**)&pDbPage, bReadOnly);
2007   if( rc ){
2008     goto getAndInitPage_error;
2009   }
2010   *ppPage = (MemPage*)sqlite3PagerGetExtra(pDbPage);
2011   if( (*ppPage)->isInit==0 ){
2012     btreePageFromDbPage(pDbPage, pgno, pBt);
2013     rc = btreeInitPage(*ppPage);
2014     if( rc!=SQLITE_OK ){
2015       releasePage(*ppPage);
2016       goto getAndInitPage_error;
2017     }
2018   }
2019   assert( (*ppPage)->pgno==pgno );
2020   assert( (*ppPage)->aData==sqlite3PagerGetData(pDbPage) );
2021 
2022   /* If obtaining a child page for a cursor, we must verify that the page is
2023   ** compatible with the root page. */
2024   if( pCur && ((*ppPage)->nCell<1 || (*ppPage)->intKey!=pCur->curIntKey) ){
2025     rc = SQLITE_CORRUPT_BKPT;
2026     releasePage(*ppPage);
2027     goto getAndInitPage_error;
2028   }
2029   return SQLITE_OK;
2030 
2031 getAndInitPage_error:
2032   if( pCur ) pCur->iPage--;
2033   testcase( pgno==0 );
2034   assert( pgno!=0 || rc==SQLITE_CORRUPT );
2035   return rc;
2036 }
2037 
2038 /*
2039 ** Release a MemPage.  This should be called once for each prior
2040 ** call to btreeGetPage.
2041 */
2042 static void releasePageNotNull(MemPage *pPage){
2043   assert( pPage->aData );
2044   assert( pPage->pBt );
2045   assert( pPage->pDbPage!=0 );
2046   assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage );
2047   assert( sqlite3PagerGetData(pPage->pDbPage)==pPage->aData );
2048   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
2049   sqlite3PagerUnrefNotNull(pPage->pDbPage);
2050 }
2051 static void releasePage(MemPage *pPage){
2052   if( pPage ) releasePageNotNull(pPage);
2053 }
2054 
2055 /*
2056 ** Get an unused page.
2057 **
2058 ** This works just like btreeGetPage() with the addition:
2059 **
2060 **   *  If the page is already in use for some other purpose, immediately
2061 **      release it and return an SQLITE_CURRUPT error.
2062 **   *  Make sure the isInit flag is clear
2063 */
2064 static int btreeGetUnusedPage(
2065   BtShared *pBt,       /* The btree */
2066   Pgno pgno,           /* Number of the page to fetch */
2067   MemPage **ppPage,    /* Return the page in this parameter */
2068   int flags            /* PAGER_GET_NOCONTENT or PAGER_GET_READONLY */
2069 ){
2070   int rc = btreeGetPage(pBt, pgno, ppPage, flags);
2071   if( rc==SQLITE_OK ){
2072     if( sqlite3PagerPageRefcount((*ppPage)->pDbPage)>1 ){
2073       releasePage(*ppPage);
2074       *ppPage = 0;
2075       return SQLITE_CORRUPT_BKPT;
2076     }
2077     (*ppPage)->isInit = 0;
2078   }else{
2079     *ppPage = 0;
2080   }
2081   return rc;
2082 }
2083 
2084 
2085 /*
2086 ** During a rollback, when the pager reloads information into the cache
2087 ** so that the cache is restored to its original state at the start of
2088 ** the transaction, for each page restored this routine is called.
2089 **
2090 ** This routine needs to reset the extra data section at the end of the
2091 ** page to agree with the restored data.
2092 */
2093 static void pageReinit(DbPage *pData){
2094   MemPage *pPage;
2095   pPage = (MemPage *)sqlite3PagerGetExtra(pData);
2096   assert( sqlite3PagerPageRefcount(pData)>0 );
2097   if( pPage->isInit ){
2098     assert( sqlite3_mutex_held(pPage->pBt->mutex) );
2099     pPage->isInit = 0;
2100     if( sqlite3PagerPageRefcount(pData)>1 ){
2101       /* pPage might not be a btree page;  it might be an overflow page
2102       ** or ptrmap page or a free page.  In those cases, the following
2103       ** call to btreeInitPage() will likely return SQLITE_CORRUPT.
2104       ** But no harm is done by this.  And it is very important that
2105       ** btreeInitPage() be called on every btree page so we make
2106       ** the call for every page that comes in for re-initing. */
2107       btreeInitPage(pPage);
2108     }
2109   }
2110 }
2111 
2112 /*
2113 ** Invoke the busy handler for a btree.
2114 */
2115 static int btreeInvokeBusyHandler(void *pArg){
2116   BtShared *pBt = (BtShared*)pArg;
2117   assert( pBt->db );
2118   assert( sqlite3_mutex_held(pBt->db->mutex) );
2119   return sqlite3InvokeBusyHandler(&pBt->db->busyHandler);
2120 }
2121 
2122 /*
2123 ** Open a database file.
2124 **
2125 ** zFilename is the name of the database file.  If zFilename is NULL
2126 ** then an ephemeral database is created.  The ephemeral database might
2127 ** be exclusively in memory, or it might use a disk-based memory cache.
2128 ** Either way, the ephemeral database will be automatically deleted
2129 ** when sqlite3BtreeClose() is called.
2130 **
2131 ** If zFilename is ":memory:" then an in-memory database is created
2132 ** that is automatically destroyed when it is closed.
2133 **
2134 ** The "flags" parameter is a bitmask that might contain bits like
2135 ** BTREE_OMIT_JOURNAL and/or BTREE_MEMORY.
2136 **
2137 ** If the database is already opened in the same database connection
2138 ** and we are in shared cache mode, then the open will fail with an
2139 ** SQLITE_CONSTRAINT error.  We cannot allow two or more BtShared
2140 ** objects in the same database connection since doing so will lead
2141 ** to problems with locking.
2142 */
2143 int sqlite3BtreeOpen(
2144   sqlite3_vfs *pVfs,      /* VFS to use for this b-tree */
2145   const char *zFilename,  /* Name of the file containing the BTree database */
2146   sqlite3 *db,            /* Associated database handle */
2147   Btree **ppBtree,        /* Pointer to new Btree object written here */
2148   int flags,              /* Options */
2149   int vfsFlags            /* Flags passed through to sqlite3_vfs.xOpen() */
2150 ){
2151   BtShared *pBt = 0;             /* Shared part of btree structure */
2152   Btree *p;                      /* Handle to return */
2153   sqlite3_mutex *mutexOpen = 0;  /* Prevents a race condition. Ticket #3537 */
2154   int rc = SQLITE_OK;            /* Result code from this function */
2155   u8 nReserve;                   /* Byte of unused space on each page */
2156   unsigned char zDbHeader[100];  /* Database header content */
2157 
2158   /* True if opening an ephemeral, temporary database */
2159   const int isTempDb = zFilename==0 || zFilename[0]==0;
2160 
2161   /* Set the variable isMemdb to true for an in-memory database, or
2162   ** false for a file-based database.
2163   */
2164 #ifdef SQLITE_OMIT_MEMORYDB
2165   const int isMemdb = 0;
2166 #else
2167   const int isMemdb = (zFilename && strcmp(zFilename, ":memory:")==0)
2168                        || (isTempDb && sqlite3TempInMemory(db))
2169                        || (vfsFlags & SQLITE_OPEN_MEMORY)!=0;
2170 #endif
2171 
2172   assert( db!=0 );
2173   assert( pVfs!=0 );
2174   assert( sqlite3_mutex_held(db->mutex) );
2175   assert( (flags&0xff)==flags );   /* flags fit in 8 bits */
2176 
2177   /* Only a BTREE_SINGLE database can be BTREE_UNORDERED */
2178   assert( (flags & BTREE_UNORDERED)==0 || (flags & BTREE_SINGLE)!=0 );
2179 
2180   /* A BTREE_SINGLE database is always a temporary and/or ephemeral */
2181   assert( (flags & BTREE_SINGLE)==0 || isTempDb );
2182 
2183   if( isMemdb ){
2184     flags |= BTREE_MEMORY;
2185   }
2186   if( (vfsFlags & SQLITE_OPEN_MAIN_DB)!=0 && (isMemdb || isTempDb) ){
2187     vfsFlags = (vfsFlags & ~SQLITE_OPEN_MAIN_DB) | SQLITE_OPEN_TEMP_DB;
2188   }
2189   p = sqlite3MallocZero(sizeof(Btree));
2190   if( !p ){
2191     return SQLITE_NOMEM_BKPT;
2192   }
2193   p->inTrans = TRANS_NONE;
2194   p->db = db;
2195 #ifndef SQLITE_OMIT_SHARED_CACHE
2196   p->lock.pBtree = p;
2197   p->lock.iTable = 1;
2198 #endif
2199 
2200 #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)
2201   /*
2202   ** If this Btree is a candidate for shared cache, try to find an
2203   ** existing BtShared object that we can share with
2204   */
2205   if( isTempDb==0 && (isMemdb==0 || (vfsFlags&SQLITE_OPEN_URI)!=0) ){
2206     if( vfsFlags & SQLITE_OPEN_SHAREDCACHE ){
2207       int nFilename = sqlite3Strlen30(zFilename)+1;
2208       int nFullPathname = pVfs->mxPathname+1;
2209       char *zFullPathname = sqlite3Malloc(MAX(nFullPathname,nFilename));
2210       MUTEX_LOGIC( sqlite3_mutex *mutexShared; )
2211 
2212       p->sharable = 1;
2213       if( !zFullPathname ){
2214         sqlite3_free(p);
2215         return SQLITE_NOMEM_BKPT;
2216       }
2217       if( isMemdb ){
2218         memcpy(zFullPathname, zFilename, nFilename);
2219       }else{
2220         rc = sqlite3OsFullPathname(pVfs, zFilename,
2221                                    nFullPathname, zFullPathname);
2222         if( rc ){
2223           sqlite3_free(zFullPathname);
2224           sqlite3_free(p);
2225           return rc;
2226         }
2227       }
2228 #if SQLITE_THREADSAFE
2229       mutexOpen = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_OPEN);
2230       sqlite3_mutex_enter(mutexOpen);
2231       mutexShared = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER);
2232       sqlite3_mutex_enter(mutexShared);
2233 #endif
2234       for(pBt=GLOBAL(BtShared*,sqlite3SharedCacheList); pBt; pBt=pBt->pNext){
2235         assert( pBt->nRef>0 );
2236         if( 0==strcmp(zFullPathname, sqlite3PagerFilename(pBt->pPager, 0))
2237                  && sqlite3PagerVfs(pBt->pPager)==pVfs ){
2238           int iDb;
2239           for(iDb=db->nDb-1; iDb>=0; iDb--){
2240             Btree *pExisting = db->aDb[iDb].pBt;
2241             if( pExisting && pExisting->pBt==pBt ){
2242               sqlite3_mutex_leave(mutexShared);
2243               sqlite3_mutex_leave(mutexOpen);
2244               sqlite3_free(zFullPathname);
2245               sqlite3_free(p);
2246               return SQLITE_CONSTRAINT;
2247             }
2248           }
2249           p->pBt = pBt;
2250           pBt->nRef++;
2251           break;
2252         }
2253       }
2254       sqlite3_mutex_leave(mutexShared);
2255       sqlite3_free(zFullPathname);
2256     }
2257 #ifdef SQLITE_DEBUG
2258     else{
2259       /* In debug mode, we mark all persistent databases as sharable
2260       ** even when they are not.  This exercises the locking code and
2261       ** gives more opportunity for asserts(sqlite3_mutex_held())
2262       ** statements to find locking problems.
2263       */
2264       p->sharable = 1;
2265     }
2266 #endif
2267   }
2268 #endif
2269   if( pBt==0 ){
2270     /*
2271     ** The following asserts make sure that structures used by the btree are
2272     ** the right size.  This is to guard against size changes that result
2273     ** when compiling on a different architecture.
2274     */
2275     assert( sizeof(i64)==8 );
2276     assert( sizeof(u64)==8 );
2277     assert( sizeof(u32)==4 );
2278     assert( sizeof(u16)==2 );
2279     assert( sizeof(Pgno)==4 );
2280 
2281     pBt = sqlite3MallocZero( sizeof(*pBt) );
2282     if( pBt==0 ){
2283       rc = SQLITE_NOMEM_BKPT;
2284       goto btree_open_out;
2285     }
2286     rc = sqlite3PagerOpen(pVfs, &pBt->pPager, zFilename,
2287                           sizeof(MemPage), flags, vfsFlags, pageReinit);
2288     if( rc==SQLITE_OK ){
2289       sqlite3PagerSetMmapLimit(pBt->pPager, db->szMmap);
2290       rc = sqlite3PagerReadFileheader(pBt->pPager,sizeof(zDbHeader),zDbHeader);
2291     }
2292     if( rc!=SQLITE_OK ){
2293       goto btree_open_out;
2294     }
2295     pBt->openFlags = (u8)flags;
2296     pBt->db = db;
2297     sqlite3PagerSetBusyhandler(pBt->pPager, btreeInvokeBusyHandler, pBt);
2298     p->pBt = pBt;
2299 
2300     pBt->pCursor = 0;
2301     pBt->pPage1 = 0;
2302     if( sqlite3PagerIsreadonly(pBt->pPager) ) pBt->btsFlags |= BTS_READ_ONLY;
2303 #ifdef SQLITE_SECURE_DELETE
2304     pBt->btsFlags |= BTS_SECURE_DELETE;
2305 #endif
2306     /* EVIDENCE-OF: R-51873-39618 The page size for a database file is
2307     ** determined by the 2-byte integer located at an offset of 16 bytes from
2308     ** the beginning of the database file. */
2309     pBt->pageSize = (zDbHeader[16]<<8) | (zDbHeader[17]<<16);
2310     if( pBt->pageSize<512 || pBt->pageSize>SQLITE_MAX_PAGE_SIZE
2311          || ((pBt->pageSize-1)&pBt->pageSize)!=0 ){
2312       pBt->pageSize = 0;
2313 #ifndef SQLITE_OMIT_AUTOVACUUM
2314       /* If the magic name ":memory:" will create an in-memory database, then
2315       ** leave the autoVacuum mode at 0 (do not auto-vacuum), even if
2316       ** SQLITE_DEFAULT_AUTOVACUUM is true. On the other hand, if
2317       ** SQLITE_OMIT_MEMORYDB has been defined, then ":memory:" is just a
2318       ** regular file-name. In this case the auto-vacuum applies as per normal.
2319       */
2320       if( zFilename && !isMemdb ){
2321         pBt->autoVacuum = (SQLITE_DEFAULT_AUTOVACUUM ? 1 : 0);
2322         pBt->incrVacuum = (SQLITE_DEFAULT_AUTOVACUUM==2 ? 1 : 0);
2323       }
2324 #endif
2325       nReserve = 0;
2326     }else{
2327       /* EVIDENCE-OF: R-37497-42412 The size of the reserved region is
2328       ** determined by the one-byte unsigned integer found at an offset of 20
2329       ** into the database file header. */
2330       nReserve = zDbHeader[20];
2331       pBt->btsFlags |= BTS_PAGESIZE_FIXED;
2332 #ifndef SQLITE_OMIT_AUTOVACUUM
2333       pBt->autoVacuum = (get4byte(&zDbHeader[36 + 4*4])?1:0);
2334       pBt->incrVacuum = (get4byte(&zDbHeader[36 + 7*4])?1:0);
2335 #endif
2336     }
2337     rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize, nReserve);
2338     if( rc ) goto btree_open_out;
2339     pBt->usableSize = pBt->pageSize - nReserve;
2340     assert( (pBt->pageSize & 7)==0 );  /* 8-byte alignment of pageSize */
2341 
2342 #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)
2343     /* Add the new BtShared object to the linked list sharable BtShareds.
2344     */
2345     pBt->nRef = 1;
2346     if( p->sharable ){
2347       MUTEX_LOGIC( sqlite3_mutex *mutexShared; )
2348       MUTEX_LOGIC( mutexShared = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER);)
2349       if( SQLITE_THREADSAFE && sqlite3GlobalConfig.bCoreMutex ){
2350         pBt->mutex = sqlite3MutexAlloc(SQLITE_MUTEX_FAST);
2351         if( pBt->mutex==0 ){
2352           rc = SQLITE_NOMEM_BKPT;
2353           goto btree_open_out;
2354         }
2355       }
2356       sqlite3_mutex_enter(mutexShared);
2357       pBt->pNext = GLOBAL(BtShared*,sqlite3SharedCacheList);
2358       GLOBAL(BtShared*,sqlite3SharedCacheList) = pBt;
2359       sqlite3_mutex_leave(mutexShared);
2360     }
2361 #endif
2362   }
2363 
2364 #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)
2365   /* If the new Btree uses a sharable pBtShared, then link the new
2366   ** Btree into the list of all sharable Btrees for the same connection.
2367   ** The list is kept in ascending order by pBt address.
2368   */
2369   if( p->sharable ){
2370     int i;
2371     Btree *pSib;
2372     for(i=0; i<db->nDb; i++){
2373       if( (pSib = db->aDb[i].pBt)!=0 && pSib->sharable ){
2374         while( pSib->pPrev ){ pSib = pSib->pPrev; }
2375         if( (uptr)p->pBt<(uptr)pSib->pBt ){
2376           p->pNext = pSib;
2377           p->pPrev = 0;
2378           pSib->pPrev = p;
2379         }else{
2380           while( pSib->pNext && (uptr)pSib->pNext->pBt<(uptr)p->pBt ){
2381             pSib = pSib->pNext;
2382           }
2383           p->pNext = pSib->pNext;
2384           p->pPrev = pSib;
2385           if( p->pNext ){
2386             p->pNext->pPrev = p;
2387           }
2388           pSib->pNext = p;
2389         }
2390         break;
2391       }
2392     }
2393   }
2394 #endif
2395   *ppBtree = p;
2396 
2397 btree_open_out:
2398   if( rc!=SQLITE_OK ){
2399     if( pBt && pBt->pPager ){
2400       sqlite3PagerClose(pBt->pPager, 0);
2401     }
2402     sqlite3_free(pBt);
2403     sqlite3_free(p);
2404     *ppBtree = 0;
2405   }else{
2406     sqlite3_file *pFile;
2407 
2408     /* If the B-Tree was successfully opened, set the pager-cache size to the
2409     ** default value. Except, when opening on an existing shared pager-cache,
2410     ** do not change the pager-cache size.
2411     */
2412     if( sqlite3BtreeSchema(p, 0, 0)==0 ){
2413       sqlite3PagerSetCachesize(p->pBt->pPager, SQLITE_DEFAULT_CACHE_SIZE);
2414     }
2415 
2416     pFile = sqlite3PagerFile(pBt->pPager);
2417     if( pFile->pMethods ){
2418       sqlite3OsFileControlHint(pFile, SQLITE_FCNTL_PDB, (void*)&pBt->db);
2419     }
2420   }
2421   if( mutexOpen ){
2422     assert( sqlite3_mutex_held(mutexOpen) );
2423     sqlite3_mutex_leave(mutexOpen);
2424   }
2425   assert( rc!=SQLITE_OK || sqlite3BtreeConnectionCount(*ppBtree)>0 );
2426   return rc;
2427 }
2428 
2429 /*
2430 ** Decrement the BtShared.nRef counter.  When it reaches zero,
2431 ** remove the BtShared structure from the sharing list.  Return
2432 ** true if the BtShared.nRef counter reaches zero and return
2433 ** false if it is still positive.
2434 */
2435 static int removeFromSharingList(BtShared *pBt){
2436 #ifndef SQLITE_OMIT_SHARED_CACHE
2437   MUTEX_LOGIC( sqlite3_mutex *pMaster; )
2438   BtShared *pList;
2439   int removed = 0;
2440 
2441   assert( sqlite3_mutex_notheld(pBt->mutex) );
2442   MUTEX_LOGIC( pMaster = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER); )
2443   sqlite3_mutex_enter(pMaster);
2444   pBt->nRef--;
2445   if( pBt->nRef<=0 ){
2446     if( GLOBAL(BtShared*,sqlite3SharedCacheList)==pBt ){
2447       GLOBAL(BtShared*,sqlite3SharedCacheList) = pBt->pNext;
2448     }else{
2449       pList = GLOBAL(BtShared*,sqlite3SharedCacheList);
2450       while( ALWAYS(pList) && pList->pNext!=pBt ){
2451         pList=pList->pNext;
2452       }
2453       if( ALWAYS(pList) ){
2454         pList->pNext = pBt->pNext;
2455       }
2456     }
2457     if( SQLITE_THREADSAFE ){
2458       sqlite3_mutex_free(pBt->mutex);
2459     }
2460     removed = 1;
2461   }
2462   sqlite3_mutex_leave(pMaster);
2463   return removed;
2464 #else
2465   return 1;
2466 #endif
2467 }
2468 
2469 /*
2470 ** Make sure pBt->pTmpSpace points to an allocation of
2471 ** MX_CELL_SIZE(pBt) bytes with a 4-byte prefix for a left-child
2472 ** pointer.
2473 */
2474 static void allocateTempSpace(BtShared *pBt){
2475   if( !pBt->pTmpSpace ){
2476     pBt->pTmpSpace = sqlite3PageMalloc( pBt->pageSize );
2477 
2478     /* One of the uses of pBt->pTmpSpace is to format cells before
2479     ** inserting them into a leaf page (function fillInCell()). If
2480     ** a cell is less than 4 bytes in size, it is rounded up to 4 bytes
2481     ** by the various routines that manipulate binary cells. Which
2482     ** can mean that fillInCell() only initializes the first 2 or 3
2483     ** bytes of pTmpSpace, but that the first 4 bytes are copied from
2484     ** it into a database page. This is not actually a problem, but it
2485     ** does cause a valgrind error when the 1 or 2 bytes of unitialized
2486     ** data is passed to system call write(). So to avoid this error,
2487     ** zero the first 4 bytes of temp space here.
2488     **
2489     ** Also:  Provide four bytes of initialized space before the
2490     ** beginning of pTmpSpace as an area available to prepend the
2491     ** left-child pointer to the beginning of a cell.
2492     */
2493     if( pBt->pTmpSpace ){
2494       memset(pBt->pTmpSpace, 0, 8);
2495       pBt->pTmpSpace += 4;
2496     }
2497   }
2498 }
2499 
2500 /*
2501 ** Free the pBt->pTmpSpace allocation
2502 */
2503 static void freeTempSpace(BtShared *pBt){
2504   if( pBt->pTmpSpace ){
2505     pBt->pTmpSpace -= 4;
2506     sqlite3PageFree(pBt->pTmpSpace);
2507     pBt->pTmpSpace = 0;
2508   }
2509 }
2510 
2511 /*
2512 ** Close an open database and invalidate all cursors.
2513 */
2514 int sqlite3BtreeClose(Btree *p){
2515   BtShared *pBt = p->pBt;
2516   BtCursor *pCur;
2517 
2518   /* Close all cursors opened via this handle.  */
2519   assert( sqlite3_mutex_held(p->db->mutex) );
2520   sqlite3BtreeEnter(p);
2521   pCur = pBt->pCursor;
2522   while( pCur ){
2523     BtCursor *pTmp = pCur;
2524     pCur = pCur->pNext;
2525     if( pTmp->pBtree==p ){
2526       sqlite3BtreeCloseCursor(pTmp);
2527     }
2528   }
2529 
2530   /* Rollback any active transaction and free the handle structure.
2531   ** The call to sqlite3BtreeRollback() drops any table-locks held by
2532   ** this handle.
2533   */
2534   sqlite3BtreeRollback(p, SQLITE_OK, 0);
2535   sqlite3BtreeLeave(p);
2536 
2537   /* If there are still other outstanding references to the shared-btree
2538   ** structure, return now. The remainder of this procedure cleans
2539   ** up the shared-btree.
2540   */
2541   assert( p->wantToLock==0 && p->locked==0 );
2542   if( !p->sharable || removeFromSharingList(pBt) ){
2543     /* The pBt is no longer on the sharing list, so we can access
2544     ** it without having to hold the mutex.
2545     **
2546     ** Clean out and delete the BtShared object.
2547     */
2548     assert( !pBt->pCursor );
2549     sqlite3PagerClose(pBt->pPager, p->db);
2550     if( pBt->xFreeSchema && pBt->pSchema ){
2551       pBt->xFreeSchema(pBt->pSchema);
2552     }
2553     sqlite3DbFree(0, pBt->pSchema);
2554     freeTempSpace(pBt);
2555     sqlite3_free(pBt);
2556   }
2557 
2558 #ifndef SQLITE_OMIT_SHARED_CACHE
2559   assert( p->wantToLock==0 );
2560   assert( p->locked==0 );
2561   if( p->pPrev ) p->pPrev->pNext = p->pNext;
2562   if( p->pNext ) p->pNext->pPrev = p->pPrev;
2563 #endif
2564 
2565   sqlite3_free(p);
2566   return SQLITE_OK;
2567 }
2568 
2569 /*
2570 ** Change the "soft" limit on the number of pages in the cache.
2571 ** Unused and unmodified pages will be recycled when the number of
2572 ** pages in the cache exceeds this soft limit.  But the size of the
2573 ** cache is allowed to grow larger than this limit if it contains
2574 ** dirty pages or pages still in active use.
2575 */
2576 int sqlite3BtreeSetCacheSize(Btree *p, int mxPage){
2577   BtShared *pBt = p->pBt;
2578   assert( sqlite3_mutex_held(p->db->mutex) );
2579   sqlite3BtreeEnter(p);
2580   sqlite3PagerSetCachesize(pBt->pPager, mxPage);
2581   sqlite3BtreeLeave(p);
2582   return SQLITE_OK;
2583 }
2584 
2585 /*
2586 ** Change the "spill" limit on the number of pages in the cache.
2587 ** If the number of pages exceeds this limit during a write transaction,
2588 ** the pager might attempt to "spill" pages to the journal early in
2589 ** order to free up memory.
2590 **
2591 ** The value returned is the current spill size.  If zero is passed
2592 ** as an argument, no changes are made to the spill size setting, so
2593 ** using mxPage of 0 is a way to query the current spill size.
2594 */
2595 int sqlite3BtreeSetSpillSize(Btree *p, int mxPage){
2596   BtShared *pBt = p->pBt;
2597   int res;
2598   assert( sqlite3_mutex_held(p->db->mutex) );
2599   sqlite3BtreeEnter(p);
2600   res = sqlite3PagerSetSpillsize(pBt->pPager, mxPage);
2601   sqlite3BtreeLeave(p);
2602   return res;
2603 }
2604 
2605 #if SQLITE_MAX_MMAP_SIZE>0
2606 /*
2607 ** Change the limit on the amount of the database file that may be
2608 ** memory mapped.
2609 */
2610 int sqlite3BtreeSetMmapLimit(Btree *p, sqlite3_int64 szMmap){
2611   BtShared *pBt = p->pBt;
2612   assert( sqlite3_mutex_held(p->db->mutex) );
2613   sqlite3BtreeEnter(p);
2614   sqlite3PagerSetMmapLimit(pBt->pPager, szMmap);
2615   sqlite3BtreeLeave(p);
2616   return SQLITE_OK;
2617 }
2618 #endif /* SQLITE_MAX_MMAP_SIZE>0 */
2619 
2620 /*
2621 ** Change the way data is synced to disk in order to increase or decrease
2622 ** how well the database resists damage due to OS crashes and power
2623 ** failures.  Level 1 is the same as asynchronous (no syncs() occur and
2624 ** there is a high probability of damage)  Level 2 is the default.  There
2625 ** is a very low but non-zero probability of damage.  Level 3 reduces the
2626 ** probability of damage to near zero but with a write performance reduction.
2627 */
2628 #ifndef SQLITE_OMIT_PAGER_PRAGMAS
2629 int sqlite3BtreeSetPagerFlags(
2630   Btree *p,              /* The btree to set the safety level on */
2631   unsigned pgFlags       /* Various PAGER_* flags */
2632 ){
2633   BtShared *pBt = p->pBt;
2634   assert( sqlite3_mutex_held(p->db->mutex) );
2635   sqlite3BtreeEnter(p);
2636   sqlite3PagerSetFlags(pBt->pPager, pgFlags);
2637   sqlite3BtreeLeave(p);
2638   return SQLITE_OK;
2639 }
2640 #endif
2641 
2642 /*
2643 ** Change the default pages size and the number of reserved bytes per page.
2644 ** Or, if the page size has already been fixed, return SQLITE_READONLY
2645 ** without changing anything.
2646 **
2647 ** The page size must be a power of 2 between 512 and 65536.  If the page
2648 ** size supplied does not meet this constraint then the page size is not
2649 ** changed.
2650 **
2651 ** Page sizes are constrained to be a power of two so that the region
2652 ** of the database file used for locking (beginning at PENDING_BYTE,
2653 ** the first byte past the 1GB boundary, 0x40000000) needs to occur
2654 ** at the beginning of a page.
2655 **
2656 ** If parameter nReserve is less than zero, then the number of reserved
2657 ** bytes per page is left unchanged.
2658 **
2659 ** If the iFix!=0 then the BTS_PAGESIZE_FIXED flag is set so that the page size
2660 ** and autovacuum mode can no longer be changed.
2661 */
2662 int sqlite3BtreeSetPageSize(Btree *p, int pageSize, int nReserve, int iFix){
2663   int rc = SQLITE_OK;
2664   BtShared *pBt = p->pBt;
2665   assert( nReserve>=-1 && nReserve<=255 );
2666   sqlite3BtreeEnter(p);
2667 #if SQLITE_HAS_CODEC
2668   if( nReserve>pBt->optimalReserve ) pBt->optimalReserve = (u8)nReserve;
2669 #endif
2670   if( pBt->btsFlags & BTS_PAGESIZE_FIXED ){
2671     sqlite3BtreeLeave(p);
2672     return SQLITE_READONLY;
2673   }
2674   if( nReserve<0 ){
2675     nReserve = pBt->pageSize - pBt->usableSize;
2676   }
2677   assert( nReserve>=0 && nReserve<=255 );
2678   if( pageSize>=512 && pageSize<=SQLITE_MAX_PAGE_SIZE &&
2679         ((pageSize-1)&pageSize)==0 ){
2680     assert( (pageSize & 7)==0 );
2681     assert( !pBt->pCursor );
2682     pBt->pageSize = (u32)pageSize;
2683     freeTempSpace(pBt);
2684   }
2685   rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize, nReserve);
2686   pBt->usableSize = pBt->pageSize - (u16)nReserve;
2687   if( iFix ) pBt->btsFlags |= BTS_PAGESIZE_FIXED;
2688   sqlite3BtreeLeave(p);
2689   return rc;
2690 }
2691 
2692 /*
2693 ** Return the currently defined page size
2694 */
2695 int sqlite3BtreeGetPageSize(Btree *p){
2696   return p->pBt->pageSize;
2697 }
2698 
2699 /*
2700 ** This function is similar to sqlite3BtreeGetReserve(), except that it
2701 ** may only be called if it is guaranteed that the b-tree mutex is already
2702 ** held.
2703 **
2704 ** This is useful in one special case in the backup API code where it is
2705 ** known that the shared b-tree mutex is held, but the mutex on the
2706 ** database handle that owns *p is not. In this case if sqlite3BtreeEnter()
2707 ** were to be called, it might collide with some other operation on the
2708 ** database handle that owns *p, causing undefined behavior.
2709 */
2710 int sqlite3BtreeGetReserveNoMutex(Btree *p){
2711   int n;
2712   assert( sqlite3_mutex_held(p->pBt->mutex) );
2713   n = p->pBt->pageSize - p->pBt->usableSize;
2714   return n;
2715 }
2716 
2717 /*
2718 ** Return the number of bytes of space at the end of every page that
2719 ** are intentually left unused.  This is the "reserved" space that is
2720 ** sometimes used by extensions.
2721 **
2722 ** If SQLITE_HAS_MUTEX is defined then the number returned is the
2723 ** greater of the current reserved space and the maximum requested
2724 ** reserve space.
2725 */
2726 int sqlite3BtreeGetOptimalReserve(Btree *p){
2727   int n;
2728   sqlite3BtreeEnter(p);
2729   n = sqlite3BtreeGetReserveNoMutex(p);
2730 #ifdef SQLITE_HAS_CODEC
2731   if( n<p->pBt->optimalReserve ) n = p->pBt->optimalReserve;
2732 #endif
2733   sqlite3BtreeLeave(p);
2734   return n;
2735 }
2736 
2737 
2738 /*
2739 ** Set the maximum page count for a database if mxPage is positive.
2740 ** No changes are made if mxPage is 0 or negative.
2741 ** Regardless of the value of mxPage, return the maximum page count.
2742 */
2743 int sqlite3BtreeMaxPageCount(Btree *p, int mxPage){
2744   int n;
2745   sqlite3BtreeEnter(p);
2746   n = sqlite3PagerMaxPageCount(p->pBt->pPager, mxPage);
2747   sqlite3BtreeLeave(p);
2748   return n;
2749 }
2750 
2751 /*
2752 ** Set the BTS_SECURE_DELETE flag if newFlag is 0 or 1.  If newFlag is -1,
2753 ** then make no changes.  Always return the value of the BTS_SECURE_DELETE
2754 ** setting after the change.
2755 */
2756 int sqlite3BtreeSecureDelete(Btree *p, int newFlag){
2757   int b;
2758   if( p==0 ) return 0;
2759   sqlite3BtreeEnter(p);
2760   if( newFlag>=0 ){
2761     p->pBt->btsFlags &= ~BTS_SECURE_DELETE;
2762     if( newFlag ) p->pBt->btsFlags |= BTS_SECURE_DELETE;
2763   }
2764   b = (p->pBt->btsFlags & BTS_SECURE_DELETE)!=0;
2765   sqlite3BtreeLeave(p);
2766   return b;
2767 }
2768 
2769 /*
2770 ** Change the 'auto-vacuum' property of the database. If the 'autoVacuum'
2771 ** parameter is non-zero, then auto-vacuum mode is enabled. If zero, it
2772 ** is disabled. The default value for the auto-vacuum property is
2773 ** determined by the SQLITE_DEFAULT_AUTOVACUUM macro.
2774 */
2775 int sqlite3BtreeSetAutoVacuum(Btree *p, int autoVacuum){
2776 #ifdef SQLITE_OMIT_AUTOVACUUM
2777   return SQLITE_READONLY;
2778 #else
2779   BtShared *pBt = p->pBt;
2780   int rc = SQLITE_OK;
2781   u8 av = (u8)autoVacuum;
2782 
2783   sqlite3BtreeEnter(p);
2784   if( (pBt->btsFlags & BTS_PAGESIZE_FIXED)!=0 && (av ?1:0)!=pBt->autoVacuum ){
2785     rc = SQLITE_READONLY;
2786   }else{
2787     pBt->autoVacuum = av ?1:0;
2788     pBt->incrVacuum = av==2 ?1:0;
2789   }
2790   sqlite3BtreeLeave(p);
2791   return rc;
2792 #endif
2793 }
2794 
2795 /*
2796 ** Return the value of the 'auto-vacuum' property. If auto-vacuum is
2797 ** enabled 1 is returned. Otherwise 0.
2798 */
2799 int sqlite3BtreeGetAutoVacuum(Btree *p){
2800 #ifdef SQLITE_OMIT_AUTOVACUUM
2801   return BTREE_AUTOVACUUM_NONE;
2802 #else
2803   int rc;
2804   sqlite3BtreeEnter(p);
2805   rc = (
2806     (!p->pBt->autoVacuum)?BTREE_AUTOVACUUM_NONE:
2807     (!p->pBt->incrVacuum)?BTREE_AUTOVACUUM_FULL:
2808     BTREE_AUTOVACUUM_INCR
2809   );
2810   sqlite3BtreeLeave(p);
2811   return rc;
2812 #endif
2813 }
2814 
2815 
2816 /*
2817 ** Get a reference to pPage1 of the database file.  This will
2818 ** also acquire a readlock on that file.
2819 **
2820 ** SQLITE_OK is returned on success.  If the file is not a
2821 ** well-formed database file, then SQLITE_CORRUPT is returned.
2822 ** SQLITE_BUSY is returned if the database is locked.  SQLITE_NOMEM
2823 ** is returned if we run out of memory.
2824 */
2825 static int lockBtree(BtShared *pBt){
2826   int rc;              /* Result code from subfunctions */
2827   MemPage *pPage1;     /* Page 1 of the database file */
2828   int nPage;           /* Number of pages in the database */
2829   int nPageFile = 0;   /* Number of pages in the database file */
2830   int nPageHeader;     /* Number of pages in the database according to hdr */
2831 
2832   assert( sqlite3_mutex_held(pBt->mutex) );
2833   assert( pBt->pPage1==0 );
2834   rc = sqlite3PagerSharedLock(pBt->pPager);
2835   if( rc!=SQLITE_OK ) return rc;
2836   rc = btreeGetPage(pBt, 1, &pPage1, 0);
2837   if( rc!=SQLITE_OK ) return rc;
2838 
2839   /* Do some checking to help insure the file we opened really is
2840   ** a valid database file.
2841   */
2842   nPage = nPageHeader = get4byte(28+(u8*)pPage1->aData);
2843   sqlite3PagerPagecount(pBt->pPager, &nPageFile);
2844   if( nPage==0 || memcmp(24+(u8*)pPage1->aData, 92+(u8*)pPage1->aData,4)!=0 ){
2845     nPage = nPageFile;
2846   }
2847   if( nPage>0 ){
2848     u32 pageSize;
2849     u32 usableSize;
2850     u8 *page1 = pPage1->aData;
2851     rc = SQLITE_NOTADB;
2852     /* EVIDENCE-OF: R-43737-39999 Every valid SQLite database file begins
2853     ** with the following 16 bytes (in hex): 53 51 4c 69 74 65 20 66 6f 72 6d
2854     ** 61 74 20 33 00. */
2855     if( memcmp(page1, zMagicHeader, 16)!=0 ){
2856       goto page1_init_failed;
2857     }
2858 
2859 #ifdef SQLITE_OMIT_WAL
2860     if( page1[18]>1 ){
2861       pBt->btsFlags |= BTS_READ_ONLY;
2862     }
2863     if( page1[19]>1 ){
2864       goto page1_init_failed;
2865     }
2866 #else
2867     if( page1[18]>2 ){
2868       pBt->btsFlags |= BTS_READ_ONLY;
2869     }
2870     if( page1[19]>2 ){
2871       goto page1_init_failed;
2872     }
2873 
2874     /* If the write version is set to 2, this database should be accessed
2875     ** in WAL mode. If the log is not already open, open it now. Then
2876     ** return SQLITE_OK and return without populating BtShared.pPage1.
2877     ** The caller detects this and calls this function again. This is
2878     ** required as the version of page 1 currently in the page1 buffer
2879     ** may not be the latest version - there may be a newer one in the log
2880     ** file.
2881     */
2882     if( page1[19]==2 && (pBt->btsFlags & BTS_NO_WAL)==0 ){
2883       int isOpen = 0;
2884       rc = sqlite3PagerOpenWal(pBt->pPager, &isOpen);
2885       if( rc!=SQLITE_OK ){
2886         goto page1_init_failed;
2887       }else{
2888 #if SQLITE_DEFAULT_SYNCHRONOUS!=SQLITE_DEFAULT_WAL_SYNCHRONOUS
2889         sqlite3 *db;
2890         Db *pDb;
2891         if( (db=pBt->db)!=0 && (pDb=db->aDb)!=0 ){
2892           while( pDb->pBt==0 || pDb->pBt->pBt!=pBt ){ pDb++; }
2893           if( pDb->bSyncSet==0
2894            && pDb->safety_level==SQLITE_DEFAULT_SYNCHRONOUS+1
2895           ){
2896             pDb->safety_level = SQLITE_DEFAULT_WAL_SYNCHRONOUS+1;
2897             sqlite3PagerSetFlags(pBt->pPager,
2898                pDb->safety_level | (db->flags & PAGER_FLAGS_MASK));
2899           }
2900         }
2901 #endif
2902         if( isOpen==0 ){
2903           releasePage(pPage1);
2904           return SQLITE_OK;
2905         }
2906       }
2907       rc = SQLITE_NOTADB;
2908     }
2909 #endif
2910 
2911     /* EVIDENCE-OF: R-15465-20813 The maximum and minimum embedded payload
2912     ** fractions and the leaf payload fraction values must be 64, 32, and 32.
2913     **
2914     ** The original design allowed these amounts to vary, but as of
2915     ** version 3.6.0, we require them to be fixed.
2916     */
2917     if( memcmp(&page1[21], "\100\040\040",3)!=0 ){
2918       goto page1_init_failed;
2919     }
2920     /* EVIDENCE-OF: R-51873-39618 The page size for a database file is
2921     ** determined by the 2-byte integer located at an offset of 16 bytes from
2922     ** the beginning of the database file. */
2923     pageSize = (page1[16]<<8) | (page1[17]<<16);
2924     /* EVIDENCE-OF: R-25008-21688 The size of a page is a power of two
2925     ** between 512 and 65536 inclusive. */
2926     if( ((pageSize-1)&pageSize)!=0
2927      || pageSize>SQLITE_MAX_PAGE_SIZE
2928      || pageSize<=256
2929     ){
2930       goto page1_init_failed;
2931     }
2932     assert( (pageSize & 7)==0 );
2933     /* EVIDENCE-OF: R-59310-51205 The "reserved space" size in the 1-byte
2934     ** integer at offset 20 is the number of bytes of space at the end of
2935     ** each page to reserve for extensions.
2936     **
2937     ** EVIDENCE-OF: R-37497-42412 The size of the reserved region is
2938     ** determined by the one-byte unsigned integer found at an offset of 20
2939     ** into the database file header. */
2940     usableSize = pageSize - page1[20];
2941     if( (u32)pageSize!=pBt->pageSize ){
2942       /* After reading the first page of the database assuming a page size
2943       ** of BtShared.pageSize, we have discovered that the page-size is
2944       ** actually pageSize. Unlock the database, leave pBt->pPage1 at
2945       ** zero and return SQLITE_OK. The caller will call this function
2946       ** again with the correct page-size.
2947       */
2948       releasePage(pPage1);
2949       pBt->usableSize = usableSize;
2950       pBt->pageSize = pageSize;
2951       freeTempSpace(pBt);
2952       rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize,
2953                                    pageSize-usableSize);
2954       return rc;
2955     }
2956     if( (pBt->db->flags & SQLITE_RecoveryMode)==0 && nPage>nPageFile ){
2957       rc = SQLITE_CORRUPT_BKPT;
2958       goto page1_init_failed;
2959     }
2960     /* EVIDENCE-OF: R-28312-64704 However, the usable size is not allowed to
2961     ** be less than 480. In other words, if the page size is 512, then the
2962     ** reserved space size cannot exceed 32. */
2963     if( usableSize<480 ){
2964       goto page1_init_failed;
2965     }
2966     pBt->pageSize = pageSize;
2967     pBt->usableSize = usableSize;
2968 #ifndef SQLITE_OMIT_AUTOVACUUM
2969     pBt->autoVacuum = (get4byte(&page1[36 + 4*4])?1:0);
2970     pBt->incrVacuum = (get4byte(&page1[36 + 7*4])?1:0);
2971 #endif
2972   }
2973 
2974   /* maxLocal is the maximum amount of payload to store locally for
2975   ** a cell.  Make sure it is small enough so that at least minFanout
2976   ** cells can will fit on one page.  We assume a 10-byte page header.
2977   ** Besides the payload, the cell must store:
2978   **     2-byte pointer to the cell
2979   **     4-byte child pointer
2980   **     9-byte nKey value
2981   **     4-byte nData value
2982   **     4-byte overflow page pointer
2983   ** So a cell consists of a 2-byte pointer, a header which is as much as
2984   ** 17 bytes long, 0 to N bytes of payload, and an optional 4 byte overflow
2985   ** page pointer.
2986   */
2987   pBt->maxLocal = (u16)((pBt->usableSize-12)*64/255 - 23);
2988   pBt->minLocal = (u16)((pBt->usableSize-12)*32/255 - 23);
2989   pBt->maxLeaf = (u16)(pBt->usableSize - 35);
2990   pBt->minLeaf = (u16)((pBt->usableSize-12)*32/255 - 23);
2991   if( pBt->maxLocal>127 ){
2992     pBt->max1bytePayload = 127;
2993   }else{
2994     pBt->max1bytePayload = (u8)pBt->maxLocal;
2995   }
2996   assert( pBt->maxLeaf + 23 <= MX_CELL_SIZE(pBt) );
2997   pBt->pPage1 = pPage1;
2998   pBt->nPage = nPage;
2999   return SQLITE_OK;
3000 
3001 page1_init_failed:
3002   releasePage(pPage1);
3003   pBt->pPage1 = 0;
3004   return rc;
3005 }
3006 
3007 #ifndef NDEBUG
3008 /*
3009 ** Return the number of cursors open on pBt. This is for use
3010 ** in assert() expressions, so it is only compiled if NDEBUG is not
3011 ** defined.
3012 **
3013 ** Only write cursors are counted if wrOnly is true.  If wrOnly is
3014 ** false then all cursors are counted.
3015 **
3016 ** For the purposes of this routine, a cursor is any cursor that
3017 ** is capable of reading or writing to the database.  Cursors that
3018 ** have been tripped into the CURSOR_FAULT state are not counted.
3019 */
3020 static int countValidCursors(BtShared *pBt, int wrOnly){
3021   BtCursor *pCur;
3022   int r = 0;
3023   for(pCur=pBt->pCursor; pCur; pCur=pCur->pNext){
3024     if( (wrOnly==0 || (pCur->curFlags & BTCF_WriteFlag)!=0)
3025      && pCur->eState!=CURSOR_FAULT ) r++;
3026   }
3027   return r;
3028 }
3029 #endif
3030 
3031 /*
3032 ** If there are no outstanding cursors and we are not in the middle
3033 ** of a transaction but there is a read lock on the database, then
3034 ** this routine unrefs the first page of the database file which
3035 ** has the effect of releasing the read lock.
3036 **
3037 ** If there is a transaction in progress, this routine is a no-op.
3038 */
3039 static void unlockBtreeIfUnused(BtShared *pBt){
3040   assert( sqlite3_mutex_held(pBt->mutex) );
3041   assert( countValidCursors(pBt,0)==0 || pBt->inTransaction>TRANS_NONE );
3042   if( pBt->inTransaction==TRANS_NONE && pBt->pPage1!=0 ){
3043     MemPage *pPage1 = pBt->pPage1;
3044     assert( pPage1->aData );
3045     assert( sqlite3PagerRefcount(pBt->pPager)==1 );
3046     pBt->pPage1 = 0;
3047     releasePageNotNull(pPage1);
3048   }
3049 }
3050 
3051 /*
3052 ** If pBt points to an empty file then convert that empty file
3053 ** into a new empty database by initializing the first page of
3054 ** the database.
3055 */
3056 static int newDatabase(BtShared *pBt){
3057   MemPage *pP1;
3058   unsigned char *data;
3059   int rc;
3060 
3061   assert( sqlite3_mutex_held(pBt->mutex) );
3062   if( pBt->nPage>0 ){
3063     return SQLITE_OK;
3064   }
3065   pP1 = pBt->pPage1;
3066   assert( pP1!=0 );
3067   data = pP1->aData;
3068   rc = sqlite3PagerWrite(pP1->pDbPage);
3069   if( rc ) return rc;
3070   memcpy(data, zMagicHeader, sizeof(zMagicHeader));
3071   assert( sizeof(zMagicHeader)==16 );
3072   data[16] = (u8)((pBt->pageSize>>8)&0xff);
3073   data[17] = (u8)((pBt->pageSize>>16)&0xff);
3074   data[18] = 1;
3075   data[19] = 1;
3076   assert( pBt->usableSize<=pBt->pageSize && pBt->usableSize+255>=pBt->pageSize);
3077   data[20] = (u8)(pBt->pageSize - pBt->usableSize);
3078   data[21] = 64;
3079   data[22] = 32;
3080   data[23] = 32;
3081   memset(&data[24], 0, 100-24);
3082   zeroPage(pP1, PTF_INTKEY|PTF_LEAF|PTF_LEAFDATA );
3083   pBt->btsFlags |= BTS_PAGESIZE_FIXED;
3084 #ifndef SQLITE_OMIT_AUTOVACUUM
3085   assert( pBt->autoVacuum==1 || pBt->autoVacuum==0 );
3086   assert( pBt->incrVacuum==1 || pBt->incrVacuum==0 );
3087   put4byte(&data[36 + 4*4], pBt->autoVacuum);
3088   put4byte(&data[36 + 7*4], pBt->incrVacuum);
3089 #endif
3090   pBt->nPage = 1;
3091   data[31] = 1;
3092   return SQLITE_OK;
3093 }
3094 
3095 /*
3096 ** Initialize the first page of the database file (creating a database
3097 ** consisting of a single page and no schema objects). Return SQLITE_OK
3098 ** if successful, or an SQLite error code otherwise.
3099 */
3100 int sqlite3BtreeNewDb(Btree *p){
3101   int rc;
3102   sqlite3BtreeEnter(p);
3103   p->pBt->nPage = 0;
3104   rc = newDatabase(p->pBt);
3105   sqlite3BtreeLeave(p);
3106   return rc;
3107 }
3108 
3109 /*
3110 ** Attempt to start a new transaction. A write-transaction
3111 ** is started if the second argument is nonzero, otherwise a read-
3112 ** transaction.  If the second argument is 2 or more and exclusive
3113 ** transaction is started, meaning that no other process is allowed
3114 ** to access the database.  A preexisting transaction may not be
3115 ** upgraded to exclusive by calling this routine a second time - the
3116 ** exclusivity flag only works for a new transaction.
3117 **
3118 ** A write-transaction must be started before attempting any
3119 ** changes to the database.  None of the following routines
3120 ** will work unless a transaction is started first:
3121 **
3122 **      sqlite3BtreeCreateTable()
3123 **      sqlite3BtreeCreateIndex()
3124 **      sqlite3BtreeClearTable()
3125 **      sqlite3BtreeDropTable()
3126 **      sqlite3BtreeInsert()
3127 **      sqlite3BtreeDelete()
3128 **      sqlite3BtreeUpdateMeta()
3129 **
3130 ** If an initial attempt to acquire the lock fails because of lock contention
3131 ** and the database was previously unlocked, then invoke the busy handler
3132 ** if there is one.  But if there was previously a read-lock, do not
3133 ** invoke the busy handler - just return SQLITE_BUSY.  SQLITE_BUSY is
3134 ** returned when there is already a read-lock in order to avoid a deadlock.
3135 **
3136 ** Suppose there are two processes A and B.  A has a read lock and B has
3137 ** a reserved lock.  B tries to promote to exclusive but is blocked because
3138 ** of A's read lock.  A tries to promote to reserved but is blocked by B.
3139 ** One or the other of the two processes must give way or there can be
3140 ** no progress.  By returning SQLITE_BUSY and not invoking the busy callback
3141 ** when A already has a read lock, we encourage A to give up and let B
3142 ** proceed.
3143 */
3144 int sqlite3BtreeBeginTrans(Btree *p, int wrflag){
3145   BtShared *pBt = p->pBt;
3146   int rc = SQLITE_OK;
3147 
3148   sqlite3BtreeEnter(p);
3149   btreeIntegrity(p);
3150 
3151   /* If the btree is already in a write-transaction, or it
3152   ** is already in a read-transaction and a read-transaction
3153   ** is requested, this is a no-op.
3154   */
3155   if( p->inTrans==TRANS_WRITE || (p->inTrans==TRANS_READ && !wrflag) ){
3156     goto trans_begun;
3157   }
3158   assert( pBt->inTransaction==TRANS_WRITE || IfNotOmitAV(pBt->bDoTruncate)==0 );
3159 
3160   /* Write transactions are not possible on a read-only database */
3161   if( (pBt->btsFlags & BTS_READ_ONLY)!=0 && wrflag ){
3162     rc = SQLITE_READONLY;
3163     goto trans_begun;
3164   }
3165 
3166 #ifndef SQLITE_OMIT_SHARED_CACHE
3167   {
3168     sqlite3 *pBlock = 0;
3169     /* If another database handle has already opened a write transaction
3170     ** on this shared-btree structure and a second write transaction is
3171     ** requested, return SQLITE_LOCKED.
3172     */
3173     if( (wrflag && pBt->inTransaction==TRANS_WRITE)
3174      || (pBt->btsFlags & BTS_PENDING)!=0
3175     ){
3176       pBlock = pBt->pWriter->db;
3177     }else if( wrflag>1 ){
3178       BtLock *pIter;
3179       for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){
3180         if( pIter->pBtree!=p ){
3181           pBlock = pIter->pBtree->db;
3182           break;
3183         }
3184       }
3185     }
3186     if( pBlock ){
3187       sqlite3ConnectionBlocked(p->db, pBlock);
3188       rc = SQLITE_LOCKED_SHAREDCACHE;
3189       goto trans_begun;
3190     }
3191   }
3192 #endif
3193 
3194   /* Any read-only or read-write transaction implies a read-lock on
3195   ** page 1. So if some other shared-cache client already has a write-lock
3196   ** on page 1, the transaction cannot be opened. */
3197   rc = querySharedCacheTableLock(p, MASTER_ROOT, READ_LOCK);
3198   if( SQLITE_OK!=rc ) goto trans_begun;
3199 
3200   pBt->btsFlags &= ~BTS_INITIALLY_EMPTY;
3201   if( pBt->nPage==0 ) pBt->btsFlags |= BTS_INITIALLY_EMPTY;
3202   do {
3203     /* Call lockBtree() until either pBt->pPage1 is populated or
3204     ** lockBtree() returns something other than SQLITE_OK. lockBtree()
3205     ** may return SQLITE_OK but leave pBt->pPage1 set to 0 if after
3206     ** reading page 1 it discovers that the page-size of the database
3207     ** file is not pBt->pageSize. In this case lockBtree() will update
3208     ** pBt->pageSize to the page-size of the file on disk.
3209     */
3210     while( pBt->pPage1==0 && SQLITE_OK==(rc = lockBtree(pBt)) );
3211 
3212     if( rc==SQLITE_OK && wrflag ){
3213       if( (pBt->btsFlags & BTS_READ_ONLY)!=0 ){
3214         rc = SQLITE_READONLY;
3215       }else{
3216         rc = sqlite3PagerBegin(pBt->pPager,wrflag>1,sqlite3TempInMemory(p->db));
3217         if( rc==SQLITE_OK ){
3218           rc = newDatabase(pBt);
3219         }
3220       }
3221     }
3222 
3223     if( rc!=SQLITE_OK ){
3224       unlockBtreeIfUnused(pBt);
3225     }
3226   }while( (rc&0xFF)==SQLITE_BUSY && pBt->inTransaction==TRANS_NONE &&
3227           btreeInvokeBusyHandler(pBt) );
3228 
3229   if( rc==SQLITE_OK ){
3230     if( p->inTrans==TRANS_NONE ){
3231       pBt->nTransaction++;
3232 #ifndef SQLITE_OMIT_SHARED_CACHE
3233       if( p->sharable ){
3234         assert( p->lock.pBtree==p && p->lock.iTable==1 );
3235         p->lock.eLock = READ_LOCK;
3236         p->lock.pNext = pBt->pLock;
3237         pBt->pLock = &p->lock;
3238       }
3239 #endif
3240     }
3241     p->inTrans = (wrflag?TRANS_WRITE:TRANS_READ);
3242     if( p->inTrans>pBt->inTransaction ){
3243       pBt->inTransaction = p->inTrans;
3244     }
3245     if( wrflag ){
3246       MemPage *pPage1 = pBt->pPage1;
3247 #ifndef SQLITE_OMIT_SHARED_CACHE
3248       assert( !pBt->pWriter );
3249       pBt->pWriter = p;
3250       pBt->btsFlags &= ~BTS_EXCLUSIVE;
3251       if( wrflag>1 ) pBt->btsFlags |= BTS_EXCLUSIVE;
3252 #endif
3253 
3254       /* If the db-size header field is incorrect (as it may be if an old
3255       ** client has been writing the database file), update it now. Doing
3256       ** this sooner rather than later means the database size can safely
3257       ** re-read the database size from page 1 if a savepoint or transaction
3258       ** rollback occurs within the transaction.
3259       */
3260       if( pBt->nPage!=get4byte(&pPage1->aData[28]) ){
3261         rc = sqlite3PagerWrite(pPage1->pDbPage);
3262         if( rc==SQLITE_OK ){
3263           put4byte(&pPage1->aData[28], pBt->nPage);
3264         }
3265       }
3266     }
3267   }
3268 
3269 
3270 trans_begun:
3271   if( rc==SQLITE_OK && wrflag ){
3272     /* This call makes sure that the pager has the correct number of
3273     ** open savepoints. If the second parameter is greater than 0 and
3274     ** the sub-journal is not already open, then it will be opened here.
3275     */
3276     rc = sqlite3PagerOpenSavepoint(pBt->pPager, p->db->nSavepoint);
3277   }
3278 
3279   btreeIntegrity(p);
3280   sqlite3BtreeLeave(p);
3281   return rc;
3282 }
3283 
3284 #ifndef SQLITE_OMIT_AUTOVACUUM
3285 
3286 /*
3287 ** Set the pointer-map entries for all children of page pPage. Also, if
3288 ** pPage contains cells that point to overflow pages, set the pointer
3289 ** map entries for the overflow pages as well.
3290 */
3291 static int setChildPtrmaps(MemPage *pPage){
3292   int i;                             /* Counter variable */
3293   int nCell;                         /* Number of cells in page pPage */
3294   int rc;                            /* Return code */
3295   BtShared *pBt = pPage->pBt;
3296   Pgno pgno = pPage->pgno;
3297 
3298   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
3299   rc = btreeInitPage(pPage);
3300   if( rc!=SQLITE_OK ) return rc;
3301   nCell = pPage->nCell;
3302 
3303   for(i=0; i<nCell; i++){
3304     u8 *pCell = findCell(pPage, i);
3305 
3306     ptrmapPutOvflPtr(pPage, pCell, &rc);
3307 
3308     if( !pPage->leaf ){
3309       Pgno childPgno = get4byte(pCell);
3310       ptrmapPut(pBt, childPgno, PTRMAP_BTREE, pgno, &rc);
3311     }
3312   }
3313 
3314   if( !pPage->leaf ){
3315     Pgno childPgno = get4byte(&pPage->aData[pPage->hdrOffset+8]);
3316     ptrmapPut(pBt, childPgno, PTRMAP_BTREE, pgno, &rc);
3317   }
3318 
3319   return rc;
3320 }
3321 
3322 /*
3323 ** Somewhere on pPage is a pointer to page iFrom.  Modify this pointer so
3324 ** that it points to iTo. Parameter eType describes the type of pointer to
3325 ** be modified, as  follows:
3326 **
3327 ** PTRMAP_BTREE:     pPage is a btree-page. The pointer points at a child
3328 **                   page of pPage.
3329 **
3330 ** PTRMAP_OVERFLOW1: pPage is a btree-page. The pointer points at an overflow
3331 **                   page pointed to by one of the cells on pPage.
3332 **
3333 ** PTRMAP_OVERFLOW2: pPage is an overflow-page. The pointer points at the next
3334 **                   overflow page in the list.
3335 */
3336 static int modifyPagePointer(MemPage *pPage, Pgno iFrom, Pgno iTo, u8 eType){
3337   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
3338   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
3339   if( eType==PTRMAP_OVERFLOW2 ){
3340     /* The pointer is always the first 4 bytes of the page in this case.  */
3341     if( get4byte(pPage->aData)!=iFrom ){
3342       return SQLITE_CORRUPT_BKPT;
3343     }
3344     put4byte(pPage->aData, iTo);
3345   }else{
3346     int i;
3347     int nCell;
3348     int rc;
3349 
3350     rc = btreeInitPage(pPage);
3351     if( rc ) return rc;
3352     nCell = pPage->nCell;
3353 
3354     for(i=0; i<nCell; i++){
3355       u8 *pCell = findCell(pPage, i);
3356       if( eType==PTRMAP_OVERFLOW1 ){
3357         CellInfo info;
3358         pPage->xParseCell(pPage, pCell, &info);
3359         if( info.nLocal<info.nPayload ){
3360           if( pCell+info.nSize > pPage->aData+pPage->pBt->usableSize ){
3361             return SQLITE_CORRUPT_BKPT;
3362           }
3363           if( iFrom==get4byte(pCell+info.nSize-4) ){
3364             put4byte(pCell+info.nSize-4, iTo);
3365             break;
3366           }
3367         }
3368       }else{
3369         if( get4byte(pCell)==iFrom ){
3370           put4byte(pCell, iTo);
3371           break;
3372         }
3373       }
3374     }
3375 
3376     if( i==nCell ){
3377       if( eType!=PTRMAP_BTREE ||
3378           get4byte(&pPage->aData[pPage->hdrOffset+8])!=iFrom ){
3379         return SQLITE_CORRUPT_BKPT;
3380       }
3381       put4byte(&pPage->aData[pPage->hdrOffset+8], iTo);
3382     }
3383   }
3384   return SQLITE_OK;
3385 }
3386 
3387 
3388 /*
3389 ** Move the open database page pDbPage to location iFreePage in the
3390 ** database. The pDbPage reference remains valid.
3391 **
3392 ** The isCommit flag indicates that there is no need to remember that
3393 ** the journal needs to be sync()ed before database page pDbPage->pgno
3394 ** can be written to. The caller has already promised not to write to that
3395 ** page.
3396 */
3397 static int relocatePage(
3398   BtShared *pBt,           /* Btree */
3399   MemPage *pDbPage,        /* Open page to move */
3400   u8 eType,                /* Pointer map 'type' entry for pDbPage */
3401   Pgno iPtrPage,           /* Pointer map 'page-no' entry for pDbPage */
3402   Pgno iFreePage,          /* The location to move pDbPage to */
3403   int isCommit             /* isCommit flag passed to sqlite3PagerMovepage */
3404 ){
3405   MemPage *pPtrPage;   /* The page that contains a pointer to pDbPage */
3406   Pgno iDbPage = pDbPage->pgno;
3407   Pager *pPager = pBt->pPager;
3408   int rc;
3409 
3410   assert( eType==PTRMAP_OVERFLOW2 || eType==PTRMAP_OVERFLOW1 ||
3411       eType==PTRMAP_BTREE || eType==PTRMAP_ROOTPAGE );
3412   assert( sqlite3_mutex_held(pBt->mutex) );
3413   assert( pDbPage->pBt==pBt );
3414 
3415   /* Move page iDbPage from its current location to page number iFreePage */
3416   TRACE(("AUTOVACUUM: Moving %d to free page %d (ptr page %d type %d)\n",
3417       iDbPage, iFreePage, iPtrPage, eType));
3418   rc = sqlite3PagerMovepage(pPager, pDbPage->pDbPage, iFreePage, isCommit);
3419   if( rc!=SQLITE_OK ){
3420     return rc;
3421   }
3422   pDbPage->pgno = iFreePage;
3423 
3424   /* If pDbPage was a btree-page, then it may have child pages and/or cells
3425   ** that point to overflow pages. The pointer map entries for all these
3426   ** pages need to be changed.
3427   **
3428   ** If pDbPage is an overflow page, then the first 4 bytes may store a
3429   ** pointer to a subsequent overflow page. If this is the case, then
3430   ** the pointer map needs to be updated for the subsequent overflow page.
3431   */
3432   if( eType==PTRMAP_BTREE || eType==PTRMAP_ROOTPAGE ){
3433     rc = setChildPtrmaps(pDbPage);
3434     if( rc!=SQLITE_OK ){
3435       return rc;
3436     }
3437   }else{
3438     Pgno nextOvfl = get4byte(pDbPage->aData);
3439     if( nextOvfl!=0 ){
3440       ptrmapPut(pBt, nextOvfl, PTRMAP_OVERFLOW2, iFreePage, &rc);
3441       if( rc!=SQLITE_OK ){
3442         return rc;
3443       }
3444     }
3445   }
3446 
3447   /* Fix the database pointer on page iPtrPage that pointed at iDbPage so
3448   ** that it points at iFreePage. Also fix the pointer map entry for
3449   ** iPtrPage.
3450   */
3451   if( eType!=PTRMAP_ROOTPAGE ){
3452     rc = btreeGetPage(pBt, iPtrPage, &pPtrPage, 0);
3453     if( rc!=SQLITE_OK ){
3454       return rc;
3455     }
3456     rc = sqlite3PagerWrite(pPtrPage->pDbPage);
3457     if( rc!=SQLITE_OK ){
3458       releasePage(pPtrPage);
3459       return rc;
3460     }
3461     rc = modifyPagePointer(pPtrPage, iDbPage, iFreePage, eType);
3462     releasePage(pPtrPage);
3463     if( rc==SQLITE_OK ){
3464       ptrmapPut(pBt, iFreePage, eType, iPtrPage, &rc);
3465     }
3466   }
3467   return rc;
3468 }
3469 
3470 /* Forward declaration required by incrVacuumStep(). */
3471 static int allocateBtreePage(BtShared *, MemPage **, Pgno *, Pgno, u8);
3472 
3473 /*
3474 ** Perform a single step of an incremental-vacuum. If successful, return
3475 ** SQLITE_OK. If there is no work to do (and therefore no point in
3476 ** calling this function again), return SQLITE_DONE. Or, if an error
3477 ** occurs, return some other error code.
3478 **
3479 ** More specifically, this function attempts to re-organize the database so
3480 ** that the last page of the file currently in use is no longer in use.
3481 **
3482 ** Parameter nFin is the number of pages that this database would contain
3483 ** were this function called until it returns SQLITE_DONE.
3484 **
3485 ** If the bCommit parameter is non-zero, this function assumes that the
3486 ** caller will keep calling incrVacuumStep() until it returns SQLITE_DONE
3487 ** or an error. bCommit is passed true for an auto-vacuum-on-commit
3488 ** operation, or false for an incremental vacuum.
3489 */
3490 static int incrVacuumStep(BtShared *pBt, Pgno nFin, Pgno iLastPg, int bCommit){
3491   Pgno nFreeList;           /* Number of pages still on the free-list */
3492   int rc;
3493 
3494   assert( sqlite3_mutex_held(pBt->mutex) );
3495   assert( iLastPg>nFin );
3496 
3497   if( !PTRMAP_ISPAGE(pBt, iLastPg) && iLastPg!=PENDING_BYTE_PAGE(pBt) ){
3498     u8 eType;
3499     Pgno iPtrPage;
3500 
3501     nFreeList = get4byte(&pBt->pPage1->aData[36]);
3502     if( nFreeList==0 ){
3503       return SQLITE_DONE;
3504     }
3505 
3506     rc = ptrmapGet(pBt, iLastPg, &eType, &iPtrPage);
3507     if( rc!=SQLITE_OK ){
3508       return rc;
3509     }
3510     if( eType==PTRMAP_ROOTPAGE ){
3511       return SQLITE_CORRUPT_BKPT;
3512     }
3513 
3514     if( eType==PTRMAP_FREEPAGE ){
3515       if( bCommit==0 ){
3516         /* Remove the page from the files free-list. This is not required
3517         ** if bCommit is non-zero. In that case, the free-list will be
3518         ** truncated to zero after this function returns, so it doesn't
3519         ** matter if it still contains some garbage entries.
3520         */
3521         Pgno iFreePg;
3522         MemPage *pFreePg;
3523         rc = allocateBtreePage(pBt, &pFreePg, &iFreePg, iLastPg, BTALLOC_EXACT);
3524         if( rc!=SQLITE_OK ){
3525           return rc;
3526         }
3527         assert( iFreePg==iLastPg );
3528         releasePage(pFreePg);
3529       }
3530     } else {
3531       Pgno iFreePg;             /* Index of free page to move pLastPg to */
3532       MemPage *pLastPg;
3533       u8 eMode = BTALLOC_ANY;   /* Mode parameter for allocateBtreePage() */
3534       Pgno iNear = 0;           /* nearby parameter for allocateBtreePage() */
3535 
3536       rc = btreeGetPage(pBt, iLastPg, &pLastPg, 0);
3537       if( rc!=SQLITE_OK ){
3538         return rc;
3539       }
3540 
3541       /* If bCommit is zero, this loop runs exactly once and page pLastPg
3542       ** is swapped with the first free page pulled off the free list.
3543       **
3544       ** On the other hand, if bCommit is greater than zero, then keep
3545       ** looping until a free-page located within the first nFin pages
3546       ** of the file is found.
3547       */
3548       if( bCommit==0 ){
3549         eMode = BTALLOC_LE;
3550         iNear = nFin;
3551       }
3552       do {
3553         MemPage *pFreePg;
3554         rc = allocateBtreePage(pBt, &pFreePg, &iFreePg, iNear, eMode);
3555         if( rc!=SQLITE_OK ){
3556           releasePage(pLastPg);
3557           return rc;
3558         }
3559         releasePage(pFreePg);
3560       }while( bCommit && iFreePg>nFin );
3561       assert( iFreePg<iLastPg );
3562 
3563       rc = relocatePage(pBt, pLastPg, eType, iPtrPage, iFreePg, bCommit);
3564       releasePage(pLastPg);
3565       if( rc!=SQLITE_OK ){
3566         return rc;
3567       }
3568     }
3569   }
3570 
3571   if( bCommit==0 ){
3572     do {
3573       iLastPg--;
3574     }while( iLastPg==PENDING_BYTE_PAGE(pBt) || PTRMAP_ISPAGE(pBt, iLastPg) );
3575     pBt->bDoTruncate = 1;
3576     pBt->nPage = iLastPg;
3577   }
3578   return SQLITE_OK;
3579 }
3580 
3581 /*
3582 ** The database opened by the first argument is an auto-vacuum database
3583 ** nOrig pages in size containing nFree free pages. Return the expected
3584 ** size of the database in pages following an auto-vacuum operation.
3585 */
3586 static Pgno finalDbSize(BtShared *pBt, Pgno nOrig, Pgno nFree){
3587   int nEntry;                     /* Number of entries on one ptrmap page */
3588   Pgno nPtrmap;                   /* Number of PtrMap pages to be freed */
3589   Pgno nFin;                      /* Return value */
3590 
3591   nEntry = pBt->usableSize/5;
3592   nPtrmap = (nFree-nOrig+PTRMAP_PAGENO(pBt, nOrig)+nEntry)/nEntry;
3593   nFin = nOrig - nFree - nPtrmap;
3594   if( nOrig>PENDING_BYTE_PAGE(pBt) && nFin<PENDING_BYTE_PAGE(pBt) ){
3595     nFin--;
3596   }
3597   while( PTRMAP_ISPAGE(pBt, nFin) || nFin==PENDING_BYTE_PAGE(pBt) ){
3598     nFin--;
3599   }
3600 
3601   return nFin;
3602 }
3603 
3604 /*
3605 ** A write-transaction must be opened before calling this function.
3606 ** It performs a single unit of work towards an incremental vacuum.
3607 **
3608 ** If the incremental vacuum is finished after this function has run,
3609 ** SQLITE_DONE is returned. If it is not finished, but no error occurred,
3610 ** SQLITE_OK is returned. Otherwise an SQLite error code.
3611 */
3612 int sqlite3BtreeIncrVacuum(Btree *p){
3613   int rc;
3614   BtShared *pBt = p->pBt;
3615 
3616   sqlite3BtreeEnter(p);
3617   assert( pBt->inTransaction==TRANS_WRITE && p->inTrans==TRANS_WRITE );
3618   if( !pBt->autoVacuum ){
3619     rc = SQLITE_DONE;
3620   }else{
3621     Pgno nOrig = btreePagecount(pBt);
3622     Pgno nFree = get4byte(&pBt->pPage1->aData[36]);
3623     Pgno nFin = finalDbSize(pBt, nOrig, nFree);
3624 
3625     if( nOrig<nFin ){
3626       rc = SQLITE_CORRUPT_BKPT;
3627     }else if( nFree>0 ){
3628       rc = saveAllCursors(pBt, 0, 0);
3629       if( rc==SQLITE_OK ){
3630         invalidateAllOverflowCache(pBt);
3631         rc = incrVacuumStep(pBt, nFin, nOrig, 0);
3632       }
3633       if( rc==SQLITE_OK ){
3634         rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
3635         put4byte(&pBt->pPage1->aData[28], pBt->nPage);
3636       }
3637     }else{
3638       rc = SQLITE_DONE;
3639     }
3640   }
3641   sqlite3BtreeLeave(p);
3642   return rc;
3643 }
3644 
3645 /*
3646 ** This routine is called prior to sqlite3PagerCommit when a transaction
3647 ** is committed for an auto-vacuum database.
3648 **
3649 ** If SQLITE_OK is returned, then *pnTrunc is set to the number of pages
3650 ** the database file should be truncated to during the commit process.
3651 ** i.e. the database has been reorganized so that only the first *pnTrunc
3652 ** pages are in use.
3653 */
3654 static int autoVacuumCommit(BtShared *pBt){
3655   int rc = SQLITE_OK;
3656   Pager *pPager = pBt->pPager;
3657   VVA_ONLY( int nRef = sqlite3PagerRefcount(pPager); )
3658 
3659   assert( sqlite3_mutex_held(pBt->mutex) );
3660   invalidateAllOverflowCache(pBt);
3661   assert(pBt->autoVacuum);
3662   if( !pBt->incrVacuum ){
3663     Pgno nFin;         /* Number of pages in database after autovacuuming */
3664     Pgno nFree;        /* Number of pages on the freelist initially */
3665     Pgno iFree;        /* The next page to be freed */
3666     Pgno nOrig;        /* Database size before freeing */
3667 
3668     nOrig = btreePagecount(pBt);
3669     if( PTRMAP_ISPAGE(pBt, nOrig) || nOrig==PENDING_BYTE_PAGE(pBt) ){
3670       /* It is not possible to create a database for which the final page
3671       ** is either a pointer-map page or the pending-byte page. If one
3672       ** is encountered, this indicates corruption.
3673       */
3674       return SQLITE_CORRUPT_BKPT;
3675     }
3676 
3677     nFree = get4byte(&pBt->pPage1->aData[36]);
3678     nFin = finalDbSize(pBt, nOrig, nFree);
3679     if( nFin>nOrig ) return SQLITE_CORRUPT_BKPT;
3680     if( nFin<nOrig ){
3681       rc = saveAllCursors(pBt, 0, 0);
3682     }
3683     for(iFree=nOrig; iFree>nFin && rc==SQLITE_OK; iFree--){
3684       rc = incrVacuumStep(pBt, nFin, iFree, 1);
3685     }
3686     if( (rc==SQLITE_DONE || rc==SQLITE_OK) && nFree>0 ){
3687       rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
3688       put4byte(&pBt->pPage1->aData[32], 0);
3689       put4byte(&pBt->pPage1->aData[36], 0);
3690       put4byte(&pBt->pPage1->aData[28], nFin);
3691       pBt->bDoTruncate = 1;
3692       pBt->nPage = nFin;
3693     }
3694     if( rc!=SQLITE_OK ){
3695       sqlite3PagerRollback(pPager);
3696     }
3697   }
3698 
3699   assert( nRef>=sqlite3PagerRefcount(pPager) );
3700   return rc;
3701 }
3702 
3703 #else /* ifndef SQLITE_OMIT_AUTOVACUUM */
3704 # define setChildPtrmaps(x) SQLITE_OK
3705 #endif
3706 
3707 /*
3708 ** This routine does the first phase of a two-phase commit.  This routine
3709 ** causes a rollback journal to be created (if it does not already exist)
3710 ** and populated with enough information so that if a power loss occurs
3711 ** the database can be restored to its original state by playing back
3712 ** the journal.  Then the contents of the journal are flushed out to
3713 ** the disk.  After the journal is safely on oxide, the changes to the
3714 ** database are written into the database file and flushed to oxide.
3715 ** At the end of this call, the rollback journal still exists on the
3716 ** disk and we are still holding all locks, so the transaction has not
3717 ** committed.  See sqlite3BtreeCommitPhaseTwo() for the second phase of the
3718 ** commit process.
3719 **
3720 ** This call is a no-op if no write-transaction is currently active on pBt.
3721 **
3722 ** Otherwise, sync the database file for the btree pBt. zMaster points to
3723 ** the name of a master journal file that should be written into the
3724 ** individual journal file, or is NULL, indicating no master journal file
3725 ** (single database transaction).
3726 **
3727 ** When this is called, the master journal should already have been
3728 ** created, populated with this journal pointer and synced to disk.
3729 **
3730 ** Once this is routine has returned, the only thing required to commit
3731 ** the write-transaction for this database file is to delete the journal.
3732 */
3733 int sqlite3BtreeCommitPhaseOne(Btree *p, const char *zMaster){
3734   int rc = SQLITE_OK;
3735   if( p->inTrans==TRANS_WRITE ){
3736     BtShared *pBt = p->pBt;
3737     sqlite3BtreeEnter(p);
3738 #ifndef SQLITE_OMIT_AUTOVACUUM
3739     if( pBt->autoVacuum ){
3740       rc = autoVacuumCommit(pBt);
3741       if( rc!=SQLITE_OK ){
3742         sqlite3BtreeLeave(p);
3743         return rc;
3744       }
3745     }
3746     if( pBt->bDoTruncate ){
3747       sqlite3PagerTruncateImage(pBt->pPager, pBt->nPage);
3748     }
3749 #endif
3750     rc = sqlite3PagerCommitPhaseOne(pBt->pPager, zMaster, 0);
3751     sqlite3BtreeLeave(p);
3752   }
3753   return rc;
3754 }
3755 
3756 /*
3757 ** This function is called from both BtreeCommitPhaseTwo() and BtreeRollback()
3758 ** at the conclusion of a transaction.
3759 */
3760 static void btreeEndTransaction(Btree *p){
3761   BtShared *pBt = p->pBt;
3762   sqlite3 *db = p->db;
3763   assert( sqlite3BtreeHoldsMutex(p) );
3764 
3765 #ifndef SQLITE_OMIT_AUTOVACUUM
3766   pBt->bDoTruncate = 0;
3767 #endif
3768   if( p->inTrans>TRANS_NONE && db->nVdbeRead>1 ){
3769     /* If there are other active statements that belong to this database
3770     ** handle, downgrade to a read-only transaction. The other statements
3771     ** may still be reading from the database.  */
3772     downgradeAllSharedCacheTableLocks(p);
3773     p->inTrans = TRANS_READ;
3774   }else{
3775     /* If the handle had any kind of transaction open, decrement the
3776     ** transaction count of the shared btree. If the transaction count
3777     ** reaches 0, set the shared state to TRANS_NONE. The unlockBtreeIfUnused()
3778     ** call below will unlock the pager.  */
3779     if( p->inTrans!=TRANS_NONE ){
3780       clearAllSharedCacheTableLocks(p);
3781       pBt->nTransaction--;
3782       if( 0==pBt->nTransaction ){
3783         pBt->inTransaction = TRANS_NONE;
3784       }
3785     }
3786 
3787     /* Set the current transaction state to TRANS_NONE and unlock the
3788     ** pager if this call closed the only read or write transaction.  */
3789     p->inTrans = TRANS_NONE;
3790     unlockBtreeIfUnused(pBt);
3791   }
3792 
3793   btreeIntegrity(p);
3794 }
3795 
3796 /*
3797 ** Commit the transaction currently in progress.
3798 **
3799 ** This routine implements the second phase of a 2-phase commit.  The
3800 ** sqlite3BtreeCommitPhaseOne() routine does the first phase and should
3801 ** be invoked prior to calling this routine.  The sqlite3BtreeCommitPhaseOne()
3802 ** routine did all the work of writing information out to disk and flushing the
3803 ** contents so that they are written onto the disk platter.  All this
3804 ** routine has to do is delete or truncate or zero the header in the
3805 ** the rollback journal (which causes the transaction to commit) and
3806 ** drop locks.
3807 **
3808 ** Normally, if an error occurs while the pager layer is attempting to
3809 ** finalize the underlying journal file, this function returns an error and
3810 ** the upper layer will attempt a rollback. However, if the second argument
3811 ** is non-zero then this b-tree transaction is part of a multi-file
3812 ** transaction. In this case, the transaction has already been committed
3813 ** (by deleting a master journal file) and the caller will ignore this
3814 ** functions return code. So, even if an error occurs in the pager layer,
3815 ** reset the b-tree objects internal state to indicate that the write
3816 ** transaction has been closed. This is quite safe, as the pager will have
3817 ** transitioned to the error state.
3818 **
3819 ** This will release the write lock on the database file.  If there
3820 ** are no active cursors, it also releases the read lock.
3821 */
3822 int sqlite3BtreeCommitPhaseTwo(Btree *p, int bCleanup){
3823 
3824   if( p->inTrans==TRANS_NONE ) return SQLITE_OK;
3825   sqlite3BtreeEnter(p);
3826   btreeIntegrity(p);
3827 
3828   /* If the handle has a write-transaction open, commit the shared-btrees
3829   ** transaction and set the shared state to TRANS_READ.
3830   */
3831   if( p->inTrans==TRANS_WRITE ){
3832     int rc;
3833     BtShared *pBt = p->pBt;
3834     assert( pBt->inTransaction==TRANS_WRITE );
3835     assert( pBt->nTransaction>0 );
3836     rc = sqlite3PagerCommitPhaseTwo(pBt->pPager);
3837     if( rc!=SQLITE_OK && bCleanup==0 ){
3838       sqlite3BtreeLeave(p);
3839       return rc;
3840     }
3841     p->iDataVersion--;  /* Compensate for pPager->iDataVersion++; */
3842     pBt->inTransaction = TRANS_READ;
3843     btreeClearHasContent(pBt);
3844   }
3845 
3846   btreeEndTransaction(p);
3847   sqlite3BtreeLeave(p);
3848   return SQLITE_OK;
3849 }
3850 
3851 /*
3852 ** Do both phases of a commit.
3853 */
3854 int sqlite3BtreeCommit(Btree *p){
3855   int rc;
3856   sqlite3BtreeEnter(p);
3857   rc = sqlite3BtreeCommitPhaseOne(p, 0);
3858   if( rc==SQLITE_OK ){
3859     rc = sqlite3BtreeCommitPhaseTwo(p, 0);
3860   }
3861   sqlite3BtreeLeave(p);
3862   return rc;
3863 }
3864 
3865 /*
3866 ** This routine sets the state to CURSOR_FAULT and the error
3867 ** code to errCode for every cursor on any BtShared that pBtree
3868 ** references.  Or if the writeOnly flag is set to 1, then only
3869 ** trip write cursors and leave read cursors unchanged.
3870 **
3871 ** Every cursor is a candidate to be tripped, including cursors
3872 ** that belong to other database connections that happen to be
3873 ** sharing the cache with pBtree.
3874 **
3875 ** This routine gets called when a rollback occurs. If the writeOnly
3876 ** flag is true, then only write-cursors need be tripped - read-only
3877 ** cursors save their current positions so that they may continue
3878 ** following the rollback. Or, if writeOnly is false, all cursors are
3879 ** tripped. In general, writeOnly is false if the transaction being
3880 ** rolled back modified the database schema. In this case b-tree root
3881 ** pages may be moved or deleted from the database altogether, making
3882 ** it unsafe for read cursors to continue.
3883 **
3884 ** If the writeOnly flag is true and an error is encountered while
3885 ** saving the current position of a read-only cursor, all cursors,
3886 ** including all read-cursors are tripped.
3887 **
3888 ** SQLITE_OK is returned if successful, or if an error occurs while
3889 ** saving a cursor position, an SQLite error code.
3890 */
3891 int sqlite3BtreeTripAllCursors(Btree *pBtree, int errCode, int writeOnly){
3892   BtCursor *p;
3893   int rc = SQLITE_OK;
3894 
3895   assert( (writeOnly==0 || writeOnly==1) && BTCF_WriteFlag==1 );
3896   if( pBtree ){
3897     sqlite3BtreeEnter(pBtree);
3898     for(p=pBtree->pBt->pCursor; p; p=p->pNext){
3899       int i;
3900       if( writeOnly && (p->curFlags & BTCF_WriteFlag)==0 ){
3901         if( p->eState==CURSOR_VALID || p->eState==CURSOR_SKIPNEXT ){
3902           rc = saveCursorPosition(p);
3903           if( rc!=SQLITE_OK ){
3904             (void)sqlite3BtreeTripAllCursors(pBtree, rc, 0);
3905             break;
3906           }
3907         }
3908       }else{
3909         sqlite3BtreeClearCursor(p);
3910         p->eState = CURSOR_FAULT;
3911         p->skipNext = errCode;
3912       }
3913       for(i=0; i<=p->iPage; i++){
3914         releasePage(p->apPage[i]);
3915         p->apPage[i] = 0;
3916       }
3917     }
3918     sqlite3BtreeLeave(pBtree);
3919   }
3920   return rc;
3921 }
3922 
3923 /*
3924 ** Rollback the transaction in progress.
3925 **
3926 ** If tripCode is not SQLITE_OK then cursors will be invalidated (tripped).
3927 ** Only write cursors are tripped if writeOnly is true but all cursors are
3928 ** tripped if writeOnly is false.  Any attempt to use
3929 ** a tripped cursor will result in an error.
3930 **
3931 ** This will release the write lock on the database file.  If there
3932 ** are no active cursors, it also releases the read lock.
3933 */
3934 int sqlite3BtreeRollback(Btree *p, int tripCode, int writeOnly){
3935   int rc;
3936   BtShared *pBt = p->pBt;
3937   MemPage *pPage1;
3938 
3939   assert( writeOnly==1 || writeOnly==0 );
3940   assert( tripCode==SQLITE_ABORT_ROLLBACK || tripCode==SQLITE_OK );
3941   sqlite3BtreeEnter(p);
3942   if( tripCode==SQLITE_OK ){
3943     rc = tripCode = saveAllCursors(pBt, 0, 0);
3944     if( rc ) writeOnly = 0;
3945   }else{
3946     rc = SQLITE_OK;
3947   }
3948   if( tripCode ){
3949     int rc2 = sqlite3BtreeTripAllCursors(p, tripCode, writeOnly);
3950     assert( rc==SQLITE_OK || (writeOnly==0 && rc2==SQLITE_OK) );
3951     if( rc2!=SQLITE_OK ) rc = rc2;
3952   }
3953   btreeIntegrity(p);
3954 
3955   if( p->inTrans==TRANS_WRITE ){
3956     int rc2;
3957 
3958     assert( TRANS_WRITE==pBt->inTransaction );
3959     rc2 = sqlite3PagerRollback(pBt->pPager);
3960     if( rc2!=SQLITE_OK ){
3961       rc = rc2;
3962     }
3963 
3964     /* The rollback may have destroyed the pPage1->aData value.  So
3965     ** call btreeGetPage() on page 1 again to make
3966     ** sure pPage1->aData is set correctly. */
3967     if( btreeGetPage(pBt, 1, &pPage1, 0)==SQLITE_OK ){
3968       int nPage = get4byte(28+(u8*)pPage1->aData);
3969       testcase( nPage==0 );
3970       if( nPage==0 ) sqlite3PagerPagecount(pBt->pPager, &nPage);
3971       testcase( pBt->nPage!=nPage );
3972       pBt->nPage = nPage;
3973       releasePage(pPage1);
3974     }
3975     assert( countValidCursors(pBt, 1)==0 );
3976     pBt->inTransaction = TRANS_READ;
3977     btreeClearHasContent(pBt);
3978   }
3979 
3980   btreeEndTransaction(p);
3981   sqlite3BtreeLeave(p);
3982   return rc;
3983 }
3984 
3985 /*
3986 ** Start a statement subtransaction. The subtransaction can be rolled
3987 ** back independently of the main transaction. You must start a transaction
3988 ** before starting a subtransaction. The subtransaction is ended automatically
3989 ** if the main transaction commits or rolls back.
3990 **
3991 ** Statement subtransactions are used around individual SQL statements
3992 ** that are contained within a BEGIN...COMMIT block.  If a constraint
3993 ** error occurs within the statement, the effect of that one statement
3994 ** can be rolled back without having to rollback the entire transaction.
3995 **
3996 ** A statement sub-transaction is implemented as an anonymous savepoint. The
3997 ** value passed as the second parameter is the total number of savepoints,
3998 ** including the new anonymous savepoint, open on the B-Tree. i.e. if there
3999 ** are no active savepoints and no other statement-transactions open,
4000 ** iStatement is 1. This anonymous savepoint can be released or rolled back
4001 ** using the sqlite3BtreeSavepoint() function.
4002 */
4003 int sqlite3BtreeBeginStmt(Btree *p, int iStatement){
4004   int rc;
4005   BtShared *pBt = p->pBt;
4006   sqlite3BtreeEnter(p);
4007   assert( p->inTrans==TRANS_WRITE );
4008   assert( (pBt->btsFlags & BTS_READ_ONLY)==0 );
4009   assert( iStatement>0 );
4010   assert( iStatement>p->db->nSavepoint );
4011   assert( pBt->inTransaction==TRANS_WRITE );
4012   /* At the pager level, a statement transaction is a savepoint with
4013   ** an index greater than all savepoints created explicitly using
4014   ** SQL statements. It is illegal to open, release or rollback any
4015   ** such savepoints while the statement transaction savepoint is active.
4016   */
4017   rc = sqlite3PagerOpenSavepoint(pBt->pPager, iStatement);
4018   sqlite3BtreeLeave(p);
4019   return rc;
4020 }
4021 
4022 /*
4023 ** The second argument to this function, op, is always SAVEPOINT_ROLLBACK
4024 ** or SAVEPOINT_RELEASE. This function either releases or rolls back the
4025 ** savepoint identified by parameter iSavepoint, depending on the value
4026 ** of op.
4027 **
4028 ** Normally, iSavepoint is greater than or equal to zero. However, if op is
4029 ** SAVEPOINT_ROLLBACK, then iSavepoint may also be -1. In this case the
4030 ** contents of the entire transaction are rolled back. This is different
4031 ** from a normal transaction rollback, as no locks are released and the
4032 ** transaction remains open.
4033 */
4034 int sqlite3BtreeSavepoint(Btree *p, int op, int iSavepoint){
4035   int rc = SQLITE_OK;
4036   if( p && p->inTrans==TRANS_WRITE ){
4037     BtShared *pBt = p->pBt;
4038     assert( op==SAVEPOINT_RELEASE || op==SAVEPOINT_ROLLBACK );
4039     assert( iSavepoint>=0 || (iSavepoint==-1 && op==SAVEPOINT_ROLLBACK) );
4040     sqlite3BtreeEnter(p);
4041     rc = sqlite3PagerSavepoint(pBt->pPager, op, iSavepoint);
4042     if( rc==SQLITE_OK ){
4043       if( iSavepoint<0 && (pBt->btsFlags & BTS_INITIALLY_EMPTY)!=0 ){
4044         pBt->nPage = 0;
4045       }
4046       rc = newDatabase(pBt);
4047       pBt->nPage = get4byte(28 + pBt->pPage1->aData);
4048 
4049       /* The database size was written into the offset 28 of the header
4050       ** when the transaction started, so we know that the value at offset
4051       ** 28 is nonzero. */
4052       assert( pBt->nPage>0 );
4053     }
4054     sqlite3BtreeLeave(p);
4055   }
4056   return rc;
4057 }
4058 
4059 /*
4060 ** Create a new cursor for the BTree whose root is on the page
4061 ** iTable. If a read-only cursor is requested, it is assumed that
4062 ** the caller already has at least a read-only transaction open
4063 ** on the database already. If a write-cursor is requested, then
4064 ** the caller is assumed to have an open write transaction.
4065 **
4066 ** If the BTREE_WRCSR bit of wrFlag is clear, then the cursor can only
4067 ** be used for reading.  If the BTREE_WRCSR bit is set, then the cursor
4068 ** can be used for reading or for writing if other conditions for writing
4069 ** are also met.  These are the conditions that must be met in order
4070 ** for writing to be allowed:
4071 **
4072 ** 1:  The cursor must have been opened with wrFlag containing BTREE_WRCSR
4073 **
4074 ** 2:  Other database connections that share the same pager cache
4075 **     but which are not in the READ_UNCOMMITTED state may not have
4076 **     cursors open with wrFlag==0 on the same table.  Otherwise
4077 **     the changes made by this write cursor would be visible to
4078 **     the read cursors in the other database connection.
4079 **
4080 ** 3:  The database must be writable (not on read-only media)
4081 **
4082 ** 4:  There must be an active transaction.
4083 **
4084 ** The BTREE_FORDELETE bit of wrFlag may optionally be set if BTREE_WRCSR
4085 ** is set.  If FORDELETE is set, that is a hint to the implementation that
4086 ** this cursor will only be used to seek to and delete entries of an index
4087 ** as part of a larger DELETE statement.  The FORDELETE hint is not used by
4088 ** this implementation.  But in a hypothetical alternative storage engine
4089 ** in which index entries are automatically deleted when corresponding table
4090 ** rows are deleted, the FORDELETE flag is a hint that all SEEK and DELETE
4091 ** operations on this cursor can be no-ops and all READ operations can
4092 ** return a null row (2-bytes: 0x01 0x00).
4093 **
4094 ** No checking is done to make sure that page iTable really is the
4095 ** root page of a b-tree.  If it is not, then the cursor acquired
4096 ** will not work correctly.
4097 **
4098 ** It is assumed that the sqlite3BtreeCursorZero() has been called
4099 ** on pCur to initialize the memory space prior to invoking this routine.
4100 */
4101 static int btreeCursor(
4102   Btree *p,                              /* The btree */
4103   int iTable,                            /* Root page of table to open */
4104   int wrFlag,                            /* 1 to write. 0 read-only */
4105   struct KeyInfo *pKeyInfo,              /* First arg to comparison function */
4106   BtCursor *pCur                         /* Space for new cursor */
4107 ){
4108   BtShared *pBt = p->pBt;                /* Shared b-tree handle */
4109   BtCursor *pX;                          /* Looping over other all cursors */
4110 
4111   assert( sqlite3BtreeHoldsMutex(p) );
4112   assert( wrFlag==0
4113        || wrFlag==BTREE_WRCSR
4114        || wrFlag==(BTREE_WRCSR|BTREE_FORDELETE)
4115   );
4116 
4117   /* The following assert statements verify that if this is a sharable
4118   ** b-tree database, the connection is holding the required table locks,
4119   ** and that no other connection has any open cursor that conflicts with
4120   ** this lock.  */
4121   assert( hasSharedCacheTableLock(p, iTable, pKeyInfo!=0, (wrFlag?2:1)) );
4122   assert( wrFlag==0 || !hasReadConflicts(p, iTable) );
4123 
4124   /* Assert that the caller has opened the required transaction. */
4125   assert( p->inTrans>TRANS_NONE );
4126   assert( wrFlag==0 || p->inTrans==TRANS_WRITE );
4127   assert( pBt->pPage1 && pBt->pPage1->aData );
4128   assert( wrFlag==0 || (pBt->btsFlags & BTS_READ_ONLY)==0 );
4129 
4130   if( wrFlag ){
4131     allocateTempSpace(pBt);
4132     if( pBt->pTmpSpace==0 ) return SQLITE_NOMEM_BKPT;
4133   }
4134   if( iTable==1 && btreePagecount(pBt)==0 ){
4135     assert( wrFlag==0 );
4136     iTable = 0;
4137   }
4138 
4139   /* Now that no other errors can occur, finish filling in the BtCursor
4140   ** variables and link the cursor into the BtShared list.  */
4141   pCur->pgnoRoot = (Pgno)iTable;
4142   pCur->iPage = -1;
4143   pCur->pKeyInfo = pKeyInfo;
4144   pCur->pBtree = p;
4145   pCur->pBt = pBt;
4146   pCur->curFlags = wrFlag ? BTCF_WriteFlag : 0;
4147   pCur->curPagerFlags = wrFlag ? 0 : PAGER_GET_READONLY;
4148   /* If there are two or more cursors on the same btree, then all such
4149   ** cursors *must* have the BTCF_Multiple flag set. */
4150   for(pX=pBt->pCursor; pX; pX=pX->pNext){
4151     if( pX->pgnoRoot==(Pgno)iTable ){
4152       pX->curFlags |= BTCF_Multiple;
4153       pCur->curFlags |= BTCF_Multiple;
4154     }
4155   }
4156   pCur->pNext = pBt->pCursor;
4157   pBt->pCursor = pCur;
4158   pCur->eState = CURSOR_INVALID;
4159   return SQLITE_OK;
4160 }
4161 int sqlite3BtreeCursor(
4162   Btree *p,                                   /* The btree */
4163   int iTable,                                 /* Root page of table to open */
4164   int wrFlag,                                 /* 1 to write. 0 read-only */
4165   struct KeyInfo *pKeyInfo,                   /* First arg to xCompare() */
4166   BtCursor *pCur                              /* Write new cursor here */
4167 ){
4168   int rc;
4169   if( iTable<1 ){
4170     rc = SQLITE_CORRUPT_BKPT;
4171   }else{
4172     sqlite3BtreeEnter(p);
4173     rc = btreeCursor(p, iTable, wrFlag, pKeyInfo, pCur);
4174     sqlite3BtreeLeave(p);
4175   }
4176   return rc;
4177 }
4178 
4179 /*
4180 ** Return the size of a BtCursor object in bytes.
4181 **
4182 ** This interfaces is needed so that users of cursors can preallocate
4183 ** sufficient storage to hold a cursor.  The BtCursor object is opaque
4184 ** to users so they cannot do the sizeof() themselves - they must call
4185 ** this routine.
4186 */
4187 int sqlite3BtreeCursorSize(void){
4188   return ROUND8(sizeof(BtCursor));
4189 }
4190 
4191 /*
4192 ** Initialize memory that will be converted into a BtCursor object.
4193 **
4194 ** The simple approach here would be to memset() the entire object
4195 ** to zero.  But it turns out that the apPage[] and aiIdx[] arrays
4196 ** do not need to be zeroed and they are large, so we can save a lot
4197 ** of run-time by skipping the initialization of those elements.
4198 */
4199 void sqlite3BtreeCursorZero(BtCursor *p){
4200   memset(p, 0, offsetof(BtCursor, iPage));
4201 }
4202 
4203 /*
4204 ** Close a cursor.  The read lock on the database file is released
4205 ** when the last cursor is closed.
4206 */
4207 int sqlite3BtreeCloseCursor(BtCursor *pCur){
4208   Btree *pBtree = pCur->pBtree;
4209   if( pBtree ){
4210     int i;
4211     BtShared *pBt = pCur->pBt;
4212     sqlite3BtreeEnter(pBtree);
4213     sqlite3BtreeClearCursor(pCur);
4214     assert( pBt->pCursor!=0 );
4215     if( pBt->pCursor==pCur ){
4216       pBt->pCursor = pCur->pNext;
4217     }else{
4218       BtCursor *pPrev = pBt->pCursor;
4219       do{
4220         if( pPrev->pNext==pCur ){
4221           pPrev->pNext = pCur->pNext;
4222           break;
4223         }
4224         pPrev = pPrev->pNext;
4225       }while( ALWAYS(pPrev) );
4226     }
4227     for(i=0; i<=pCur->iPage; i++){
4228       releasePage(pCur->apPage[i]);
4229     }
4230     unlockBtreeIfUnused(pBt);
4231     sqlite3_free(pCur->aOverflow);
4232     /* sqlite3_free(pCur); */
4233     sqlite3BtreeLeave(pBtree);
4234   }
4235   return SQLITE_OK;
4236 }
4237 
4238 /*
4239 ** Make sure the BtCursor* given in the argument has a valid
4240 ** BtCursor.info structure.  If it is not already valid, call
4241 ** btreeParseCell() to fill it in.
4242 **
4243 ** BtCursor.info is a cache of the information in the current cell.
4244 ** Using this cache reduces the number of calls to btreeParseCell().
4245 */
4246 #ifndef NDEBUG
4247   static void assertCellInfo(BtCursor *pCur){
4248     CellInfo info;
4249     int iPage = pCur->iPage;
4250     memset(&info, 0, sizeof(info));
4251     btreeParseCell(pCur->apPage[iPage], pCur->aiIdx[iPage], &info);
4252     assert( CORRUPT_DB || memcmp(&info, &pCur->info, sizeof(info))==0 );
4253   }
4254 #else
4255   #define assertCellInfo(x)
4256 #endif
4257 static SQLITE_NOINLINE void getCellInfo(BtCursor *pCur){
4258   if( pCur->info.nSize==0 ){
4259     int iPage = pCur->iPage;
4260     pCur->curFlags |= BTCF_ValidNKey;
4261     btreeParseCell(pCur->apPage[iPage],pCur->aiIdx[iPage],&pCur->info);
4262   }else{
4263     assertCellInfo(pCur);
4264   }
4265 }
4266 
4267 #ifndef NDEBUG  /* The next routine used only within assert() statements */
4268 /*
4269 ** Return true if the given BtCursor is valid.  A valid cursor is one
4270 ** that is currently pointing to a row in a (non-empty) table.
4271 ** This is a verification routine is used only within assert() statements.
4272 */
4273 int sqlite3BtreeCursorIsValid(BtCursor *pCur){
4274   return pCur && pCur->eState==CURSOR_VALID;
4275 }
4276 #endif /* NDEBUG */
4277 int sqlite3BtreeCursorIsValidNN(BtCursor *pCur){
4278   assert( pCur!=0 );
4279   return pCur->eState==CURSOR_VALID;
4280 }
4281 
4282 /*
4283 ** Return the value of the integer key or "rowid" for a table btree.
4284 ** This routine is only valid for a cursor that is pointing into a
4285 ** ordinary table btree.  If the cursor points to an index btree or
4286 ** is invalid, the result of this routine is undefined.
4287 */
4288 i64 sqlite3BtreeIntegerKey(BtCursor *pCur){
4289   assert( cursorHoldsMutex(pCur) );
4290   assert( pCur->eState==CURSOR_VALID );
4291   assert( pCur->curIntKey );
4292   getCellInfo(pCur);
4293   return pCur->info.nKey;
4294 }
4295 
4296 /*
4297 ** Return the number of bytes of payload for the entry that pCur is
4298 ** currently pointing to.  For table btrees, this will be the amount
4299 ** of data.  For index btrees, this will be the size of the key.
4300 **
4301 ** The caller must guarantee that the cursor is pointing to a non-NULL
4302 ** valid entry.  In other words, the calling procedure must guarantee
4303 ** that the cursor has Cursor.eState==CURSOR_VALID.
4304 */
4305 u32 sqlite3BtreePayloadSize(BtCursor *pCur){
4306   assert( cursorHoldsMutex(pCur) );
4307   assert( pCur->eState==CURSOR_VALID );
4308   getCellInfo(pCur);
4309   return pCur->info.nPayload;
4310 }
4311 
4312 /*
4313 ** Given the page number of an overflow page in the database (parameter
4314 ** ovfl), this function finds the page number of the next page in the
4315 ** linked list of overflow pages. If possible, it uses the auto-vacuum
4316 ** pointer-map data instead of reading the content of page ovfl to do so.
4317 **
4318 ** If an error occurs an SQLite error code is returned. Otherwise:
4319 **
4320 ** The page number of the next overflow page in the linked list is
4321 ** written to *pPgnoNext. If page ovfl is the last page in its linked
4322 ** list, *pPgnoNext is set to zero.
4323 **
4324 ** If ppPage is not NULL, and a reference to the MemPage object corresponding
4325 ** to page number pOvfl was obtained, then *ppPage is set to point to that
4326 ** reference. It is the responsibility of the caller to call releasePage()
4327 ** on *ppPage to free the reference. In no reference was obtained (because
4328 ** the pointer-map was used to obtain the value for *pPgnoNext), then
4329 ** *ppPage is set to zero.
4330 */
4331 static int getOverflowPage(
4332   BtShared *pBt,               /* The database file */
4333   Pgno ovfl,                   /* Current overflow page number */
4334   MemPage **ppPage,            /* OUT: MemPage handle (may be NULL) */
4335   Pgno *pPgnoNext              /* OUT: Next overflow page number */
4336 ){
4337   Pgno next = 0;
4338   MemPage *pPage = 0;
4339   int rc = SQLITE_OK;
4340 
4341   assert( sqlite3_mutex_held(pBt->mutex) );
4342   assert(pPgnoNext);
4343 
4344 #ifndef SQLITE_OMIT_AUTOVACUUM
4345   /* Try to find the next page in the overflow list using the
4346   ** autovacuum pointer-map pages. Guess that the next page in
4347   ** the overflow list is page number (ovfl+1). If that guess turns
4348   ** out to be wrong, fall back to loading the data of page
4349   ** number ovfl to determine the next page number.
4350   */
4351   if( pBt->autoVacuum ){
4352     Pgno pgno;
4353     Pgno iGuess = ovfl+1;
4354     u8 eType;
4355 
4356     while( PTRMAP_ISPAGE(pBt, iGuess) || iGuess==PENDING_BYTE_PAGE(pBt) ){
4357       iGuess++;
4358     }
4359 
4360     if( iGuess<=btreePagecount(pBt) ){
4361       rc = ptrmapGet(pBt, iGuess, &eType, &pgno);
4362       if( rc==SQLITE_OK && eType==PTRMAP_OVERFLOW2 && pgno==ovfl ){
4363         next = iGuess;
4364         rc = SQLITE_DONE;
4365       }
4366     }
4367   }
4368 #endif
4369 
4370   assert( next==0 || rc==SQLITE_DONE );
4371   if( rc==SQLITE_OK ){
4372     rc = btreeGetPage(pBt, ovfl, &pPage, (ppPage==0) ? PAGER_GET_READONLY : 0);
4373     assert( rc==SQLITE_OK || pPage==0 );
4374     if( rc==SQLITE_OK ){
4375       next = get4byte(pPage->aData);
4376     }
4377   }
4378 
4379   *pPgnoNext = next;
4380   if( ppPage ){
4381     *ppPage = pPage;
4382   }else{
4383     releasePage(pPage);
4384   }
4385   return (rc==SQLITE_DONE ? SQLITE_OK : rc);
4386 }
4387 
4388 /*
4389 ** Copy data from a buffer to a page, or from a page to a buffer.
4390 **
4391 ** pPayload is a pointer to data stored on database page pDbPage.
4392 ** If argument eOp is false, then nByte bytes of data are copied
4393 ** from pPayload to the buffer pointed at by pBuf. If eOp is true,
4394 ** then sqlite3PagerWrite() is called on pDbPage and nByte bytes
4395 ** of data are copied from the buffer pBuf to pPayload.
4396 **
4397 ** SQLITE_OK is returned on success, otherwise an error code.
4398 */
4399 static int copyPayload(
4400   void *pPayload,           /* Pointer to page data */
4401   void *pBuf,               /* Pointer to buffer */
4402   int nByte,                /* Number of bytes to copy */
4403   int eOp,                  /* 0 -> copy from page, 1 -> copy to page */
4404   DbPage *pDbPage           /* Page containing pPayload */
4405 ){
4406   if( eOp ){
4407     /* Copy data from buffer to page (a write operation) */
4408     int rc = sqlite3PagerWrite(pDbPage);
4409     if( rc!=SQLITE_OK ){
4410       return rc;
4411     }
4412     memcpy(pPayload, pBuf, nByte);
4413   }else{
4414     /* Copy data from page to buffer (a read operation) */
4415     memcpy(pBuf, pPayload, nByte);
4416   }
4417   return SQLITE_OK;
4418 }
4419 
4420 /*
4421 ** This function is used to read or overwrite payload information
4422 ** for the entry that the pCur cursor is pointing to. The eOp
4423 ** argument is interpreted as follows:
4424 **
4425 **   0: The operation is a read. Populate the overflow cache.
4426 **   1: The operation is a write. Populate the overflow cache.
4427 **   2: The operation is a read. Do not populate the overflow cache.
4428 **
4429 ** A total of "amt" bytes are read or written beginning at "offset".
4430 ** Data is read to or from the buffer pBuf.
4431 **
4432 ** The content being read or written might appear on the main page
4433 ** or be scattered out on multiple overflow pages.
4434 **
4435 ** If the current cursor entry uses one or more overflow pages and the
4436 ** eOp argument is not 2, this function may allocate space for and lazily
4437 ** populates the overflow page-list cache array (BtCursor.aOverflow).
4438 ** Subsequent calls use this cache to make seeking to the supplied offset
4439 ** more efficient.
4440 **
4441 ** Once an overflow page-list cache has been allocated, it may be
4442 ** invalidated if some other cursor writes to the same table, or if
4443 ** the cursor is moved to a different row. Additionally, in auto-vacuum
4444 ** mode, the following events may invalidate an overflow page-list cache.
4445 **
4446 **   * An incremental vacuum,
4447 **   * A commit in auto_vacuum="full" mode,
4448 **   * Creating a table (may require moving an overflow page).
4449 */
4450 static int accessPayload(
4451   BtCursor *pCur,      /* Cursor pointing to entry to read from */
4452   u32 offset,          /* Begin reading this far into payload */
4453   u32 amt,             /* Read this many bytes */
4454   unsigned char *pBuf, /* Write the bytes into this buffer */
4455   int eOp              /* zero to read. non-zero to write. */
4456 ){
4457   unsigned char *aPayload;
4458   int rc = SQLITE_OK;
4459   int iIdx = 0;
4460   MemPage *pPage = pCur->apPage[pCur->iPage]; /* Btree page of current entry */
4461   BtShared *pBt = pCur->pBt;                  /* Btree this cursor belongs to */
4462 #ifdef SQLITE_DIRECT_OVERFLOW_READ
4463   unsigned char * const pBufStart = pBuf;
4464   int bEnd;                                 /* True if reading to end of data */
4465 #endif
4466 
4467   assert( pPage );
4468   assert( pCur->eState==CURSOR_VALID );
4469   assert( pCur->aiIdx[pCur->iPage]<pPage->nCell );
4470   assert( cursorHoldsMutex(pCur) );
4471   assert( eOp!=2 || offset==0 );    /* Always start from beginning for eOp==2 */
4472 
4473   getCellInfo(pCur);
4474   aPayload = pCur->info.pPayload;
4475 #ifdef SQLITE_DIRECT_OVERFLOW_READ
4476   bEnd = offset+amt==pCur->info.nPayload;
4477 #endif
4478   assert( offset+amt <= pCur->info.nPayload );
4479 
4480   assert( aPayload > pPage->aData );
4481   if( (uptr)(aPayload - pPage->aData) > (pBt->usableSize - pCur->info.nLocal) ){
4482     /* Trying to read or write past the end of the data is an error.  The
4483     ** conditional above is really:
4484     **    &aPayload[pCur->info.nLocal] > &pPage->aData[pBt->usableSize]
4485     ** but is recast into its current form to avoid integer overflow problems
4486     */
4487     return SQLITE_CORRUPT_BKPT;
4488   }
4489 
4490   /* Check if data must be read/written to/from the btree page itself. */
4491   if( offset<pCur->info.nLocal ){
4492     int a = amt;
4493     if( a+offset>pCur->info.nLocal ){
4494       a = pCur->info.nLocal - offset;
4495     }
4496     rc = copyPayload(&aPayload[offset], pBuf, a, (eOp & 0x01), pPage->pDbPage);
4497     offset = 0;
4498     pBuf += a;
4499     amt -= a;
4500   }else{
4501     offset -= pCur->info.nLocal;
4502   }
4503 
4504 
4505   if( rc==SQLITE_OK && amt>0 ){
4506     const u32 ovflSize = pBt->usableSize - 4;  /* Bytes content per ovfl page */
4507     Pgno nextPage;
4508 
4509     nextPage = get4byte(&aPayload[pCur->info.nLocal]);
4510 
4511     /* If the BtCursor.aOverflow[] has not been allocated, allocate it now.
4512     ** Except, do not allocate aOverflow[] for eOp==2.
4513     **
4514     ** The aOverflow[] array is sized at one entry for each overflow page
4515     ** in the overflow chain. The page number of the first overflow page is
4516     ** stored in aOverflow[0], etc. A value of 0 in the aOverflow[] array
4517     ** means "not yet known" (the cache is lazily populated).
4518     */
4519     if( eOp!=2 && (pCur->curFlags & BTCF_ValidOvfl)==0 ){
4520       int nOvfl = (pCur->info.nPayload-pCur->info.nLocal+ovflSize-1)/ovflSize;
4521       if( nOvfl>pCur->nOvflAlloc ){
4522         Pgno *aNew = (Pgno*)sqlite3Realloc(
4523             pCur->aOverflow, nOvfl*2*sizeof(Pgno)
4524         );
4525         if( aNew==0 ){
4526           return SQLITE_NOMEM_BKPT;
4527         }else{
4528           pCur->nOvflAlloc = nOvfl*2;
4529           pCur->aOverflow = aNew;
4530         }
4531       }
4532       memset(pCur->aOverflow, 0, nOvfl*sizeof(Pgno));
4533       pCur->curFlags |= BTCF_ValidOvfl;
4534     }
4535 
4536     /* If the overflow page-list cache has been allocated and the
4537     ** entry for the first required overflow page is valid, skip
4538     ** directly to it.
4539     */
4540     if( (pCur->curFlags & BTCF_ValidOvfl)!=0
4541      && pCur->aOverflow[offset/ovflSize]
4542     ){
4543       iIdx = (offset/ovflSize);
4544       nextPage = pCur->aOverflow[iIdx];
4545       offset = (offset%ovflSize);
4546     }
4547 
4548     assert( rc==SQLITE_OK && amt>0 );
4549     while( nextPage ){
4550       /* If required, populate the overflow page-list cache. */
4551       if( (pCur->curFlags & BTCF_ValidOvfl)!=0 ){
4552         assert( pCur->aOverflow[iIdx]==0
4553                 || pCur->aOverflow[iIdx]==nextPage
4554                 || CORRUPT_DB );
4555         pCur->aOverflow[iIdx] = nextPage;
4556       }
4557 
4558       if( offset>=ovflSize ){
4559         /* The only reason to read this page is to obtain the page
4560         ** number for the next page in the overflow chain. The page
4561         ** data is not required. So first try to lookup the overflow
4562         ** page-list cache, if any, then fall back to the getOverflowPage()
4563         ** function.
4564         **
4565         ** Note that the aOverflow[] array must be allocated because eOp!=2
4566         ** here.  If eOp==2, then offset==0 and this branch is never taken.
4567         */
4568         assert( eOp!=2 );
4569         assert( pCur->curFlags & BTCF_ValidOvfl );
4570         assert( pCur->pBtree->db==pBt->db );
4571         if( pCur->aOverflow[iIdx+1] ){
4572           nextPage = pCur->aOverflow[iIdx+1];
4573         }else{
4574           rc = getOverflowPage(pBt, nextPage, 0, &nextPage);
4575         }
4576         offset -= ovflSize;
4577       }else{
4578         /* Need to read this page properly. It contains some of the
4579         ** range of data that is being read (eOp==0) or written (eOp!=0).
4580         */
4581 #ifdef SQLITE_DIRECT_OVERFLOW_READ
4582         sqlite3_file *fd;
4583 #endif
4584         int a = amt;
4585         if( a + offset > ovflSize ){
4586           a = ovflSize - offset;
4587         }
4588 
4589 #ifdef SQLITE_DIRECT_OVERFLOW_READ
4590         /* If all the following are true:
4591         **
4592         **   1) this is a read operation, and
4593         **   2) data is required from the start of this overflow page, and
4594         **   3) the database is file-backed, and
4595         **   4) there is no open write-transaction, and
4596         **   5) the database is not a WAL database,
4597         **   6) all data from the page is being read.
4598         **   7) at least 4 bytes have already been read into the output buffer
4599         **
4600         ** then data can be read directly from the database file into the
4601         ** output buffer, bypassing the page-cache altogether. This speeds
4602         ** up loading large records that span many overflow pages.
4603         */
4604         if( (eOp&0x01)==0                                      /* (1) */
4605          && offset==0                                          /* (2) */
4606          && (bEnd || a==ovflSize)                              /* (6) */
4607          && pBt->inTransaction==TRANS_READ                     /* (4) */
4608          && (fd = sqlite3PagerFile(pBt->pPager))->pMethods     /* (3) */
4609          && 0==sqlite3PagerUseWal(pBt->pPager)                 /* (5) */
4610          && &pBuf[-4]>=pBufStart                               /* (7) */
4611         ){
4612           u8 aSave[4];
4613           u8 *aWrite = &pBuf[-4];
4614           assert( aWrite>=pBufStart );                         /* hence (7) */
4615           memcpy(aSave, aWrite, 4);
4616           rc = sqlite3OsRead(fd, aWrite, a+4, (i64)pBt->pageSize*(nextPage-1));
4617           nextPage = get4byte(aWrite);
4618           memcpy(aWrite, aSave, 4);
4619         }else
4620 #endif
4621 
4622         {
4623           DbPage *pDbPage;
4624           rc = sqlite3PagerGet(pBt->pPager, nextPage, &pDbPage,
4625               ((eOp&0x01)==0 ? PAGER_GET_READONLY : 0)
4626           );
4627           if( rc==SQLITE_OK ){
4628             aPayload = sqlite3PagerGetData(pDbPage);
4629             nextPage = get4byte(aPayload);
4630             rc = copyPayload(&aPayload[offset+4], pBuf, a, (eOp&0x01), pDbPage);
4631             sqlite3PagerUnref(pDbPage);
4632             offset = 0;
4633           }
4634         }
4635         amt -= a;
4636         pBuf += a;
4637       }
4638       if( amt==0 ) break;
4639       if( rc ) break;
4640       iIdx++;
4641     }
4642   }
4643 
4644   if( rc==SQLITE_OK && amt>0 ){
4645     return SQLITE_CORRUPT_BKPT;
4646   }
4647   return rc;
4648 }
4649 
4650 /*
4651 ** Read part of the payload for the row at which that cursor pCur is currently
4652 ** pointing.  "amt" bytes will be transferred into pBuf[].  The transfer
4653 ** begins at "offset".
4654 **
4655 ** pCur can be pointing to either a table or an index b-tree.
4656 ** If pointing to a table btree, then the content section is read.  If
4657 ** pCur is pointing to an index b-tree then the key section is read.
4658 **
4659 ** For sqlite3BtreePayload(), the caller must ensure that pCur is pointing
4660 ** to a valid row in the table.  For sqlite3BtreePayloadChecked(), the
4661 ** cursor might be invalid or might need to be restored before being read.
4662 **
4663 ** Return SQLITE_OK on success or an error code if anything goes
4664 ** wrong.  An error is returned if "offset+amt" is larger than
4665 ** the available payload.
4666 */
4667 int sqlite3BtreePayload(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){
4668   assert( cursorHoldsMutex(pCur) );
4669   assert( pCur->eState==CURSOR_VALID );
4670   assert( pCur->iPage>=0 && pCur->apPage[pCur->iPage] );
4671   assert( pCur->aiIdx[pCur->iPage]<pCur->apPage[pCur->iPage]->nCell );
4672   return accessPayload(pCur, offset, amt, (unsigned char*)pBuf, 0);
4673 }
4674 #ifndef SQLITE_OMIT_INCRBLOB
4675 int sqlite3BtreePayloadChecked(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){
4676   int rc;
4677   if ( pCur->eState==CURSOR_INVALID ){
4678     return SQLITE_ABORT;
4679   }
4680   assert( cursorOwnsBtShared(pCur) );
4681   rc = restoreCursorPosition(pCur);
4682   if( rc==SQLITE_OK ){
4683     assert( pCur->eState==CURSOR_VALID );
4684     assert( pCur->iPage>=0 && pCur->apPage[pCur->iPage] );
4685     assert( pCur->aiIdx[pCur->iPage]<pCur->apPage[pCur->iPage]->nCell );
4686     rc = accessPayload(pCur, offset, amt, pBuf, 0);
4687   }
4688   return rc;
4689 }
4690 #endif /* SQLITE_OMIT_INCRBLOB */
4691 
4692 /*
4693 ** Return a pointer to payload information from the entry that the
4694 ** pCur cursor is pointing to.  The pointer is to the beginning of
4695 ** the key if index btrees (pPage->intKey==0) and is the data for
4696 ** table btrees (pPage->intKey==1). The number of bytes of available
4697 ** key/data is written into *pAmt.  If *pAmt==0, then the value
4698 ** returned will not be a valid pointer.
4699 **
4700 ** This routine is an optimization.  It is common for the entire key
4701 ** and data to fit on the local page and for there to be no overflow
4702 ** pages.  When that is so, this routine can be used to access the
4703 ** key and data without making a copy.  If the key and/or data spills
4704 ** onto overflow pages, then accessPayload() must be used to reassemble
4705 ** the key/data and copy it into a preallocated buffer.
4706 **
4707 ** The pointer returned by this routine looks directly into the cached
4708 ** page of the database.  The data might change or move the next time
4709 ** any btree routine is called.
4710 */
4711 static const void *fetchPayload(
4712   BtCursor *pCur,      /* Cursor pointing to entry to read from */
4713   u32 *pAmt            /* Write the number of available bytes here */
4714 ){
4715   u32 amt;
4716   assert( pCur!=0 && pCur->iPage>=0 && pCur->apPage[pCur->iPage]);
4717   assert( pCur->eState==CURSOR_VALID );
4718   assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
4719   assert( cursorOwnsBtShared(pCur) );
4720   assert( pCur->aiIdx[pCur->iPage]<pCur->apPage[pCur->iPage]->nCell );
4721   assert( pCur->info.nSize>0 );
4722   assert( pCur->info.pPayload>pCur->apPage[pCur->iPage]->aData || CORRUPT_DB );
4723   assert( pCur->info.pPayload<pCur->apPage[pCur->iPage]->aDataEnd ||CORRUPT_DB);
4724   amt = (int)(pCur->apPage[pCur->iPage]->aDataEnd - pCur->info.pPayload);
4725   if( pCur->info.nLocal<amt ) amt = pCur->info.nLocal;
4726   *pAmt = amt;
4727   return (void*)pCur->info.pPayload;
4728 }
4729 
4730 
4731 /*
4732 ** For the entry that cursor pCur is point to, return as
4733 ** many bytes of the key or data as are available on the local
4734 ** b-tree page.  Write the number of available bytes into *pAmt.
4735 **
4736 ** The pointer returned is ephemeral.  The key/data may move
4737 ** or be destroyed on the next call to any Btree routine,
4738 ** including calls from other threads against the same cache.
4739 ** Hence, a mutex on the BtShared should be held prior to calling
4740 ** this routine.
4741 **
4742 ** These routines is used to get quick access to key and data
4743 ** in the common case where no overflow pages are used.
4744 */
4745 const void *sqlite3BtreePayloadFetch(BtCursor *pCur, u32 *pAmt){
4746   return fetchPayload(pCur, pAmt);
4747 }
4748 
4749 
4750 /*
4751 ** Move the cursor down to a new child page.  The newPgno argument is the
4752 ** page number of the child page to move to.
4753 **
4754 ** This function returns SQLITE_CORRUPT if the page-header flags field of
4755 ** the new child page does not match the flags field of the parent (i.e.
4756 ** if an intkey page appears to be the parent of a non-intkey page, or
4757 ** vice-versa).
4758 */
4759 static int moveToChild(BtCursor *pCur, u32 newPgno){
4760   BtShared *pBt = pCur->pBt;
4761 
4762   assert( cursorOwnsBtShared(pCur) );
4763   assert( pCur->eState==CURSOR_VALID );
4764   assert( pCur->iPage<BTCURSOR_MAX_DEPTH );
4765   assert( pCur->iPage>=0 );
4766   if( pCur->iPage>=(BTCURSOR_MAX_DEPTH-1) ){
4767     return SQLITE_CORRUPT_BKPT;
4768   }
4769   pCur->info.nSize = 0;
4770   pCur->curFlags &= ~(BTCF_ValidNKey|BTCF_ValidOvfl);
4771   pCur->iPage++;
4772   pCur->aiIdx[pCur->iPage] = 0;
4773   return getAndInitPage(pBt, newPgno, &pCur->apPage[pCur->iPage],
4774                         pCur, pCur->curPagerFlags);
4775 }
4776 
4777 #if SQLITE_DEBUG
4778 /*
4779 ** Page pParent is an internal (non-leaf) tree page. This function
4780 ** asserts that page number iChild is the left-child if the iIdx'th
4781 ** cell in page pParent. Or, if iIdx is equal to the total number of
4782 ** cells in pParent, that page number iChild is the right-child of
4783 ** the page.
4784 */
4785 static void assertParentIndex(MemPage *pParent, int iIdx, Pgno iChild){
4786   if( CORRUPT_DB ) return;  /* The conditions tested below might not be true
4787                             ** in a corrupt database */
4788   assert( iIdx<=pParent->nCell );
4789   if( iIdx==pParent->nCell ){
4790     assert( get4byte(&pParent->aData[pParent->hdrOffset+8])==iChild );
4791   }else{
4792     assert( get4byte(findCell(pParent, iIdx))==iChild );
4793   }
4794 }
4795 #else
4796 #  define assertParentIndex(x,y,z)
4797 #endif
4798 
4799 /*
4800 ** Move the cursor up to the parent page.
4801 **
4802 ** pCur->idx is set to the cell index that contains the pointer
4803 ** to the page we are coming from.  If we are coming from the
4804 ** right-most child page then pCur->idx is set to one more than
4805 ** the largest cell index.
4806 */
4807 static void moveToParent(BtCursor *pCur){
4808   assert( cursorOwnsBtShared(pCur) );
4809   assert( pCur->eState==CURSOR_VALID );
4810   assert( pCur->iPage>0 );
4811   assert( pCur->apPage[pCur->iPage] );
4812   assertParentIndex(
4813     pCur->apPage[pCur->iPage-1],
4814     pCur->aiIdx[pCur->iPage-1],
4815     pCur->apPage[pCur->iPage]->pgno
4816   );
4817   testcase( pCur->aiIdx[pCur->iPage-1] > pCur->apPage[pCur->iPage-1]->nCell );
4818   pCur->info.nSize = 0;
4819   pCur->curFlags &= ~(BTCF_ValidNKey|BTCF_ValidOvfl);
4820   releasePageNotNull(pCur->apPage[pCur->iPage--]);
4821 }
4822 
4823 /*
4824 ** Move the cursor to point to the root page of its b-tree structure.
4825 **
4826 ** If the table has a virtual root page, then the cursor is moved to point
4827 ** to the virtual root page instead of the actual root page. A table has a
4828 ** virtual root page when the actual root page contains no cells and a
4829 ** single child page. This can only happen with the table rooted at page 1.
4830 **
4831 ** If the b-tree structure is empty, the cursor state is set to
4832 ** CURSOR_INVALID. Otherwise, the cursor is set to point to the first
4833 ** cell located on the root (or virtual root) page and the cursor state
4834 ** is set to CURSOR_VALID.
4835 **
4836 ** If this function returns successfully, it may be assumed that the
4837 ** page-header flags indicate that the [virtual] root-page is the expected
4838 ** kind of b-tree page (i.e. if when opening the cursor the caller did not
4839 ** specify a KeyInfo structure the flags byte is set to 0x05 or 0x0D,
4840 ** indicating a table b-tree, or if the caller did specify a KeyInfo
4841 ** structure the flags byte is set to 0x02 or 0x0A, indicating an index
4842 ** b-tree).
4843 */
4844 static int moveToRoot(BtCursor *pCur){
4845   MemPage *pRoot;
4846   int rc = SQLITE_OK;
4847 
4848   assert( cursorOwnsBtShared(pCur) );
4849   assert( CURSOR_INVALID < CURSOR_REQUIRESEEK );
4850   assert( CURSOR_VALID   < CURSOR_REQUIRESEEK );
4851   assert( CURSOR_FAULT   > CURSOR_REQUIRESEEK );
4852   if( pCur->eState>=CURSOR_REQUIRESEEK ){
4853     if( pCur->eState==CURSOR_FAULT ){
4854       assert( pCur->skipNext!=SQLITE_OK );
4855       return pCur->skipNext;
4856     }
4857     sqlite3BtreeClearCursor(pCur);
4858   }
4859 
4860   if( pCur->iPage>=0 ){
4861     if( pCur->iPage ){
4862       do{
4863         assert( pCur->apPage[pCur->iPage]!=0 );
4864         releasePageNotNull(pCur->apPage[pCur->iPage--]);
4865       }while( pCur->iPage);
4866       goto skip_init;
4867     }
4868   }else if( pCur->pgnoRoot==0 ){
4869     pCur->eState = CURSOR_INVALID;
4870     return SQLITE_OK;
4871   }else{
4872     assert( pCur->iPage==(-1) );
4873     rc = getAndInitPage(pCur->pBtree->pBt, pCur->pgnoRoot, &pCur->apPage[0],
4874                         0, pCur->curPagerFlags);
4875     if( rc!=SQLITE_OK ){
4876       pCur->eState = CURSOR_INVALID;
4877        return rc;
4878     }
4879     pCur->iPage = 0;
4880     pCur->curIntKey = pCur->apPage[0]->intKey;
4881   }
4882   pRoot = pCur->apPage[0];
4883   assert( pRoot->pgno==pCur->pgnoRoot );
4884 
4885   /* If pCur->pKeyInfo is not NULL, then the caller that opened this cursor
4886   ** expected to open it on an index b-tree. Otherwise, if pKeyInfo is
4887   ** NULL, the caller expects a table b-tree. If this is not the case,
4888   ** return an SQLITE_CORRUPT error.
4889   **
4890   ** Earlier versions of SQLite assumed that this test could not fail
4891   ** if the root page was already loaded when this function was called (i.e.
4892   ** if pCur->iPage>=0). But this is not so if the database is corrupted
4893   ** in such a way that page pRoot is linked into a second b-tree table
4894   ** (or the freelist).  */
4895   assert( pRoot->intKey==1 || pRoot->intKey==0 );
4896   if( pRoot->isInit==0 || (pCur->pKeyInfo==0)!=pRoot->intKey ){
4897     return SQLITE_CORRUPT_BKPT;
4898   }
4899 
4900 skip_init:
4901   pCur->aiIdx[0] = 0;
4902   pCur->info.nSize = 0;
4903   pCur->curFlags &= ~(BTCF_AtLast|BTCF_ValidNKey|BTCF_ValidOvfl);
4904 
4905   pRoot = pCur->apPage[0];
4906   if( pRoot->nCell>0 ){
4907     pCur->eState = CURSOR_VALID;
4908   }else if( !pRoot->leaf ){
4909     Pgno subpage;
4910     if( pRoot->pgno!=1 ) return SQLITE_CORRUPT_BKPT;
4911     subpage = get4byte(&pRoot->aData[pRoot->hdrOffset+8]);
4912     pCur->eState = CURSOR_VALID;
4913     rc = moveToChild(pCur, subpage);
4914   }else{
4915     pCur->eState = CURSOR_INVALID;
4916   }
4917   return rc;
4918 }
4919 
4920 /*
4921 ** Move the cursor down to the left-most leaf entry beneath the
4922 ** entry to which it is currently pointing.
4923 **
4924 ** The left-most leaf is the one with the smallest key - the first
4925 ** in ascending order.
4926 */
4927 static int moveToLeftmost(BtCursor *pCur){
4928   Pgno pgno;
4929   int rc = SQLITE_OK;
4930   MemPage *pPage;
4931 
4932   assert( cursorOwnsBtShared(pCur) );
4933   assert( pCur->eState==CURSOR_VALID );
4934   while( rc==SQLITE_OK && !(pPage = pCur->apPage[pCur->iPage])->leaf ){
4935     assert( pCur->aiIdx[pCur->iPage]<pPage->nCell );
4936     pgno = get4byte(findCell(pPage, pCur->aiIdx[pCur->iPage]));
4937     rc = moveToChild(pCur, pgno);
4938   }
4939   return rc;
4940 }
4941 
4942 /*
4943 ** Move the cursor down to the right-most leaf entry beneath the
4944 ** page to which it is currently pointing.  Notice the difference
4945 ** between moveToLeftmost() and moveToRightmost().  moveToLeftmost()
4946 ** finds the left-most entry beneath the *entry* whereas moveToRightmost()
4947 ** finds the right-most entry beneath the *page*.
4948 **
4949 ** The right-most entry is the one with the largest key - the last
4950 ** key in ascending order.
4951 */
4952 static int moveToRightmost(BtCursor *pCur){
4953   Pgno pgno;
4954   int rc = SQLITE_OK;
4955   MemPage *pPage = 0;
4956 
4957   assert( cursorOwnsBtShared(pCur) );
4958   assert( pCur->eState==CURSOR_VALID );
4959   while( !(pPage = pCur->apPage[pCur->iPage])->leaf ){
4960     pgno = get4byte(&pPage->aData[pPage->hdrOffset+8]);
4961     pCur->aiIdx[pCur->iPage] = pPage->nCell;
4962     rc = moveToChild(pCur, pgno);
4963     if( rc ) return rc;
4964   }
4965   pCur->aiIdx[pCur->iPage] = pPage->nCell-1;
4966   assert( pCur->info.nSize==0 );
4967   assert( (pCur->curFlags & BTCF_ValidNKey)==0 );
4968   return SQLITE_OK;
4969 }
4970 
4971 /* Move the cursor to the first entry in the table.  Return SQLITE_OK
4972 ** on success.  Set *pRes to 0 if the cursor actually points to something
4973 ** or set *pRes to 1 if the table is empty.
4974 */
4975 int sqlite3BtreeFirst(BtCursor *pCur, int *pRes){
4976   int rc;
4977 
4978   assert( cursorOwnsBtShared(pCur) );
4979   assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
4980   rc = moveToRoot(pCur);
4981   if( rc==SQLITE_OK ){
4982     if( pCur->eState==CURSOR_INVALID ){
4983       assert( pCur->pgnoRoot==0 || pCur->apPage[pCur->iPage]->nCell==0 );
4984       *pRes = 1;
4985     }else{
4986       assert( pCur->apPage[pCur->iPage]->nCell>0 );
4987       *pRes = 0;
4988       rc = moveToLeftmost(pCur);
4989     }
4990   }
4991   return rc;
4992 }
4993 
4994 /* Move the cursor to the last entry in the table.  Return SQLITE_OK
4995 ** on success.  Set *pRes to 0 if the cursor actually points to something
4996 ** or set *pRes to 1 if the table is empty.
4997 */
4998 int sqlite3BtreeLast(BtCursor *pCur, int *pRes){
4999   int rc;
5000 
5001   assert( cursorOwnsBtShared(pCur) );
5002   assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
5003 
5004   /* If the cursor already points to the last entry, this is a no-op. */
5005   if( CURSOR_VALID==pCur->eState && (pCur->curFlags & BTCF_AtLast)!=0 ){
5006 #ifdef SQLITE_DEBUG
5007     /* This block serves to assert() that the cursor really does point
5008     ** to the last entry in the b-tree. */
5009     int ii;
5010     for(ii=0; ii<pCur->iPage; ii++){
5011       assert( pCur->aiIdx[ii]==pCur->apPage[ii]->nCell );
5012     }
5013     assert( pCur->aiIdx[pCur->iPage]==pCur->apPage[pCur->iPage]->nCell-1 );
5014     assert( pCur->apPage[pCur->iPage]->leaf );
5015 #endif
5016     return SQLITE_OK;
5017   }
5018 
5019   rc = moveToRoot(pCur);
5020   if( rc==SQLITE_OK ){
5021     if( CURSOR_INVALID==pCur->eState ){
5022       assert( pCur->pgnoRoot==0 || pCur->apPage[pCur->iPage]->nCell==0 );
5023       *pRes = 1;
5024     }else{
5025       assert( pCur->eState==CURSOR_VALID );
5026       *pRes = 0;
5027       rc = moveToRightmost(pCur);
5028       if( rc==SQLITE_OK ){
5029         pCur->curFlags |= BTCF_AtLast;
5030       }else{
5031         pCur->curFlags &= ~BTCF_AtLast;
5032       }
5033 
5034     }
5035   }
5036   return rc;
5037 }
5038 
5039 /* Move the cursor so that it points to an entry near the key
5040 ** specified by pIdxKey or intKey.   Return a success code.
5041 **
5042 ** For INTKEY tables, the intKey parameter is used.  pIdxKey
5043 ** must be NULL.  For index tables, pIdxKey is used and intKey
5044 ** is ignored.
5045 **
5046 ** If an exact match is not found, then the cursor is always
5047 ** left pointing at a leaf page which would hold the entry if it
5048 ** were present.  The cursor might point to an entry that comes
5049 ** before or after the key.
5050 **
5051 ** An integer is written into *pRes which is the result of
5052 ** comparing the key with the entry to which the cursor is
5053 ** pointing.  The meaning of the integer written into
5054 ** *pRes is as follows:
5055 **
5056 **     *pRes<0      The cursor is left pointing at an entry that
5057 **                  is smaller than intKey/pIdxKey or if the table is empty
5058 **                  and the cursor is therefore left point to nothing.
5059 **
5060 **     *pRes==0     The cursor is left pointing at an entry that
5061 **                  exactly matches intKey/pIdxKey.
5062 **
5063 **     *pRes>0      The cursor is left pointing at an entry that
5064 **                  is larger than intKey/pIdxKey.
5065 **
5066 ** For index tables, the pIdxKey->eqSeen field is set to 1 if there
5067 ** exists an entry in the table that exactly matches pIdxKey.
5068 */
5069 int sqlite3BtreeMovetoUnpacked(
5070   BtCursor *pCur,          /* The cursor to be moved */
5071   UnpackedRecord *pIdxKey, /* Unpacked index key */
5072   i64 intKey,              /* The table key */
5073   int biasRight,           /* If true, bias the search to the high end */
5074   int *pRes                /* Write search results here */
5075 ){
5076   int rc;
5077   RecordCompare xRecordCompare;
5078 
5079   assert( cursorOwnsBtShared(pCur) );
5080   assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
5081   assert( pRes );
5082   assert( (pIdxKey==0)==(pCur->pKeyInfo==0) );
5083   assert( pCur->eState!=CURSOR_VALID || (pIdxKey==0)==(pCur->curIntKey!=0) );
5084 
5085   /* If the cursor is already positioned at the point we are trying
5086   ** to move to, then just return without doing any work */
5087   if( pIdxKey==0
5088    && pCur->eState==CURSOR_VALID && (pCur->curFlags & BTCF_ValidNKey)!=0
5089   ){
5090     if( pCur->info.nKey==intKey ){
5091       *pRes = 0;
5092       return SQLITE_OK;
5093     }
5094     if( pCur->info.nKey<intKey ){
5095       if( (pCur->curFlags & BTCF_AtLast)!=0 ){
5096         *pRes = -1;
5097         return SQLITE_OK;
5098       }
5099       /* If the requested key is one more than the previous key, then
5100       ** try to get there using sqlite3BtreeNext() rather than a full
5101       ** binary search.  This is an optimization only.  The correct answer
5102       ** is still obtained without this ase, only a little more slowely */
5103       if( pCur->info.nKey+1==intKey && !pCur->skipNext ){
5104         *pRes = 0;
5105         rc = sqlite3BtreeNext(pCur, pRes);
5106         if( rc ) return rc;
5107         if( *pRes==0 ){
5108           getCellInfo(pCur);
5109           if( pCur->info.nKey==intKey ){
5110             return SQLITE_OK;
5111           }
5112         }
5113       }
5114     }
5115   }
5116 
5117   if( pIdxKey ){
5118     xRecordCompare = sqlite3VdbeFindCompare(pIdxKey);
5119     pIdxKey->errCode = 0;
5120     assert( pIdxKey->default_rc==1
5121          || pIdxKey->default_rc==0
5122          || pIdxKey->default_rc==-1
5123     );
5124   }else{
5125     xRecordCompare = 0; /* All keys are integers */
5126   }
5127 
5128   rc = moveToRoot(pCur);
5129   if( rc ){
5130     return rc;
5131   }
5132   assert( pCur->pgnoRoot==0 || pCur->apPage[pCur->iPage] );
5133   assert( pCur->pgnoRoot==0 || pCur->apPage[pCur->iPage]->isInit );
5134   assert( pCur->eState==CURSOR_INVALID || pCur->apPage[pCur->iPage]->nCell>0 );
5135   if( pCur->eState==CURSOR_INVALID ){
5136     *pRes = -1;
5137     assert( pCur->pgnoRoot==0 || pCur->apPage[pCur->iPage]->nCell==0 );
5138     return SQLITE_OK;
5139   }
5140   assert( pCur->apPage[0]->intKey==pCur->curIntKey );
5141   assert( pCur->curIntKey || pIdxKey );
5142   for(;;){
5143     int lwr, upr, idx, c;
5144     Pgno chldPg;
5145     MemPage *pPage = pCur->apPage[pCur->iPage];
5146     u8 *pCell;                          /* Pointer to current cell in pPage */
5147 
5148     /* pPage->nCell must be greater than zero. If this is the root-page
5149     ** the cursor would have been INVALID above and this for(;;) loop
5150     ** not run. If this is not the root-page, then the moveToChild() routine
5151     ** would have already detected db corruption. Similarly, pPage must
5152     ** be the right kind (index or table) of b-tree page. Otherwise
5153     ** a moveToChild() or moveToRoot() call would have detected corruption.  */
5154     assert( pPage->nCell>0 );
5155     assert( pPage->intKey==(pIdxKey==0) );
5156     lwr = 0;
5157     upr = pPage->nCell-1;
5158     assert( biasRight==0 || biasRight==1 );
5159     idx = upr>>(1-biasRight); /* idx = biasRight ? upr : (lwr+upr)/2; */
5160     pCur->aiIdx[pCur->iPage] = (u16)idx;
5161     if( xRecordCompare==0 ){
5162       for(;;){
5163         i64 nCellKey;
5164         pCell = findCellPastPtr(pPage, idx);
5165         if( pPage->intKeyLeaf ){
5166           while( 0x80 <= *(pCell++) ){
5167             if( pCell>=pPage->aDataEnd ) return SQLITE_CORRUPT_BKPT;
5168           }
5169         }
5170         getVarint(pCell, (u64*)&nCellKey);
5171         if( nCellKey<intKey ){
5172           lwr = idx+1;
5173           if( lwr>upr ){ c = -1; break; }
5174         }else if( nCellKey>intKey ){
5175           upr = idx-1;
5176           if( lwr>upr ){ c = +1; break; }
5177         }else{
5178           assert( nCellKey==intKey );
5179           pCur->aiIdx[pCur->iPage] = (u16)idx;
5180           if( !pPage->leaf ){
5181             lwr = idx;
5182             goto moveto_next_layer;
5183           }else{
5184             pCur->curFlags |= BTCF_ValidNKey;
5185             pCur->info.nKey = nCellKey;
5186             pCur->info.nSize = 0;
5187             *pRes = 0;
5188             return SQLITE_OK;
5189           }
5190         }
5191         assert( lwr+upr>=0 );
5192         idx = (lwr+upr)>>1;  /* idx = (lwr+upr)/2; */
5193       }
5194     }else{
5195       for(;;){
5196         int nCell;  /* Size of the pCell cell in bytes */
5197         pCell = findCellPastPtr(pPage, idx);
5198 
5199         /* The maximum supported page-size is 65536 bytes. This means that
5200         ** the maximum number of record bytes stored on an index B-Tree
5201         ** page is less than 16384 bytes and may be stored as a 2-byte
5202         ** varint. This information is used to attempt to avoid parsing
5203         ** the entire cell by checking for the cases where the record is
5204         ** stored entirely within the b-tree page by inspecting the first
5205         ** 2 bytes of the cell.
5206         */
5207         nCell = pCell[0];
5208         if( nCell<=pPage->max1bytePayload ){
5209           /* This branch runs if the record-size field of the cell is a
5210           ** single byte varint and the record fits entirely on the main
5211           ** b-tree page.  */
5212           testcase( pCell+nCell+1==pPage->aDataEnd );
5213           c = xRecordCompare(nCell, (void*)&pCell[1], pIdxKey);
5214         }else if( !(pCell[1] & 0x80)
5215           && (nCell = ((nCell&0x7f)<<7) + pCell[1])<=pPage->maxLocal
5216         ){
5217           /* The record-size field is a 2 byte varint and the record
5218           ** fits entirely on the main b-tree page.  */
5219           testcase( pCell+nCell+2==pPage->aDataEnd );
5220           c = xRecordCompare(nCell, (void*)&pCell[2], pIdxKey);
5221         }else{
5222           /* The record flows over onto one or more overflow pages. In
5223           ** this case the whole cell needs to be parsed, a buffer allocated
5224           ** and accessPayload() used to retrieve the record into the
5225           ** buffer before VdbeRecordCompare() can be called.
5226           **
5227           ** If the record is corrupt, the xRecordCompare routine may read
5228           ** up to two varints past the end of the buffer. An extra 18
5229           ** bytes of padding is allocated at the end of the buffer in
5230           ** case this happens.  */
5231           void *pCellKey;
5232           u8 * const pCellBody = pCell - pPage->childPtrSize;
5233           pPage->xParseCell(pPage, pCellBody, &pCur->info);
5234           nCell = (int)pCur->info.nKey;
5235           testcase( nCell<0 );   /* True if key size is 2^32 or more */
5236           testcase( nCell==0 );  /* Invalid key size:  0x80 0x80 0x00 */
5237           testcase( nCell==1 );  /* Invalid key size:  0x80 0x80 0x01 */
5238           testcase( nCell==2 );  /* Minimum legal index key size */
5239           if( nCell<2 ){
5240             rc = SQLITE_CORRUPT_BKPT;
5241             goto moveto_finish;
5242           }
5243           pCellKey = sqlite3Malloc( nCell+18 );
5244           if( pCellKey==0 ){
5245             rc = SQLITE_NOMEM_BKPT;
5246             goto moveto_finish;
5247           }
5248           pCur->aiIdx[pCur->iPage] = (u16)idx;
5249           rc = accessPayload(pCur, 0, nCell, (unsigned char*)pCellKey, 2);
5250           if( rc ){
5251             sqlite3_free(pCellKey);
5252             goto moveto_finish;
5253           }
5254           c = xRecordCompare(nCell, pCellKey, pIdxKey);
5255           sqlite3_free(pCellKey);
5256         }
5257         assert(
5258             (pIdxKey->errCode!=SQLITE_CORRUPT || c==0)
5259          && (pIdxKey->errCode!=SQLITE_NOMEM || pCur->pBtree->db->mallocFailed)
5260         );
5261         if( c<0 ){
5262           lwr = idx+1;
5263         }else if( c>0 ){
5264           upr = idx-1;
5265         }else{
5266           assert( c==0 );
5267           *pRes = 0;
5268           rc = SQLITE_OK;
5269           pCur->aiIdx[pCur->iPage] = (u16)idx;
5270           if( pIdxKey->errCode ) rc = SQLITE_CORRUPT;
5271           goto moveto_finish;
5272         }
5273         if( lwr>upr ) break;
5274         assert( lwr+upr>=0 );
5275         idx = (lwr+upr)>>1;  /* idx = (lwr+upr)/2 */
5276       }
5277     }
5278     assert( lwr==upr+1 || (pPage->intKey && !pPage->leaf) );
5279     assert( pPage->isInit );
5280     if( pPage->leaf ){
5281       assert( pCur->aiIdx[pCur->iPage]<pCur->apPage[pCur->iPage]->nCell );
5282       pCur->aiIdx[pCur->iPage] = (u16)idx;
5283       *pRes = c;
5284       rc = SQLITE_OK;
5285       goto moveto_finish;
5286     }
5287 moveto_next_layer:
5288     if( lwr>=pPage->nCell ){
5289       chldPg = get4byte(&pPage->aData[pPage->hdrOffset+8]);
5290     }else{
5291       chldPg = get4byte(findCell(pPage, lwr));
5292     }
5293     pCur->aiIdx[pCur->iPage] = (u16)lwr;
5294     rc = moveToChild(pCur, chldPg);
5295     if( rc ) break;
5296   }
5297 moveto_finish:
5298   pCur->info.nSize = 0;
5299   assert( (pCur->curFlags & BTCF_ValidOvfl)==0 );
5300   return rc;
5301 }
5302 
5303 
5304 /*
5305 ** Return TRUE if the cursor is not pointing at an entry of the table.
5306 **
5307 ** TRUE will be returned after a call to sqlite3BtreeNext() moves
5308 ** past the last entry in the table or sqlite3BtreePrev() moves past
5309 ** the first entry.  TRUE is also returned if the table is empty.
5310 */
5311 int sqlite3BtreeEof(BtCursor *pCur){
5312   /* TODO: What if the cursor is in CURSOR_REQUIRESEEK but all table entries
5313   ** have been deleted? This API will need to change to return an error code
5314   ** as well as the boolean result value.
5315   */
5316   return (CURSOR_VALID!=pCur->eState);
5317 }
5318 
5319 /*
5320 ** Advance the cursor to the next entry in the database.  If
5321 ** successful then set *pRes=0.  If the cursor
5322 ** was already pointing to the last entry in the database before
5323 ** this routine was called, then set *pRes=1.
5324 **
5325 ** The main entry point is sqlite3BtreeNext().  That routine is optimized
5326 ** for the common case of merely incrementing the cell counter BtCursor.aiIdx
5327 ** to the next cell on the current page.  The (slower) btreeNext() helper
5328 ** routine is called when it is necessary to move to a different page or
5329 ** to restore the cursor.
5330 **
5331 ** The calling function will set *pRes to 0 or 1.  The initial *pRes value
5332 ** will be 1 if the cursor being stepped corresponds to an SQL index and
5333 ** if this routine could have been skipped if that SQL index had been
5334 ** a unique index.  Otherwise the caller will have set *pRes to zero.
5335 ** Zero is the common case. The btree implementation is free to use the
5336 ** initial *pRes value as a hint to improve performance, but the current
5337 ** SQLite btree implementation does not. (Note that the comdb2 btree
5338 ** implementation does use this hint, however.)
5339 */
5340 static SQLITE_NOINLINE int btreeNext(BtCursor *pCur, int *pRes){
5341   int rc;
5342   int idx;
5343   MemPage *pPage;
5344 
5345   assert( cursorOwnsBtShared(pCur) );
5346   assert( pCur->skipNext==0 || pCur->eState!=CURSOR_VALID );
5347   assert( *pRes==0 );
5348   if( pCur->eState!=CURSOR_VALID ){
5349     assert( (pCur->curFlags & BTCF_ValidOvfl)==0 );
5350     rc = restoreCursorPosition(pCur);
5351     if( rc!=SQLITE_OK ){
5352       return rc;
5353     }
5354     if( CURSOR_INVALID==pCur->eState ){
5355       *pRes = 1;
5356       return SQLITE_OK;
5357     }
5358     if( pCur->skipNext ){
5359       assert( pCur->eState==CURSOR_VALID || pCur->eState==CURSOR_SKIPNEXT );
5360       pCur->eState = CURSOR_VALID;
5361       if( pCur->skipNext>0 ){
5362         pCur->skipNext = 0;
5363         return SQLITE_OK;
5364       }
5365       pCur->skipNext = 0;
5366     }
5367   }
5368 
5369   pPage = pCur->apPage[pCur->iPage];
5370   idx = ++pCur->aiIdx[pCur->iPage];
5371   assert( pPage->isInit );
5372 
5373   /* If the database file is corrupt, it is possible for the value of idx
5374   ** to be invalid here. This can only occur if a second cursor modifies
5375   ** the page while cursor pCur is holding a reference to it. Which can
5376   ** only happen if the database is corrupt in such a way as to link the
5377   ** page into more than one b-tree structure. */
5378   testcase( idx>pPage->nCell );
5379 
5380   if( idx>=pPage->nCell ){
5381     if( !pPage->leaf ){
5382       rc = moveToChild(pCur, get4byte(&pPage->aData[pPage->hdrOffset+8]));
5383       if( rc ) return rc;
5384       return moveToLeftmost(pCur);
5385     }
5386     do{
5387       if( pCur->iPage==0 ){
5388         *pRes = 1;
5389         pCur->eState = CURSOR_INVALID;
5390         return SQLITE_OK;
5391       }
5392       moveToParent(pCur);
5393       pPage = pCur->apPage[pCur->iPage];
5394     }while( pCur->aiIdx[pCur->iPage]>=pPage->nCell );
5395     if( pPage->intKey ){
5396       return sqlite3BtreeNext(pCur, pRes);
5397     }else{
5398       return SQLITE_OK;
5399     }
5400   }
5401   if( pPage->leaf ){
5402     return SQLITE_OK;
5403   }else{
5404     return moveToLeftmost(pCur);
5405   }
5406 }
5407 int sqlite3BtreeNext(BtCursor *pCur, int *pRes){
5408   MemPage *pPage;
5409   assert( cursorOwnsBtShared(pCur) );
5410   assert( pRes!=0 );
5411   assert( *pRes==0 || *pRes==1 );
5412   assert( pCur->skipNext==0 || pCur->eState!=CURSOR_VALID );
5413   pCur->info.nSize = 0;
5414   pCur->curFlags &= ~(BTCF_ValidNKey|BTCF_ValidOvfl);
5415   *pRes = 0;
5416   if( pCur->eState!=CURSOR_VALID ) return btreeNext(pCur, pRes);
5417   pPage = pCur->apPage[pCur->iPage];
5418   if( (++pCur->aiIdx[pCur->iPage])>=pPage->nCell ){
5419     pCur->aiIdx[pCur->iPage]--;
5420     return btreeNext(pCur, pRes);
5421   }
5422   if( pPage->leaf ){
5423     return SQLITE_OK;
5424   }else{
5425     return moveToLeftmost(pCur);
5426   }
5427 }
5428 
5429 /*
5430 ** Step the cursor to the back to the previous entry in the database.  If
5431 ** successful then set *pRes=0.  If the cursor
5432 ** was already pointing to the first entry in the database before
5433 ** this routine was called, then set *pRes=1.
5434 **
5435 ** The main entry point is sqlite3BtreePrevious().  That routine is optimized
5436 ** for the common case of merely decrementing the cell counter BtCursor.aiIdx
5437 ** to the previous cell on the current page.  The (slower) btreePrevious()
5438 ** helper routine is called when it is necessary to move to a different page
5439 ** or to restore the cursor.
5440 **
5441 ** The calling function will set *pRes to 0 or 1.  The initial *pRes value
5442 ** will be 1 if the cursor being stepped corresponds to an SQL index and
5443 ** if this routine could have been skipped if that SQL index had been
5444 ** a unique index.  Otherwise the caller will have set *pRes to zero.
5445 ** Zero is the common case. The btree implementation is free to use the
5446 ** initial *pRes value as a hint to improve performance, but the current
5447 ** SQLite btree implementation does not. (Note that the comdb2 btree
5448 ** implementation does use this hint, however.)
5449 */
5450 static SQLITE_NOINLINE int btreePrevious(BtCursor *pCur, int *pRes){
5451   int rc;
5452   MemPage *pPage;
5453 
5454   assert( cursorOwnsBtShared(pCur) );
5455   assert( pRes!=0 );
5456   assert( *pRes==0 );
5457   assert( pCur->skipNext==0 || pCur->eState!=CURSOR_VALID );
5458   assert( (pCur->curFlags & (BTCF_AtLast|BTCF_ValidOvfl|BTCF_ValidNKey))==0 );
5459   assert( pCur->info.nSize==0 );
5460   if( pCur->eState!=CURSOR_VALID ){
5461     rc = restoreCursorPosition(pCur);
5462     if( rc!=SQLITE_OK ){
5463       return rc;
5464     }
5465     if( CURSOR_INVALID==pCur->eState ){
5466       *pRes = 1;
5467       return SQLITE_OK;
5468     }
5469     if( pCur->skipNext ){
5470       assert( pCur->eState==CURSOR_VALID || pCur->eState==CURSOR_SKIPNEXT );
5471       pCur->eState = CURSOR_VALID;
5472       if( pCur->skipNext<0 ){
5473         pCur->skipNext = 0;
5474         return SQLITE_OK;
5475       }
5476       pCur->skipNext = 0;
5477     }
5478   }
5479 
5480   pPage = pCur->apPage[pCur->iPage];
5481   assert( pPage->isInit );
5482   if( !pPage->leaf ){
5483     int idx = pCur->aiIdx[pCur->iPage];
5484     rc = moveToChild(pCur, get4byte(findCell(pPage, idx)));
5485     if( rc ) return rc;
5486     rc = moveToRightmost(pCur);
5487   }else{
5488     while( pCur->aiIdx[pCur->iPage]==0 ){
5489       if( pCur->iPage==0 ){
5490         pCur->eState = CURSOR_INVALID;
5491         *pRes = 1;
5492         return SQLITE_OK;
5493       }
5494       moveToParent(pCur);
5495     }
5496     assert( pCur->info.nSize==0 );
5497     assert( (pCur->curFlags & (BTCF_ValidOvfl))==0 );
5498 
5499     pCur->aiIdx[pCur->iPage]--;
5500     pPage = pCur->apPage[pCur->iPage];
5501     if( pPage->intKey && !pPage->leaf ){
5502       rc = sqlite3BtreePrevious(pCur, pRes);
5503     }else{
5504       rc = SQLITE_OK;
5505     }
5506   }
5507   return rc;
5508 }
5509 int sqlite3BtreePrevious(BtCursor *pCur, int *pRes){
5510   assert( cursorOwnsBtShared(pCur) );
5511   assert( pRes!=0 );
5512   assert( *pRes==0 || *pRes==1 );
5513   assert( pCur->skipNext==0 || pCur->eState!=CURSOR_VALID );
5514   *pRes = 0;
5515   pCur->curFlags &= ~(BTCF_AtLast|BTCF_ValidOvfl|BTCF_ValidNKey);
5516   pCur->info.nSize = 0;
5517   if( pCur->eState!=CURSOR_VALID
5518    || pCur->aiIdx[pCur->iPage]==0
5519    || pCur->apPage[pCur->iPage]->leaf==0
5520   ){
5521     return btreePrevious(pCur, pRes);
5522   }
5523   pCur->aiIdx[pCur->iPage]--;
5524   return SQLITE_OK;
5525 }
5526 
5527 /*
5528 ** Allocate a new page from the database file.
5529 **
5530 ** The new page is marked as dirty.  (In other words, sqlite3PagerWrite()
5531 ** has already been called on the new page.)  The new page has also
5532 ** been referenced and the calling routine is responsible for calling
5533 ** sqlite3PagerUnref() on the new page when it is done.
5534 **
5535 ** SQLITE_OK is returned on success.  Any other return value indicates
5536 ** an error.  *ppPage is set to NULL in the event of an error.
5537 **
5538 ** If the "nearby" parameter is not 0, then an effort is made to
5539 ** locate a page close to the page number "nearby".  This can be used in an
5540 ** attempt to keep related pages close to each other in the database file,
5541 ** which in turn can make database access faster.
5542 **
5543 ** If the eMode parameter is BTALLOC_EXACT and the nearby page exists
5544 ** anywhere on the free-list, then it is guaranteed to be returned.  If
5545 ** eMode is BTALLOC_LT then the page returned will be less than or equal
5546 ** to nearby if any such page exists.  If eMode is BTALLOC_ANY then there
5547 ** are no restrictions on which page is returned.
5548 */
5549 static int allocateBtreePage(
5550   BtShared *pBt,         /* The btree */
5551   MemPage **ppPage,      /* Store pointer to the allocated page here */
5552   Pgno *pPgno,           /* Store the page number here */
5553   Pgno nearby,           /* Search for a page near this one */
5554   u8 eMode               /* BTALLOC_EXACT, BTALLOC_LT, or BTALLOC_ANY */
5555 ){
5556   MemPage *pPage1;
5557   int rc;
5558   u32 n;     /* Number of pages on the freelist */
5559   u32 k;     /* Number of leaves on the trunk of the freelist */
5560   MemPage *pTrunk = 0;
5561   MemPage *pPrevTrunk = 0;
5562   Pgno mxPage;     /* Total size of the database file */
5563 
5564   assert( sqlite3_mutex_held(pBt->mutex) );
5565   assert( eMode==BTALLOC_ANY || (nearby>0 && IfNotOmitAV(pBt->autoVacuum)) );
5566   pPage1 = pBt->pPage1;
5567   mxPage = btreePagecount(pBt);
5568   /* EVIDENCE-OF: R-05119-02637 The 4-byte big-endian integer at offset 36
5569   ** stores stores the total number of pages on the freelist. */
5570   n = get4byte(&pPage1->aData[36]);
5571   testcase( n==mxPage-1 );
5572   if( n>=mxPage ){
5573     return SQLITE_CORRUPT_BKPT;
5574   }
5575   if( n>0 ){
5576     /* There are pages on the freelist.  Reuse one of those pages. */
5577     Pgno iTrunk;
5578     u8 searchList = 0; /* If the free-list must be searched for 'nearby' */
5579     u32 nSearch = 0;   /* Count of the number of search attempts */
5580 
5581     /* If eMode==BTALLOC_EXACT and a query of the pointer-map
5582     ** shows that the page 'nearby' is somewhere on the free-list, then
5583     ** the entire-list will be searched for that page.
5584     */
5585 #ifndef SQLITE_OMIT_AUTOVACUUM
5586     if( eMode==BTALLOC_EXACT ){
5587       if( nearby<=mxPage ){
5588         u8 eType;
5589         assert( nearby>0 );
5590         assert( pBt->autoVacuum );
5591         rc = ptrmapGet(pBt, nearby, &eType, 0);
5592         if( rc ) return rc;
5593         if( eType==PTRMAP_FREEPAGE ){
5594           searchList = 1;
5595         }
5596       }
5597     }else if( eMode==BTALLOC_LE ){
5598       searchList = 1;
5599     }
5600 #endif
5601 
5602     /* Decrement the free-list count by 1. Set iTrunk to the index of the
5603     ** first free-list trunk page. iPrevTrunk is initially 1.
5604     */
5605     rc = sqlite3PagerWrite(pPage1->pDbPage);
5606     if( rc ) return rc;
5607     put4byte(&pPage1->aData[36], n-1);
5608 
5609     /* The code within this loop is run only once if the 'searchList' variable
5610     ** is not true. Otherwise, it runs once for each trunk-page on the
5611     ** free-list until the page 'nearby' is located (eMode==BTALLOC_EXACT)
5612     ** or until a page less than 'nearby' is located (eMode==BTALLOC_LT)
5613     */
5614     do {
5615       pPrevTrunk = pTrunk;
5616       if( pPrevTrunk ){
5617         /* EVIDENCE-OF: R-01506-11053 The first integer on a freelist trunk page
5618         ** is the page number of the next freelist trunk page in the list or
5619         ** zero if this is the last freelist trunk page. */
5620         iTrunk = get4byte(&pPrevTrunk->aData[0]);
5621       }else{
5622         /* EVIDENCE-OF: R-59841-13798 The 4-byte big-endian integer at offset 32
5623         ** stores the page number of the first page of the freelist, or zero if
5624         ** the freelist is empty. */
5625         iTrunk = get4byte(&pPage1->aData[32]);
5626       }
5627       testcase( iTrunk==mxPage );
5628       if( iTrunk>mxPage || nSearch++ > n ){
5629         rc = SQLITE_CORRUPT_BKPT;
5630       }else{
5631         rc = btreeGetUnusedPage(pBt, iTrunk, &pTrunk, 0);
5632       }
5633       if( rc ){
5634         pTrunk = 0;
5635         goto end_allocate_page;
5636       }
5637       assert( pTrunk!=0 );
5638       assert( pTrunk->aData!=0 );
5639       /* EVIDENCE-OF: R-13523-04394 The second integer on a freelist trunk page
5640       ** is the number of leaf page pointers to follow. */
5641       k = get4byte(&pTrunk->aData[4]);
5642       if( k==0 && !searchList ){
5643         /* The trunk has no leaves and the list is not being searched.
5644         ** So extract the trunk page itself and use it as the newly
5645         ** allocated page */
5646         assert( pPrevTrunk==0 );
5647         rc = sqlite3PagerWrite(pTrunk->pDbPage);
5648         if( rc ){
5649           goto end_allocate_page;
5650         }
5651         *pPgno = iTrunk;
5652         memcpy(&pPage1->aData[32], &pTrunk->aData[0], 4);
5653         *ppPage = pTrunk;
5654         pTrunk = 0;
5655         TRACE(("ALLOCATE: %d trunk - %d free pages left\n", *pPgno, n-1));
5656       }else if( k>(u32)(pBt->usableSize/4 - 2) ){
5657         /* Value of k is out of range.  Database corruption */
5658         rc = SQLITE_CORRUPT_BKPT;
5659         goto end_allocate_page;
5660 #ifndef SQLITE_OMIT_AUTOVACUUM
5661       }else if( searchList
5662             && (nearby==iTrunk || (iTrunk<nearby && eMode==BTALLOC_LE))
5663       ){
5664         /* The list is being searched and this trunk page is the page
5665         ** to allocate, regardless of whether it has leaves.
5666         */
5667         *pPgno = iTrunk;
5668         *ppPage = pTrunk;
5669         searchList = 0;
5670         rc = sqlite3PagerWrite(pTrunk->pDbPage);
5671         if( rc ){
5672           goto end_allocate_page;
5673         }
5674         if( k==0 ){
5675           if( !pPrevTrunk ){
5676             memcpy(&pPage1->aData[32], &pTrunk->aData[0], 4);
5677           }else{
5678             rc = sqlite3PagerWrite(pPrevTrunk->pDbPage);
5679             if( rc!=SQLITE_OK ){
5680               goto end_allocate_page;
5681             }
5682             memcpy(&pPrevTrunk->aData[0], &pTrunk->aData[0], 4);
5683           }
5684         }else{
5685           /* The trunk page is required by the caller but it contains
5686           ** pointers to free-list leaves. The first leaf becomes a trunk
5687           ** page in this case.
5688           */
5689           MemPage *pNewTrunk;
5690           Pgno iNewTrunk = get4byte(&pTrunk->aData[8]);
5691           if( iNewTrunk>mxPage ){
5692             rc = SQLITE_CORRUPT_BKPT;
5693             goto end_allocate_page;
5694           }
5695           testcase( iNewTrunk==mxPage );
5696           rc = btreeGetUnusedPage(pBt, iNewTrunk, &pNewTrunk, 0);
5697           if( rc!=SQLITE_OK ){
5698             goto end_allocate_page;
5699           }
5700           rc = sqlite3PagerWrite(pNewTrunk->pDbPage);
5701           if( rc!=SQLITE_OK ){
5702             releasePage(pNewTrunk);
5703             goto end_allocate_page;
5704           }
5705           memcpy(&pNewTrunk->aData[0], &pTrunk->aData[0], 4);
5706           put4byte(&pNewTrunk->aData[4], k-1);
5707           memcpy(&pNewTrunk->aData[8], &pTrunk->aData[12], (k-1)*4);
5708           releasePage(pNewTrunk);
5709           if( !pPrevTrunk ){
5710             assert( sqlite3PagerIswriteable(pPage1->pDbPage) );
5711             put4byte(&pPage1->aData[32], iNewTrunk);
5712           }else{
5713             rc = sqlite3PagerWrite(pPrevTrunk->pDbPage);
5714             if( rc ){
5715               goto end_allocate_page;
5716             }
5717             put4byte(&pPrevTrunk->aData[0], iNewTrunk);
5718           }
5719         }
5720         pTrunk = 0;
5721         TRACE(("ALLOCATE: %d trunk - %d free pages left\n", *pPgno, n-1));
5722 #endif
5723       }else if( k>0 ){
5724         /* Extract a leaf from the trunk */
5725         u32 closest;
5726         Pgno iPage;
5727         unsigned char *aData = pTrunk->aData;
5728         if( nearby>0 ){
5729           u32 i;
5730           closest = 0;
5731           if( eMode==BTALLOC_LE ){
5732             for(i=0; i<k; i++){
5733               iPage = get4byte(&aData[8+i*4]);
5734               if( iPage<=nearby ){
5735                 closest = i;
5736                 break;
5737               }
5738             }
5739           }else{
5740             int dist;
5741             dist = sqlite3AbsInt32(get4byte(&aData[8]) - nearby);
5742             for(i=1; i<k; i++){
5743               int d2 = sqlite3AbsInt32(get4byte(&aData[8+i*4]) - nearby);
5744               if( d2<dist ){
5745                 closest = i;
5746                 dist = d2;
5747               }
5748             }
5749           }
5750         }else{
5751           closest = 0;
5752         }
5753 
5754         iPage = get4byte(&aData[8+closest*4]);
5755         testcase( iPage==mxPage );
5756         if( iPage>mxPage ){
5757           rc = SQLITE_CORRUPT_BKPT;
5758           goto end_allocate_page;
5759         }
5760         testcase( iPage==mxPage );
5761         if( !searchList
5762          || (iPage==nearby || (iPage<nearby && eMode==BTALLOC_LE))
5763         ){
5764           int noContent;
5765           *pPgno = iPage;
5766           TRACE(("ALLOCATE: %d was leaf %d of %d on trunk %d"
5767                  ": %d more free pages\n",
5768                  *pPgno, closest+1, k, pTrunk->pgno, n-1));
5769           rc = sqlite3PagerWrite(pTrunk->pDbPage);
5770           if( rc ) goto end_allocate_page;
5771           if( closest<k-1 ){
5772             memcpy(&aData[8+closest*4], &aData[4+k*4], 4);
5773           }
5774           put4byte(&aData[4], k-1);
5775           noContent = !btreeGetHasContent(pBt, *pPgno)? PAGER_GET_NOCONTENT : 0;
5776           rc = btreeGetUnusedPage(pBt, *pPgno, ppPage, noContent);
5777           if( rc==SQLITE_OK ){
5778             rc = sqlite3PagerWrite((*ppPage)->pDbPage);
5779             if( rc!=SQLITE_OK ){
5780               releasePage(*ppPage);
5781               *ppPage = 0;
5782             }
5783           }
5784           searchList = 0;
5785         }
5786       }
5787       releasePage(pPrevTrunk);
5788       pPrevTrunk = 0;
5789     }while( searchList );
5790   }else{
5791     /* There are no pages on the freelist, so append a new page to the
5792     ** database image.
5793     **
5794     ** Normally, new pages allocated by this block can be requested from the
5795     ** pager layer with the 'no-content' flag set. This prevents the pager
5796     ** from trying to read the pages content from disk. However, if the
5797     ** current transaction has already run one or more incremental-vacuum
5798     ** steps, then the page we are about to allocate may contain content
5799     ** that is required in the event of a rollback. In this case, do
5800     ** not set the no-content flag. This causes the pager to load and journal
5801     ** the current page content before overwriting it.
5802     **
5803     ** Note that the pager will not actually attempt to load or journal
5804     ** content for any page that really does lie past the end of the database
5805     ** file on disk. So the effects of disabling the no-content optimization
5806     ** here are confined to those pages that lie between the end of the
5807     ** database image and the end of the database file.
5808     */
5809     int bNoContent = (0==IfNotOmitAV(pBt->bDoTruncate))? PAGER_GET_NOCONTENT:0;
5810 
5811     rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
5812     if( rc ) return rc;
5813     pBt->nPage++;
5814     if( pBt->nPage==PENDING_BYTE_PAGE(pBt) ) pBt->nPage++;
5815 
5816 #ifndef SQLITE_OMIT_AUTOVACUUM
5817     if( pBt->autoVacuum && PTRMAP_ISPAGE(pBt, pBt->nPage) ){
5818       /* If *pPgno refers to a pointer-map page, allocate two new pages
5819       ** at the end of the file instead of one. The first allocated page
5820       ** becomes a new pointer-map page, the second is used by the caller.
5821       */
5822       MemPage *pPg = 0;
5823       TRACE(("ALLOCATE: %d from end of file (pointer-map page)\n", pBt->nPage));
5824       assert( pBt->nPage!=PENDING_BYTE_PAGE(pBt) );
5825       rc = btreeGetUnusedPage(pBt, pBt->nPage, &pPg, bNoContent);
5826       if( rc==SQLITE_OK ){
5827         rc = sqlite3PagerWrite(pPg->pDbPage);
5828         releasePage(pPg);
5829       }
5830       if( rc ) return rc;
5831       pBt->nPage++;
5832       if( pBt->nPage==PENDING_BYTE_PAGE(pBt) ){ pBt->nPage++; }
5833     }
5834 #endif
5835     put4byte(28 + (u8*)pBt->pPage1->aData, pBt->nPage);
5836     *pPgno = pBt->nPage;
5837 
5838     assert( *pPgno!=PENDING_BYTE_PAGE(pBt) );
5839     rc = btreeGetUnusedPage(pBt, *pPgno, ppPage, bNoContent);
5840     if( rc ) return rc;
5841     rc = sqlite3PagerWrite((*ppPage)->pDbPage);
5842     if( rc!=SQLITE_OK ){
5843       releasePage(*ppPage);
5844       *ppPage = 0;
5845     }
5846     TRACE(("ALLOCATE: %d from end of file\n", *pPgno));
5847   }
5848 
5849   assert( *pPgno!=PENDING_BYTE_PAGE(pBt) );
5850 
5851 end_allocate_page:
5852   releasePage(pTrunk);
5853   releasePage(pPrevTrunk);
5854   assert( rc!=SQLITE_OK || sqlite3PagerPageRefcount((*ppPage)->pDbPage)<=1 );
5855   assert( rc!=SQLITE_OK || (*ppPage)->isInit==0 );
5856   return rc;
5857 }
5858 
5859 /*
5860 ** This function is used to add page iPage to the database file free-list.
5861 ** It is assumed that the page is not already a part of the free-list.
5862 **
5863 ** The value passed as the second argument to this function is optional.
5864 ** If the caller happens to have a pointer to the MemPage object
5865 ** corresponding to page iPage handy, it may pass it as the second value.
5866 ** Otherwise, it may pass NULL.
5867 **
5868 ** If a pointer to a MemPage object is passed as the second argument,
5869 ** its reference count is not altered by this function.
5870 */
5871 static int freePage2(BtShared *pBt, MemPage *pMemPage, Pgno iPage){
5872   MemPage *pTrunk = 0;                /* Free-list trunk page */
5873   Pgno iTrunk = 0;                    /* Page number of free-list trunk page */
5874   MemPage *pPage1 = pBt->pPage1;      /* Local reference to page 1 */
5875   MemPage *pPage;                     /* Page being freed. May be NULL. */
5876   int rc;                             /* Return Code */
5877   int nFree;                          /* Initial number of pages on free-list */
5878 
5879   assert( sqlite3_mutex_held(pBt->mutex) );
5880   assert( CORRUPT_DB || iPage>1 );
5881   assert( !pMemPage || pMemPage->pgno==iPage );
5882 
5883   if( iPage<2 ) return SQLITE_CORRUPT_BKPT;
5884   if( pMemPage ){
5885     pPage = pMemPage;
5886     sqlite3PagerRef(pPage->pDbPage);
5887   }else{
5888     pPage = btreePageLookup(pBt, iPage);
5889   }
5890 
5891   /* Increment the free page count on pPage1 */
5892   rc = sqlite3PagerWrite(pPage1->pDbPage);
5893   if( rc ) goto freepage_out;
5894   nFree = get4byte(&pPage1->aData[36]);
5895   put4byte(&pPage1->aData[36], nFree+1);
5896 
5897   if( pBt->btsFlags & BTS_SECURE_DELETE ){
5898     /* If the secure_delete option is enabled, then
5899     ** always fully overwrite deleted information with zeros.
5900     */
5901     if( (!pPage && ((rc = btreeGetPage(pBt, iPage, &pPage, 0))!=0) )
5902      ||            ((rc = sqlite3PagerWrite(pPage->pDbPage))!=0)
5903     ){
5904       goto freepage_out;
5905     }
5906     memset(pPage->aData, 0, pPage->pBt->pageSize);
5907   }
5908 
5909   /* If the database supports auto-vacuum, write an entry in the pointer-map
5910   ** to indicate that the page is free.
5911   */
5912   if( ISAUTOVACUUM ){
5913     ptrmapPut(pBt, iPage, PTRMAP_FREEPAGE, 0, &rc);
5914     if( rc ) goto freepage_out;
5915   }
5916 
5917   /* Now manipulate the actual database free-list structure. There are two
5918   ** possibilities. If the free-list is currently empty, or if the first
5919   ** trunk page in the free-list is full, then this page will become a
5920   ** new free-list trunk page. Otherwise, it will become a leaf of the
5921   ** first trunk page in the current free-list. This block tests if it
5922   ** is possible to add the page as a new free-list leaf.
5923   */
5924   if( nFree!=0 ){
5925     u32 nLeaf;                /* Initial number of leaf cells on trunk page */
5926 
5927     iTrunk = get4byte(&pPage1->aData[32]);
5928     rc = btreeGetPage(pBt, iTrunk, &pTrunk, 0);
5929     if( rc!=SQLITE_OK ){
5930       goto freepage_out;
5931     }
5932 
5933     nLeaf = get4byte(&pTrunk->aData[4]);
5934     assert( pBt->usableSize>32 );
5935     if( nLeaf > (u32)pBt->usableSize/4 - 2 ){
5936       rc = SQLITE_CORRUPT_BKPT;
5937       goto freepage_out;
5938     }
5939     if( nLeaf < (u32)pBt->usableSize/4 - 8 ){
5940       /* In this case there is room on the trunk page to insert the page
5941       ** being freed as a new leaf.
5942       **
5943       ** Note that the trunk page is not really full until it contains
5944       ** usableSize/4 - 2 entries, not usableSize/4 - 8 entries as we have
5945       ** coded.  But due to a coding error in versions of SQLite prior to
5946       ** 3.6.0, databases with freelist trunk pages holding more than
5947       ** usableSize/4 - 8 entries will be reported as corrupt.  In order
5948       ** to maintain backwards compatibility with older versions of SQLite,
5949       ** we will continue to restrict the number of entries to usableSize/4 - 8
5950       ** for now.  At some point in the future (once everyone has upgraded
5951       ** to 3.6.0 or later) we should consider fixing the conditional above
5952       ** to read "usableSize/4-2" instead of "usableSize/4-8".
5953       **
5954       ** EVIDENCE-OF: R-19920-11576 However, newer versions of SQLite still
5955       ** avoid using the last six entries in the freelist trunk page array in
5956       ** order that database files created by newer versions of SQLite can be
5957       ** read by older versions of SQLite.
5958       */
5959       rc = sqlite3PagerWrite(pTrunk->pDbPage);
5960       if( rc==SQLITE_OK ){
5961         put4byte(&pTrunk->aData[4], nLeaf+1);
5962         put4byte(&pTrunk->aData[8+nLeaf*4], iPage);
5963         if( pPage && (pBt->btsFlags & BTS_SECURE_DELETE)==0 ){
5964           sqlite3PagerDontWrite(pPage->pDbPage);
5965         }
5966         rc = btreeSetHasContent(pBt, iPage);
5967       }
5968       TRACE(("FREE-PAGE: %d leaf on trunk page %d\n",pPage->pgno,pTrunk->pgno));
5969       goto freepage_out;
5970     }
5971   }
5972 
5973   /* If control flows to this point, then it was not possible to add the
5974   ** the page being freed as a leaf page of the first trunk in the free-list.
5975   ** Possibly because the free-list is empty, or possibly because the
5976   ** first trunk in the free-list is full. Either way, the page being freed
5977   ** will become the new first trunk page in the free-list.
5978   */
5979   if( pPage==0 && SQLITE_OK!=(rc = btreeGetPage(pBt, iPage, &pPage, 0)) ){
5980     goto freepage_out;
5981   }
5982   rc = sqlite3PagerWrite(pPage->pDbPage);
5983   if( rc!=SQLITE_OK ){
5984     goto freepage_out;
5985   }
5986   put4byte(pPage->aData, iTrunk);
5987   put4byte(&pPage->aData[4], 0);
5988   put4byte(&pPage1->aData[32], iPage);
5989   TRACE(("FREE-PAGE: %d new trunk page replacing %d\n", pPage->pgno, iTrunk));
5990 
5991 freepage_out:
5992   if( pPage ){
5993     pPage->isInit = 0;
5994   }
5995   releasePage(pPage);
5996   releasePage(pTrunk);
5997   return rc;
5998 }
5999 static void freePage(MemPage *pPage, int *pRC){
6000   if( (*pRC)==SQLITE_OK ){
6001     *pRC = freePage2(pPage->pBt, pPage, pPage->pgno);
6002   }
6003 }
6004 
6005 /*
6006 ** Free any overflow pages associated with the given Cell.  Write the
6007 ** local Cell size (the number of bytes on the original page, omitting
6008 ** overflow) into *pnSize.
6009 */
6010 static int clearCell(
6011   MemPage *pPage,          /* The page that contains the Cell */
6012   unsigned char *pCell,    /* First byte of the Cell */
6013   CellInfo *pInfo          /* Size information about the cell */
6014 ){
6015   BtShared *pBt = pPage->pBt;
6016   Pgno ovflPgno;
6017   int rc;
6018   int nOvfl;
6019   u32 ovflPageSize;
6020 
6021   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
6022   pPage->xParseCell(pPage, pCell, pInfo);
6023   if( pInfo->nLocal==pInfo->nPayload ){
6024     return SQLITE_OK;  /* No overflow pages. Return without doing anything */
6025   }
6026   if( pCell+pInfo->nSize-1 > pPage->aData+pPage->maskPage ){
6027     return SQLITE_CORRUPT_BKPT;  /* Cell extends past end of page */
6028   }
6029   ovflPgno = get4byte(pCell + pInfo->nSize - 4);
6030   assert( pBt->usableSize > 4 );
6031   ovflPageSize = pBt->usableSize - 4;
6032   nOvfl = (pInfo->nPayload - pInfo->nLocal + ovflPageSize - 1)/ovflPageSize;
6033   assert( nOvfl>0 ||
6034     (CORRUPT_DB && (pInfo->nPayload + ovflPageSize)<ovflPageSize)
6035   );
6036   while( nOvfl-- ){
6037     Pgno iNext = 0;
6038     MemPage *pOvfl = 0;
6039     if( ovflPgno<2 || ovflPgno>btreePagecount(pBt) ){
6040       /* 0 is not a legal page number and page 1 cannot be an
6041       ** overflow page. Therefore if ovflPgno<2 or past the end of the
6042       ** file the database must be corrupt. */
6043       return SQLITE_CORRUPT_BKPT;
6044     }
6045     if( nOvfl ){
6046       rc = getOverflowPage(pBt, ovflPgno, &pOvfl, &iNext);
6047       if( rc ) return rc;
6048     }
6049 
6050     if( ( pOvfl || ((pOvfl = btreePageLookup(pBt, ovflPgno))!=0) )
6051      && sqlite3PagerPageRefcount(pOvfl->pDbPage)!=1
6052     ){
6053       /* There is no reason any cursor should have an outstanding reference
6054       ** to an overflow page belonging to a cell that is being deleted/updated.
6055       ** So if there exists more than one reference to this page, then it
6056       ** must not really be an overflow page and the database must be corrupt.
6057       ** It is helpful to detect this before calling freePage2(), as
6058       ** freePage2() may zero the page contents if secure-delete mode is
6059       ** enabled. If this 'overflow' page happens to be a page that the
6060       ** caller is iterating through or using in some other way, this
6061       ** can be problematic.
6062       */
6063       rc = SQLITE_CORRUPT_BKPT;
6064     }else{
6065       rc = freePage2(pBt, pOvfl, ovflPgno);
6066     }
6067 
6068     if( pOvfl ){
6069       sqlite3PagerUnref(pOvfl->pDbPage);
6070     }
6071     if( rc ) return rc;
6072     ovflPgno = iNext;
6073   }
6074   return SQLITE_OK;
6075 }
6076 
6077 /*
6078 ** Create the byte sequence used to represent a cell on page pPage
6079 ** and write that byte sequence into pCell[].  Overflow pages are
6080 ** allocated and filled in as necessary.  The calling procedure
6081 ** is responsible for making sure sufficient space has been allocated
6082 ** for pCell[].
6083 **
6084 ** Note that pCell does not necessary need to point to the pPage->aData
6085 ** area.  pCell might point to some temporary storage.  The cell will
6086 ** be constructed in this temporary area then copied into pPage->aData
6087 ** later.
6088 */
6089 static int fillInCell(
6090   MemPage *pPage,                /* The page that contains the cell */
6091   unsigned char *pCell,          /* Complete text of the cell */
6092   const BtreePayload *pX,        /* Payload with which to construct the cell */
6093   int *pnSize                    /* Write cell size here */
6094 ){
6095   int nPayload;
6096   const u8 *pSrc;
6097   int nSrc, n, rc;
6098   int spaceLeft;
6099   MemPage *pOvfl = 0;
6100   MemPage *pToRelease = 0;
6101   unsigned char *pPrior;
6102   unsigned char *pPayload;
6103   BtShared *pBt = pPage->pBt;
6104   Pgno pgnoOvfl = 0;
6105   int nHeader;
6106 
6107   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
6108 
6109   /* pPage is not necessarily writeable since pCell might be auxiliary
6110   ** buffer space that is separate from the pPage buffer area */
6111   assert( pCell<pPage->aData || pCell>=&pPage->aData[pBt->pageSize]
6112             || sqlite3PagerIswriteable(pPage->pDbPage) );
6113 
6114   /* Fill in the header. */
6115   nHeader = pPage->childPtrSize;
6116   if( pPage->intKey ){
6117     nPayload = pX->nData + pX->nZero;
6118     pSrc = pX->pData;
6119     nSrc = pX->nData;
6120     assert( pPage->intKeyLeaf ); /* fillInCell() only called for leaves */
6121     nHeader += putVarint32(&pCell[nHeader], nPayload);
6122     nHeader += putVarint(&pCell[nHeader], *(u64*)&pX->nKey);
6123   }else{
6124     assert( pX->nKey<=0x7fffffff && pX->pKey!=0 );
6125     nSrc = nPayload = (int)pX->nKey;
6126     pSrc = pX->pKey;
6127     nHeader += putVarint32(&pCell[nHeader], nPayload);
6128   }
6129 
6130   /* Fill in the payload */
6131   if( nPayload<=pPage->maxLocal ){
6132     n = nHeader + nPayload;
6133     testcase( n==3 );
6134     testcase( n==4 );
6135     if( n<4 ) n = 4;
6136     *pnSize = n;
6137     spaceLeft = nPayload;
6138     pPrior = pCell;
6139   }else{
6140     int mn = pPage->minLocal;
6141     n = mn + (nPayload - mn) % (pPage->pBt->usableSize - 4);
6142     testcase( n==pPage->maxLocal );
6143     testcase( n==pPage->maxLocal+1 );
6144     if( n > pPage->maxLocal ) n = mn;
6145     spaceLeft = n;
6146     *pnSize = n + nHeader + 4;
6147     pPrior = &pCell[nHeader+n];
6148   }
6149   pPayload = &pCell[nHeader];
6150 
6151   /* At this point variables should be set as follows:
6152   **
6153   **   nPayload           Total payload size in bytes
6154   **   pPayload           Begin writing payload here
6155   **   spaceLeft          Space available at pPayload.  If nPayload>spaceLeft,
6156   **                      that means content must spill into overflow pages.
6157   **   *pnSize            Size of the local cell (not counting overflow pages)
6158   **   pPrior             Where to write the pgno of the first overflow page
6159   **
6160   ** Use a call to btreeParseCellPtr() to verify that the values above
6161   ** were computed correctly.
6162   */
6163 #if SQLITE_DEBUG
6164   {
6165     CellInfo info;
6166     pPage->xParseCell(pPage, pCell, &info);
6167     assert( nHeader==(int)(info.pPayload - pCell) );
6168     assert( info.nKey==pX->nKey );
6169     assert( *pnSize == info.nSize );
6170     assert( spaceLeft == info.nLocal );
6171   }
6172 #endif
6173 
6174   /* Write the payload into the local Cell and any extra into overflow pages */
6175   while( nPayload>0 ){
6176     if( spaceLeft==0 ){
6177 #ifndef SQLITE_OMIT_AUTOVACUUM
6178       Pgno pgnoPtrmap = pgnoOvfl; /* Overflow page pointer-map entry page */
6179       if( pBt->autoVacuum ){
6180         do{
6181           pgnoOvfl++;
6182         } while(
6183           PTRMAP_ISPAGE(pBt, pgnoOvfl) || pgnoOvfl==PENDING_BYTE_PAGE(pBt)
6184         );
6185       }
6186 #endif
6187       rc = allocateBtreePage(pBt, &pOvfl, &pgnoOvfl, pgnoOvfl, 0);
6188 #ifndef SQLITE_OMIT_AUTOVACUUM
6189       /* If the database supports auto-vacuum, and the second or subsequent
6190       ** overflow page is being allocated, add an entry to the pointer-map
6191       ** for that page now.
6192       **
6193       ** If this is the first overflow page, then write a partial entry
6194       ** to the pointer-map. If we write nothing to this pointer-map slot,
6195       ** then the optimistic overflow chain processing in clearCell()
6196       ** may misinterpret the uninitialized values and delete the
6197       ** wrong pages from the database.
6198       */
6199       if( pBt->autoVacuum && rc==SQLITE_OK ){
6200         u8 eType = (pgnoPtrmap?PTRMAP_OVERFLOW2:PTRMAP_OVERFLOW1);
6201         ptrmapPut(pBt, pgnoOvfl, eType, pgnoPtrmap, &rc);
6202         if( rc ){
6203           releasePage(pOvfl);
6204         }
6205       }
6206 #endif
6207       if( rc ){
6208         releasePage(pToRelease);
6209         return rc;
6210       }
6211 
6212       /* If pToRelease is not zero than pPrior points into the data area
6213       ** of pToRelease.  Make sure pToRelease is still writeable. */
6214       assert( pToRelease==0 || sqlite3PagerIswriteable(pToRelease->pDbPage) );
6215 
6216       /* If pPrior is part of the data area of pPage, then make sure pPage
6217       ** is still writeable */
6218       assert( pPrior<pPage->aData || pPrior>=&pPage->aData[pBt->pageSize]
6219             || sqlite3PagerIswriteable(pPage->pDbPage) );
6220 
6221       put4byte(pPrior, pgnoOvfl);
6222       releasePage(pToRelease);
6223       pToRelease = pOvfl;
6224       pPrior = pOvfl->aData;
6225       put4byte(pPrior, 0);
6226       pPayload = &pOvfl->aData[4];
6227       spaceLeft = pBt->usableSize - 4;
6228     }
6229     n = nPayload;
6230     if( n>spaceLeft ) n = spaceLeft;
6231 
6232     /* If pToRelease is not zero than pPayload points into the data area
6233     ** of pToRelease.  Make sure pToRelease is still writeable. */
6234     assert( pToRelease==0 || sqlite3PagerIswriteable(pToRelease->pDbPage) );
6235 
6236     /* If pPayload is part of the data area of pPage, then make sure pPage
6237     ** is still writeable */
6238     assert( pPayload<pPage->aData || pPayload>=&pPage->aData[pBt->pageSize]
6239             || sqlite3PagerIswriteable(pPage->pDbPage) );
6240 
6241     if( nSrc>0 ){
6242       if( n>nSrc ) n = nSrc;
6243       assert( pSrc );
6244       memcpy(pPayload, pSrc, n);
6245     }else{
6246       memset(pPayload, 0, n);
6247     }
6248     nPayload -= n;
6249     pPayload += n;
6250     pSrc += n;
6251     nSrc -= n;
6252     spaceLeft -= n;
6253   }
6254   releasePage(pToRelease);
6255   return SQLITE_OK;
6256 }
6257 
6258 /*
6259 ** Remove the i-th cell from pPage.  This routine effects pPage only.
6260 ** The cell content is not freed or deallocated.  It is assumed that
6261 ** the cell content has been copied someplace else.  This routine just
6262 ** removes the reference to the cell from pPage.
6263 **
6264 ** "sz" must be the number of bytes in the cell.
6265 */
6266 static void dropCell(MemPage *pPage, int idx, int sz, int *pRC){
6267   u32 pc;         /* Offset to cell content of cell being deleted */
6268   u8 *data;       /* pPage->aData */
6269   u8 *ptr;        /* Used to move bytes around within data[] */
6270   int rc;         /* The return code */
6271   int hdr;        /* Beginning of the header.  0 most pages.  100 page 1 */
6272 
6273   if( *pRC ) return;
6274   assert( idx>=0 && idx<pPage->nCell );
6275   assert( CORRUPT_DB || sz==cellSize(pPage, idx) );
6276   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
6277   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
6278   data = pPage->aData;
6279   ptr = &pPage->aCellIdx[2*idx];
6280   pc = get2byte(ptr);
6281   hdr = pPage->hdrOffset;
6282   testcase( pc==get2byte(&data[hdr+5]) );
6283   testcase( pc+sz==pPage->pBt->usableSize );
6284   if( pc < (u32)get2byte(&data[hdr+5]) || pc+sz > pPage->pBt->usableSize ){
6285     *pRC = SQLITE_CORRUPT_BKPT;
6286     return;
6287   }
6288   rc = freeSpace(pPage, pc, sz);
6289   if( rc ){
6290     *pRC = rc;
6291     return;
6292   }
6293   pPage->nCell--;
6294   if( pPage->nCell==0 ){
6295     memset(&data[hdr+1], 0, 4);
6296     data[hdr+7] = 0;
6297     put2byte(&data[hdr+5], pPage->pBt->usableSize);
6298     pPage->nFree = pPage->pBt->usableSize - pPage->hdrOffset
6299                        - pPage->childPtrSize - 8;
6300   }else{
6301     memmove(ptr, ptr+2, 2*(pPage->nCell - idx));
6302     put2byte(&data[hdr+3], pPage->nCell);
6303     pPage->nFree += 2;
6304   }
6305 }
6306 
6307 /*
6308 ** Insert a new cell on pPage at cell index "i".  pCell points to the
6309 ** content of the cell.
6310 **
6311 ** If the cell content will fit on the page, then put it there.  If it
6312 ** will not fit, then make a copy of the cell content into pTemp if
6313 ** pTemp is not null.  Regardless of pTemp, allocate a new entry
6314 ** in pPage->apOvfl[] and make it point to the cell content (either
6315 ** in pTemp or the original pCell) and also record its index.
6316 ** Allocating a new entry in pPage->aCell[] implies that
6317 ** pPage->nOverflow is incremented.
6318 **
6319 ** *pRC must be SQLITE_OK when this routine is called.
6320 */
6321 static void insertCell(
6322   MemPage *pPage,   /* Page into which we are copying */
6323   int i,            /* New cell becomes the i-th cell of the page */
6324   u8 *pCell,        /* Content of the new cell */
6325   int sz,           /* Bytes of content in pCell */
6326   u8 *pTemp,        /* Temp storage space for pCell, if needed */
6327   Pgno iChild,      /* If non-zero, replace first 4 bytes with this value */
6328   int *pRC          /* Read and write return code from here */
6329 ){
6330   int idx = 0;      /* Where to write new cell content in data[] */
6331   int j;            /* Loop counter */
6332   u8 *data;         /* The content of the whole page */
6333   u8 *pIns;         /* The point in pPage->aCellIdx[] where no cell inserted */
6334 
6335   assert( *pRC==SQLITE_OK );
6336   assert( i>=0 && i<=pPage->nCell+pPage->nOverflow );
6337   assert( MX_CELL(pPage->pBt)<=10921 );
6338   assert( pPage->nCell<=MX_CELL(pPage->pBt) || CORRUPT_DB );
6339   assert( pPage->nOverflow<=ArraySize(pPage->apOvfl) );
6340   assert( ArraySize(pPage->apOvfl)==ArraySize(pPage->aiOvfl) );
6341   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
6342   /* The cell should normally be sized correctly.  However, when moving a
6343   ** malformed cell from a leaf page to an interior page, if the cell size
6344   ** wanted to be less than 4 but got rounded up to 4 on the leaf, then size
6345   ** might be less than 8 (leaf-size + pointer) on the interior node.  Hence
6346   ** the term after the || in the following assert(). */
6347   assert( sz==pPage->xCellSize(pPage, pCell) || (sz==8 && iChild>0) );
6348   if( pPage->nOverflow || sz+2>pPage->nFree ){
6349     if( pTemp ){
6350       memcpy(pTemp, pCell, sz);
6351       pCell = pTemp;
6352     }
6353     if( iChild ){
6354       put4byte(pCell, iChild);
6355     }
6356     j = pPage->nOverflow++;
6357     /* Comparison against ArraySize-1 since we hold back one extra slot
6358     ** as a contingency.  In other words, never need more than 3 overflow
6359     ** slots but 4 are allocated, just to be safe. */
6360     assert( j < ArraySize(pPage->apOvfl)-1 );
6361     pPage->apOvfl[j] = pCell;
6362     pPage->aiOvfl[j] = (u16)i;
6363 
6364     /* When multiple overflows occur, they are always sequential and in
6365     ** sorted order.  This invariants arise because multiple overflows can
6366     ** only occur when inserting divider cells into the parent page during
6367     ** balancing, and the dividers are adjacent and sorted.
6368     */
6369     assert( j==0 || pPage->aiOvfl[j-1]<(u16)i ); /* Overflows in sorted order */
6370     assert( j==0 || i==pPage->aiOvfl[j-1]+1 );   /* Overflows are sequential */
6371   }else{
6372     int rc = sqlite3PagerWrite(pPage->pDbPage);
6373     if( rc!=SQLITE_OK ){
6374       *pRC = rc;
6375       return;
6376     }
6377     assert( sqlite3PagerIswriteable(pPage->pDbPage) );
6378     data = pPage->aData;
6379     assert( &data[pPage->cellOffset]==pPage->aCellIdx );
6380     rc = allocateSpace(pPage, sz, &idx);
6381     if( rc ){ *pRC = rc; return; }
6382     /* The allocateSpace() routine guarantees the following properties
6383     ** if it returns successfully */
6384     assert( idx >= 0 );
6385     assert( idx >= pPage->cellOffset+2*pPage->nCell+2 || CORRUPT_DB );
6386     assert( idx+sz <= (int)pPage->pBt->usableSize );
6387     pPage->nFree -= (u16)(2 + sz);
6388     memcpy(&data[idx], pCell, sz);
6389     if( iChild ){
6390       put4byte(&data[idx], iChild);
6391     }
6392     pIns = pPage->aCellIdx + i*2;
6393     memmove(pIns+2, pIns, 2*(pPage->nCell - i));
6394     put2byte(pIns, idx);
6395     pPage->nCell++;
6396     /* increment the cell count */
6397     if( (++data[pPage->hdrOffset+4])==0 ) data[pPage->hdrOffset+3]++;
6398     assert( get2byte(&data[pPage->hdrOffset+3])==pPage->nCell );
6399 #ifndef SQLITE_OMIT_AUTOVACUUM
6400     if( pPage->pBt->autoVacuum ){
6401       /* The cell may contain a pointer to an overflow page. If so, write
6402       ** the entry for the overflow page into the pointer map.
6403       */
6404       ptrmapPutOvflPtr(pPage, pCell, pRC);
6405     }
6406 #endif
6407   }
6408 }
6409 
6410 /*
6411 ** A CellArray object contains a cache of pointers and sizes for a
6412 ** consecutive sequence of cells that might be held on multiple pages.
6413 */
6414 typedef struct CellArray CellArray;
6415 struct CellArray {
6416   int nCell;              /* Number of cells in apCell[] */
6417   MemPage *pRef;          /* Reference page */
6418   u8 **apCell;            /* All cells begin balanced */
6419   u16 *szCell;            /* Local size of all cells in apCell[] */
6420 };
6421 
6422 /*
6423 ** Make sure the cell sizes at idx, idx+1, ..., idx+N-1 have been
6424 ** computed.
6425 */
6426 static void populateCellCache(CellArray *p, int idx, int N){
6427   assert( idx>=0 && idx+N<=p->nCell );
6428   while( N>0 ){
6429     assert( p->apCell[idx]!=0 );
6430     if( p->szCell[idx]==0 ){
6431       p->szCell[idx] = p->pRef->xCellSize(p->pRef, p->apCell[idx]);
6432     }else{
6433       assert( CORRUPT_DB ||
6434               p->szCell[idx]==p->pRef->xCellSize(p->pRef, p->apCell[idx]) );
6435     }
6436     idx++;
6437     N--;
6438   }
6439 }
6440 
6441 /*
6442 ** Return the size of the Nth element of the cell array
6443 */
6444 static SQLITE_NOINLINE u16 computeCellSize(CellArray *p, int N){
6445   assert( N>=0 && N<p->nCell );
6446   assert( p->szCell[N]==0 );
6447   p->szCell[N] = p->pRef->xCellSize(p->pRef, p->apCell[N]);
6448   return p->szCell[N];
6449 }
6450 static u16 cachedCellSize(CellArray *p, int N){
6451   assert( N>=0 && N<p->nCell );
6452   if( p->szCell[N] ) return p->szCell[N];
6453   return computeCellSize(p, N);
6454 }
6455 
6456 /*
6457 ** Array apCell[] contains pointers to nCell b-tree page cells. The
6458 ** szCell[] array contains the size in bytes of each cell. This function
6459 ** replaces the current contents of page pPg with the contents of the cell
6460 ** array.
6461 **
6462 ** Some of the cells in apCell[] may currently be stored in pPg. This
6463 ** function works around problems caused by this by making a copy of any
6464 ** such cells before overwriting the page data.
6465 **
6466 ** The MemPage.nFree field is invalidated by this function. It is the
6467 ** responsibility of the caller to set it correctly.
6468 */
6469 static int rebuildPage(
6470   MemPage *pPg,                   /* Edit this page */
6471   int nCell,                      /* Final number of cells on page */
6472   u8 **apCell,                    /* Array of cells */
6473   u16 *szCell                     /* Array of cell sizes */
6474 ){
6475   const int hdr = pPg->hdrOffset;          /* Offset of header on pPg */
6476   u8 * const aData = pPg->aData;           /* Pointer to data for pPg */
6477   const int usableSize = pPg->pBt->usableSize;
6478   u8 * const pEnd = &aData[usableSize];
6479   int i;
6480   u8 *pCellptr = pPg->aCellIdx;
6481   u8 *pTmp = sqlite3PagerTempSpace(pPg->pBt->pPager);
6482   u8 *pData;
6483 
6484   i = get2byte(&aData[hdr+5]);
6485   memcpy(&pTmp[i], &aData[i], usableSize - i);
6486 
6487   pData = pEnd;
6488   for(i=0; i<nCell; i++){
6489     u8 *pCell = apCell[i];
6490     if( SQLITE_WITHIN(pCell,aData,pEnd) ){
6491       pCell = &pTmp[pCell - aData];
6492     }
6493     pData -= szCell[i];
6494     put2byte(pCellptr, (pData - aData));
6495     pCellptr += 2;
6496     if( pData < pCellptr ) return SQLITE_CORRUPT_BKPT;
6497     memcpy(pData, pCell, szCell[i]);
6498     assert( szCell[i]==pPg->xCellSize(pPg, pCell) || CORRUPT_DB );
6499     testcase( szCell[i]!=pPg->xCellSize(pPg,pCell) );
6500   }
6501 
6502   /* The pPg->nFree field is now set incorrectly. The caller will fix it. */
6503   pPg->nCell = nCell;
6504   pPg->nOverflow = 0;
6505 
6506   put2byte(&aData[hdr+1], 0);
6507   put2byte(&aData[hdr+3], pPg->nCell);
6508   put2byte(&aData[hdr+5], pData - aData);
6509   aData[hdr+7] = 0x00;
6510   return SQLITE_OK;
6511 }
6512 
6513 /*
6514 ** Array apCell[] contains nCell pointers to b-tree cells. Array szCell
6515 ** contains the size in bytes of each such cell. This function attempts to
6516 ** add the cells stored in the array to page pPg. If it cannot (because
6517 ** the page needs to be defragmented before the cells will fit), non-zero
6518 ** is returned. Otherwise, if the cells are added successfully, zero is
6519 ** returned.
6520 **
6521 ** Argument pCellptr points to the first entry in the cell-pointer array
6522 ** (part of page pPg) to populate. After cell apCell[0] is written to the
6523 ** page body, a 16-bit offset is written to pCellptr. And so on, for each
6524 ** cell in the array. It is the responsibility of the caller to ensure
6525 ** that it is safe to overwrite this part of the cell-pointer array.
6526 **
6527 ** When this function is called, *ppData points to the start of the
6528 ** content area on page pPg. If the size of the content area is extended,
6529 ** *ppData is updated to point to the new start of the content area
6530 ** before returning.
6531 **
6532 ** Finally, argument pBegin points to the byte immediately following the
6533 ** end of the space required by this page for the cell-pointer area (for
6534 ** all cells - not just those inserted by the current call). If the content
6535 ** area must be extended to before this point in order to accomodate all
6536 ** cells in apCell[], then the cells do not fit and non-zero is returned.
6537 */
6538 static int pageInsertArray(
6539   MemPage *pPg,                   /* Page to add cells to */
6540   u8 *pBegin,                     /* End of cell-pointer array */
6541   u8 **ppData,                    /* IN/OUT: Page content -area pointer */
6542   u8 *pCellptr,                   /* Pointer to cell-pointer area */
6543   int iFirst,                     /* Index of first cell to add */
6544   int nCell,                      /* Number of cells to add to pPg */
6545   CellArray *pCArray              /* Array of cells */
6546 ){
6547   int i;
6548   u8 *aData = pPg->aData;
6549   u8 *pData = *ppData;
6550   int iEnd = iFirst + nCell;
6551   assert( CORRUPT_DB || pPg->hdrOffset==0 );    /* Never called on page 1 */
6552   for(i=iFirst; i<iEnd; i++){
6553     int sz, rc;
6554     u8 *pSlot;
6555     sz = cachedCellSize(pCArray, i);
6556     if( (aData[1]==0 && aData[2]==0) || (pSlot = pageFindSlot(pPg,sz,&rc))==0 ){
6557       if( (pData - pBegin)<sz ) return 1;
6558       pData -= sz;
6559       pSlot = pData;
6560     }
6561     /* pSlot and pCArray->apCell[i] will never overlap on a well-formed
6562     ** database.  But they might for a corrupt database.  Hence use memmove()
6563     ** since memcpy() sends SIGABORT with overlapping buffers on OpenBSD */
6564     assert( (pSlot+sz)<=pCArray->apCell[i]
6565          || pSlot>=(pCArray->apCell[i]+sz)
6566          || CORRUPT_DB );
6567     memmove(pSlot, pCArray->apCell[i], sz);
6568     put2byte(pCellptr, (pSlot - aData));
6569     pCellptr += 2;
6570   }
6571   *ppData = pData;
6572   return 0;
6573 }
6574 
6575 /*
6576 ** Array apCell[] contains nCell pointers to b-tree cells. Array szCell
6577 ** contains the size in bytes of each such cell. This function adds the
6578 ** space associated with each cell in the array that is currently stored
6579 ** within the body of pPg to the pPg free-list. The cell-pointers and other
6580 ** fields of the page are not updated.
6581 **
6582 ** This function returns the total number of cells added to the free-list.
6583 */
6584 static int pageFreeArray(
6585   MemPage *pPg,                   /* Page to edit */
6586   int iFirst,                     /* First cell to delete */
6587   int nCell,                      /* Cells to delete */
6588   CellArray *pCArray              /* Array of cells */
6589 ){
6590   u8 * const aData = pPg->aData;
6591   u8 * const pEnd = &aData[pPg->pBt->usableSize];
6592   u8 * const pStart = &aData[pPg->hdrOffset + 8 + pPg->childPtrSize];
6593   int nRet = 0;
6594   int i;
6595   int iEnd = iFirst + nCell;
6596   u8 *pFree = 0;
6597   int szFree = 0;
6598 
6599   for(i=iFirst; i<iEnd; i++){
6600     u8 *pCell = pCArray->apCell[i];
6601     if( SQLITE_WITHIN(pCell, pStart, pEnd) ){
6602       int sz;
6603       /* No need to use cachedCellSize() here.  The sizes of all cells that
6604       ** are to be freed have already been computing while deciding which
6605       ** cells need freeing */
6606       sz = pCArray->szCell[i];  assert( sz>0 );
6607       if( pFree!=(pCell + sz) ){
6608         if( pFree ){
6609           assert( pFree>aData && (pFree - aData)<65536 );
6610           freeSpace(pPg, (u16)(pFree - aData), szFree);
6611         }
6612         pFree = pCell;
6613         szFree = sz;
6614         if( pFree+sz>pEnd ) return 0;
6615       }else{
6616         pFree = pCell;
6617         szFree += sz;
6618       }
6619       nRet++;
6620     }
6621   }
6622   if( pFree ){
6623     assert( pFree>aData && (pFree - aData)<65536 );
6624     freeSpace(pPg, (u16)(pFree - aData), szFree);
6625   }
6626   return nRet;
6627 }
6628 
6629 /*
6630 ** apCell[] and szCell[] contains pointers to and sizes of all cells in the
6631 ** pages being balanced.  The current page, pPg, has pPg->nCell cells starting
6632 ** with apCell[iOld].  After balancing, this page should hold nNew cells
6633 ** starting at apCell[iNew].
6634 **
6635 ** This routine makes the necessary adjustments to pPg so that it contains
6636 ** the correct cells after being balanced.
6637 **
6638 ** The pPg->nFree field is invalid when this function returns. It is the
6639 ** responsibility of the caller to set it correctly.
6640 */
6641 static int editPage(
6642   MemPage *pPg,                   /* Edit this page */
6643   int iOld,                       /* Index of first cell currently on page */
6644   int iNew,                       /* Index of new first cell on page */
6645   int nNew,                       /* Final number of cells on page */
6646   CellArray *pCArray              /* Array of cells and sizes */
6647 ){
6648   u8 * const aData = pPg->aData;
6649   const int hdr = pPg->hdrOffset;
6650   u8 *pBegin = &pPg->aCellIdx[nNew * 2];
6651   int nCell = pPg->nCell;       /* Cells stored on pPg */
6652   u8 *pData;
6653   u8 *pCellptr;
6654   int i;
6655   int iOldEnd = iOld + pPg->nCell + pPg->nOverflow;
6656   int iNewEnd = iNew + nNew;
6657 
6658 #ifdef SQLITE_DEBUG
6659   u8 *pTmp = sqlite3PagerTempSpace(pPg->pBt->pPager);
6660   memcpy(pTmp, aData, pPg->pBt->usableSize);
6661 #endif
6662 
6663   /* Remove cells from the start and end of the page */
6664   if( iOld<iNew ){
6665     int nShift = pageFreeArray(pPg, iOld, iNew-iOld, pCArray);
6666     memmove(pPg->aCellIdx, &pPg->aCellIdx[nShift*2], nCell*2);
6667     nCell -= nShift;
6668   }
6669   if( iNewEnd < iOldEnd ){
6670     nCell -= pageFreeArray(pPg, iNewEnd, iOldEnd - iNewEnd, pCArray);
6671   }
6672 
6673   pData = &aData[get2byteNotZero(&aData[hdr+5])];
6674   if( pData<pBegin ) goto editpage_fail;
6675 
6676   /* Add cells to the start of the page */
6677   if( iNew<iOld ){
6678     int nAdd = MIN(nNew,iOld-iNew);
6679     assert( (iOld-iNew)<nNew || nCell==0 || CORRUPT_DB );
6680     pCellptr = pPg->aCellIdx;
6681     memmove(&pCellptr[nAdd*2], pCellptr, nCell*2);
6682     if( pageInsertArray(
6683           pPg, pBegin, &pData, pCellptr,
6684           iNew, nAdd, pCArray
6685     ) ) goto editpage_fail;
6686     nCell += nAdd;
6687   }
6688 
6689   /* Add any overflow cells */
6690   for(i=0; i<pPg->nOverflow; i++){
6691     int iCell = (iOld + pPg->aiOvfl[i]) - iNew;
6692     if( iCell>=0 && iCell<nNew ){
6693       pCellptr = &pPg->aCellIdx[iCell * 2];
6694       memmove(&pCellptr[2], pCellptr, (nCell - iCell) * 2);
6695       nCell++;
6696       if( pageInsertArray(
6697             pPg, pBegin, &pData, pCellptr,
6698             iCell+iNew, 1, pCArray
6699       ) ) goto editpage_fail;
6700     }
6701   }
6702 
6703   /* Append cells to the end of the page */
6704   pCellptr = &pPg->aCellIdx[nCell*2];
6705   if( pageInsertArray(
6706         pPg, pBegin, &pData, pCellptr,
6707         iNew+nCell, nNew-nCell, pCArray
6708   ) ) goto editpage_fail;
6709 
6710   pPg->nCell = nNew;
6711   pPg->nOverflow = 0;
6712 
6713   put2byte(&aData[hdr+3], pPg->nCell);
6714   put2byte(&aData[hdr+5], pData - aData);
6715 
6716 #ifdef SQLITE_DEBUG
6717   for(i=0; i<nNew && !CORRUPT_DB; i++){
6718     u8 *pCell = pCArray->apCell[i+iNew];
6719     int iOff = get2byteAligned(&pPg->aCellIdx[i*2]);
6720     if( SQLITE_WITHIN(pCell, aData, &aData[pPg->pBt->usableSize]) ){
6721       pCell = &pTmp[pCell - aData];
6722     }
6723     assert( 0==memcmp(pCell, &aData[iOff],
6724             pCArray->pRef->xCellSize(pCArray->pRef, pCArray->apCell[i+iNew])) );
6725   }
6726 #endif
6727 
6728   return SQLITE_OK;
6729  editpage_fail:
6730   /* Unable to edit this page. Rebuild it from scratch instead. */
6731   populateCellCache(pCArray, iNew, nNew);
6732   return rebuildPage(pPg, nNew, &pCArray->apCell[iNew], &pCArray->szCell[iNew]);
6733 }
6734 
6735 /*
6736 ** The following parameters determine how many adjacent pages get involved
6737 ** in a balancing operation.  NN is the number of neighbors on either side
6738 ** of the page that participate in the balancing operation.  NB is the
6739 ** total number of pages that participate, including the target page and
6740 ** NN neighbors on either side.
6741 **
6742 ** The minimum value of NN is 1 (of course).  Increasing NN above 1
6743 ** (to 2 or 3) gives a modest improvement in SELECT and DELETE performance
6744 ** in exchange for a larger degradation in INSERT and UPDATE performance.
6745 ** The value of NN appears to give the best results overall.
6746 */
6747 #define NN 1             /* Number of neighbors on either side of pPage */
6748 #define NB (NN*2+1)      /* Total pages involved in the balance */
6749 
6750 
6751 #ifndef SQLITE_OMIT_QUICKBALANCE
6752 /*
6753 ** This version of balance() handles the common special case where
6754 ** a new entry is being inserted on the extreme right-end of the
6755 ** tree, in other words, when the new entry will become the largest
6756 ** entry in the tree.
6757 **
6758 ** Instead of trying to balance the 3 right-most leaf pages, just add
6759 ** a new page to the right-hand side and put the one new entry in
6760 ** that page.  This leaves the right side of the tree somewhat
6761 ** unbalanced.  But odds are that we will be inserting new entries
6762 ** at the end soon afterwards so the nearly empty page will quickly
6763 ** fill up.  On average.
6764 **
6765 ** pPage is the leaf page which is the right-most page in the tree.
6766 ** pParent is its parent.  pPage must have a single overflow entry
6767 ** which is also the right-most entry on the page.
6768 **
6769 ** The pSpace buffer is used to store a temporary copy of the divider
6770 ** cell that will be inserted into pParent. Such a cell consists of a 4
6771 ** byte page number followed by a variable length integer. In other
6772 ** words, at most 13 bytes. Hence the pSpace buffer must be at
6773 ** least 13 bytes in size.
6774 */
6775 static int balance_quick(MemPage *pParent, MemPage *pPage, u8 *pSpace){
6776   BtShared *const pBt = pPage->pBt;    /* B-Tree Database */
6777   MemPage *pNew;                       /* Newly allocated page */
6778   int rc;                              /* Return Code */
6779   Pgno pgnoNew;                        /* Page number of pNew */
6780 
6781   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
6782   assert( sqlite3PagerIswriteable(pParent->pDbPage) );
6783   assert( pPage->nOverflow==1 );
6784 
6785   /* This error condition is now caught prior to reaching this function */
6786   if( NEVER(pPage->nCell==0) ) return SQLITE_CORRUPT_BKPT;
6787 
6788   /* Allocate a new page. This page will become the right-sibling of
6789   ** pPage. Make the parent page writable, so that the new divider cell
6790   ** may be inserted. If both these operations are successful, proceed.
6791   */
6792   rc = allocateBtreePage(pBt, &pNew, &pgnoNew, 0, 0);
6793 
6794   if( rc==SQLITE_OK ){
6795 
6796     u8 *pOut = &pSpace[4];
6797     u8 *pCell = pPage->apOvfl[0];
6798     u16 szCell = pPage->xCellSize(pPage, pCell);
6799     u8 *pStop;
6800 
6801     assert( sqlite3PagerIswriteable(pNew->pDbPage) );
6802     assert( pPage->aData[0]==(PTF_INTKEY|PTF_LEAFDATA|PTF_LEAF) );
6803     zeroPage(pNew, PTF_INTKEY|PTF_LEAFDATA|PTF_LEAF);
6804     rc = rebuildPage(pNew, 1, &pCell, &szCell);
6805     if( NEVER(rc) ) return rc;
6806     pNew->nFree = pBt->usableSize - pNew->cellOffset - 2 - szCell;
6807 
6808     /* If this is an auto-vacuum database, update the pointer map
6809     ** with entries for the new page, and any pointer from the
6810     ** cell on the page to an overflow page. If either of these
6811     ** operations fails, the return code is set, but the contents
6812     ** of the parent page are still manipulated by thh code below.
6813     ** That is Ok, at this point the parent page is guaranteed to
6814     ** be marked as dirty. Returning an error code will cause a
6815     ** rollback, undoing any changes made to the parent page.
6816     */
6817     if( ISAUTOVACUUM ){
6818       ptrmapPut(pBt, pgnoNew, PTRMAP_BTREE, pParent->pgno, &rc);
6819       if( szCell>pNew->minLocal ){
6820         ptrmapPutOvflPtr(pNew, pCell, &rc);
6821       }
6822     }
6823 
6824     /* Create a divider cell to insert into pParent. The divider cell
6825     ** consists of a 4-byte page number (the page number of pPage) and
6826     ** a variable length key value (which must be the same value as the
6827     ** largest key on pPage).
6828     **
6829     ** To find the largest key value on pPage, first find the right-most
6830     ** cell on pPage. The first two fields of this cell are the
6831     ** record-length (a variable length integer at most 32-bits in size)
6832     ** and the key value (a variable length integer, may have any value).
6833     ** The first of the while(...) loops below skips over the record-length
6834     ** field. The second while(...) loop copies the key value from the
6835     ** cell on pPage into the pSpace buffer.
6836     */
6837     pCell = findCell(pPage, pPage->nCell-1);
6838     pStop = &pCell[9];
6839     while( (*(pCell++)&0x80) && pCell<pStop );
6840     pStop = &pCell[9];
6841     while( ((*(pOut++) = *(pCell++))&0x80) && pCell<pStop );
6842 
6843     /* Insert the new divider cell into pParent. */
6844     if( rc==SQLITE_OK ){
6845       insertCell(pParent, pParent->nCell, pSpace, (int)(pOut-pSpace),
6846                    0, pPage->pgno, &rc);
6847     }
6848 
6849     /* Set the right-child pointer of pParent to point to the new page. */
6850     put4byte(&pParent->aData[pParent->hdrOffset+8], pgnoNew);
6851 
6852     /* Release the reference to the new page. */
6853     releasePage(pNew);
6854   }
6855 
6856   return rc;
6857 }
6858 #endif /* SQLITE_OMIT_QUICKBALANCE */
6859 
6860 #if 0
6861 /*
6862 ** This function does not contribute anything to the operation of SQLite.
6863 ** it is sometimes activated temporarily while debugging code responsible
6864 ** for setting pointer-map entries.
6865 */
6866 static int ptrmapCheckPages(MemPage **apPage, int nPage){
6867   int i, j;
6868   for(i=0; i<nPage; i++){
6869     Pgno n;
6870     u8 e;
6871     MemPage *pPage = apPage[i];
6872     BtShared *pBt = pPage->pBt;
6873     assert( pPage->isInit );
6874 
6875     for(j=0; j<pPage->nCell; j++){
6876       CellInfo info;
6877       u8 *z;
6878 
6879       z = findCell(pPage, j);
6880       pPage->xParseCell(pPage, z, &info);
6881       if( info.nLocal<info.nPayload ){
6882         Pgno ovfl = get4byte(&z[info.nSize-4]);
6883         ptrmapGet(pBt, ovfl, &e, &n);
6884         assert( n==pPage->pgno && e==PTRMAP_OVERFLOW1 );
6885       }
6886       if( !pPage->leaf ){
6887         Pgno child = get4byte(z);
6888         ptrmapGet(pBt, child, &e, &n);
6889         assert( n==pPage->pgno && e==PTRMAP_BTREE );
6890       }
6891     }
6892     if( !pPage->leaf ){
6893       Pgno child = get4byte(&pPage->aData[pPage->hdrOffset+8]);
6894       ptrmapGet(pBt, child, &e, &n);
6895       assert( n==pPage->pgno && e==PTRMAP_BTREE );
6896     }
6897   }
6898   return 1;
6899 }
6900 #endif
6901 
6902 /*
6903 ** This function is used to copy the contents of the b-tree node stored
6904 ** on page pFrom to page pTo. If page pFrom was not a leaf page, then
6905 ** the pointer-map entries for each child page are updated so that the
6906 ** parent page stored in the pointer map is page pTo. If pFrom contained
6907 ** any cells with overflow page pointers, then the corresponding pointer
6908 ** map entries are also updated so that the parent page is page pTo.
6909 **
6910 ** If pFrom is currently carrying any overflow cells (entries in the
6911 ** MemPage.apOvfl[] array), they are not copied to pTo.
6912 **
6913 ** Before returning, page pTo is reinitialized using btreeInitPage().
6914 **
6915 ** The performance of this function is not critical. It is only used by
6916 ** the balance_shallower() and balance_deeper() procedures, neither of
6917 ** which are called often under normal circumstances.
6918 */
6919 static void copyNodeContent(MemPage *pFrom, MemPage *pTo, int *pRC){
6920   if( (*pRC)==SQLITE_OK ){
6921     BtShared * const pBt = pFrom->pBt;
6922     u8 * const aFrom = pFrom->aData;
6923     u8 * const aTo = pTo->aData;
6924     int const iFromHdr = pFrom->hdrOffset;
6925     int const iToHdr = ((pTo->pgno==1) ? 100 : 0);
6926     int rc;
6927     int iData;
6928 
6929 
6930     assert( pFrom->isInit );
6931     assert( pFrom->nFree>=iToHdr );
6932     assert( get2byte(&aFrom[iFromHdr+5]) <= (int)pBt->usableSize );
6933 
6934     /* Copy the b-tree node content from page pFrom to page pTo. */
6935     iData = get2byte(&aFrom[iFromHdr+5]);
6936     memcpy(&aTo[iData], &aFrom[iData], pBt->usableSize-iData);
6937     memcpy(&aTo[iToHdr], &aFrom[iFromHdr], pFrom->cellOffset + 2*pFrom->nCell);
6938 
6939     /* Reinitialize page pTo so that the contents of the MemPage structure
6940     ** match the new data. The initialization of pTo can actually fail under
6941     ** fairly obscure circumstances, even though it is a copy of initialized
6942     ** page pFrom.
6943     */
6944     pTo->isInit = 0;
6945     rc = btreeInitPage(pTo);
6946     if( rc!=SQLITE_OK ){
6947       *pRC = rc;
6948       return;
6949     }
6950 
6951     /* If this is an auto-vacuum database, update the pointer-map entries
6952     ** for any b-tree or overflow pages that pTo now contains the pointers to.
6953     */
6954     if( ISAUTOVACUUM ){
6955       *pRC = setChildPtrmaps(pTo);
6956     }
6957   }
6958 }
6959 
6960 /*
6961 ** This routine redistributes cells on the iParentIdx'th child of pParent
6962 ** (hereafter "the page") and up to 2 siblings so that all pages have about the
6963 ** same amount of free space. Usually a single sibling on either side of the
6964 ** page are used in the balancing, though both siblings might come from one
6965 ** side if the page is the first or last child of its parent. If the page
6966 ** has fewer than 2 siblings (something which can only happen if the page
6967 ** is a root page or a child of a root page) then all available siblings
6968 ** participate in the balancing.
6969 **
6970 ** The number of siblings of the page might be increased or decreased by
6971 ** one or two in an effort to keep pages nearly full but not over full.
6972 **
6973 ** Note that when this routine is called, some of the cells on the page
6974 ** might not actually be stored in MemPage.aData[]. This can happen
6975 ** if the page is overfull. This routine ensures that all cells allocated
6976 ** to the page and its siblings fit into MemPage.aData[] before returning.
6977 **
6978 ** In the course of balancing the page and its siblings, cells may be
6979 ** inserted into or removed from the parent page (pParent). Doing so
6980 ** may cause the parent page to become overfull or underfull. If this
6981 ** happens, it is the responsibility of the caller to invoke the correct
6982 ** balancing routine to fix this problem (see the balance() routine).
6983 **
6984 ** If this routine fails for any reason, it might leave the database
6985 ** in a corrupted state. So if this routine fails, the database should
6986 ** be rolled back.
6987 **
6988 ** The third argument to this function, aOvflSpace, is a pointer to a
6989 ** buffer big enough to hold one page. If while inserting cells into the parent
6990 ** page (pParent) the parent page becomes overfull, this buffer is
6991 ** used to store the parent's overflow cells. Because this function inserts
6992 ** a maximum of four divider cells into the parent page, and the maximum
6993 ** size of a cell stored within an internal node is always less than 1/4
6994 ** of the page-size, the aOvflSpace[] buffer is guaranteed to be large
6995 ** enough for all overflow cells.
6996 **
6997 ** If aOvflSpace is set to a null pointer, this function returns
6998 ** SQLITE_NOMEM.
6999 */
7000 static int balance_nonroot(
7001   MemPage *pParent,               /* Parent page of siblings being balanced */
7002   int iParentIdx,                 /* Index of "the page" in pParent */
7003   u8 *aOvflSpace,                 /* page-size bytes of space for parent ovfl */
7004   int isRoot,                     /* True if pParent is a root-page */
7005   int bBulk                       /* True if this call is part of a bulk load */
7006 ){
7007   BtShared *pBt;               /* The whole database */
7008   int nMaxCells = 0;           /* Allocated size of apCell, szCell, aFrom. */
7009   int nNew = 0;                /* Number of pages in apNew[] */
7010   int nOld;                    /* Number of pages in apOld[] */
7011   int i, j, k;                 /* Loop counters */
7012   int nxDiv;                   /* Next divider slot in pParent->aCell[] */
7013   int rc = SQLITE_OK;          /* The return code */
7014   u16 leafCorrection;          /* 4 if pPage is a leaf.  0 if not */
7015   int leafData;                /* True if pPage is a leaf of a LEAFDATA tree */
7016   int usableSpace;             /* Bytes in pPage beyond the header */
7017   int pageFlags;               /* Value of pPage->aData[0] */
7018   int iSpace1 = 0;             /* First unused byte of aSpace1[] */
7019   int iOvflSpace = 0;          /* First unused byte of aOvflSpace[] */
7020   int szScratch;               /* Size of scratch memory requested */
7021   MemPage *apOld[NB];          /* pPage and up to two siblings */
7022   MemPage *apNew[NB+2];        /* pPage and up to NB siblings after balancing */
7023   u8 *pRight;                  /* Location in parent of right-sibling pointer */
7024   u8 *apDiv[NB-1];             /* Divider cells in pParent */
7025   int cntNew[NB+2];            /* Index in b.paCell[] of cell after i-th page */
7026   int cntOld[NB+2];            /* Old index in b.apCell[] */
7027   int szNew[NB+2];             /* Combined size of cells placed on i-th page */
7028   u8 *aSpace1;                 /* Space for copies of dividers cells */
7029   Pgno pgno;                   /* Temp var to store a page number in */
7030   u8 abDone[NB+2];             /* True after i'th new page is populated */
7031   Pgno aPgno[NB+2];            /* Page numbers of new pages before shuffling */
7032   Pgno aPgOrder[NB+2];         /* Copy of aPgno[] used for sorting pages */
7033   u16 aPgFlags[NB+2];          /* flags field of new pages before shuffling */
7034   CellArray b;                  /* Parsed information on cells being balanced */
7035 
7036   memset(abDone, 0, sizeof(abDone));
7037   b.nCell = 0;
7038   b.apCell = 0;
7039   pBt = pParent->pBt;
7040   assert( sqlite3_mutex_held(pBt->mutex) );
7041   assert( sqlite3PagerIswriteable(pParent->pDbPage) );
7042 
7043 #if 0
7044   TRACE(("BALANCE: begin page %d child of %d\n", pPage->pgno, pParent->pgno));
7045 #endif
7046 
7047   /* At this point pParent may have at most one overflow cell. And if
7048   ** this overflow cell is present, it must be the cell with
7049   ** index iParentIdx. This scenario comes about when this function
7050   ** is called (indirectly) from sqlite3BtreeDelete().
7051   */
7052   assert( pParent->nOverflow==0 || pParent->nOverflow==1 );
7053   assert( pParent->nOverflow==0 || pParent->aiOvfl[0]==iParentIdx );
7054 
7055   if( !aOvflSpace ){
7056     return SQLITE_NOMEM_BKPT;
7057   }
7058 
7059   /* Find the sibling pages to balance. Also locate the cells in pParent
7060   ** that divide the siblings. An attempt is made to find NN siblings on
7061   ** either side of pPage. More siblings are taken from one side, however,
7062   ** if there are fewer than NN siblings on the other side. If pParent
7063   ** has NB or fewer children then all children of pParent are taken.
7064   **
7065   ** This loop also drops the divider cells from the parent page. This
7066   ** way, the remainder of the function does not have to deal with any
7067   ** overflow cells in the parent page, since if any existed they will
7068   ** have already been removed.
7069   */
7070   i = pParent->nOverflow + pParent->nCell;
7071   if( i<2 ){
7072     nxDiv = 0;
7073   }else{
7074     assert( bBulk==0 || bBulk==1 );
7075     if( iParentIdx==0 ){
7076       nxDiv = 0;
7077     }else if( iParentIdx==i ){
7078       nxDiv = i-2+bBulk;
7079     }else{
7080       nxDiv = iParentIdx-1;
7081     }
7082     i = 2-bBulk;
7083   }
7084   nOld = i+1;
7085   if( (i+nxDiv-pParent->nOverflow)==pParent->nCell ){
7086     pRight = &pParent->aData[pParent->hdrOffset+8];
7087   }else{
7088     pRight = findCell(pParent, i+nxDiv-pParent->nOverflow);
7089   }
7090   pgno = get4byte(pRight);
7091   while( 1 ){
7092     rc = getAndInitPage(pBt, pgno, &apOld[i], 0, 0);
7093     if( rc ){
7094       memset(apOld, 0, (i+1)*sizeof(MemPage*));
7095       goto balance_cleanup;
7096     }
7097     nMaxCells += 1+apOld[i]->nCell+apOld[i]->nOverflow;
7098     if( (i--)==0 ) break;
7099 
7100     if( pParent->nOverflow && i+nxDiv==pParent->aiOvfl[0] ){
7101       apDiv[i] = pParent->apOvfl[0];
7102       pgno = get4byte(apDiv[i]);
7103       szNew[i] = pParent->xCellSize(pParent, apDiv[i]);
7104       pParent->nOverflow = 0;
7105     }else{
7106       apDiv[i] = findCell(pParent, i+nxDiv-pParent->nOverflow);
7107       pgno = get4byte(apDiv[i]);
7108       szNew[i] = pParent->xCellSize(pParent, apDiv[i]);
7109 
7110       /* Drop the cell from the parent page. apDiv[i] still points to
7111       ** the cell within the parent, even though it has been dropped.
7112       ** This is safe because dropping a cell only overwrites the first
7113       ** four bytes of it, and this function does not need the first
7114       ** four bytes of the divider cell. So the pointer is safe to use
7115       ** later on.
7116       **
7117       ** But not if we are in secure-delete mode. In secure-delete mode,
7118       ** the dropCell() routine will overwrite the entire cell with zeroes.
7119       ** In this case, temporarily copy the cell into the aOvflSpace[]
7120       ** buffer. It will be copied out again as soon as the aSpace[] buffer
7121       ** is allocated.  */
7122       if( pBt->btsFlags & BTS_SECURE_DELETE ){
7123         int iOff;
7124 
7125         iOff = SQLITE_PTR_TO_INT(apDiv[i]) - SQLITE_PTR_TO_INT(pParent->aData);
7126         if( (iOff+szNew[i])>(int)pBt->usableSize ){
7127           rc = SQLITE_CORRUPT_BKPT;
7128           memset(apOld, 0, (i+1)*sizeof(MemPage*));
7129           goto balance_cleanup;
7130         }else{
7131           memcpy(&aOvflSpace[iOff], apDiv[i], szNew[i]);
7132           apDiv[i] = &aOvflSpace[apDiv[i]-pParent->aData];
7133         }
7134       }
7135       dropCell(pParent, i+nxDiv-pParent->nOverflow, szNew[i], &rc);
7136     }
7137   }
7138 
7139   /* Make nMaxCells a multiple of 4 in order to preserve 8-byte
7140   ** alignment */
7141   nMaxCells = (nMaxCells + 3)&~3;
7142 
7143   /*
7144   ** Allocate space for memory structures
7145   */
7146   szScratch =
7147        nMaxCells*sizeof(u8*)                       /* b.apCell */
7148      + nMaxCells*sizeof(u16)                       /* b.szCell */
7149      + pBt->pageSize;                              /* aSpace1 */
7150 
7151   /* EVIDENCE-OF: R-28375-38319 SQLite will never request a scratch buffer
7152   ** that is more than 6 times the database page size. */
7153   assert( szScratch<=6*(int)pBt->pageSize );
7154   b.apCell = sqlite3ScratchMalloc( szScratch );
7155   if( b.apCell==0 ){
7156     rc = SQLITE_NOMEM_BKPT;
7157     goto balance_cleanup;
7158   }
7159   b.szCell = (u16*)&b.apCell[nMaxCells];
7160   aSpace1 = (u8*)&b.szCell[nMaxCells];
7161   assert( EIGHT_BYTE_ALIGNMENT(aSpace1) );
7162 
7163   /*
7164   ** Load pointers to all cells on sibling pages and the divider cells
7165   ** into the local b.apCell[] array.  Make copies of the divider cells
7166   ** into space obtained from aSpace1[]. The divider cells have already
7167   ** been removed from pParent.
7168   **
7169   ** If the siblings are on leaf pages, then the child pointers of the
7170   ** divider cells are stripped from the cells before they are copied
7171   ** into aSpace1[].  In this way, all cells in b.apCell[] are without
7172   ** child pointers.  If siblings are not leaves, then all cell in
7173   ** b.apCell[] include child pointers.  Either way, all cells in b.apCell[]
7174   ** are alike.
7175   **
7176   ** leafCorrection:  4 if pPage is a leaf.  0 if pPage is not a leaf.
7177   **       leafData:  1 if pPage holds key+data and pParent holds only keys.
7178   */
7179   b.pRef = apOld[0];
7180   leafCorrection = b.pRef->leaf*4;
7181   leafData = b.pRef->intKeyLeaf;
7182   for(i=0; i<nOld; i++){
7183     MemPage *pOld = apOld[i];
7184     int limit = pOld->nCell;
7185     u8 *aData = pOld->aData;
7186     u16 maskPage = pOld->maskPage;
7187     u8 *piCell = aData + pOld->cellOffset;
7188     u8 *piEnd;
7189 
7190     /* Verify that all sibling pages are of the same "type" (table-leaf,
7191     ** table-interior, index-leaf, or index-interior).
7192     */
7193     if( pOld->aData[0]!=apOld[0]->aData[0] ){
7194       rc = SQLITE_CORRUPT_BKPT;
7195       goto balance_cleanup;
7196     }
7197 
7198     /* Load b.apCell[] with pointers to all cells in pOld.  If pOld
7199     ** constains overflow cells, include them in the b.apCell[] array
7200     ** in the correct spot.
7201     **
7202     ** Note that when there are multiple overflow cells, it is always the
7203     ** case that they are sequential and adjacent.  This invariant arises
7204     ** because multiple overflows can only occurs when inserting divider
7205     ** cells into a parent on a prior balance, and divider cells are always
7206     ** adjacent and are inserted in order.  There is an assert() tagged
7207     ** with "NOTE 1" in the overflow cell insertion loop to prove this
7208     ** invariant.
7209     **
7210     ** This must be done in advance.  Once the balance starts, the cell
7211     ** offset section of the btree page will be overwritten and we will no
7212     ** long be able to find the cells if a pointer to each cell is not saved
7213     ** first.
7214     */
7215     memset(&b.szCell[b.nCell], 0, sizeof(b.szCell[0])*(limit+pOld->nOverflow));
7216     if( pOld->nOverflow>0 ){
7217       limit = pOld->aiOvfl[0];
7218       for(j=0; j<limit; j++){
7219         b.apCell[b.nCell] = aData + (maskPage & get2byteAligned(piCell));
7220         piCell += 2;
7221         b.nCell++;
7222       }
7223       for(k=0; k<pOld->nOverflow; k++){
7224         assert( k==0 || pOld->aiOvfl[k-1]+1==pOld->aiOvfl[k] );/* NOTE 1 */
7225         b.apCell[b.nCell] = pOld->apOvfl[k];
7226         b.nCell++;
7227       }
7228     }
7229     piEnd = aData + pOld->cellOffset + 2*pOld->nCell;
7230     while( piCell<piEnd ){
7231       assert( b.nCell<nMaxCells );
7232       b.apCell[b.nCell] = aData + (maskPage & get2byteAligned(piCell));
7233       piCell += 2;
7234       b.nCell++;
7235     }
7236 
7237     cntOld[i] = b.nCell;
7238     if( i<nOld-1 && !leafData){
7239       u16 sz = (u16)szNew[i];
7240       u8 *pTemp;
7241       assert( b.nCell<nMaxCells );
7242       b.szCell[b.nCell] = sz;
7243       pTemp = &aSpace1[iSpace1];
7244       iSpace1 += sz;
7245       assert( sz<=pBt->maxLocal+23 );
7246       assert( iSpace1 <= (int)pBt->pageSize );
7247       memcpy(pTemp, apDiv[i], sz);
7248       b.apCell[b.nCell] = pTemp+leafCorrection;
7249       assert( leafCorrection==0 || leafCorrection==4 );
7250       b.szCell[b.nCell] = b.szCell[b.nCell] - leafCorrection;
7251       if( !pOld->leaf ){
7252         assert( leafCorrection==0 );
7253         assert( pOld->hdrOffset==0 );
7254         /* The right pointer of the child page pOld becomes the left
7255         ** pointer of the divider cell */
7256         memcpy(b.apCell[b.nCell], &pOld->aData[8], 4);
7257       }else{
7258         assert( leafCorrection==4 );
7259         while( b.szCell[b.nCell]<4 ){
7260           /* Do not allow any cells smaller than 4 bytes. If a smaller cell
7261           ** does exist, pad it with 0x00 bytes. */
7262           assert( b.szCell[b.nCell]==3 || CORRUPT_DB );
7263           assert( b.apCell[b.nCell]==&aSpace1[iSpace1-3] || CORRUPT_DB );
7264           aSpace1[iSpace1++] = 0x00;
7265           b.szCell[b.nCell]++;
7266         }
7267       }
7268       b.nCell++;
7269     }
7270   }
7271 
7272   /*
7273   ** Figure out the number of pages needed to hold all b.nCell cells.
7274   ** Store this number in "k".  Also compute szNew[] which is the total
7275   ** size of all cells on the i-th page and cntNew[] which is the index
7276   ** in b.apCell[] of the cell that divides page i from page i+1.
7277   ** cntNew[k] should equal b.nCell.
7278   **
7279   ** Values computed by this block:
7280   **
7281   **           k: The total number of sibling pages
7282   **    szNew[i]: Spaced used on the i-th sibling page.
7283   **   cntNew[i]: Index in b.apCell[] and b.szCell[] for the first cell to
7284   **              the right of the i-th sibling page.
7285   ** usableSpace: Number of bytes of space available on each sibling.
7286   **
7287   */
7288   usableSpace = pBt->usableSize - 12 + leafCorrection;
7289   for(i=0; i<nOld; i++){
7290     MemPage *p = apOld[i];
7291     szNew[i] = usableSpace - p->nFree;
7292     for(j=0; j<p->nOverflow; j++){
7293       szNew[i] += 2 + p->xCellSize(p, p->apOvfl[j]);
7294     }
7295     cntNew[i] = cntOld[i];
7296   }
7297   k = nOld;
7298   for(i=0; i<k; i++){
7299     int sz;
7300     while( szNew[i]>usableSpace ){
7301       if( i+1>=k ){
7302         k = i+2;
7303         if( k>NB+2 ){ rc = SQLITE_CORRUPT_BKPT; goto balance_cleanup; }
7304         szNew[k-1] = 0;
7305         cntNew[k-1] = b.nCell;
7306       }
7307       sz = 2 + cachedCellSize(&b, cntNew[i]-1);
7308       szNew[i] -= sz;
7309       if( !leafData ){
7310         if( cntNew[i]<b.nCell ){
7311           sz = 2 + cachedCellSize(&b, cntNew[i]);
7312         }else{
7313           sz = 0;
7314         }
7315       }
7316       szNew[i+1] += sz;
7317       cntNew[i]--;
7318     }
7319     while( cntNew[i]<b.nCell ){
7320       sz = 2 + cachedCellSize(&b, cntNew[i]);
7321       if( szNew[i]+sz>usableSpace ) break;
7322       szNew[i] += sz;
7323       cntNew[i]++;
7324       if( !leafData ){
7325         if( cntNew[i]<b.nCell ){
7326           sz = 2 + cachedCellSize(&b, cntNew[i]);
7327         }else{
7328           sz = 0;
7329         }
7330       }
7331       szNew[i+1] -= sz;
7332     }
7333     if( cntNew[i]>=b.nCell ){
7334       k = i+1;
7335     }else if( cntNew[i] <= (i>0 ? cntNew[i-1] : 0) ){
7336       rc = SQLITE_CORRUPT_BKPT;
7337       goto balance_cleanup;
7338     }
7339   }
7340 
7341   /*
7342   ** The packing computed by the previous block is biased toward the siblings
7343   ** on the left side (siblings with smaller keys). The left siblings are
7344   ** always nearly full, while the right-most sibling might be nearly empty.
7345   ** The next block of code attempts to adjust the packing of siblings to
7346   ** get a better balance.
7347   **
7348   ** This adjustment is more than an optimization.  The packing above might
7349   ** be so out of balance as to be illegal.  For example, the right-most
7350   ** sibling might be completely empty.  This adjustment is not optional.
7351   */
7352   for(i=k-1; i>0; i--){
7353     int szRight = szNew[i];  /* Size of sibling on the right */
7354     int szLeft = szNew[i-1]; /* Size of sibling on the left */
7355     int r;              /* Index of right-most cell in left sibling */
7356     int d;              /* Index of first cell to the left of right sibling */
7357 
7358     r = cntNew[i-1] - 1;
7359     d = r + 1 - leafData;
7360     (void)cachedCellSize(&b, d);
7361     do{
7362       assert( d<nMaxCells );
7363       assert( r<nMaxCells );
7364       (void)cachedCellSize(&b, r);
7365       if( szRight!=0
7366        && (bBulk || szRight+b.szCell[d]+2 > szLeft-(b.szCell[r]+(i==k-1?0:2)))){
7367         break;
7368       }
7369       szRight += b.szCell[d] + 2;
7370       szLeft -= b.szCell[r] + 2;
7371       cntNew[i-1] = r;
7372       r--;
7373       d--;
7374     }while( r>=0 );
7375     szNew[i] = szRight;
7376     szNew[i-1] = szLeft;
7377     if( cntNew[i-1] <= (i>1 ? cntNew[i-2] : 0) ){
7378       rc = SQLITE_CORRUPT_BKPT;
7379       goto balance_cleanup;
7380     }
7381   }
7382 
7383   /* Sanity check:  For a non-corrupt database file one of the follwing
7384   ** must be true:
7385   **    (1) We found one or more cells (cntNew[0])>0), or
7386   **    (2) pPage is a virtual root page.  A virtual root page is when
7387   **        the real root page is page 1 and we are the only child of
7388   **        that page.
7389   */
7390   assert( cntNew[0]>0 || (pParent->pgno==1 && pParent->nCell==0) || CORRUPT_DB);
7391   TRACE(("BALANCE: old: %d(nc=%d) %d(nc=%d) %d(nc=%d)\n",
7392     apOld[0]->pgno, apOld[0]->nCell,
7393     nOld>=2 ? apOld[1]->pgno : 0, nOld>=2 ? apOld[1]->nCell : 0,
7394     nOld>=3 ? apOld[2]->pgno : 0, nOld>=3 ? apOld[2]->nCell : 0
7395   ));
7396 
7397   /*
7398   ** Allocate k new pages.  Reuse old pages where possible.
7399   */
7400   pageFlags = apOld[0]->aData[0];
7401   for(i=0; i<k; i++){
7402     MemPage *pNew;
7403     if( i<nOld ){
7404       pNew = apNew[i] = apOld[i];
7405       apOld[i] = 0;
7406       rc = sqlite3PagerWrite(pNew->pDbPage);
7407       nNew++;
7408       if( rc ) goto balance_cleanup;
7409     }else{
7410       assert( i>0 );
7411       rc = allocateBtreePage(pBt, &pNew, &pgno, (bBulk ? 1 : pgno), 0);
7412       if( rc ) goto balance_cleanup;
7413       zeroPage(pNew, pageFlags);
7414       apNew[i] = pNew;
7415       nNew++;
7416       cntOld[i] = b.nCell;
7417 
7418       /* Set the pointer-map entry for the new sibling page. */
7419       if( ISAUTOVACUUM ){
7420         ptrmapPut(pBt, pNew->pgno, PTRMAP_BTREE, pParent->pgno, &rc);
7421         if( rc!=SQLITE_OK ){
7422           goto balance_cleanup;
7423         }
7424       }
7425     }
7426   }
7427 
7428   /*
7429   ** Reassign page numbers so that the new pages are in ascending order.
7430   ** This helps to keep entries in the disk file in order so that a scan
7431   ** of the table is closer to a linear scan through the file. That in turn
7432   ** helps the operating system to deliver pages from the disk more rapidly.
7433   **
7434   ** An O(n^2) insertion sort algorithm is used, but since n is never more
7435   ** than (NB+2) (a small constant), that should not be a problem.
7436   **
7437   ** When NB==3, this one optimization makes the database about 25% faster
7438   ** for large insertions and deletions.
7439   */
7440   for(i=0; i<nNew; i++){
7441     aPgOrder[i] = aPgno[i] = apNew[i]->pgno;
7442     aPgFlags[i] = apNew[i]->pDbPage->flags;
7443     for(j=0; j<i; j++){
7444       if( aPgno[j]==aPgno[i] ){
7445         /* This branch is taken if the set of sibling pages somehow contains
7446         ** duplicate entries. This can happen if the database is corrupt.
7447         ** It would be simpler to detect this as part of the loop below, but
7448         ** we do the detection here in order to avoid populating the pager
7449         ** cache with two separate objects associated with the same
7450         ** page number.  */
7451         assert( CORRUPT_DB );
7452         rc = SQLITE_CORRUPT_BKPT;
7453         goto balance_cleanup;
7454       }
7455     }
7456   }
7457   for(i=0; i<nNew; i++){
7458     int iBest = 0;                /* aPgno[] index of page number to use */
7459     for(j=1; j<nNew; j++){
7460       if( aPgOrder[j]<aPgOrder[iBest] ) iBest = j;
7461     }
7462     pgno = aPgOrder[iBest];
7463     aPgOrder[iBest] = 0xffffffff;
7464     if( iBest!=i ){
7465       if( iBest>i ){
7466         sqlite3PagerRekey(apNew[iBest]->pDbPage, pBt->nPage+iBest+1, 0);
7467       }
7468       sqlite3PagerRekey(apNew[i]->pDbPage, pgno, aPgFlags[iBest]);
7469       apNew[i]->pgno = pgno;
7470     }
7471   }
7472 
7473   TRACE(("BALANCE: new: %d(%d nc=%d) %d(%d nc=%d) %d(%d nc=%d) "
7474          "%d(%d nc=%d) %d(%d nc=%d)\n",
7475     apNew[0]->pgno, szNew[0], cntNew[0],
7476     nNew>=2 ? apNew[1]->pgno : 0, nNew>=2 ? szNew[1] : 0,
7477     nNew>=2 ? cntNew[1] - cntNew[0] - !leafData : 0,
7478     nNew>=3 ? apNew[2]->pgno : 0, nNew>=3 ? szNew[2] : 0,
7479     nNew>=3 ? cntNew[2] - cntNew[1] - !leafData : 0,
7480     nNew>=4 ? apNew[3]->pgno : 0, nNew>=4 ? szNew[3] : 0,
7481     nNew>=4 ? cntNew[3] - cntNew[2] - !leafData : 0,
7482     nNew>=5 ? apNew[4]->pgno : 0, nNew>=5 ? szNew[4] : 0,
7483     nNew>=5 ? cntNew[4] - cntNew[3] - !leafData : 0
7484   ));
7485 
7486   assert( sqlite3PagerIswriteable(pParent->pDbPage) );
7487   put4byte(pRight, apNew[nNew-1]->pgno);
7488 
7489   /* If the sibling pages are not leaves, ensure that the right-child pointer
7490   ** of the right-most new sibling page is set to the value that was
7491   ** originally in the same field of the right-most old sibling page. */
7492   if( (pageFlags & PTF_LEAF)==0 && nOld!=nNew ){
7493     MemPage *pOld = (nNew>nOld ? apNew : apOld)[nOld-1];
7494     memcpy(&apNew[nNew-1]->aData[8], &pOld->aData[8], 4);
7495   }
7496 
7497   /* Make any required updates to pointer map entries associated with
7498   ** cells stored on sibling pages following the balance operation. Pointer
7499   ** map entries associated with divider cells are set by the insertCell()
7500   ** routine. The associated pointer map entries are:
7501   **
7502   **   a) if the cell contains a reference to an overflow chain, the
7503   **      entry associated with the first page in the overflow chain, and
7504   **
7505   **   b) if the sibling pages are not leaves, the child page associated
7506   **      with the cell.
7507   **
7508   ** If the sibling pages are not leaves, then the pointer map entry
7509   ** associated with the right-child of each sibling may also need to be
7510   ** updated. This happens below, after the sibling pages have been
7511   ** populated, not here.
7512   */
7513   if( ISAUTOVACUUM ){
7514     MemPage *pNew = apNew[0];
7515     u8 *aOld = pNew->aData;
7516     int cntOldNext = pNew->nCell + pNew->nOverflow;
7517     int usableSize = pBt->usableSize;
7518     int iNew = 0;
7519     int iOld = 0;
7520 
7521     for(i=0; i<b.nCell; i++){
7522       u8 *pCell = b.apCell[i];
7523       if( i==cntOldNext ){
7524         MemPage *pOld = (++iOld)<nNew ? apNew[iOld] : apOld[iOld];
7525         cntOldNext += pOld->nCell + pOld->nOverflow + !leafData;
7526         aOld = pOld->aData;
7527       }
7528       if( i==cntNew[iNew] ){
7529         pNew = apNew[++iNew];
7530         if( !leafData ) continue;
7531       }
7532 
7533       /* Cell pCell is destined for new sibling page pNew. Originally, it
7534       ** was either part of sibling page iOld (possibly an overflow cell),
7535       ** or else the divider cell to the left of sibling page iOld. So,
7536       ** if sibling page iOld had the same page number as pNew, and if
7537       ** pCell really was a part of sibling page iOld (not a divider or
7538       ** overflow cell), we can skip updating the pointer map entries.  */
7539       if( iOld>=nNew
7540        || pNew->pgno!=aPgno[iOld]
7541        || !SQLITE_WITHIN(pCell,aOld,&aOld[usableSize])
7542       ){
7543         if( !leafCorrection ){
7544           ptrmapPut(pBt, get4byte(pCell), PTRMAP_BTREE, pNew->pgno, &rc);
7545         }
7546         if( cachedCellSize(&b,i)>pNew->minLocal ){
7547           ptrmapPutOvflPtr(pNew, pCell, &rc);
7548         }
7549         if( rc ) goto balance_cleanup;
7550       }
7551     }
7552   }
7553 
7554   /* Insert new divider cells into pParent. */
7555   for(i=0; i<nNew-1; i++){
7556     u8 *pCell;
7557     u8 *pTemp;
7558     int sz;
7559     MemPage *pNew = apNew[i];
7560     j = cntNew[i];
7561 
7562     assert( j<nMaxCells );
7563     assert( b.apCell[j]!=0 );
7564     pCell = b.apCell[j];
7565     sz = b.szCell[j] + leafCorrection;
7566     pTemp = &aOvflSpace[iOvflSpace];
7567     if( !pNew->leaf ){
7568       memcpy(&pNew->aData[8], pCell, 4);
7569     }else if( leafData ){
7570       /* If the tree is a leaf-data tree, and the siblings are leaves,
7571       ** then there is no divider cell in b.apCell[]. Instead, the divider
7572       ** cell consists of the integer key for the right-most cell of
7573       ** the sibling-page assembled above only.
7574       */
7575       CellInfo info;
7576       j--;
7577       pNew->xParseCell(pNew, b.apCell[j], &info);
7578       pCell = pTemp;
7579       sz = 4 + putVarint(&pCell[4], info.nKey);
7580       pTemp = 0;
7581     }else{
7582       pCell -= 4;
7583       /* Obscure case for non-leaf-data trees: If the cell at pCell was
7584       ** previously stored on a leaf node, and its reported size was 4
7585       ** bytes, then it may actually be smaller than this
7586       ** (see btreeParseCellPtr(), 4 bytes is the minimum size of
7587       ** any cell). But it is important to pass the correct size to
7588       ** insertCell(), so reparse the cell now.
7589       **
7590       ** This can only happen for b-trees used to evaluate "IN (SELECT ...)"
7591       ** and WITHOUT ROWID tables with exactly one column which is the
7592       ** primary key.
7593       */
7594       if( b.szCell[j]==4 ){
7595         assert(leafCorrection==4);
7596         sz = pParent->xCellSize(pParent, pCell);
7597       }
7598     }
7599     iOvflSpace += sz;
7600     assert( sz<=pBt->maxLocal+23 );
7601     assert( iOvflSpace <= (int)pBt->pageSize );
7602     insertCell(pParent, nxDiv+i, pCell, sz, pTemp, pNew->pgno, &rc);
7603     if( rc!=SQLITE_OK ) goto balance_cleanup;
7604     assert( sqlite3PagerIswriteable(pParent->pDbPage) );
7605   }
7606 
7607   /* Now update the actual sibling pages. The order in which they are updated
7608   ** is important, as this code needs to avoid disrupting any page from which
7609   ** cells may still to be read. In practice, this means:
7610   **
7611   **  (1) If cells are moving left (from apNew[iPg] to apNew[iPg-1])
7612   **      then it is not safe to update page apNew[iPg] until after
7613   **      the left-hand sibling apNew[iPg-1] has been updated.
7614   **
7615   **  (2) If cells are moving right (from apNew[iPg] to apNew[iPg+1])
7616   **      then it is not safe to update page apNew[iPg] until after
7617   **      the right-hand sibling apNew[iPg+1] has been updated.
7618   **
7619   ** If neither of the above apply, the page is safe to update.
7620   **
7621   ** The iPg value in the following loop starts at nNew-1 goes down
7622   ** to 0, then back up to nNew-1 again, thus making two passes over
7623   ** the pages.  On the initial downward pass, only condition (1) above
7624   ** needs to be tested because (2) will always be true from the previous
7625   ** step.  On the upward pass, both conditions are always true, so the
7626   ** upwards pass simply processes pages that were missed on the downward
7627   ** pass.
7628   */
7629   for(i=1-nNew; i<nNew; i++){
7630     int iPg = i<0 ? -i : i;
7631     assert( iPg>=0 && iPg<nNew );
7632     if( abDone[iPg] ) continue;         /* Skip pages already processed */
7633     if( i>=0                            /* On the upwards pass, or... */
7634      || cntOld[iPg-1]>=cntNew[iPg-1]    /* Condition (1) is true */
7635     ){
7636       int iNew;
7637       int iOld;
7638       int nNewCell;
7639 
7640       /* Verify condition (1):  If cells are moving left, update iPg
7641       ** only after iPg-1 has already been updated. */
7642       assert( iPg==0 || cntOld[iPg-1]>=cntNew[iPg-1] || abDone[iPg-1] );
7643 
7644       /* Verify condition (2):  If cells are moving right, update iPg
7645       ** only after iPg+1 has already been updated. */
7646       assert( cntNew[iPg]>=cntOld[iPg] || abDone[iPg+1] );
7647 
7648       if( iPg==0 ){
7649         iNew = iOld = 0;
7650         nNewCell = cntNew[0];
7651       }else{
7652         iOld = iPg<nOld ? (cntOld[iPg-1] + !leafData) : b.nCell;
7653         iNew = cntNew[iPg-1] + !leafData;
7654         nNewCell = cntNew[iPg] - iNew;
7655       }
7656 
7657       rc = editPage(apNew[iPg], iOld, iNew, nNewCell, &b);
7658       if( rc ) goto balance_cleanup;
7659       abDone[iPg]++;
7660       apNew[iPg]->nFree = usableSpace-szNew[iPg];
7661       assert( apNew[iPg]->nOverflow==0 );
7662       assert( apNew[iPg]->nCell==nNewCell );
7663     }
7664   }
7665 
7666   /* All pages have been processed exactly once */
7667   assert( memcmp(abDone, "\01\01\01\01\01", nNew)==0 );
7668 
7669   assert( nOld>0 );
7670   assert( nNew>0 );
7671 
7672   if( isRoot && pParent->nCell==0 && pParent->hdrOffset<=apNew[0]->nFree ){
7673     /* The root page of the b-tree now contains no cells. The only sibling
7674     ** page is the right-child of the parent. Copy the contents of the
7675     ** child page into the parent, decreasing the overall height of the
7676     ** b-tree structure by one. This is described as the "balance-shallower"
7677     ** sub-algorithm in some documentation.
7678     **
7679     ** If this is an auto-vacuum database, the call to copyNodeContent()
7680     ** sets all pointer-map entries corresponding to database image pages
7681     ** for which the pointer is stored within the content being copied.
7682     **
7683     ** It is critical that the child page be defragmented before being
7684     ** copied into the parent, because if the parent is page 1 then it will
7685     ** by smaller than the child due to the database header, and so all the
7686     ** free space needs to be up front.
7687     */
7688     assert( nNew==1 || CORRUPT_DB );
7689     rc = defragmentPage(apNew[0]);
7690     testcase( rc!=SQLITE_OK );
7691     assert( apNew[0]->nFree ==
7692         (get2byte(&apNew[0]->aData[5])-apNew[0]->cellOffset-apNew[0]->nCell*2)
7693       || rc!=SQLITE_OK
7694     );
7695     copyNodeContent(apNew[0], pParent, &rc);
7696     freePage(apNew[0], &rc);
7697   }else if( ISAUTOVACUUM && !leafCorrection ){
7698     /* Fix the pointer map entries associated with the right-child of each
7699     ** sibling page. All other pointer map entries have already been taken
7700     ** care of.  */
7701     for(i=0; i<nNew; i++){
7702       u32 key = get4byte(&apNew[i]->aData[8]);
7703       ptrmapPut(pBt, key, PTRMAP_BTREE, apNew[i]->pgno, &rc);
7704     }
7705   }
7706 
7707   assert( pParent->isInit );
7708   TRACE(("BALANCE: finished: old=%d new=%d cells=%d\n",
7709           nOld, nNew, b.nCell));
7710 
7711   /* Free any old pages that were not reused as new pages.
7712   */
7713   for(i=nNew; i<nOld; i++){
7714     freePage(apOld[i], &rc);
7715   }
7716 
7717 #if 0
7718   if( ISAUTOVACUUM && rc==SQLITE_OK && apNew[0]->isInit ){
7719     /* The ptrmapCheckPages() contains assert() statements that verify that
7720     ** all pointer map pages are set correctly. This is helpful while
7721     ** debugging. This is usually disabled because a corrupt database may
7722     ** cause an assert() statement to fail.  */
7723     ptrmapCheckPages(apNew, nNew);
7724     ptrmapCheckPages(&pParent, 1);
7725   }
7726 #endif
7727 
7728   /*
7729   ** Cleanup before returning.
7730   */
7731 balance_cleanup:
7732   sqlite3ScratchFree(b.apCell);
7733   for(i=0; i<nOld; i++){
7734     releasePage(apOld[i]);
7735   }
7736   for(i=0; i<nNew; i++){
7737     releasePage(apNew[i]);
7738   }
7739 
7740   return rc;
7741 }
7742 
7743 
7744 /*
7745 ** This function is called when the root page of a b-tree structure is
7746 ** overfull (has one or more overflow pages).
7747 **
7748 ** A new child page is allocated and the contents of the current root
7749 ** page, including overflow cells, are copied into the child. The root
7750 ** page is then overwritten to make it an empty page with the right-child
7751 ** pointer pointing to the new page.
7752 **
7753 ** Before returning, all pointer-map entries corresponding to pages
7754 ** that the new child-page now contains pointers to are updated. The
7755 ** entry corresponding to the new right-child pointer of the root
7756 ** page is also updated.
7757 **
7758 ** If successful, *ppChild is set to contain a reference to the child
7759 ** page and SQLITE_OK is returned. In this case the caller is required
7760 ** to call releasePage() on *ppChild exactly once. If an error occurs,
7761 ** an error code is returned and *ppChild is set to 0.
7762 */
7763 static int balance_deeper(MemPage *pRoot, MemPage **ppChild){
7764   int rc;                        /* Return value from subprocedures */
7765   MemPage *pChild = 0;           /* Pointer to a new child page */
7766   Pgno pgnoChild = 0;            /* Page number of the new child page */
7767   BtShared *pBt = pRoot->pBt;    /* The BTree */
7768 
7769   assert( pRoot->nOverflow>0 );
7770   assert( sqlite3_mutex_held(pBt->mutex) );
7771 
7772   /* Make pRoot, the root page of the b-tree, writable. Allocate a new
7773   ** page that will become the new right-child of pPage. Copy the contents
7774   ** of the node stored on pRoot into the new child page.
7775   */
7776   rc = sqlite3PagerWrite(pRoot->pDbPage);
7777   if( rc==SQLITE_OK ){
7778     rc = allocateBtreePage(pBt,&pChild,&pgnoChild,pRoot->pgno,0);
7779     copyNodeContent(pRoot, pChild, &rc);
7780     if( ISAUTOVACUUM ){
7781       ptrmapPut(pBt, pgnoChild, PTRMAP_BTREE, pRoot->pgno, &rc);
7782     }
7783   }
7784   if( rc ){
7785     *ppChild = 0;
7786     releasePage(pChild);
7787     return rc;
7788   }
7789   assert( sqlite3PagerIswriteable(pChild->pDbPage) );
7790   assert( sqlite3PagerIswriteable(pRoot->pDbPage) );
7791   assert( pChild->nCell==pRoot->nCell );
7792 
7793   TRACE(("BALANCE: copy root %d into %d\n", pRoot->pgno, pChild->pgno));
7794 
7795   /* Copy the overflow cells from pRoot to pChild */
7796   memcpy(pChild->aiOvfl, pRoot->aiOvfl,
7797          pRoot->nOverflow*sizeof(pRoot->aiOvfl[0]));
7798   memcpy(pChild->apOvfl, pRoot->apOvfl,
7799          pRoot->nOverflow*sizeof(pRoot->apOvfl[0]));
7800   pChild->nOverflow = pRoot->nOverflow;
7801 
7802   /* Zero the contents of pRoot. Then install pChild as the right-child. */
7803   zeroPage(pRoot, pChild->aData[0] & ~PTF_LEAF);
7804   put4byte(&pRoot->aData[pRoot->hdrOffset+8], pgnoChild);
7805 
7806   *ppChild = pChild;
7807   return SQLITE_OK;
7808 }
7809 
7810 /*
7811 ** The page that pCur currently points to has just been modified in
7812 ** some way. This function figures out if this modification means the
7813 ** tree needs to be balanced, and if so calls the appropriate balancing
7814 ** routine. Balancing routines are:
7815 **
7816 **   balance_quick()
7817 **   balance_deeper()
7818 **   balance_nonroot()
7819 */
7820 static int balance(BtCursor *pCur){
7821   int rc = SQLITE_OK;
7822   const int nMin = pCur->pBt->usableSize * 2 / 3;
7823   u8 aBalanceQuickSpace[13];
7824   u8 *pFree = 0;
7825 
7826   VVA_ONLY( int balance_quick_called = 0 );
7827   VVA_ONLY( int balance_deeper_called = 0 );
7828 
7829   do {
7830     int iPage = pCur->iPage;
7831     MemPage *pPage = pCur->apPage[iPage];
7832 
7833     if( iPage==0 ){
7834       if( pPage->nOverflow ){
7835         /* The root page of the b-tree is overfull. In this case call the
7836         ** balance_deeper() function to create a new child for the root-page
7837         ** and copy the current contents of the root-page to it. The
7838         ** next iteration of the do-loop will balance the child page.
7839         */
7840         assert( balance_deeper_called==0 );
7841         VVA_ONLY( balance_deeper_called++ );
7842         rc = balance_deeper(pPage, &pCur->apPage[1]);
7843         if( rc==SQLITE_OK ){
7844           pCur->iPage = 1;
7845           pCur->aiIdx[0] = 0;
7846           pCur->aiIdx[1] = 0;
7847           assert( pCur->apPage[1]->nOverflow );
7848         }
7849       }else{
7850         break;
7851       }
7852     }else if( pPage->nOverflow==0 && pPage->nFree<=nMin ){
7853       break;
7854     }else{
7855       MemPage * const pParent = pCur->apPage[iPage-1];
7856       int const iIdx = pCur->aiIdx[iPage-1];
7857 
7858       rc = sqlite3PagerWrite(pParent->pDbPage);
7859       if( rc==SQLITE_OK ){
7860 #ifndef SQLITE_OMIT_QUICKBALANCE
7861         if( pPage->intKeyLeaf
7862          && pPage->nOverflow==1
7863          && pPage->aiOvfl[0]==pPage->nCell
7864          && pParent->pgno!=1
7865          && pParent->nCell==iIdx
7866         ){
7867           /* Call balance_quick() to create a new sibling of pPage on which
7868           ** to store the overflow cell. balance_quick() inserts a new cell
7869           ** into pParent, which may cause pParent overflow. If this
7870           ** happens, the next iteration of the do-loop will balance pParent
7871           ** use either balance_nonroot() or balance_deeper(). Until this
7872           ** happens, the overflow cell is stored in the aBalanceQuickSpace[]
7873           ** buffer.
7874           **
7875           ** The purpose of the following assert() is to check that only a
7876           ** single call to balance_quick() is made for each call to this
7877           ** function. If this were not verified, a subtle bug involving reuse
7878           ** of the aBalanceQuickSpace[] might sneak in.
7879           */
7880           assert( balance_quick_called==0 );
7881           VVA_ONLY( balance_quick_called++ );
7882           rc = balance_quick(pParent, pPage, aBalanceQuickSpace);
7883         }else
7884 #endif
7885         {
7886           /* In this case, call balance_nonroot() to redistribute cells
7887           ** between pPage and up to 2 of its sibling pages. This involves
7888           ** modifying the contents of pParent, which may cause pParent to
7889           ** become overfull or underfull. The next iteration of the do-loop
7890           ** will balance the parent page to correct this.
7891           **
7892           ** If the parent page becomes overfull, the overflow cell or cells
7893           ** are stored in the pSpace buffer allocated immediately below.
7894           ** A subsequent iteration of the do-loop will deal with this by
7895           ** calling balance_nonroot() (balance_deeper() may be called first,
7896           ** but it doesn't deal with overflow cells - just moves them to a
7897           ** different page). Once this subsequent call to balance_nonroot()
7898           ** has completed, it is safe to release the pSpace buffer used by
7899           ** the previous call, as the overflow cell data will have been
7900           ** copied either into the body of a database page or into the new
7901           ** pSpace buffer passed to the latter call to balance_nonroot().
7902           */
7903           u8 *pSpace = sqlite3PageMalloc(pCur->pBt->pageSize);
7904           rc = balance_nonroot(pParent, iIdx, pSpace, iPage==1,
7905                                pCur->hints&BTREE_BULKLOAD);
7906           if( pFree ){
7907             /* If pFree is not NULL, it points to the pSpace buffer used
7908             ** by a previous call to balance_nonroot(). Its contents are
7909             ** now stored either on real database pages or within the
7910             ** new pSpace buffer, so it may be safely freed here. */
7911             sqlite3PageFree(pFree);
7912           }
7913 
7914           /* The pSpace buffer will be freed after the next call to
7915           ** balance_nonroot(), or just before this function returns, whichever
7916           ** comes first. */
7917           pFree = pSpace;
7918         }
7919       }
7920 
7921       pPage->nOverflow = 0;
7922 
7923       /* The next iteration of the do-loop balances the parent page. */
7924       releasePage(pPage);
7925       pCur->iPage--;
7926       assert( pCur->iPage>=0 );
7927     }
7928   }while( rc==SQLITE_OK );
7929 
7930   if( pFree ){
7931     sqlite3PageFree(pFree);
7932   }
7933   return rc;
7934 }
7935 
7936 
7937 /*
7938 ** Insert a new record into the BTree.  The content of the new record
7939 ** is described by the pX object.  The pCur cursor is used only to
7940 ** define what table the record should be inserted into, and is left
7941 ** pointing at a random location.
7942 **
7943 ** For a table btree (used for rowid tables), only the pX.nKey value of
7944 ** the key is used. The pX.pKey value must be NULL.  The pX.nKey is the
7945 ** rowid or INTEGER PRIMARY KEY of the row.  The pX.nData,pData,nZero fields
7946 ** hold the content of the row.
7947 **
7948 ** For an index btree (used for indexes and WITHOUT ROWID tables), the
7949 ** key is an arbitrary byte sequence stored in pX.pKey,nKey.  The
7950 ** pX.pData,nData,nZero fields must be zero.
7951 **
7952 ** If the seekResult parameter is non-zero, then a successful call to
7953 ** MovetoUnpacked() to seek cursor pCur to (pKey,nKey) has already
7954 ** been performed.  In other words, if seekResult!=0 then the cursor
7955 ** is currently pointing to a cell that will be adjacent to the cell
7956 ** to be inserted.  If seekResult<0 then pCur points to a cell that is
7957 ** smaller then (pKey,nKey).  If seekResult>0 then pCur points to a cell
7958 ** that is larger than (pKey,nKey).
7959 **
7960 ** If seekResult==0, that means pCur is pointing at some unknown location.
7961 ** In that case, this routine must seek the cursor to the correct insertion
7962 ** point for (pKey,nKey) before doing the insertion.  For index btrees,
7963 ** if pX->nMem is non-zero, then pX->aMem contains pointers to the unpacked
7964 ** key values and pX->aMem can be used instead of pX->pKey to avoid having
7965 ** to decode the key.
7966 */
7967 int sqlite3BtreeInsert(
7968   BtCursor *pCur,                /* Insert data into the table of this cursor */
7969   const BtreePayload *pX,        /* Content of the row to be inserted */
7970   int flags,                     /* True if this is likely an append */
7971   int seekResult                 /* Result of prior MovetoUnpacked() call */
7972 ){
7973   int rc;
7974   int loc = seekResult;          /* -1: before desired location  +1: after */
7975   int szNew = 0;
7976   int idx;
7977   MemPage *pPage;
7978   Btree *p = pCur->pBtree;
7979   BtShared *pBt = p->pBt;
7980   unsigned char *oldCell;
7981   unsigned char *newCell = 0;
7982 
7983   assert( (flags & (BTREE_SAVEPOSITION|BTREE_APPEND))==flags );
7984 
7985   if( pCur->eState==CURSOR_FAULT ){
7986     assert( pCur->skipNext!=SQLITE_OK );
7987     return pCur->skipNext;
7988   }
7989 
7990   assert( cursorOwnsBtShared(pCur) );
7991   assert( (pCur->curFlags & BTCF_WriteFlag)!=0
7992               && pBt->inTransaction==TRANS_WRITE
7993               && (pBt->btsFlags & BTS_READ_ONLY)==0 );
7994   assert( hasSharedCacheTableLock(p, pCur->pgnoRoot, pCur->pKeyInfo!=0, 2) );
7995 
7996   /* Assert that the caller has been consistent. If this cursor was opened
7997   ** expecting an index b-tree, then the caller should be inserting blob
7998   ** keys with no associated data. If the cursor was opened expecting an
7999   ** intkey table, the caller should be inserting integer keys with a
8000   ** blob of associated data.  */
8001   assert( (pX->pKey==0)==(pCur->pKeyInfo==0) );
8002 
8003   /* Save the positions of any other cursors open on this table.
8004   **
8005   ** In some cases, the call to btreeMoveto() below is a no-op. For
8006   ** example, when inserting data into a table with auto-generated integer
8007   ** keys, the VDBE layer invokes sqlite3BtreeLast() to figure out the
8008   ** integer key to use. It then calls this function to actually insert the
8009   ** data into the intkey B-Tree. In this case btreeMoveto() recognizes
8010   ** that the cursor is already where it needs to be and returns without
8011   ** doing any work. To avoid thwarting these optimizations, it is important
8012   ** not to clear the cursor here.
8013   */
8014   if( pCur->curFlags & BTCF_Multiple ){
8015     rc = saveAllCursors(pBt, pCur->pgnoRoot, pCur);
8016     if( rc ) return rc;
8017   }
8018 
8019   if( pCur->pKeyInfo==0 ){
8020     assert( pX->pKey==0 );
8021     /* If this is an insert into a table b-tree, invalidate any incrblob
8022     ** cursors open on the row being replaced */
8023     invalidateIncrblobCursors(p, pX->nKey, 0);
8024 
8025     /* If BTREE_SAVEPOSITION is set, the cursor must already be pointing
8026     ** to a row with the same key as the new entry being inserted.  */
8027     assert( (flags & BTREE_SAVEPOSITION)==0 ||
8028             ((pCur->curFlags&BTCF_ValidNKey)!=0 && pX->nKey==pCur->info.nKey) );
8029 
8030     /* If the cursor is currently on the last row and we are appending a
8031     ** new row onto the end, set the "loc" to avoid an unnecessary
8032     ** btreeMoveto() call */
8033     if( (pCur->curFlags&BTCF_ValidNKey)!=0 && pX->nKey==pCur->info.nKey ){
8034       loc = 0;
8035     }else if( (pCur->curFlags&BTCF_ValidNKey)!=0 && pX->nKey>0
8036                && pCur->info.nKey==pX->nKey-1 ){
8037       loc = -1;
8038     }else if( loc==0 ){
8039       rc = sqlite3BtreeMovetoUnpacked(pCur, 0, pX->nKey, flags!=0, &loc);
8040       if( rc ) return rc;
8041     }
8042   }else if( loc==0 && (flags & BTREE_SAVEPOSITION)==0 ){
8043     if( pX->nMem ){
8044       UnpackedRecord r;
8045       r.pKeyInfo = pCur->pKeyInfo;
8046       r.aMem = pX->aMem;
8047       r.nField = pX->nMem;
8048       r.default_rc = 0;
8049       r.errCode = 0;
8050       r.r1 = 0;
8051       r.r2 = 0;
8052       r.eqSeen = 0;
8053       rc = sqlite3BtreeMovetoUnpacked(pCur, &r, 0, flags!=0, &loc);
8054     }else{
8055       rc = btreeMoveto(pCur, pX->pKey, pX->nKey, flags!=0, &loc);
8056     }
8057     if( rc ) return rc;
8058   }
8059   assert( pCur->eState==CURSOR_VALID || (pCur->eState==CURSOR_INVALID && loc) );
8060 
8061   pPage = pCur->apPage[pCur->iPage];
8062   assert( pPage->intKey || pX->nKey>=0 );
8063   assert( pPage->leaf || !pPage->intKey );
8064 
8065   TRACE(("INSERT: table=%d nkey=%lld ndata=%d page=%d %s\n",
8066           pCur->pgnoRoot, pX->nKey, pX->nData, pPage->pgno,
8067           loc==0 ? "overwrite" : "new entry"));
8068   assert( pPage->isInit );
8069   newCell = pBt->pTmpSpace;
8070   assert( newCell!=0 );
8071   rc = fillInCell(pPage, newCell, pX, &szNew);
8072   if( rc ) goto end_insert;
8073   assert( szNew==pPage->xCellSize(pPage, newCell) );
8074   assert( szNew <= MX_CELL_SIZE(pBt) );
8075   idx = pCur->aiIdx[pCur->iPage];
8076   if( loc==0 ){
8077     CellInfo info;
8078     assert( idx<pPage->nCell );
8079     rc = sqlite3PagerWrite(pPage->pDbPage);
8080     if( rc ){
8081       goto end_insert;
8082     }
8083     oldCell = findCell(pPage, idx);
8084     if( !pPage->leaf ){
8085       memcpy(newCell, oldCell, 4);
8086     }
8087     rc = clearCell(pPage, oldCell, &info);
8088     if( info.nSize==szNew && info.nLocal==info.nPayload ){
8089       /* Overwrite the old cell with the new if they are the same size.
8090       ** We could also try to do this if the old cell is smaller, then add
8091       ** the leftover space to the free list.  But experiments show that
8092       ** doing that is no faster then skipping this optimization and just
8093       ** calling dropCell() and insertCell(). */
8094       assert( rc==SQLITE_OK ); /* clearCell never fails when nLocal==nPayload */
8095       if( oldCell+szNew > pPage->aDataEnd ) return SQLITE_CORRUPT_BKPT;
8096       memcpy(oldCell, newCell, szNew);
8097       return SQLITE_OK;
8098     }
8099     dropCell(pPage, idx, info.nSize, &rc);
8100     if( rc ) goto end_insert;
8101   }else if( loc<0 && pPage->nCell>0 ){
8102     assert( pPage->leaf );
8103     idx = ++pCur->aiIdx[pCur->iPage];
8104   }else{
8105     assert( pPage->leaf );
8106   }
8107   insertCell(pPage, idx, newCell, szNew, 0, 0, &rc);
8108   assert( pPage->nOverflow==0 || rc==SQLITE_OK );
8109   assert( rc!=SQLITE_OK || pPage->nCell>0 || pPage->nOverflow>0 );
8110 
8111   /* If no error has occurred and pPage has an overflow cell, call balance()
8112   ** to redistribute the cells within the tree. Since balance() may move
8113   ** the cursor, zero the BtCursor.info.nSize and BTCF_ValidNKey
8114   ** variables.
8115   **
8116   ** Previous versions of SQLite called moveToRoot() to move the cursor
8117   ** back to the root page as balance() used to invalidate the contents
8118   ** of BtCursor.apPage[] and BtCursor.aiIdx[]. Instead of doing that,
8119   ** set the cursor state to "invalid". This makes common insert operations
8120   ** slightly faster.
8121   **
8122   ** There is a subtle but important optimization here too. When inserting
8123   ** multiple records into an intkey b-tree using a single cursor (as can
8124   ** happen while processing an "INSERT INTO ... SELECT" statement), it
8125   ** is advantageous to leave the cursor pointing to the last entry in
8126   ** the b-tree if possible. If the cursor is left pointing to the last
8127   ** entry in the table, and the next row inserted has an integer key
8128   ** larger than the largest existing key, it is possible to insert the
8129   ** row without seeking the cursor. This can be a big performance boost.
8130   */
8131   pCur->info.nSize = 0;
8132   if( pPage->nOverflow ){
8133     assert( rc==SQLITE_OK );
8134     pCur->curFlags &= ~(BTCF_ValidNKey);
8135     rc = balance(pCur);
8136 
8137     /* Must make sure nOverflow is reset to zero even if the balance()
8138     ** fails. Internal data structure corruption will result otherwise.
8139     ** Also, set the cursor state to invalid. This stops saveCursorPosition()
8140     ** from trying to save the current position of the cursor.  */
8141     pCur->apPage[pCur->iPage]->nOverflow = 0;
8142     pCur->eState = CURSOR_INVALID;
8143     if( (flags & BTREE_SAVEPOSITION) && rc==SQLITE_OK ){
8144       rc = moveToRoot(pCur);
8145       if( pCur->pKeyInfo ){
8146         assert( pCur->pKey==0 );
8147         pCur->pKey = sqlite3Malloc( pX->nKey );
8148         if( pCur->pKey==0 ){
8149           rc = SQLITE_NOMEM;
8150         }else{
8151           memcpy(pCur->pKey, pX->pKey, pX->nKey);
8152         }
8153       }
8154       pCur->eState = CURSOR_REQUIRESEEK;
8155       pCur->nKey = pX->nKey;
8156     }
8157   }
8158   assert( pCur->apPage[pCur->iPage]->nOverflow==0 );
8159 
8160 end_insert:
8161   return rc;
8162 }
8163 
8164 /*
8165 ** Delete the entry that the cursor is pointing to.
8166 **
8167 ** If the BTREE_SAVEPOSITION bit of the flags parameter is zero, then
8168 ** the cursor is left pointing at an arbitrary location after the delete.
8169 ** But if that bit is set, then the cursor is left in a state such that
8170 ** the next call to BtreeNext() or BtreePrev() moves it to the same row
8171 ** as it would have been on if the call to BtreeDelete() had been omitted.
8172 **
8173 ** The BTREE_AUXDELETE bit of flags indicates that is one of several deletes
8174 ** associated with a single table entry and its indexes.  Only one of those
8175 ** deletes is considered the "primary" delete.  The primary delete occurs
8176 ** on a cursor that is not a BTREE_FORDELETE cursor.  All but one delete
8177 ** operation on non-FORDELETE cursors is tagged with the AUXDELETE flag.
8178 ** The BTREE_AUXDELETE bit is a hint that is not used by this implementation,
8179 ** but which might be used by alternative storage engines.
8180 */
8181 int sqlite3BtreeDelete(BtCursor *pCur, u8 flags){
8182   Btree *p = pCur->pBtree;
8183   BtShared *pBt = p->pBt;
8184   int rc;                              /* Return code */
8185   MemPage *pPage;                      /* Page to delete cell from */
8186   unsigned char *pCell;                /* Pointer to cell to delete */
8187   int iCellIdx;                        /* Index of cell to delete */
8188   int iCellDepth;                      /* Depth of node containing pCell */
8189   CellInfo info;                       /* Size of the cell being deleted */
8190   int bSkipnext = 0;                   /* Leaf cursor in SKIPNEXT state */
8191   u8 bPreserve = flags & BTREE_SAVEPOSITION;  /* Keep cursor valid */
8192 
8193   assert( cursorOwnsBtShared(pCur) );
8194   assert( pBt->inTransaction==TRANS_WRITE );
8195   assert( (pBt->btsFlags & BTS_READ_ONLY)==0 );
8196   assert( pCur->curFlags & BTCF_WriteFlag );
8197   assert( hasSharedCacheTableLock(p, pCur->pgnoRoot, pCur->pKeyInfo!=0, 2) );
8198   assert( !hasReadConflicts(p, pCur->pgnoRoot) );
8199   assert( pCur->aiIdx[pCur->iPage]<pCur->apPage[pCur->iPage]->nCell );
8200   assert( pCur->eState==CURSOR_VALID );
8201   assert( (flags & ~(BTREE_SAVEPOSITION | BTREE_AUXDELETE))==0 );
8202 
8203   iCellDepth = pCur->iPage;
8204   iCellIdx = pCur->aiIdx[iCellDepth];
8205   pPage = pCur->apPage[iCellDepth];
8206   pCell = findCell(pPage, iCellIdx);
8207 
8208   /* If the bPreserve flag is set to true, then the cursor position must
8209   ** be preserved following this delete operation. If the current delete
8210   ** will cause a b-tree rebalance, then this is done by saving the cursor
8211   ** key and leaving the cursor in CURSOR_REQUIRESEEK state before
8212   ** returning.
8213   **
8214   ** Or, if the current delete will not cause a rebalance, then the cursor
8215   ** will be left in CURSOR_SKIPNEXT state pointing to the entry immediately
8216   ** before or after the deleted entry. In this case set bSkipnext to true.  */
8217   if( bPreserve ){
8218     if( !pPage->leaf
8219      || (pPage->nFree+cellSizePtr(pPage,pCell)+2)>(int)(pBt->usableSize*2/3)
8220     ){
8221       /* A b-tree rebalance will be required after deleting this entry.
8222       ** Save the cursor key.  */
8223       rc = saveCursorKey(pCur);
8224       if( rc ) return rc;
8225     }else{
8226       bSkipnext = 1;
8227     }
8228   }
8229 
8230   /* If the page containing the entry to delete is not a leaf page, move
8231   ** the cursor to the largest entry in the tree that is smaller than
8232   ** the entry being deleted. This cell will replace the cell being deleted
8233   ** from the internal node. The 'previous' entry is used for this instead
8234   ** of the 'next' entry, as the previous entry is always a part of the
8235   ** sub-tree headed by the child page of the cell being deleted. This makes
8236   ** balancing the tree following the delete operation easier.  */
8237   if( !pPage->leaf ){
8238     int notUsed = 0;
8239     rc = sqlite3BtreePrevious(pCur, &notUsed);
8240     if( rc ) return rc;
8241   }
8242 
8243   /* Save the positions of any other cursors open on this table before
8244   ** making any modifications.  */
8245   if( pCur->curFlags & BTCF_Multiple ){
8246     rc = saveAllCursors(pBt, pCur->pgnoRoot, pCur);
8247     if( rc ) return rc;
8248   }
8249 
8250   /* If this is a delete operation to remove a row from a table b-tree,
8251   ** invalidate any incrblob cursors open on the row being deleted.  */
8252   if( pCur->pKeyInfo==0 ){
8253     invalidateIncrblobCursors(p, pCur->info.nKey, 0);
8254   }
8255 
8256   /* Make the page containing the entry to be deleted writable. Then free any
8257   ** overflow pages associated with the entry and finally remove the cell
8258   ** itself from within the page.  */
8259   rc = sqlite3PagerWrite(pPage->pDbPage);
8260   if( rc ) return rc;
8261   rc = clearCell(pPage, pCell, &info);
8262   dropCell(pPage, iCellIdx, info.nSize, &rc);
8263   if( rc ) return rc;
8264 
8265   /* If the cell deleted was not located on a leaf page, then the cursor
8266   ** is currently pointing to the largest entry in the sub-tree headed
8267   ** by the child-page of the cell that was just deleted from an internal
8268   ** node. The cell from the leaf node needs to be moved to the internal
8269   ** node to replace the deleted cell.  */
8270   if( !pPage->leaf ){
8271     MemPage *pLeaf = pCur->apPage[pCur->iPage];
8272     int nCell;
8273     Pgno n = pCur->apPage[iCellDepth+1]->pgno;
8274     unsigned char *pTmp;
8275 
8276     pCell = findCell(pLeaf, pLeaf->nCell-1);
8277     if( pCell<&pLeaf->aData[4] ) return SQLITE_CORRUPT_BKPT;
8278     nCell = pLeaf->xCellSize(pLeaf, pCell);
8279     assert( MX_CELL_SIZE(pBt) >= nCell );
8280     pTmp = pBt->pTmpSpace;
8281     assert( pTmp!=0 );
8282     rc = sqlite3PagerWrite(pLeaf->pDbPage);
8283     if( rc==SQLITE_OK ){
8284       insertCell(pPage, iCellIdx, pCell-4, nCell+4, pTmp, n, &rc);
8285     }
8286     dropCell(pLeaf, pLeaf->nCell-1, nCell, &rc);
8287     if( rc ) return rc;
8288   }
8289 
8290   /* Balance the tree. If the entry deleted was located on a leaf page,
8291   ** then the cursor still points to that page. In this case the first
8292   ** call to balance() repairs the tree, and the if(...) condition is
8293   ** never true.
8294   **
8295   ** Otherwise, if the entry deleted was on an internal node page, then
8296   ** pCur is pointing to the leaf page from which a cell was removed to
8297   ** replace the cell deleted from the internal node. This is slightly
8298   ** tricky as the leaf node may be underfull, and the internal node may
8299   ** be either under or overfull. In this case run the balancing algorithm
8300   ** on the leaf node first. If the balance proceeds far enough up the
8301   ** tree that we can be sure that any problem in the internal node has
8302   ** been corrected, so be it. Otherwise, after balancing the leaf node,
8303   ** walk the cursor up the tree to the internal node and balance it as
8304   ** well.  */
8305   rc = balance(pCur);
8306   if( rc==SQLITE_OK && pCur->iPage>iCellDepth ){
8307     while( pCur->iPage>iCellDepth ){
8308       releasePage(pCur->apPage[pCur->iPage--]);
8309     }
8310     rc = balance(pCur);
8311   }
8312 
8313   if( rc==SQLITE_OK ){
8314     if( bSkipnext ){
8315       assert( bPreserve && (pCur->iPage==iCellDepth || CORRUPT_DB) );
8316       assert( pPage==pCur->apPage[pCur->iPage] || CORRUPT_DB );
8317       assert( (pPage->nCell>0 || CORRUPT_DB) && iCellIdx<=pPage->nCell );
8318       pCur->eState = CURSOR_SKIPNEXT;
8319       if( iCellIdx>=pPage->nCell ){
8320         pCur->skipNext = -1;
8321         pCur->aiIdx[iCellDepth] = pPage->nCell-1;
8322       }else{
8323         pCur->skipNext = 1;
8324       }
8325     }else{
8326       rc = moveToRoot(pCur);
8327       if( bPreserve ){
8328         pCur->eState = CURSOR_REQUIRESEEK;
8329       }
8330     }
8331   }
8332   return rc;
8333 }
8334 
8335 /*
8336 ** Create a new BTree table.  Write into *piTable the page
8337 ** number for the root page of the new table.
8338 **
8339 ** The type of type is determined by the flags parameter.  Only the
8340 ** following values of flags are currently in use.  Other values for
8341 ** flags might not work:
8342 **
8343 **     BTREE_INTKEY|BTREE_LEAFDATA     Used for SQL tables with rowid keys
8344 **     BTREE_ZERODATA                  Used for SQL indices
8345 */
8346 static int btreeCreateTable(Btree *p, int *piTable, int createTabFlags){
8347   BtShared *pBt = p->pBt;
8348   MemPage *pRoot;
8349   Pgno pgnoRoot;
8350   int rc;
8351   int ptfFlags;          /* Page-type flage for the root page of new table */
8352 
8353   assert( sqlite3BtreeHoldsMutex(p) );
8354   assert( pBt->inTransaction==TRANS_WRITE );
8355   assert( (pBt->btsFlags & BTS_READ_ONLY)==0 );
8356 
8357 #ifdef SQLITE_OMIT_AUTOVACUUM
8358   rc = allocateBtreePage(pBt, &pRoot, &pgnoRoot, 1, 0);
8359   if( rc ){
8360     return rc;
8361   }
8362 #else
8363   if( pBt->autoVacuum ){
8364     Pgno pgnoMove;      /* Move a page here to make room for the root-page */
8365     MemPage *pPageMove; /* The page to move to. */
8366 
8367     /* Creating a new table may probably require moving an existing database
8368     ** to make room for the new tables root page. In case this page turns
8369     ** out to be an overflow page, delete all overflow page-map caches
8370     ** held by open cursors.
8371     */
8372     invalidateAllOverflowCache(pBt);
8373 
8374     /* Read the value of meta[3] from the database to determine where the
8375     ** root page of the new table should go. meta[3] is the largest root-page
8376     ** created so far, so the new root-page is (meta[3]+1).
8377     */
8378     sqlite3BtreeGetMeta(p, BTREE_LARGEST_ROOT_PAGE, &pgnoRoot);
8379     pgnoRoot++;
8380 
8381     /* The new root-page may not be allocated on a pointer-map page, or the
8382     ** PENDING_BYTE page.
8383     */
8384     while( pgnoRoot==PTRMAP_PAGENO(pBt, pgnoRoot) ||
8385         pgnoRoot==PENDING_BYTE_PAGE(pBt) ){
8386       pgnoRoot++;
8387     }
8388     assert( pgnoRoot>=3 || CORRUPT_DB );
8389     testcase( pgnoRoot<3 );
8390 
8391     /* Allocate a page. The page that currently resides at pgnoRoot will
8392     ** be moved to the allocated page (unless the allocated page happens
8393     ** to reside at pgnoRoot).
8394     */
8395     rc = allocateBtreePage(pBt, &pPageMove, &pgnoMove, pgnoRoot, BTALLOC_EXACT);
8396     if( rc!=SQLITE_OK ){
8397       return rc;
8398     }
8399 
8400     if( pgnoMove!=pgnoRoot ){
8401       /* pgnoRoot is the page that will be used for the root-page of
8402       ** the new table (assuming an error did not occur). But we were
8403       ** allocated pgnoMove. If required (i.e. if it was not allocated
8404       ** by extending the file), the current page at position pgnoMove
8405       ** is already journaled.
8406       */
8407       u8 eType = 0;
8408       Pgno iPtrPage = 0;
8409 
8410       /* Save the positions of any open cursors. This is required in
8411       ** case they are holding a reference to an xFetch reference
8412       ** corresponding to page pgnoRoot.  */
8413       rc = saveAllCursors(pBt, 0, 0);
8414       releasePage(pPageMove);
8415       if( rc!=SQLITE_OK ){
8416         return rc;
8417       }
8418 
8419       /* Move the page currently at pgnoRoot to pgnoMove. */
8420       rc = btreeGetPage(pBt, pgnoRoot, &pRoot, 0);
8421       if( rc!=SQLITE_OK ){
8422         return rc;
8423       }
8424       rc = ptrmapGet(pBt, pgnoRoot, &eType, &iPtrPage);
8425       if( eType==PTRMAP_ROOTPAGE || eType==PTRMAP_FREEPAGE ){
8426         rc = SQLITE_CORRUPT_BKPT;
8427       }
8428       if( rc!=SQLITE_OK ){
8429         releasePage(pRoot);
8430         return rc;
8431       }
8432       assert( eType!=PTRMAP_ROOTPAGE );
8433       assert( eType!=PTRMAP_FREEPAGE );
8434       rc = relocatePage(pBt, pRoot, eType, iPtrPage, pgnoMove, 0);
8435       releasePage(pRoot);
8436 
8437       /* Obtain the page at pgnoRoot */
8438       if( rc!=SQLITE_OK ){
8439         return rc;
8440       }
8441       rc = btreeGetPage(pBt, pgnoRoot, &pRoot, 0);
8442       if( rc!=SQLITE_OK ){
8443         return rc;
8444       }
8445       rc = sqlite3PagerWrite(pRoot->pDbPage);
8446       if( rc!=SQLITE_OK ){
8447         releasePage(pRoot);
8448         return rc;
8449       }
8450     }else{
8451       pRoot = pPageMove;
8452     }
8453 
8454     /* Update the pointer-map and meta-data with the new root-page number. */
8455     ptrmapPut(pBt, pgnoRoot, PTRMAP_ROOTPAGE, 0, &rc);
8456     if( rc ){
8457       releasePage(pRoot);
8458       return rc;
8459     }
8460 
8461     /* When the new root page was allocated, page 1 was made writable in
8462     ** order either to increase the database filesize, or to decrement the
8463     ** freelist count.  Hence, the sqlite3BtreeUpdateMeta() call cannot fail.
8464     */
8465     assert( sqlite3PagerIswriteable(pBt->pPage1->pDbPage) );
8466     rc = sqlite3BtreeUpdateMeta(p, 4, pgnoRoot);
8467     if( NEVER(rc) ){
8468       releasePage(pRoot);
8469       return rc;
8470     }
8471 
8472   }else{
8473     rc = allocateBtreePage(pBt, &pRoot, &pgnoRoot, 1, 0);
8474     if( rc ) return rc;
8475   }
8476 #endif
8477   assert( sqlite3PagerIswriteable(pRoot->pDbPage) );
8478   if( createTabFlags & BTREE_INTKEY ){
8479     ptfFlags = PTF_INTKEY | PTF_LEAFDATA | PTF_LEAF;
8480   }else{
8481     ptfFlags = PTF_ZERODATA | PTF_LEAF;
8482   }
8483   zeroPage(pRoot, ptfFlags);
8484   sqlite3PagerUnref(pRoot->pDbPage);
8485   assert( (pBt->openFlags & BTREE_SINGLE)==0 || pgnoRoot==2 );
8486   *piTable = (int)pgnoRoot;
8487   return SQLITE_OK;
8488 }
8489 int sqlite3BtreeCreateTable(Btree *p, int *piTable, int flags){
8490   int rc;
8491   sqlite3BtreeEnter(p);
8492   rc = btreeCreateTable(p, piTable, flags);
8493   sqlite3BtreeLeave(p);
8494   return rc;
8495 }
8496 
8497 /*
8498 ** Erase the given database page and all its children.  Return
8499 ** the page to the freelist.
8500 */
8501 static int clearDatabasePage(
8502   BtShared *pBt,           /* The BTree that contains the table */
8503   Pgno pgno,               /* Page number to clear */
8504   int freePageFlag,        /* Deallocate page if true */
8505   int *pnChange            /* Add number of Cells freed to this counter */
8506 ){
8507   MemPage *pPage;
8508   int rc;
8509   unsigned char *pCell;
8510   int i;
8511   int hdr;
8512   CellInfo info;
8513 
8514   assert( sqlite3_mutex_held(pBt->mutex) );
8515   if( pgno>btreePagecount(pBt) ){
8516     return SQLITE_CORRUPT_BKPT;
8517   }
8518   rc = getAndInitPage(pBt, pgno, &pPage, 0, 0);
8519   if( rc ) return rc;
8520   if( pPage->bBusy ){
8521     rc = SQLITE_CORRUPT_BKPT;
8522     goto cleardatabasepage_out;
8523   }
8524   pPage->bBusy = 1;
8525   hdr = pPage->hdrOffset;
8526   for(i=0; i<pPage->nCell; i++){
8527     pCell = findCell(pPage, i);
8528     if( !pPage->leaf ){
8529       rc = clearDatabasePage(pBt, get4byte(pCell), 1, pnChange);
8530       if( rc ) goto cleardatabasepage_out;
8531     }
8532     rc = clearCell(pPage, pCell, &info);
8533     if( rc ) goto cleardatabasepage_out;
8534   }
8535   if( !pPage->leaf ){
8536     rc = clearDatabasePage(pBt, get4byte(&pPage->aData[hdr+8]), 1, pnChange);
8537     if( rc ) goto cleardatabasepage_out;
8538   }else if( pnChange ){
8539     assert( pPage->intKey || CORRUPT_DB );
8540     testcase( !pPage->intKey );
8541     *pnChange += pPage->nCell;
8542   }
8543   if( freePageFlag ){
8544     freePage(pPage, &rc);
8545   }else if( (rc = sqlite3PagerWrite(pPage->pDbPage))==0 ){
8546     zeroPage(pPage, pPage->aData[hdr] | PTF_LEAF);
8547   }
8548 
8549 cleardatabasepage_out:
8550   pPage->bBusy = 0;
8551   releasePage(pPage);
8552   return rc;
8553 }
8554 
8555 /*
8556 ** Delete all information from a single table in the database.  iTable is
8557 ** the page number of the root of the table.  After this routine returns,
8558 ** the root page is empty, but still exists.
8559 **
8560 ** This routine will fail with SQLITE_LOCKED if there are any open
8561 ** read cursors on the table.  Open write cursors are moved to the
8562 ** root of the table.
8563 **
8564 ** If pnChange is not NULL, then table iTable must be an intkey table. The
8565 ** integer value pointed to by pnChange is incremented by the number of
8566 ** entries in the table.
8567 */
8568 int sqlite3BtreeClearTable(Btree *p, int iTable, int *pnChange){
8569   int rc;
8570   BtShared *pBt = p->pBt;
8571   sqlite3BtreeEnter(p);
8572   assert( p->inTrans==TRANS_WRITE );
8573 
8574   rc = saveAllCursors(pBt, (Pgno)iTable, 0);
8575 
8576   if( SQLITE_OK==rc ){
8577     /* Invalidate all incrblob cursors open on table iTable (assuming iTable
8578     ** is the root of a table b-tree - if it is not, the following call is
8579     ** a no-op).  */
8580     invalidateIncrblobCursors(p, 0, 1);
8581     rc = clearDatabasePage(pBt, (Pgno)iTable, 0, pnChange);
8582   }
8583   sqlite3BtreeLeave(p);
8584   return rc;
8585 }
8586 
8587 /*
8588 ** Delete all information from the single table that pCur is open on.
8589 **
8590 ** This routine only work for pCur on an ephemeral table.
8591 */
8592 int sqlite3BtreeClearTableOfCursor(BtCursor *pCur){
8593   return sqlite3BtreeClearTable(pCur->pBtree, pCur->pgnoRoot, 0);
8594 }
8595 
8596 /*
8597 ** Erase all information in a table and add the root of the table to
8598 ** the freelist.  Except, the root of the principle table (the one on
8599 ** page 1) is never added to the freelist.
8600 **
8601 ** This routine will fail with SQLITE_LOCKED if there are any open
8602 ** cursors on the table.
8603 **
8604 ** If AUTOVACUUM is enabled and the page at iTable is not the last
8605 ** root page in the database file, then the last root page
8606 ** in the database file is moved into the slot formerly occupied by
8607 ** iTable and that last slot formerly occupied by the last root page
8608 ** is added to the freelist instead of iTable.  In this say, all
8609 ** root pages are kept at the beginning of the database file, which
8610 ** is necessary for AUTOVACUUM to work right.  *piMoved is set to the
8611 ** page number that used to be the last root page in the file before
8612 ** the move.  If no page gets moved, *piMoved is set to 0.
8613 ** The last root page is recorded in meta[3] and the value of
8614 ** meta[3] is updated by this procedure.
8615 */
8616 static int btreeDropTable(Btree *p, Pgno iTable, int *piMoved){
8617   int rc;
8618   MemPage *pPage = 0;
8619   BtShared *pBt = p->pBt;
8620 
8621   assert( sqlite3BtreeHoldsMutex(p) );
8622   assert( p->inTrans==TRANS_WRITE );
8623   assert( iTable>=2 );
8624 
8625   rc = btreeGetPage(pBt, (Pgno)iTable, &pPage, 0);
8626   if( rc ) return rc;
8627   rc = sqlite3BtreeClearTable(p, iTable, 0);
8628   if( rc ){
8629     releasePage(pPage);
8630     return rc;
8631   }
8632 
8633   *piMoved = 0;
8634 
8635 #ifdef SQLITE_OMIT_AUTOVACUUM
8636   freePage(pPage, &rc);
8637   releasePage(pPage);
8638 #else
8639   if( pBt->autoVacuum ){
8640     Pgno maxRootPgno;
8641     sqlite3BtreeGetMeta(p, BTREE_LARGEST_ROOT_PAGE, &maxRootPgno);
8642 
8643     if( iTable==maxRootPgno ){
8644       /* If the table being dropped is the table with the largest root-page
8645       ** number in the database, put the root page on the free list.
8646       */
8647       freePage(pPage, &rc);
8648       releasePage(pPage);
8649       if( rc!=SQLITE_OK ){
8650         return rc;
8651       }
8652     }else{
8653       /* The table being dropped does not have the largest root-page
8654       ** number in the database. So move the page that does into the
8655       ** gap left by the deleted root-page.
8656       */
8657       MemPage *pMove;
8658       releasePage(pPage);
8659       rc = btreeGetPage(pBt, maxRootPgno, &pMove, 0);
8660       if( rc!=SQLITE_OK ){
8661         return rc;
8662       }
8663       rc = relocatePage(pBt, pMove, PTRMAP_ROOTPAGE, 0, iTable, 0);
8664       releasePage(pMove);
8665       if( rc!=SQLITE_OK ){
8666         return rc;
8667       }
8668       pMove = 0;
8669       rc = btreeGetPage(pBt, maxRootPgno, &pMove, 0);
8670       freePage(pMove, &rc);
8671       releasePage(pMove);
8672       if( rc!=SQLITE_OK ){
8673         return rc;
8674       }
8675       *piMoved = maxRootPgno;
8676     }
8677 
8678     /* Set the new 'max-root-page' value in the database header. This
8679     ** is the old value less one, less one more if that happens to
8680     ** be a root-page number, less one again if that is the
8681     ** PENDING_BYTE_PAGE.
8682     */
8683     maxRootPgno--;
8684     while( maxRootPgno==PENDING_BYTE_PAGE(pBt)
8685            || PTRMAP_ISPAGE(pBt, maxRootPgno) ){
8686       maxRootPgno--;
8687     }
8688     assert( maxRootPgno!=PENDING_BYTE_PAGE(pBt) );
8689 
8690     rc = sqlite3BtreeUpdateMeta(p, 4, maxRootPgno);
8691   }else{
8692     freePage(pPage, &rc);
8693     releasePage(pPage);
8694   }
8695 #endif
8696   return rc;
8697 }
8698 int sqlite3BtreeDropTable(Btree *p, int iTable, int *piMoved){
8699   int rc;
8700   sqlite3BtreeEnter(p);
8701   rc = btreeDropTable(p, iTable, piMoved);
8702   sqlite3BtreeLeave(p);
8703   return rc;
8704 }
8705 
8706 
8707 /*
8708 ** This function may only be called if the b-tree connection already
8709 ** has a read or write transaction open on the database.
8710 **
8711 ** Read the meta-information out of a database file.  Meta[0]
8712 ** is the number of free pages currently in the database.  Meta[1]
8713 ** through meta[15] are available for use by higher layers.  Meta[0]
8714 ** is read-only, the others are read/write.
8715 **
8716 ** The schema layer numbers meta values differently.  At the schema
8717 ** layer (and the SetCookie and ReadCookie opcodes) the number of
8718 ** free pages is not visible.  So Cookie[0] is the same as Meta[1].
8719 **
8720 ** This routine treats Meta[BTREE_DATA_VERSION] as a special case.  Instead
8721 ** of reading the value out of the header, it instead loads the "DataVersion"
8722 ** from the pager.  The BTREE_DATA_VERSION value is not actually stored in the
8723 ** database file.  It is a number computed by the pager.  But its access
8724 ** pattern is the same as header meta values, and so it is convenient to
8725 ** read it from this routine.
8726 */
8727 void sqlite3BtreeGetMeta(Btree *p, int idx, u32 *pMeta){
8728   BtShared *pBt = p->pBt;
8729 
8730   sqlite3BtreeEnter(p);
8731   assert( p->inTrans>TRANS_NONE );
8732   assert( SQLITE_OK==querySharedCacheTableLock(p, MASTER_ROOT, READ_LOCK) );
8733   assert( pBt->pPage1 );
8734   assert( idx>=0 && idx<=15 );
8735 
8736   if( idx==BTREE_DATA_VERSION ){
8737     *pMeta = sqlite3PagerDataVersion(pBt->pPager) + p->iDataVersion;
8738   }else{
8739     *pMeta = get4byte(&pBt->pPage1->aData[36 + idx*4]);
8740   }
8741 
8742   /* If auto-vacuum is disabled in this build and this is an auto-vacuum
8743   ** database, mark the database as read-only.  */
8744 #ifdef SQLITE_OMIT_AUTOVACUUM
8745   if( idx==BTREE_LARGEST_ROOT_PAGE && *pMeta>0 ){
8746     pBt->btsFlags |= BTS_READ_ONLY;
8747   }
8748 #endif
8749 
8750   sqlite3BtreeLeave(p);
8751 }
8752 
8753 /*
8754 ** Write meta-information back into the database.  Meta[0] is
8755 ** read-only and may not be written.
8756 */
8757 int sqlite3BtreeUpdateMeta(Btree *p, int idx, u32 iMeta){
8758   BtShared *pBt = p->pBt;
8759   unsigned char *pP1;
8760   int rc;
8761   assert( idx>=1 && idx<=15 );
8762   sqlite3BtreeEnter(p);
8763   assert( p->inTrans==TRANS_WRITE );
8764   assert( pBt->pPage1!=0 );
8765   pP1 = pBt->pPage1->aData;
8766   rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
8767   if( rc==SQLITE_OK ){
8768     put4byte(&pP1[36 + idx*4], iMeta);
8769 #ifndef SQLITE_OMIT_AUTOVACUUM
8770     if( idx==BTREE_INCR_VACUUM ){
8771       assert( pBt->autoVacuum || iMeta==0 );
8772       assert( iMeta==0 || iMeta==1 );
8773       pBt->incrVacuum = (u8)iMeta;
8774     }
8775 #endif
8776   }
8777   sqlite3BtreeLeave(p);
8778   return rc;
8779 }
8780 
8781 #ifndef SQLITE_OMIT_BTREECOUNT
8782 /*
8783 ** The first argument, pCur, is a cursor opened on some b-tree. Count the
8784 ** number of entries in the b-tree and write the result to *pnEntry.
8785 **
8786 ** SQLITE_OK is returned if the operation is successfully executed.
8787 ** Otherwise, if an error is encountered (i.e. an IO error or database
8788 ** corruption) an SQLite error code is returned.
8789 */
8790 int sqlite3BtreeCount(BtCursor *pCur, i64 *pnEntry){
8791   i64 nEntry = 0;                      /* Value to return in *pnEntry */
8792   int rc;                              /* Return code */
8793 
8794   if( pCur->pgnoRoot==0 ){
8795     *pnEntry = 0;
8796     return SQLITE_OK;
8797   }
8798   rc = moveToRoot(pCur);
8799 
8800   /* Unless an error occurs, the following loop runs one iteration for each
8801   ** page in the B-Tree structure (not including overflow pages).
8802   */
8803   while( rc==SQLITE_OK ){
8804     int iIdx;                          /* Index of child node in parent */
8805     MemPage *pPage;                    /* Current page of the b-tree */
8806 
8807     /* If this is a leaf page or the tree is not an int-key tree, then
8808     ** this page contains countable entries. Increment the entry counter
8809     ** accordingly.
8810     */
8811     pPage = pCur->apPage[pCur->iPage];
8812     if( pPage->leaf || !pPage->intKey ){
8813       nEntry += pPage->nCell;
8814     }
8815 
8816     /* pPage is a leaf node. This loop navigates the cursor so that it
8817     ** points to the first interior cell that it points to the parent of
8818     ** the next page in the tree that has not yet been visited. The
8819     ** pCur->aiIdx[pCur->iPage] value is set to the index of the parent cell
8820     ** of the page, or to the number of cells in the page if the next page
8821     ** to visit is the right-child of its parent.
8822     **
8823     ** If all pages in the tree have been visited, return SQLITE_OK to the
8824     ** caller.
8825     */
8826     if( pPage->leaf ){
8827       do {
8828         if( pCur->iPage==0 ){
8829           /* All pages of the b-tree have been visited. Return successfully. */
8830           *pnEntry = nEntry;
8831           return moveToRoot(pCur);
8832         }
8833         moveToParent(pCur);
8834       }while ( pCur->aiIdx[pCur->iPage]>=pCur->apPage[pCur->iPage]->nCell );
8835 
8836       pCur->aiIdx[pCur->iPage]++;
8837       pPage = pCur->apPage[pCur->iPage];
8838     }
8839 
8840     /* Descend to the child node of the cell that the cursor currently
8841     ** points at. This is the right-child if (iIdx==pPage->nCell).
8842     */
8843     iIdx = pCur->aiIdx[pCur->iPage];
8844     if( iIdx==pPage->nCell ){
8845       rc = moveToChild(pCur, get4byte(&pPage->aData[pPage->hdrOffset+8]));
8846     }else{
8847       rc = moveToChild(pCur, get4byte(findCell(pPage, iIdx)));
8848     }
8849   }
8850 
8851   /* An error has occurred. Return an error code. */
8852   return rc;
8853 }
8854 #endif
8855 
8856 /*
8857 ** Return the pager associated with a BTree.  This routine is used for
8858 ** testing and debugging only.
8859 */
8860 Pager *sqlite3BtreePager(Btree *p){
8861   return p->pBt->pPager;
8862 }
8863 
8864 #ifndef SQLITE_OMIT_INTEGRITY_CHECK
8865 /*
8866 ** Append a message to the error message string.
8867 */
8868 static void checkAppendMsg(
8869   IntegrityCk *pCheck,
8870   const char *zFormat,
8871   ...
8872 ){
8873   va_list ap;
8874   if( !pCheck->mxErr ) return;
8875   pCheck->mxErr--;
8876   pCheck->nErr++;
8877   va_start(ap, zFormat);
8878   if( pCheck->errMsg.nChar ){
8879     sqlite3StrAccumAppend(&pCheck->errMsg, "\n", 1);
8880   }
8881   if( pCheck->zPfx ){
8882     sqlite3XPrintf(&pCheck->errMsg, pCheck->zPfx, pCheck->v1, pCheck->v2);
8883   }
8884   sqlite3VXPrintf(&pCheck->errMsg, zFormat, ap);
8885   va_end(ap);
8886   if( pCheck->errMsg.accError==STRACCUM_NOMEM ){
8887     pCheck->mallocFailed = 1;
8888   }
8889 }
8890 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
8891 
8892 #ifndef SQLITE_OMIT_INTEGRITY_CHECK
8893 
8894 /*
8895 ** Return non-zero if the bit in the IntegrityCk.aPgRef[] array that
8896 ** corresponds to page iPg is already set.
8897 */
8898 static int getPageReferenced(IntegrityCk *pCheck, Pgno iPg){
8899   assert( iPg<=pCheck->nPage && sizeof(pCheck->aPgRef[0])==1 );
8900   return (pCheck->aPgRef[iPg/8] & (1 << (iPg & 0x07)));
8901 }
8902 
8903 /*
8904 ** Set the bit in the IntegrityCk.aPgRef[] array that corresponds to page iPg.
8905 */
8906 static void setPageReferenced(IntegrityCk *pCheck, Pgno iPg){
8907   assert( iPg<=pCheck->nPage && sizeof(pCheck->aPgRef[0])==1 );
8908   pCheck->aPgRef[iPg/8] |= (1 << (iPg & 0x07));
8909 }
8910 
8911 
8912 /*
8913 ** Add 1 to the reference count for page iPage.  If this is the second
8914 ** reference to the page, add an error message to pCheck->zErrMsg.
8915 ** Return 1 if there are 2 or more references to the page and 0 if
8916 ** if this is the first reference to the page.
8917 **
8918 ** Also check that the page number is in bounds.
8919 */
8920 static int checkRef(IntegrityCk *pCheck, Pgno iPage){
8921   if( iPage==0 ) return 1;
8922   if( iPage>pCheck->nPage ){
8923     checkAppendMsg(pCheck, "invalid page number %d", iPage);
8924     return 1;
8925   }
8926   if( getPageReferenced(pCheck, iPage) ){
8927     checkAppendMsg(pCheck, "2nd reference to page %d", iPage);
8928     return 1;
8929   }
8930   setPageReferenced(pCheck, iPage);
8931   return 0;
8932 }
8933 
8934 #ifndef SQLITE_OMIT_AUTOVACUUM
8935 /*
8936 ** Check that the entry in the pointer-map for page iChild maps to
8937 ** page iParent, pointer type ptrType. If not, append an error message
8938 ** to pCheck.
8939 */
8940 static void checkPtrmap(
8941   IntegrityCk *pCheck,   /* Integrity check context */
8942   Pgno iChild,           /* Child page number */
8943   u8 eType,              /* Expected pointer map type */
8944   Pgno iParent           /* Expected pointer map parent page number */
8945 ){
8946   int rc;
8947   u8 ePtrmapType;
8948   Pgno iPtrmapParent;
8949 
8950   rc = ptrmapGet(pCheck->pBt, iChild, &ePtrmapType, &iPtrmapParent);
8951   if( rc!=SQLITE_OK ){
8952     if( rc==SQLITE_NOMEM || rc==SQLITE_IOERR_NOMEM ) pCheck->mallocFailed = 1;
8953     checkAppendMsg(pCheck, "Failed to read ptrmap key=%d", iChild);
8954     return;
8955   }
8956 
8957   if( ePtrmapType!=eType || iPtrmapParent!=iParent ){
8958     checkAppendMsg(pCheck,
8959       "Bad ptr map entry key=%d expected=(%d,%d) got=(%d,%d)",
8960       iChild, eType, iParent, ePtrmapType, iPtrmapParent);
8961   }
8962 }
8963 #endif
8964 
8965 /*
8966 ** Check the integrity of the freelist or of an overflow page list.
8967 ** Verify that the number of pages on the list is N.
8968 */
8969 static void checkList(
8970   IntegrityCk *pCheck,  /* Integrity checking context */
8971   int isFreeList,       /* True for a freelist.  False for overflow page list */
8972   int iPage,            /* Page number for first page in the list */
8973   int N                 /* Expected number of pages in the list */
8974 ){
8975   int i;
8976   int expected = N;
8977   int iFirst = iPage;
8978   while( N-- > 0 && pCheck->mxErr ){
8979     DbPage *pOvflPage;
8980     unsigned char *pOvflData;
8981     if( iPage<1 ){
8982       checkAppendMsg(pCheck,
8983          "%d of %d pages missing from overflow list starting at %d",
8984           N+1, expected, iFirst);
8985       break;
8986     }
8987     if( checkRef(pCheck, iPage) ) break;
8988     if( sqlite3PagerGet(pCheck->pPager, (Pgno)iPage, &pOvflPage, 0) ){
8989       checkAppendMsg(pCheck, "failed to get page %d", iPage);
8990       break;
8991     }
8992     pOvflData = (unsigned char *)sqlite3PagerGetData(pOvflPage);
8993     if( isFreeList ){
8994       int n = get4byte(&pOvflData[4]);
8995 #ifndef SQLITE_OMIT_AUTOVACUUM
8996       if( pCheck->pBt->autoVacuum ){
8997         checkPtrmap(pCheck, iPage, PTRMAP_FREEPAGE, 0);
8998       }
8999 #endif
9000       if( n>(int)pCheck->pBt->usableSize/4-2 ){
9001         checkAppendMsg(pCheck,
9002            "freelist leaf count too big on page %d", iPage);
9003         N--;
9004       }else{
9005         for(i=0; i<n; i++){
9006           Pgno iFreePage = get4byte(&pOvflData[8+i*4]);
9007 #ifndef SQLITE_OMIT_AUTOVACUUM
9008           if( pCheck->pBt->autoVacuum ){
9009             checkPtrmap(pCheck, iFreePage, PTRMAP_FREEPAGE, 0);
9010           }
9011 #endif
9012           checkRef(pCheck, iFreePage);
9013         }
9014         N -= n;
9015       }
9016     }
9017 #ifndef SQLITE_OMIT_AUTOVACUUM
9018     else{
9019       /* If this database supports auto-vacuum and iPage is not the last
9020       ** page in this overflow list, check that the pointer-map entry for
9021       ** the following page matches iPage.
9022       */
9023       if( pCheck->pBt->autoVacuum && N>0 ){
9024         i = get4byte(pOvflData);
9025         checkPtrmap(pCheck, i, PTRMAP_OVERFLOW2, iPage);
9026       }
9027     }
9028 #endif
9029     iPage = get4byte(pOvflData);
9030     sqlite3PagerUnref(pOvflPage);
9031 
9032     if( isFreeList && N<(iPage!=0) ){
9033       checkAppendMsg(pCheck, "free-page count in header is too small");
9034     }
9035   }
9036 }
9037 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
9038 
9039 /*
9040 ** An implementation of a min-heap.
9041 **
9042 ** aHeap[0] is the number of elements on the heap.  aHeap[1] is the
9043 ** root element.  The daughter nodes of aHeap[N] are aHeap[N*2]
9044 ** and aHeap[N*2+1].
9045 **
9046 ** The heap property is this:  Every node is less than or equal to both
9047 ** of its daughter nodes.  A consequence of the heap property is that the
9048 ** root node aHeap[1] is always the minimum value currently in the heap.
9049 **
9050 ** The btreeHeapInsert() routine inserts an unsigned 32-bit number onto
9051 ** the heap, preserving the heap property.  The btreeHeapPull() routine
9052 ** removes the root element from the heap (the minimum value in the heap)
9053 ** and then moves other nodes around as necessary to preserve the heap
9054 ** property.
9055 **
9056 ** This heap is used for cell overlap and coverage testing.  Each u32
9057 ** entry represents the span of a cell or freeblock on a btree page.
9058 ** The upper 16 bits are the index of the first byte of a range and the
9059 ** lower 16 bits are the index of the last byte of that range.
9060 */
9061 static void btreeHeapInsert(u32 *aHeap, u32 x){
9062   u32 j, i = ++aHeap[0];
9063   aHeap[i] = x;
9064   while( (j = i/2)>0 && aHeap[j]>aHeap[i] ){
9065     x = aHeap[j];
9066     aHeap[j] = aHeap[i];
9067     aHeap[i] = x;
9068     i = j;
9069   }
9070 }
9071 static int btreeHeapPull(u32 *aHeap, u32 *pOut){
9072   u32 j, i, x;
9073   if( (x = aHeap[0])==0 ) return 0;
9074   *pOut = aHeap[1];
9075   aHeap[1] = aHeap[x];
9076   aHeap[x] = 0xffffffff;
9077   aHeap[0]--;
9078   i = 1;
9079   while( (j = i*2)<=aHeap[0] ){
9080     if( aHeap[j]>aHeap[j+1] ) j++;
9081     if( aHeap[i]<aHeap[j] ) break;
9082     x = aHeap[i];
9083     aHeap[i] = aHeap[j];
9084     aHeap[j] = x;
9085     i = j;
9086   }
9087   return 1;
9088 }
9089 
9090 #ifndef SQLITE_OMIT_INTEGRITY_CHECK
9091 /*
9092 ** Do various sanity checks on a single page of a tree.  Return
9093 ** the tree depth.  Root pages return 0.  Parents of root pages
9094 ** return 1, and so forth.
9095 **
9096 ** These checks are done:
9097 **
9098 **      1.  Make sure that cells and freeblocks do not overlap
9099 **          but combine to completely cover the page.
9100 **      2.  Make sure integer cell keys are in order.
9101 **      3.  Check the integrity of overflow pages.
9102 **      4.  Recursively call checkTreePage on all children.
9103 **      5.  Verify that the depth of all children is the same.
9104 */
9105 static int checkTreePage(
9106   IntegrityCk *pCheck,  /* Context for the sanity check */
9107   int iPage,            /* Page number of the page to check */
9108   i64 *piMinKey,        /* Write minimum integer primary key here */
9109   i64 maxKey            /* Error if integer primary key greater than this */
9110 ){
9111   MemPage *pPage = 0;      /* The page being analyzed */
9112   int i;                   /* Loop counter */
9113   int rc;                  /* Result code from subroutine call */
9114   int depth = -1, d2;      /* Depth of a subtree */
9115   int pgno;                /* Page number */
9116   int nFrag;               /* Number of fragmented bytes on the page */
9117   int hdr;                 /* Offset to the page header */
9118   int cellStart;           /* Offset to the start of the cell pointer array */
9119   int nCell;               /* Number of cells */
9120   int doCoverageCheck = 1; /* True if cell coverage checking should be done */
9121   int keyCanBeEqual = 1;   /* True if IPK can be equal to maxKey
9122                            ** False if IPK must be strictly less than maxKey */
9123   u8 *data;                /* Page content */
9124   u8 *pCell;               /* Cell content */
9125   u8 *pCellIdx;            /* Next element of the cell pointer array */
9126   BtShared *pBt;           /* The BtShared object that owns pPage */
9127   u32 pc;                  /* Address of a cell */
9128   u32 usableSize;          /* Usable size of the page */
9129   u32 contentOffset;       /* Offset to the start of the cell content area */
9130   u32 *heap = 0;           /* Min-heap used for checking cell coverage */
9131   u32 x, prev = 0;         /* Next and previous entry on the min-heap */
9132   const char *saved_zPfx = pCheck->zPfx;
9133   int saved_v1 = pCheck->v1;
9134   int saved_v2 = pCheck->v2;
9135   u8 savedIsInit = 0;
9136 
9137   /* Check that the page exists
9138   */
9139   pBt = pCheck->pBt;
9140   usableSize = pBt->usableSize;
9141   if( iPage==0 ) return 0;
9142   if( checkRef(pCheck, iPage) ) return 0;
9143   pCheck->zPfx = "Page %d: ";
9144   pCheck->v1 = iPage;
9145   if( (rc = btreeGetPage(pBt, (Pgno)iPage, &pPage, 0))!=0 ){
9146     checkAppendMsg(pCheck,
9147        "unable to get the page. error code=%d", rc);
9148     goto end_of_check;
9149   }
9150 
9151   /* Clear MemPage.isInit to make sure the corruption detection code in
9152   ** btreeInitPage() is executed.  */
9153   savedIsInit = pPage->isInit;
9154   pPage->isInit = 0;
9155   if( (rc = btreeInitPage(pPage))!=0 ){
9156     assert( rc==SQLITE_CORRUPT );  /* The only possible error from InitPage */
9157     checkAppendMsg(pCheck,
9158                    "btreeInitPage() returns error code %d", rc);
9159     goto end_of_check;
9160   }
9161   data = pPage->aData;
9162   hdr = pPage->hdrOffset;
9163 
9164   /* Set up for cell analysis */
9165   pCheck->zPfx = "On tree page %d cell %d: ";
9166   contentOffset = get2byteNotZero(&data[hdr+5]);
9167   assert( contentOffset<=usableSize );  /* Enforced by btreeInitPage() */
9168 
9169   /* EVIDENCE-OF: R-37002-32774 The two-byte integer at offset 3 gives the
9170   ** number of cells on the page. */
9171   nCell = get2byte(&data[hdr+3]);
9172   assert( pPage->nCell==nCell );
9173 
9174   /* EVIDENCE-OF: R-23882-45353 The cell pointer array of a b-tree page
9175   ** immediately follows the b-tree page header. */
9176   cellStart = hdr + 12 - 4*pPage->leaf;
9177   assert( pPage->aCellIdx==&data[cellStart] );
9178   pCellIdx = &data[cellStart + 2*(nCell-1)];
9179 
9180   if( !pPage->leaf ){
9181     /* Analyze the right-child page of internal pages */
9182     pgno = get4byte(&data[hdr+8]);
9183 #ifndef SQLITE_OMIT_AUTOVACUUM
9184     if( pBt->autoVacuum ){
9185       pCheck->zPfx = "On page %d at right child: ";
9186       checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage);
9187     }
9188 #endif
9189     depth = checkTreePage(pCheck, pgno, &maxKey, maxKey);
9190     keyCanBeEqual = 0;
9191   }else{
9192     /* For leaf pages, the coverage check will occur in the same loop
9193     ** as the other cell checks, so initialize the heap.  */
9194     heap = pCheck->heap;
9195     heap[0] = 0;
9196   }
9197 
9198   /* EVIDENCE-OF: R-02776-14802 The cell pointer array consists of K 2-byte
9199   ** integer offsets to the cell contents. */
9200   for(i=nCell-1; i>=0 && pCheck->mxErr; i--){
9201     CellInfo info;
9202 
9203     /* Check cell size */
9204     pCheck->v2 = i;
9205     assert( pCellIdx==&data[cellStart + i*2] );
9206     pc = get2byteAligned(pCellIdx);
9207     pCellIdx -= 2;
9208     if( pc<contentOffset || pc>usableSize-4 ){
9209       checkAppendMsg(pCheck, "Offset %d out of range %d..%d",
9210                              pc, contentOffset, usableSize-4);
9211       doCoverageCheck = 0;
9212       continue;
9213     }
9214     pCell = &data[pc];
9215     pPage->xParseCell(pPage, pCell, &info);
9216     if( pc+info.nSize>usableSize ){
9217       checkAppendMsg(pCheck, "Extends off end of page");
9218       doCoverageCheck = 0;
9219       continue;
9220     }
9221 
9222     /* Check for integer primary key out of range */
9223     if( pPage->intKey ){
9224       if( keyCanBeEqual ? (info.nKey > maxKey) : (info.nKey >= maxKey) ){
9225         checkAppendMsg(pCheck, "Rowid %lld out of order", info.nKey);
9226       }
9227       maxKey = info.nKey;
9228     }
9229 
9230     /* Check the content overflow list */
9231     if( info.nPayload>info.nLocal ){
9232       int nPage;       /* Number of pages on the overflow chain */
9233       Pgno pgnoOvfl;   /* First page of the overflow chain */
9234       assert( pc + info.nSize - 4 <= usableSize );
9235       nPage = (info.nPayload - info.nLocal + usableSize - 5)/(usableSize - 4);
9236       pgnoOvfl = get4byte(&pCell[info.nSize - 4]);
9237 #ifndef SQLITE_OMIT_AUTOVACUUM
9238       if( pBt->autoVacuum ){
9239         checkPtrmap(pCheck, pgnoOvfl, PTRMAP_OVERFLOW1, iPage);
9240       }
9241 #endif
9242       checkList(pCheck, 0, pgnoOvfl, nPage);
9243     }
9244 
9245     if( !pPage->leaf ){
9246       /* Check sanity of left child page for internal pages */
9247       pgno = get4byte(pCell);
9248 #ifndef SQLITE_OMIT_AUTOVACUUM
9249       if( pBt->autoVacuum ){
9250         checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage);
9251       }
9252 #endif
9253       d2 = checkTreePage(pCheck, pgno, &maxKey, maxKey);
9254       keyCanBeEqual = 0;
9255       if( d2!=depth ){
9256         checkAppendMsg(pCheck, "Child page depth differs");
9257         depth = d2;
9258       }
9259     }else{
9260       /* Populate the coverage-checking heap for leaf pages */
9261       btreeHeapInsert(heap, (pc<<16)|(pc+info.nSize-1));
9262     }
9263   }
9264   *piMinKey = maxKey;
9265 
9266   /* Check for complete coverage of the page
9267   */
9268   pCheck->zPfx = 0;
9269   if( doCoverageCheck && pCheck->mxErr>0 ){
9270     /* For leaf pages, the min-heap has already been initialized and the
9271     ** cells have already been inserted.  But for internal pages, that has
9272     ** not yet been done, so do it now */
9273     if( !pPage->leaf ){
9274       heap = pCheck->heap;
9275       heap[0] = 0;
9276       for(i=nCell-1; i>=0; i--){
9277         u32 size;
9278         pc = get2byteAligned(&data[cellStart+i*2]);
9279         size = pPage->xCellSize(pPage, &data[pc]);
9280         btreeHeapInsert(heap, (pc<<16)|(pc+size-1));
9281       }
9282     }
9283     /* Add the freeblocks to the min-heap
9284     **
9285     ** EVIDENCE-OF: R-20690-50594 The second field of the b-tree page header
9286     ** is the offset of the first freeblock, or zero if there are no
9287     ** freeblocks on the page.
9288     */
9289     i = get2byte(&data[hdr+1]);
9290     while( i>0 ){
9291       int size, j;
9292       assert( (u32)i<=usableSize-4 );     /* Enforced by btreeInitPage() */
9293       size = get2byte(&data[i+2]);
9294       assert( (u32)(i+size)<=usableSize );  /* Enforced by btreeInitPage() */
9295       btreeHeapInsert(heap, (((u32)i)<<16)|(i+size-1));
9296       /* EVIDENCE-OF: R-58208-19414 The first 2 bytes of a freeblock are a
9297       ** big-endian integer which is the offset in the b-tree page of the next
9298       ** freeblock in the chain, or zero if the freeblock is the last on the
9299       ** chain. */
9300       j = get2byte(&data[i]);
9301       /* EVIDENCE-OF: R-06866-39125 Freeblocks are always connected in order of
9302       ** increasing offset. */
9303       assert( j==0 || j>i+size );  /* Enforced by btreeInitPage() */
9304       assert( (u32)j<=usableSize-4 );   /* Enforced by btreeInitPage() */
9305       i = j;
9306     }
9307     /* Analyze the min-heap looking for overlap between cells and/or
9308     ** freeblocks, and counting the number of untracked bytes in nFrag.
9309     **
9310     ** Each min-heap entry is of the form:    (start_address<<16)|end_address.
9311     ** There is an implied first entry the covers the page header, the cell
9312     ** pointer index, and the gap between the cell pointer index and the start
9313     ** of cell content.
9314     **
9315     ** The loop below pulls entries from the min-heap in order and compares
9316     ** the start_address against the previous end_address.  If there is an
9317     ** overlap, that means bytes are used multiple times.  If there is a gap,
9318     ** that gap is added to the fragmentation count.
9319     */
9320     nFrag = 0;
9321     prev = contentOffset - 1;   /* Implied first min-heap entry */
9322     while( btreeHeapPull(heap,&x) ){
9323       if( (prev&0xffff)>=(x>>16) ){
9324         checkAppendMsg(pCheck,
9325           "Multiple uses for byte %u of page %d", x>>16, iPage);
9326         break;
9327       }else{
9328         nFrag += (x>>16) - (prev&0xffff) - 1;
9329         prev = x;
9330       }
9331     }
9332     nFrag += usableSize - (prev&0xffff) - 1;
9333     /* EVIDENCE-OF: R-43263-13491 The total number of bytes in all fragments
9334     ** is stored in the fifth field of the b-tree page header.
9335     ** EVIDENCE-OF: R-07161-27322 The one-byte integer at offset 7 gives the
9336     ** number of fragmented free bytes within the cell content area.
9337     */
9338     if( heap[0]==0 && nFrag!=data[hdr+7] ){
9339       checkAppendMsg(pCheck,
9340           "Fragmentation of %d bytes reported as %d on page %d",
9341           nFrag, data[hdr+7], iPage);
9342     }
9343   }
9344 
9345 end_of_check:
9346   if( !doCoverageCheck ) pPage->isInit = savedIsInit;
9347   releasePage(pPage);
9348   pCheck->zPfx = saved_zPfx;
9349   pCheck->v1 = saved_v1;
9350   pCheck->v2 = saved_v2;
9351   return depth+1;
9352 }
9353 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
9354 
9355 #ifndef SQLITE_OMIT_INTEGRITY_CHECK
9356 /*
9357 ** This routine does a complete check of the given BTree file.  aRoot[] is
9358 ** an array of pages numbers were each page number is the root page of
9359 ** a table.  nRoot is the number of entries in aRoot.
9360 **
9361 ** A read-only or read-write transaction must be opened before calling
9362 ** this function.
9363 **
9364 ** Write the number of error seen in *pnErr.  Except for some memory
9365 ** allocation errors,  an error message held in memory obtained from
9366 ** malloc is returned if *pnErr is non-zero.  If *pnErr==0 then NULL is
9367 ** returned.  If a memory allocation error occurs, NULL is returned.
9368 */
9369 char *sqlite3BtreeIntegrityCheck(
9370   Btree *p,     /* The btree to be checked */
9371   int *aRoot,   /* An array of root pages numbers for individual trees */
9372   int nRoot,    /* Number of entries in aRoot[] */
9373   int mxErr,    /* Stop reporting errors after this many */
9374   int *pnErr    /* Write number of errors seen to this variable */
9375 ){
9376   Pgno i;
9377   IntegrityCk sCheck;
9378   BtShared *pBt = p->pBt;
9379   int savedDbFlags = pBt->db->flags;
9380   char zErr[100];
9381   VVA_ONLY( int nRef );
9382 
9383   sqlite3BtreeEnter(p);
9384   assert( p->inTrans>TRANS_NONE && pBt->inTransaction>TRANS_NONE );
9385   VVA_ONLY( nRef = sqlite3PagerRefcount(pBt->pPager) );
9386   assert( nRef>=0 );
9387   sCheck.pBt = pBt;
9388   sCheck.pPager = pBt->pPager;
9389   sCheck.nPage = btreePagecount(sCheck.pBt);
9390   sCheck.mxErr = mxErr;
9391   sCheck.nErr = 0;
9392   sCheck.mallocFailed = 0;
9393   sCheck.zPfx = 0;
9394   sCheck.v1 = 0;
9395   sCheck.v2 = 0;
9396   sCheck.aPgRef = 0;
9397   sCheck.heap = 0;
9398   sqlite3StrAccumInit(&sCheck.errMsg, 0, zErr, sizeof(zErr), SQLITE_MAX_LENGTH);
9399   sCheck.errMsg.printfFlags = SQLITE_PRINTF_INTERNAL;
9400   if( sCheck.nPage==0 ){
9401     goto integrity_ck_cleanup;
9402   }
9403 
9404   sCheck.aPgRef = sqlite3MallocZero((sCheck.nPage / 8)+ 1);
9405   if( !sCheck.aPgRef ){
9406     sCheck.mallocFailed = 1;
9407     goto integrity_ck_cleanup;
9408   }
9409   sCheck.heap = (u32*)sqlite3PageMalloc( pBt->pageSize );
9410   if( sCheck.heap==0 ){
9411     sCheck.mallocFailed = 1;
9412     goto integrity_ck_cleanup;
9413   }
9414 
9415   i = PENDING_BYTE_PAGE(pBt);
9416   if( i<=sCheck.nPage ) setPageReferenced(&sCheck, i);
9417 
9418   /* Check the integrity of the freelist
9419   */
9420   sCheck.zPfx = "Main freelist: ";
9421   checkList(&sCheck, 1, get4byte(&pBt->pPage1->aData[32]),
9422             get4byte(&pBt->pPage1->aData[36]));
9423   sCheck.zPfx = 0;
9424 
9425   /* Check all the tables.
9426   */
9427   testcase( pBt->db->flags & SQLITE_CellSizeCk );
9428   pBt->db->flags &= ~SQLITE_CellSizeCk;
9429   for(i=0; (int)i<nRoot && sCheck.mxErr; i++){
9430     i64 notUsed;
9431     if( aRoot[i]==0 ) continue;
9432 #ifndef SQLITE_OMIT_AUTOVACUUM
9433     if( pBt->autoVacuum && aRoot[i]>1 ){
9434       checkPtrmap(&sCheck, aRoot[i], PTRMAP_ROOTPAGE, 0);
9435     }
9436 #endif
9437     checkTreePage(&sCheck, aRoot[i], &notUsed, LARGEST_INT64);
9438   }
9439   pBt->db->flags = savedDbFlags;
9440 
9441   /* Make sure every page in the file is referenced
9442   */
9443   for(i=1; i<=sCheck.nPage && sCheck.mxErr; i++){
9444 #ifdef SQLITE_OMIT_AUTOVACUUM
9445     if( getPageReferenced(&sCheck, i)==0 ){
9446       checkAppendMsg(&sCheck, "Page %d is never used", i);
9447     }
9448 #else
9449     /* If the database supports auto-vacuum, make sure no tables contain
9450     ** references to pointer-map pages.
9451     */
9452     if( getPageReferenced(&sCheck, i)==0 &&
9453        (PTRMAP_PAGENO(pBt, i)!=i || !pBt->autoVacuum) ){
9454       checkAppendMsg(&sCheck, "Page %d is never used", i);
9455     }
9456     if( getPageReferenced(&sCheck, i)!=0 &&
9457        (PTRMAP_PAGENO(pBt, i)==i && pBt->autoVacuum) ){
9458       checkAppendMsg(&sCheck, "Pointer map page %d is referenced", i);
9459     }
9460 #endif
9461   }
9462 
9463   /* Clean  up and report errors.
9464   */
9465 integrity_ck_cleanup:
9466   sqlite3PageFree(sCheck.heap);
9467   sqlite3_free(sCheck.aPgRef);
9468   if( sCheck.mallocFailed ){
9469     sqlite3StrAccumReset(&sCheck.errMsg);
9470     sCheck.nErr++;
9471   }
9472   *pnErr = sCheck.nErr;
9473   if( sCheck.nErr==0 ) sqlite3StrAccumReset(&sCheck.errMsg);
9474   /* Make sure this analysis did not leave any unref() pages. */
9475   assert( nRef==sqlite3PagerRefcount(pBt->pPager) );
9476   sqlite3BtreeLeave(p);
9477   return sqlite3StrAccumFinish(&sCheck.errMsg);
9478 }
9479 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
9480 
9481 /*
9482 ** Return the full pathname of the underlying database file.  Return
9483 ** an empty string if the database is in-memory or a TEMP database.
9484 **
9485 ** The pager filename is invariant as long as the pager is
9486 ** open so it is safe to access without the BtShared mutex.
9487 */
9488 const char *sqlite3BtreeGetFilename(Btree *p){
9489   assert( p->pBt->pPager!=0 );
9490   return sqlite3PagerFilename(p->pBt->pPager, 1);
9491 }
9492 
9493 /*
9494 ** Return the pathname of the journal file for this database. The return
9495 ** value of this routine is the same regardless of whether the journal file
9496 ** has been created or not.
9497 **
9498 ** The pager journal filename is invariant as long as the pager is
9499 ** open so it is safe to access without the BtShared mutex.
9500 */
9501 const char *sqlite3BtreeGetJournalname(Btree *p){
9502   assert( p->pBt->pPager!=0 );
9503   return sqlite3PagerJournalname(p->pBt->pPager);
9504 }
9505 
9506 /*
9507 ** Return non-zero if a transaction is active.
9508 */
9509 int sqlite3BtreeIsInTrans(Btree *p){
9510   assert( p==0 || sqlite3_mutex_held(p->db->mutex) );
9511   return (p && (p->inTrans==TRANS_WRITE));
9512 }
9513 
9514 #ifndef SQLITE_OMIT_WAL
9515 /*
9516 ** Run a checkpoint on the Btree passed as the first argument.
9517 **
9518 ** Return SQLITE_LOCKED if this or any other connection has an open
9519 ** transaction on the shared-cache the argument Btree is connected to.
9520 **
9521 ** Parameter eMode is one of SQLITE_CHECKPOINT_PASSIVE, FULL or RESTART.
9522 */
9523 int sqlite3BtreeCheckpoint(Btree *p, int eMode, int *pnLog, int *pnCkpt){
9524   int rc = SQLITE_OK;
9525   if( p ){
9526     BtShared *pBt = p->pBt;
9527     sqlite3BtreeEnter(p);
9528     if( pBt->inTransaction!=TRANS_NONE ){
9529       rc = SQLITE_LOCKED;
9530     }else{
9531       rc = sqlite3PagerCheckpoint(pBt->pPager, p->db, eMode, pnLog, pnCkpt);
9532     }
9533     sqlite3BtreeLeave(p);
9534   }
9535   return rc;
9536 }
9537 #endif
9538 
9539 /*
9540 ** Return non-zero if a read (or write) transaction is active.
9541 */
9542 int sqlite3BtreeIsInReadTrans(Btree *p){
9543   assert( p );
9544   assert( sqlite3_mutex_held(p->db->mutex) );
9545   return p->inTrans!=TRANS_NONE;
9546 }
9547 
9548 int sqlite3BtreeIsInBackup(Btree *p){
9549   assert( p );
9550   assert( sqlite3_mutex_held(p->db->mutex) );
9551   return p->nBackup!=0;
9552 }
9553 
9554 /*
9555 ** This function returns a pointer to a blob of memory associated with
9556 ** a single shared-btree. The memory is used by client code for its own
9557 ** purposes (for example, to store a high-level schema associated with
9558 ** the shared-btree). The btree layer manages reference counting issues.
9559 **
9560 ** The first time this is called on a shared-btree, nBytes bytes of memory
9561 ** are allocated, zeroed, and returned to the caller. For each subsequent
9562 ** call the nBytes parameter is ignored and a pointer to the same blob
9563 ** of memory returned.
9564 **
9565 ** If the nBytes parameter is 0 and the blob of memory has not yet been
9566 ** allocated, a null pointer is returned. If the blob has already been
9567 ** allocated, it is returned as normal.
9568 **
9569 ** Just before the shared-btree is closed, the function passed as the
9570 ** xFree argument when the memory allocation was made is invoked on the
9571 ** blob of allocated memory. The xFree function should not call sqlite3_free()
9572 ** on the memory, the btree layer does that.
9573 */
9574 void *sqlite3BtreeSchema(Btree *p, int nBytes, void(*xFree)(void *)){
9575   BtShared *pBt = p->pBt;
9576   sqlite3BtreeEnter(p);
9577   if( !pBt->pSchema && nBytes ){
9578     pBt->pSchema = sqlite3DbMallocZero(0, nBytes);
9579     pBt->xFreeSchema = xFree;
9580   }
9581   sqlite3BtreeLeave(p);
9582   return pBt->pSchema;
9583 }
9584 
9585 /*
9586 ** Return SQLITE_LOCKED_SHAREDCACHE if another user of the same shared
9587 ** btree as the argument handle holds an exclusive lock on the
9588 ** sqlite_master table. Otherwise SQLITE_OK.
9589 */
9590 int sqlite3BtreeSchemaLocked(Btree *p){
9591   int rc;
9592   assert( sqlite3_mutex_held(p->db->mutex) );
9593   sqlite3BtreeEnter(p);
9594   rc = querySharedCacheTableLock(p, MASTER_ROOT, READ_LOCK);
9595   assert( rc==SQLITE_OK || rc==SQLITE_LOCKED_SHAREDCACHE );
9596   sqlite3BtreeLeave(p);
9597   return rc;
9598 }
9599 
9600 
9601 #ifndef SQLITE_OMIT_SHARED_CACHE
9602 /*
9603 ** Obtain a lock on the table whose root page is iTab.  The
9604 ** lock is a write lock if isWritelock is true or a read lock
9605 ** if it is false.
9606 */
9607 int sqlite3BtreeLockTable(Btree *p, int iTab, u8 isWriteLock){
9608   int rc = SQLITE_OK;
9609   assert( p->inTrans!=TRANS_NONE );
9610   if( p->sharable ){
9611     u8 lockType = READ_LOCK + isWriteLock;
9612     assert( READ_LOCK+1==WRITE_LOCK );
9613     assert( isWriteLock==0 || isWriteLock==1 );
9614 
9615     sqlite3BtreeEnter(p);
9616     rc = querySharedCacheTableLock(p, iTab, lockType);
9617     if( rc==SQLITE_OK ){
9618       rc = setSharedCacheTableLock(p, iTab, lockType);
9619     }
9620     sqlite3BtreeLeave(p);
9621   }
9622   return rc;
9623 }
9624 #endif
9625 
9626 #ifndef SQLITE_OMIT_INCRBLOB
9627 /*
9628 ** Argument pCsr must be a cursor opened for writing on an
9629 ** INTKEY table currently pointing at a valid table entry.
9630 ** This function modifies the data stored as part of that entry.
9631 **
9632 ** Only the data content may only be modified, it is not possible to
9633 ** change the length of the data stored. If this function is called with
9634 ** parameters that attempt to write past the end of the existing data,
9635 ** no modifications are made and SQLITE_CORRUPT is returned.
9636 */
9637 int sqlite3BtreePutData(BtCursor *pCsr, u32 offset, u32 amt, void *z){
9638   int rc;
9639   assert( cursorOwnsBtShared(pCsr) );
9640   assert( sqlite3_mutex_held(pCsr->pBtree->db->mutex) );
9641   assert( pCsr->curFlags & BTCF_Incrblob );
9642 
9643   rc = restoreCursorPosition(pCsr);
9644   if( rc!=SQLITE_OK ){
9645     return rc;
9646   }
9647   assert( pCsr->eState!=CURSOR_REQUIRESEEK );
9648   if( pCsr->eState!=CURSOR_VALID ){
9649     return SQLITE_ABORT;
9650   }
9651 
9652   /* Save the positions of all other cursors open on this table. This is
9653   ** required in case any of them are holding references to an xFetch
9654   ** version of the b-tree page modified by the accessPayload call below.
9655   **
9656   ** Note that pCsr must be open on a INTKEY table and saveCursorPosition()
9657   ** and hence saveAllCursors() cannot fail on a BTREE_INTKEY table, hence
9658   ** saveAllCursors can only return SQLITE_OK.
9659   */
9660   VVA_ONLY(rc =) saveAllCursors(pCsr->pBt, pCsr->pgnoRoot, pCsr);
9661   assert( rc==SQLITE_OK );
9662 
9663   /* Check some assumptions:
9664   **   (a) the cursor is open for writing,
9665   **   (b) there is a read/write transaction open,
9666   **   (c) the connection holds a write-lock on the table (if required),
9667   **   (d) there are no conflicting read-locks, and
9668   **   (e) the cursor points at a valid row of an intKey table.
9669   */
9670   if( (pCsr->curFlags & BTCF_WriteFlag)==0 ){
9671     return SQLITE_READONLY;
9672   }
9673   assert( (pCsr->pBt->btsFlags & BTS_READ_ONLY)==0
9674               && pCsr->pBt->inTransaction==TRANS_WRITE );
9675   assert( hasSharedCacheTableLock(pCsr->pBtree, pCsr->pgnoRoot, 0, 2) );
9676   assert( !hasReadConflicts(pCsr->pBtree, pCsr->pgnoRoot) );
9677   assert( pCsr->apPage[pCsr->iPage]->intKey );
9678 
9679   return accessPayload(pCsr, offset, amt, (unsigned char *)z, 1);
9680 }
9681 
9682 /*
9683 ** Mark this cursor as an incremental blob cursor.
9684 */
9685 void sqlite3BtreeIncrblobCursor(BtCursor *pCur){
9686   pCur->curFlags |= BTCF_Incrblob;
9687   pCur->pBtree->hasIncrblobCur = 1;
9688 }
9689 #endif
9690 
9691 /*
9692 ** Set both the "read version" (single byte at byte offset 18) and
9693 ** "write version" (single byte at byte offset 19) fields in the database
9694 ** header to iVersion.
9695 */
9696 int sqlite3BtreeSetVersion(Btree *pBtree, int iVersion){
9697   BtShared *pBt = pBtree->pBt;
9698   int rc;                         /* Return code */
9699 
9700   assert( iVersion==1 || iVersion==2 );
9701 
9702   /* If setting the version fields to 1, do not automatically open the
9703   ** WAL connection, even if the version fields are currently set to 2.
9704   */
9705   pBt->btsFlags &= ~BTS_NO_WAL;
9706   if( iVersion==1 ) pBt->btsFlags |= BTS_NO_WAL;
9707 
9708   rc = sqlite3BtreeBeginTrans(pBtree, 0);
9709   if( rc==SQLITE_OK ){
9710     u8 *aData = pBt->pPage1->aData;
9711     if( aData[18]!=(u8)iVersion || aData[19]!=(u8)iVersion ){
9712       rc = sqlite3BtreeBeginTrans(pBtree, 2);
9713       if( rc==SQLITE_OK ){
9714         rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
9715         if( rc==SQLITE_OK ){
9716           aData[18] = (u8)iVersion;
9717           aData[19] = (u8)iVersion;
9718         }
9719       }
9720     }
9721   }
9722 
9723   pBt->btsFlags &= ~BTS_NO_WAL;
9724   return rc;
9725 }
9726 
9727 /*
9728 ** Return true if the cursor has a hint specified.  This routine is
9729 ** only used from within assert() statements
9730 */
9731 int sqlite3BtreeCursorHasHint(BtCursor *pCsr, unsigned int mask){
9732   return (pCsr->hints & mask)!=0;
9733 }
9734 
9735 /*
9736 ** Return true if the given Btree is read-only.
9737 */
9738 int sqlite3BtreeIsReadonly(Btree *p){
9739   return (p->pBt->btsFlags & BTS_READ_ONLY)!=0;
9740 }
9741 
9742 /*
9743 ** Return the size of the header added to each page by this module.
9744 */
9745 int sqlite3HeaderSizeBtree(void){ return ROUND8(sizeof(MemPage)); }
9746 
9747 #if !defined(SQLITE_OMIT_SHARED_CACHE)
9748 /*
9749 ** Return true if the Btree passed as the only argument is sharable.
9750 */
9751 int sqlite3BtreeSharable(Btree *p){
9752   return p->sharable;
9753 }
9754 
9755 /*
9756 ** Return the number of connections to the BtShared object accessed by
9757 ** the Btree handle passed as the only argument. For private caches
9758 ** this is always 1. For shared caches it may be 1 or greater.
9759 */
9760 int sqlite3BtreeConnectionCount(Btree *p){
9761   testcase( p->sharable );
9762   return p->pBt->nRef;
9763 }
9764 #endif
9765