xref: /sqlite-3.40.0/src/btree.c (revision 9ec82ff2)
1 /*
2 ** 2004 April 6
3 **
4 ** The author disclaims copyright to this source code.  In place of
5 ** a legal notice, here is a blessing:
6 **
7 **    May you do good and not evil.
8 **    May you find forgiveness for yourself and forgive others.
9 **    May you share freely, never taking more than you give.
10 **
11 *************************************************************************
12 ** This file implements an external (disk-based) database using BTrees.
13 ** See the header comment on "btreeInt.h" for additional information.
14 ** Including a description of file format and an overview of operation.
15 */
16 #include "btreeInt.h"
17 
18 /*
19 ** The header string that appears at the beginning of every
20 ** SQLite database.
21 */
22 static const char zMagicHeader[] = SQLITE_FILE_HEADER;
23 
24 /*
25 ** Set this global variable to 1 to enable tracing using the TRACE
26 ** macro.
27 */
28 #if 0
29 int sqlite3BtreeTrace=1;  /* True to enable tracing */
30 # define TRACE(X)  if(sqlite3BtreeTrace){printf X;fflush(stdout);}
31 #else
32 # define TRACE(X)
33 #endif
34 
35 /*
36 ** Extract a 2-byte big-endian integer from an array of unsigned bytes.
37 ** But if the value is zero, make it 65536.
38 **
39 ** This routine is used to extract the "offset to cell content area" value
40 ** from the header of a btree page.  If the page size is 65536 and the page
41 ** is empty, the offset should be 65536, but the 2-byte value stores zero.
42 ** This routine makes the necessary adjustment to 65536.
43 */
44 #define get2byteNotZero(X)  (((((int)get2byte(X))-1)&0xffff)+1)
45 
46 /*
47 ** Values passed as the 5th argument to allocateBtreePage()
48 */
49 #define BTALLOC_ANY   0           /* Allocate any page */
50 #define BTALLOC_EXACT 1           /* Allocate exact page if possible */
51 #define BTALLOC_LE    2           /* Allocate any page <= the parameter */
52 
53 /*
54 ** Macro IfNotOmitAV(x) returns (x) if SQLITE_OMIT_AUTOVACUUM is not
55 ** defined, or 0 if it is. For example:
56 **
57 **   bIncrVacuum = IfNotOmitAV(pBtShared->incrVacuum);
58 */
59 #ifndef SQLITE_OMIT_AUTOVACUUM
60 #define IfNotOmitAV(expr) (expr)
61 #else
62 #define IfNotOmitAV(expr) 0
63 #endif
64 
65 #ifndef SQLITE_OMIT_SHARED_CACHE
66 /*
67 ** A list of BtShared objects that are eligible for participation
68 ** in shared cache.  This variable has file scope during normal builds,
69 ** but the test harness needs to access it so we make it global for
70 ** test builds.
71 **
72 ** Access to this variable is protected by SQLITE_MUTEX_STATIC_MASTER.
73 */
74 #ifdef SQLITE_TEST
75 BtShared *SQLITE_WSD sqlite3SharedCacheList = 0;
76 #else
77 static BtShared *SQLITE_WSD sqlite3SharedCacheList = 0;
78 #endif
79 #endif /* SQLITE_OMIT_SHARED_CACHE */
80 
81 #ifndef SQLITE_OMIT_SHARED_CACHE
82 /*
83 ** Enable or disable the shared pager and schema features.
84 **
85 ** This routine has no effect on existing database connections.
86 ** The shared cache setting effects only future calls to
87 ** sqlite3_open(), sqlite3_open16(), or sqlite3_open_v2().
88 */
89 int sqlite3_enable_shared_cache(int enable){
90   sqlite3GlobalConfig.sharedCacheEnabled = enable;
91   return SQLITE_OK;
92 }
93 #endif
94 
95 
96 
97 #ifdef SQLITE_OMIT_SHARED_CACHE
98   /*
99   ** The functions querySharedCacheTableLock(), setSharedCacheTableLock(),
100   ** and clearAllSharedCacheTableLocks()
101   ** manipulate entries in the BtShared.pLock linked list used to store
102   ** shared-cache table level locks. If the library is compiled with the
103   ** shared-cache feature disabled, then there is only ever one user
104   ** of each BtShared structure and so this locking is not necessary.
105   ** So define the lock related functions as no-ops.
106   */
107   #define querySharedCacheTableLock(a,b,c) SQLITE_OK
108   #define setSharedCacheTableLock(a,b,c) SQLITE_OK
109   #define clearAllSharedCacheTableLocks(a)
110   #define downgradeAllSharedCacheTableLocks(a)
111   #define hasSharedCacheTableLock(a,b,c,d) 1
112   #define hasReadConflicts(a, b) 0
113 #endif
114 
115 #ifndef SQLITE_OMIT_SHARED_CACHE
116 
117 #ifdef SQLITE_DEBUG
118 /*
119 **** This function is only used as part of an assert() statement. ***
120 **
121 ** Check to see if pBtree holds the required locks to read or write to the
122 ** table with root page iRoot.   Return 1 if it does and 0 if not.
123 **
124 ** For example, when writing to a table with root-page iRoot via
125 ** Btree connection pBtree:
126 **
127 **    assert( hasSharedCacheTableLock(pBtree, iRoot, 0, WRITE_LOCK) );
128 **
129 ** When writing to an index that resides in a sharable database, the
130 ** caller should have first obtained a lock specifying the root page of
131 ** the corresponding table. This makes things a bit more complicated,
132 ** as this module treats each table as a separate structure. To determine
133 ** the table corresponding to the index being written, this
134 ** function has to search through the database schema.
135 **
136 ** Instead of a lock on the table/index rooted at page iRoot, the caller may
137 ** hold a write-lock on the schema table (root page 1). This is also
138 ** acceptable.
139 */
140 static int hasSharedCacheTableLock(
141   Btree *pBtree,         /* Handle that must hold lock */
142   Pgno iRoot,            /* Root page of b-tree */
143   int isIndex,           /* True if iRoot is the root of an index b-tree */
144   int eLockType          /* Required lock type (READ_LOCK or WRITE_LOCK) */
145 ){
146   Schema *pSchema = (Schema *)pBtree->pBt->pSchema;
147   Pgno iTab = 0;
148   BtLock *pLock;
149 
150   /* If this database is not shareable, or if the client is reading
151   ** and has the read-uncommitted flag set, then no lock is required.
152   ** Return true immediately.
153   */
154   if( (pBtree->sharable==0)
155    || (eLockType==READ_LOCK && (pBtree->db->flags & SQLITE_ReadUncommitted))
156   ){
157     return 1;
158   }
159 
160   /* If the client is reading  or writing an index and the schema is
161   ** not loaded, then it is too difficult to actually check to see if
162   ** the correct locks are held.  So do not bother - just return true.
163   ** This case does not come up very often anyhow.
164   */
165   if( isIndex && (!pSchema || (pSchema->schemaFlags&DB_SchemaLoaded)==0) ){
166     return 1;
167   }
168 
169   /* Figure out the root-page that the lock should be held on. For table
170   ** b-trees, this is just the root page of the b-tree being read or
171   ** written. For index b-trees, it is the root page of the associated
172   ** table.  */
173   if( isIndex ){
174     HashElem *p;
175     for(p=sqliteHashFirst(&pSchema->idxHash); p; p=sqliteHashNext(p)){
176       Index *pIdx = (Index *)sqliteHashData(p);
177       if( pIdx->tnum==(int)iRoot ){
178         if( iTab ){
179           /* Two or more indexes share the same root page.  There must
180           ** be imposter tables.  So just return true.  The assert is not
181           ** useful in that case. */
182           return 1;
183         }
184         iTab = pIdx->pTable->tnum;
185       }
186     }
187   }else{
188     iTab = iRoot;
189   }
190 
191   /* Search for the required lock. Either a write-lock on root-page iTab, a
192   ** write-lock on the schema table, or (if the client is reading) a
193   ** read-lock on iTab will suffice. Return 1 if any of these are found.  */
194   for(pLock=pBtree->pBt->pLock; pLock; pLock=pLock->pNext){
195     if( pLock->pBtree==pBtree
196      && (pLock->iTable==iTab || (pLock->eLock==WRITE_LOCK && pLock->iTable==1))
197      && pLock->eLock>=eLockType
198     ){
199       return 1;
200     }
201   }
202 
203   /* Failed to find the required lock. */
204   return 0;
205 }
206 #endif /* SQLITE_DEBUG */
207 
208 #ifdef SQLITE_DEBUG
209 /*
210 **** This function may be used as part of assert() statements only. ****
211 **
212 ** Return true if it would be illegal for pBtree to write into the
213 ** table or index rooted at iRoot because other shared connections are
214 ** simultaneously reading that same table or index.
215 **
216 ** It is illegal for pBtree to write if some other Btree object that
217 ** shares the same BtShared object is currently reading or writing
218 ** the iRoot table.  Except, if the other Btree object has the
219 ** read-uncommitted flag set, then it is OK for the other object to
220 ** have a read cursor.
221 **
222 ** For example, before writing to any part of the table or index
223 ** rooted at page iRoot, one should call:
224 **
225 **    assert( !hasReadConflicts(pBtree, iRoot) );
226 */
227 static int hasReadConflicts(Btree *pBtree, Pgno iRoot){
228   BtCursor *p;
229   for(p=pBtree->pBt->pCursor; p; p=p->pNext){
230     if( p->pgnoRoot==iRoot
231      && p->pBtree!=pBtree
232      && 0==(p->pBtree->db->flags & SQLITE_ReadUncommitted)
233     ){
234       return 1;
235     }
236   }
237   return 0;
238 }
239 #endif    /* #ifdef SQLITE_DEBUG */
240 
241 /*
242 ** Query to see if Btree handle p may obtain a lock of type eLock
243 ** (READ_LOCK or WRITE_LOCK) on the table with root-page iTab. Return
244 ** SQLITE_OK if the lock may be obtained (by calling
245 ** setSharedCacheTableLock()), or SQLITE_LOCKED if not.
246 */
247 static int querySharedCacheTableLock(Btree *p, Pgno iTab, u8 eLock){
248   BtShared *pBt = p->pBt;
249   BtLock *pIter;
250 
251   assert( sqlite3BtreeHoldsMutex(p) );
252   assert( eLock==READ_LOCK || eLock==WRITE_LOCK );
253   assert( p->db!=0 );
254   assert( !(p->db->flags&SQLITE_ReadUncommitted)||eLock==WRITE_LOCK||iTab==1 );
255 
256   /* If requesting a write-lock, then the Btree must have an open write
257   ** transaction on this file. And, obviously, for this to be so there
258   ** must be an open write transaction on the file itself.
259   */
260   assert( eLock==READ_LOCK || (p==pBt->pWriter && p->inTrans==TRANS_WRITE) );
261   assert( eLock==READ_LOCK || pBt->inTransaction==TRANS_WRITE );
262 
263   /* This routine is a no-op if the shared-cache is not enabled */
264   if( !p->sharable ){
265     return SQLITE_OK;
266   }
267 
268   /* If some other connection is holding an exclusive lock, the
269   ** requested lock may not be obtained.
270   */
271   if( pBt->pWriter!=p && (pBt->btsFlags & BTS_EXCLUSIVE)!=0 ){
272     sqlite3ConnectionBlocked(p->db, pBt->pWriter->db);
273     return SQLITE_LOCKED_SHAREDCACHE;
274   }
275 
276   for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){
277     /* The condition (pIter->eLock!=eLock) in the following if(...)
278     ** statement is a simplification of:
279     **
280     **   (eLock==WRITE_LOCK || pIter->eLock==WRITE_LOCK)
281     **
282     ** since we know that if eLock==WRITE_LOCK, then no other connection
283     ** may hold a WRITE_LOCK on any table in this file (since there can
284     ** only be a single writer).
285     */
286     assert( pIter->eLock==READ_LOCK || pIter->eLock==WRITE_LOCK );
287     assert( eLock==READ_LOCK || pIter->pBtree==p || pIter->eLock==READ_LOCK);
288     if( pIter->pBtree!=p && pIter->iTable==iTab && pIter->eLock!=eLock ){
289       sqlite3ConnectionBlocked(p->db, pIter->pBtree->db);
290       if( eLock==WRITE_LOCK ){
291         assert( p==pBt->pWriter );
292         pBt->btsFlags |= BTS_PENDING;
293       }
294       return SQLITE_LOCKED_SHAREDCACHE;
295     }
296   }
297   return SQLITE_OK;
298 }
299 #endif /* !SQLITE_OMIT_SHARED_CACHE */
300 
301 #ifndef SQLITE_OMIT_SHARED_CACHE
302 /*
303 ** Add a lock on the table with root-page iTable to the shared-btree used
304 ** by Btree handle p. Parameter eLock must be either READ_LOCK or
305 ** WRITE_LOCK.
306 **
307 ** This function assumes the following:
308 **
309 **   (a) The specified Btree object p is connected to a sharable
310 **       database (one with the BtShared.sharable flag set), and
311 **
312 **   (b) No other Btree objects hold a lock that conflicts
313 **       with the requested lock (i.e. querySharedCacheTableLock() has
314 **       already been called and returned SQLITE_OK).
315 **
316 ** SQLITE_OK is returned if the lock is added successfully. SQLITE_NOMEM
317 ** is returned if a malloc attempt fails.
318 */
319 static int setSharedCacheTableLock(Btree *p, Pgno iTable, u8 eLock){
320   BtShared *pBt = p->pBt;
321   BtLock *pLock = 0;
322   BtLock *pIter;
323 
324   assert( sqlite3BtreeHoldsMutex(p) );
325   assert( eLock==READ_LOCK || eLock==WRITE_LOCK );
326   assert( p->db!=0 );
327 
328   /* A connection with the read-uncommitted flag set will never try to
329   ** obtain a read-lock using this function. The only read-lock obtained
330   ** by a connection in read-uncommitted mode is on the sqlite_master
331   ** table, and that lock is obtained in BtreeBeginTrans().  */
332   assert( 0==(p->db->flags&SQLITE_ReadUncommitted) || eLock==WRITE_LOCK );
333 
334   /* This function should only be called on a sharable b-tree after it
335   ** has been determined that no other b-tree holds a conflicting lock.  */
336   assert( p->sharable );
337   assert( SQLITE_OK==querySharedCacheTableLock(p, iTable, eLock) );
338 
339   /* First search the list for an existing lock on this table. */
340   for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){
341     if( pIter->iTable==iTable && pIter->pBtree==p ){
342       pLock = pIter;
343       break;
344     }
345   }
346 
347   /* If the above search did not find a BtLock struct associating Btree p
348   ** with table iTable, allocate one and link it into the list.
349   */
350   if( !pLock ){
351     pLock = (BtLock *)sqlite3MallocZero(sizeof(BtLock));
352     if( !pLock ){
353       return SQLITE_NOMEM_BKPT;
354     }
355     pLock->iTable = iTable;
356     pLock->pBtree = p;
357     pLock->pNext = pBt->pLock;
358     pBt->pLock = pLock;
359   }
360 
361   /* Set the BtLock.eLock variable to the maximum of the current lock
362   ** and the requested lock. This means if a write-lock was already held
363   ** and a read-lock requested, we don't incorrectly downgrade the lock.
364   */
365   assert( WRITE_LOCK>READ_LOCK );
366   if( eLock>pLock->eLock ){
367     pLock->eLock = eLock;
368   }
369 
370   return SQLITE_OK;
371 }
372 #endif /* !SQLITE_OMIT_SHARED_CACHE */
373 
374 #ifndef SQLITE_OMIT_SHARED_CACHE
375 /*
376 ** Release all the table locks (locks obtained via calls to
377 ** the setSharedCacheTableLock() procedure) held by Btree object p.
378 **
379 ** This function assumes that Btree p has an open read or write
380 ** transaction. If it does not, then the BTS_PENDING flag
381 ** may be incorrectly cleared.
382 */
383 static void clearAllSharedCacheTableLocks(Btree *p){
384   BtShared *pBt = p->pBt;
385   BtLock **ppIter = &pBt->pLock;
386 
387   assert( sqlite3BtreeHoldsMutex(p) );
388   assert( p->sharable || 0==*ppIter );
389   assert( p->inTrans>0 );
390 
391   while( *ppIter ){
392     BtLock *pLock = *ppIter;
393     assert( (pBt->btsFlags & BTS_EXCLUSIVE)==0 || pBt->pWriter==pLock->pBtree );
394     assert( pLock->pBtree->inTrans>=pLock->eLock );
395     if( pLock->pBtree==p ){
396       *ppIter = pLock->pNext;
397       assert( pLock->iTable!=1 || pLock==&p->lock );
398       if( pLock->iTable!=1 ){
399         sqlite3_free(pLock);
400       }
401     }else{
402       ppIter = &pLock->pNext;
403     }
404   }
405 
406   assert( (pBt->btsFlags & BTS_PENDING)==0 || pBt->pWriter );
407   if( pBt->pWriter==p ){
408     pBt->pWriter = 0;
409     pBt->btsFlags &= ~(BTS_EXCLUSIVE|BTS_PENDING);
410   }else if( pBt->nTransaction==2 ){
411     /* This function is called when Btree p is concluding its
412     ** transaction. If there currently exists a writer, and p is not
413     ** that writer, then the number of locks held by connections other
414     ** than the writer must be about to drop to zero. In this case
415     ** set the BTS_PENDING flag to 0.
416     **
417     ** If there is not currently a writer, then BTS_PENDING must
418     ** be zero already. So this next line is harmless in that case.
419     */
420     pBt->btsFlags &= ~BTS_PENDING;
421   }
422 }
423 
424 /*
425 ** This function changes all write-locks held by Btree p into read-locks.
426 */
427 static void downgradeAllSharedCacheTableLocks(Btree *p){
428   BtShared *pBt = p->pBt;
429   if( pBt->pWriter==p ){
430     BtLock *pLock;
431     pBt->pWriter = 0;
432     pBt->btsFlags &= ~(BTS_EXCLUSIVE|BTS_PENDING);
433     for(pLock=pBt->pLock; pLock; pLock=pLock->pNext){
434       assert( pLock->eLock==READ_LOCK || pLock->pBtree==p );
435       pLock->eLock = READ_LOCK;
436     }
437   }
438 }
439 
440 #endif /* SQLITE_OMIT_SHARED_CACHE */
441 
442 static void releasePage(MemPage *pPage);  /* Forward reference */
443 
444 /*
445 ***** This routine is used inside of assert() only ****
446 **
447 ** Verify that the cursor holds the mutex on its BtShared
448 */
449 #ifdef SQLITE_DEBUG
450 static int cursorHoldsMutex(BtCursor *p){
451   return sqlite3_mutex_held(p->pBt->mutex);
452 }
453 
454 /* Verify that the cursor and the BtShared agree about what is the current
455 ** database connetion. This is important in shared-cache mode. If the database
456 ** connection pointers get out-of-sync, it is possible for routines like
457 ** btreeInitPage() to reference an stale connection pointer that references a
458 ** a connection that has already closed.  This routine is used inside assert()
459 ** statements only and for the purpose of double-checking that the btree code
460 ** does keep the database connection pointers up-to-date.
461 */
462 static int cursorOwnsBtShared(BtCursor *p){
463   assert( cursorHoldsMutex(p) );
464   return (p->pBtree->db==p->pBt->db);
465 }
466 #endif
467 
468 /*
469 ** Invalidate the overflow cache of the cursor passed as the first argument.
470 ** on the shared btree structure pBt.
471 */
472 #define invalidateOverflowCache(pCur) (pCur->curFlags &= ~BTCF_ValidOvfl)
473 
474 /*
475 ** Invalidate the overflow page-list cache for all cursors opened
476 ** on the shared btree structure pBt.
477 */
478 static void invalidateAllOverflowCache(BtShared *pBt){
479   BtCursor *p;
480   assert( sqlite3_mutex_held(pBt->mutex) );
481   for(p=pBt->pCursor; p; p=p->pNext){
482     invalidateOverflowCache(p);
483   }
484 }
485 
486 #ifndef SQLITE_OMIT_INCRBLOB
487 /*
488 ** This function is called before modifying the contents of a table
489 ** to invalidate any incrblob cursors that are open on the
490 ** row or one of the rows being modified.
491 **
492 ** If argument isClearTable is true, then the entire contents of the
493 ** table is about to be deleted. In this case invalidate all incrblob
494 ** cursors open on any row within the table with root-page pgnoRoot.
495 **
496 ** Otherwise, if argument isClearTable is false, then the row with
497 ** rowid iRow is being replaced or deleted. In this case invalidate
498 ** only those incrblob cursors open on that specific row.
499 */
500 static void invalidateIncrblobCursors(
501   Btree *pBtree,          /* The database file to check */
502   i64 iRow,               /* The rowid that might be changing */
503   int isClearTable        /* True if all rows are being deleted */
504 ){
505   BtCursor *p;
506   if( pBtree->hasIncrblobCur==0 ) return;
507   assert( sqlite3BtreeHoldsMutex(pBtree) );
508   pBtree->hasIncrblobCur = 0;
509   for(p=pBtree->pBt->pCursor; p; p=p->pNext){
510     if( (p->curFlags & BTCF_Incrblob)!=0 ){
511       pBtree->hasIncrblobCur = 1;
512       if( isClearTable || p->info.nKey==iRow ){
513         p->eState = CURSOR_INVALID;
514       }
515     }
516   }
517 }
518 
519 #else
520   /* Stub function when INCRBLOB is omitted */
521   #define invalidateIncrblobCursors(x,y,z)
522 #endif /* SQLITE_OMIT_INCRBLOB */
523 
524 /*
525 ** Set bit pgno of the BtShared.pHasContent bitvec. This is called
526 ** when a page that previously contained data becomes a free-list leaf
527 ** page.
528 **
529 ** The BtShared.pHasContent bitvec exists to work around an obscure
530 ** bug caused by the interaction of two useful IO optimizations surrounding
531 ** free-list leaf pages:
532 **
533 **   1) When all data is deleted from a page and the page becomes
534 **      a free-list leaf page, the page is not written to the database
535 **      (as free-list leaf pages contain no meaningful data). Sometimes
536 **      such a page is not even journalled (as it will not be modified,
537 **      why bother journalling it?).
538 **
539 **   2) When a free-list leaf page is reused, its content is not read
540 **      from the database or written to the journal file (why should it
541 **      be, if it is not at all meaningful?).
542 **
543 ** By themselves, these optimizations work fine and provide a handy
544 ** performance boost to bulk delete or insert operations. However, if
545 ** a page is moved to the free-list and then reused within the same
546 ** transaction, a problem comes up. If the page is not journalled when
547 ** it is moved to the free-list and it is also not journalled when it
548 ** is extracted from the free-list and reused, then the original data
549 ** may be lost. In the event of a rollback, it may not be possible
550 ** to restore the database to its original configuration.
551 **
552 ** The solution is the BtShared.pHasContent bitvec. Whenever a page is
553 ** moved to become a free-list leaf page, the corresponding bit is
554 ** set in the bitvec. Whenever a leaf page is extracted from the free-list,
555 ** optimization 2 above is omitted if the corresponding bit is already
556 ** set in BtShared.pHasContent. The contents of the bitvec are cleared
557 ** at the end of every transaction.
558 */
559 static int btreeSetHasContent(BtShared *pBt, Pgno pgno){
560   int rc = SQLITE_OK;
561   if( !pBt->pHasContent ){
562     assert( pgno<=pBt->nPage );
563     pBt->pHasContent = sqlite3BitvecCreate(pBt->nPage);
564     if( !pBt->pHasContent ){
565       rc = SQLITE_NOMEM_BKPT;
566     }
567   }
568   if( rc==SQLITE_OK && pgno<=sqlite3BitvecSize(pBt->pHasContent) ){
569     rc = sqlite3BitvecSet(pBt->pHasContent, pgno);
570   }
571   return rc;
572 }
573 
574 /*
575 ** Query the BtShared.pHasContent vector.
576 **
577 ** This function is called when a free-list leaf page is removed from the
578 ** free-list for reuse. It returns false if it is safe to retrieve the
579 ** page from the pager layer with the 'no-content' flag set. True otherwise.
580 */
581 static int btreeGetHasContent(BtShared *pBt, Pgno pgno){
582   Bitvec *p = pBt->pHasContent;
583   return (p && (pgno>sqlite3BitvecSize(p) || sqlite3BitvecTest(p, pgno)));
584 }
585 
586 /*
587 ** Clear (destroy) the BtShared.pHasContent bitvec. This should be
588 ** invoked at the conclusion of each write-transaction.
589 */
590 static void btreeClearHasContent(BtShared *pBt){
591   sqlite3BitvecDestroy(pBt->pHasContent);
592   pBt->pHasContent = 0;
593 }
594 
595 /*
596 ** Release all of the apPage[] pages for a cursor.
597 */
598 static void btreeReleaseAllCursorPages(BtCursor *pCur){
599   int i;
600   for(i=0; i<=pCur->iPage; i++){
601     releasePage(pCur->apPage[i]);
602     pCur->apPage[i] = 0;
603   }
604   pCur->iPage = -1;
605 }
606 
607 /*
608 ** The cursor passed as the only argument must point to a valid entry
609 ** when this function is called (i.e. have eState==CURSOR_VALID). This
610 ** function saves the current cursor key in variables pCur->nKey and
611 ** pCur->pKey. SQLITE_OK is returned if successful or an SQLite error
612 ** code otherwise.
613 **
614 ** If the cursor is open on an intkey table, then the integer key
615 ** (the rowid) is stored in pCur->nKey and pCur->pKey is left set to
616 ** NULL. If the cursor is open on a non-intkey table, then pCur->pKey is
617 ** set to point to a malloced buffer pCur->nKey bytes in size containing
618 ** the key.
619 */
620 static int saveCursorKey(BtCursor *pCur){
621   int rc = SQLITE_OK;
622   assert( CURSOR_VALID==pCur->eState );
623   assert( 0==pCur->pKey );
624   assert( cursorHoldsMutex(pCur) );
625 
626   if( pCur->curIntKey ){
627     /* Only the rowid is required for a table btree */
628     pCur->nKey = sqlite3BtreeIntegerKey(pCur);
629   }else{
630     /* For an index btree, save the complete key content */
631     void *pKey;
632     pCur->nKey = sqlite3BtreePayloadSize(pCur);
633     pKey = sqlite3Malloc( pCur->nKey );
634     if( pKey ){
635       rc = sqlite3BtreePayload(pCur, 0, (int)pCur->nKey, pKey);
636       if( rc==SQLITE_OK ){
637         pCur->pKey = pKey;
638       }else{
639         sqlite3_free(pKey);
640       }
641     }else{
642       rc = SQLITE_NOMEM_BKPT;
643     }
644   }
645   assert( !pCur->curIntKey || !pCur->pKey );
646   return rc;
647 }
648 
649 /*
650 ** Save the current cursor position in the variables BtCursor.nKey
651 ** and BtCursor.pKey. The cursor's state is set to CURSOR_REQUIRESEEK.
652 **
653 ** The caller must ensure that the cursor is valid (has eState==CURSOR_VALID)
654 ** prior to calling this routine.
655 */
656 static int saveCursorPosition(BtCursor *pCur){
657   int rc;
658 
659   assert( CURSOR_VALID==pCur->eState || CURSOR_SKIPNEXT==pCur->eState );
660   assert( 0==pCur->pKey );
661   assert( cursorHoldsMutex(pCur) );
662 
663   if( pCur->eState==CURSOR_SKIPNEXT ){
664     pCur->eState = CURSOR_VALID;
665   }else{
666     pCur->skipNext = 0;
667   }
668 
669   rc = saveCursorKey(pCur);
670   if( rc==SQLITE_OK ){
671     btreeReleaseAllCursorPages(pCur);
672     pCur->eState = CURSOR_REQUIRESEEK;
673   }
674 
675   pCur->curFlags &= ~(BTCF_ValidNKey|BTCF_ValidOvfl|BTCF_AtLast);
676   return rc;
677 }
678 
679 /* Forward reference */
680 static int SQLITE_NOINLINE saveCursorsOnList(BtCursor*,Pgno,BtCursor*);
681 
682 /*
683 ** Save the positions of all cursors (except pExcept) that are open on
684 ** the table with root-page iRoot.  "Saving the cursor position" means that
685 ** the location in the btree is remembered in such a way that it can be
686 ** moved back to the same spot after the btree has been modified.  This
687 ** routine is called just before cursor pExcept is used to modify the
688 ** table, for example in BtreeDelete() or BtreeInsert().
689 **
690 ** If there are two or more cursors on the same btree, then all such
691 ** cursors should have their BTCF_Multiple flag set.  The btreeCursor()
692 ** routine enforces that rule.  This routine only needs to be called in
693 ** the uncommon case when pExpect has the BTCF_Multiple flag set.
694 **
695 ** If pExpect!=NULL and if no other cursors are found on the same root-page,
696 ** then the BTCF_Multiple flag on pExpect is cleared, to avoid another
697 ** pointless call to this routine.
698 **
699 ** Implementation note:  This routine merely checks to see if any cursors
700 ** need to be saved.  It calls out to saveCursorsOnList() in the (unusual)
701 ** event that cursors are in need to being saved.
702 */
703 static int saveAllCursors(BtShared *pBt, Pgno iRoot, BtCursor *pExcept){
704   BtCursor *p;
705   assert( sqlite3_mutex_held(pBt->mutex) );
706   assert( pExcept==0 || pExcept->pBt==pBt );
707   for(p=pBt->pCursor; p; p=p->pNext){
708     if( p!=pExcept && (0==iRoot || p->pgnoRoot==iRoot) ) break;
709   }
710   if( p ) return saveCursorsOnList(p, iRoot, pExcept);
711   if( pExcept ) pExcept->curFlags &= ~BTCF_Multiple;
712   return SQLITE_OK;
713 }
714 
715 /* This helper routine to saveAllCursors does the actual work of saving
716 ** the cursors if and when a cursor is found that actually requires saving.
717 ** The common case is that no cursors need to be saved, so this routine is
718 ** broken out from its caller to avoid unnecessary stack pointer movement.
719 */
720 static int SQLITE_NOINLINE saveCursorsOnList(
721   BtCursor *p,         /* The first cursor that needs saving */
722   Pgno iRoot,          /* Only save cursor with this iRoot. Save all if zero */
723   BtCursor *pExcept    /* Do not save this cursor */
724 ){
725   do{
726     if( p!=pExcept && (0==iRoot || p->pgnoRoot==iRoot) ){
727       if( p->eState==CURSOR_VALID || p->eState==CURSOR_SKIPNEXT ){
728         int rc = saveCursorPosition(p);
729         if( SQLITE_OK!=rc ){
730           return rc;
731         }
732       }else{
733         testcase( p->iPage>0 );
734         btreeReleaseAllCursorPages(p);
735       }
736     }
737     p = p->pNext;
738   }while( p );
739   return SQLITE_OK;
740 }
741 
742 /*
743 ** Clear the current cursor position.
744 */
745 void sqlite3BtreeClearCursor(BtCursor *pCur){
746   assert( cursorHoldsMutex(pCur) );
747   sqlite3_free(pCur->pKey);
748   pCur->pKey = 0;
749   pCur->eState = CURSOR_INVALID;
750 }
751 
752 /*
753 ** In this version of BtreeMoveto, pKey is a packed index record
754 ** such as is generated by the OP_MakeRecord opcode.  Unpack the
755 ** record and then call BtreeMovetoUnpacked() to do the work.
756 */
757 static int btreeMoveto(
758   BtCursor *pCur,     /* Cursor open on the btree to be searched */
759   const void *pKey,   /* Packed key if the btree is an index */
760   i64 nKey,           /* Integer key for tables.  Size of pKey for indices */
761   int bias,           /* Bias search to the high end */
762   int *pRes           /* Write search results here */
763 ){
764   int rc;                    /* Status code */
765   UnpackedRecord *pIdxKey;   /* Unpacked index key */
766 
767   if( pKey ){
768     assert( nKey==(i64)(int)nKey );
769     pIdxKey = sqlite3VdbeAllocUnpackedRecord(pCur->pKeyInfo);
770     if( pIdxKey==0 ) return SQLITE_NOMEM_BKPT;
771     sqlite3VdbeRecordUnpack(pCur->pKeyInfo, (int)nKey, pKey, pIdxKey);
772     if( pIdxKey->nField==0 ){
773       rc = SQLITE_CORRUPT_BKPT;
774       goto moveto_done;
775     }
776   }else{
777     pIdxKey = 0;
778   }
779   rc = sqlite3BtreeMovetoUnpacked(pCur, pIdxKey, nKey, bias, pRes);
780 moveto_done:
781   if( pIdxKey ){
782     sqlite3DbFree(pCur->pKeyInfo->db, pIdxKey);
783   }
784   return rc;
785 }
786 
787 /*
788 ** Restore the cursor to the position it was in (or as close to as possible)
789 ** when saveCursorPosition() was called. Note that this call deletes the
790 ** saved position info stored by saveCursorPosition(), so there can be
791 ** at most one effective restoreCursorPosition() call after each
792 ** saveCursorPosition().
793 */
794 static int btreeRestoreCursorPosition(BtCursor *pCur){
795   int rc;
796   int skipNext;
797   assert( cursorOwnsBtShared(pCur) );
798   assert( pCur->eState>=CURSOR_REQUIRESEEK );
799   if( pCur->eState==CURSOR_FAULT ){
800     return pCur->skipNext;
801   }
802   pCur->eState = CURSOR_INVALID;
803   rc = btreeMoveto(pCur, pCur->pKey, pCur->nKey, 0, &skipNext);
804   if( rc==SQLITE_OK ){
805     sqlite3_free(pCur->pKey);
806     pCur->pKey = 0;
807     assert( pCur->eState==CURSOR_VALID || pCur->eState==CURSOR_INVALID );
808     pCur->skipNext |= skipNext;
809     if( pCur->skipNext && pCur->eState==CURSOR_VALID ){
810       pCur->eState = CURSOR_SKIPNEXT;
811     }
812   }
813   return rc;
814 }
815 
816 #define restoreCursorPosition(p) \
817   (p->eState>=CURSOR_REQUIRESEEK ? \
818          btreeRestoreCursorPosition(p) : \
819          SQLITE_OK)
820 
821 /*
822 ** Determine whether or not a cursor has moved from the position where
823 ** it was last placed, or has been invalidated for any other reason.
824 ** Cursors can move when the row they are pointing at is deleted out
825 ** from under them, for example.  Cursor might also move if a btree
826 ** is rebalanced.
827 **
828 ** Calling this routine with a NULL cursor pointer returns false.
829 **
830 ** Use the separate sqlite3BtreeCursorRestore() routine to restore a cursor
831 ** back to where it ought to be if this routine returns true.
832 */
833 int sqlite3BtreeCursorHasMoved(BtCursor *pCur){
834   return pCur->eState!=CURSOR_VALID;
835 }
836 
837 /*
838 ** This routine restores a cursor back to its original position after it
839 ** has been moved by some outside activity (such as a btree rebalance or
840 ** a row having been deleted out from under the cursor).
841 **
842 ** On success, the *pDifferentRow parameter is false if the cursor is left
843 ** pointing at exactly the same row.  *pDifferntRow is the row the cursor
844 ** was pointing to has been deleted, forcing the cursor to point to some
845 ** nearby row.
846 **
847 ** This routine should only be called for a cursor that just returned
848 ** TRUE from sqlite3BtreeCursorHasMoved().
849 */
850 int sqlite3BtreeCursorRestore(BtCursor *pCur, int *pDifferentRow){
851   int rc;
852 
853   assert( pCur!=0 );
854   assert( pCur->eState!=CURSOR_VALID );
855   rc = restoreCursorPosition(pCur);
856   if( rc ){
857     *pDifferentRow = 1;
858     return rc;
859   }
860   if( pCur->eState!=CURSOR_VALID ){
861     *pDifferentRow = 1;
862   }else{
863     assert( pCur->skipNext==0 );
864     *pDifferentRow = 0;
865   }
866   return SQLITE_OK;
867 }
868 
869 #ifdef SQLITE_ENABLE_CURSOR_HINTS
870 /*
871 ** Provide hints to the cursor.  The particular hint given (and the type
872 ** and number of the varargs parameters) is determined by the eHintType
873 ** parameter.  See the definitions of the BTREE_HINT_* macros for details.
874 */
875 void sqlite3BtreeCursorHint(BtCursor *pCur, int eHintType, ...){
876   /* Used only by system that substitute their own storage engine */
877 }
878 #endif
879 
880 /*
881 ** Provide flag hints to the cursor.
882 */
883 void sqlite3BtreeCursorHintFlags(BtCursor *pCur, unsigned x){
884   assert( x==BTREE_SEEK_EQ || x==BTREE_BULKLOAD || x==0 );
885   pCur->hints = x;
886 }
887 
888 
889 #ifndef SQLITE_OMIT_AUTOVACUUM
890 /*
891 ** Given a page number of a regular database page, return the page
892 ** number for the pointer-map page that contains the entry for the
893 ** input page number.
894 **
895 ** Return 0 (not a valid page) for pgno==1 since there is
896 ** no pointer map associated with page 1.  The integrity_check logic
897 ** requires that ptrmapPageno(*,1)!=1.
898 */
899 static Pgno ptrmapPageno(BtShared *pBt, Pgno pgno){
900   int nPagesPerMapPage;
901   Pgno iPtrMap, ret;
902   assert( sqlite3_mutex_held(pBt->mutex) );
903   if( pgno<2 ) return 0;
904   nPagesPerMapPage = (pBt->usableSize/5)+1;
905   iPtrMap = (pgno-2)/nPagesPerMapPage;
906   ret = (iPtrMap*nPagesPerMapPage) + 2;
907   if( ret==PENDING_BYTE_PAGE(pBt) ){
908     ret++;
909   }
910   return ret;
911 }
912 
913 /*
914 ** Write an entry into the pointer map.
915 **
916 ** This routine updates the pointer map entry for page number 'key'
917 ** so that it maps to type 'eType' and parent page number 'pgno'.
918 **
919 ** If *pRC is initially non-zero (non-SQLITE_OK) then this routine is
920 ** a no-op.  If an error occurs, the appropriate error code is written
921 ** into *pRC.
922 */
923 static void ptrmapPut(BtShared *pBt, Pgno key, u8 eType, Pgno parent, int *pRC){
924   DbPage *pDbPage;  /* The pointer map page */
925   u8 *pPtrmap;      /* The pointer map data */
926   Pgno iPtrmap;     /* The pointer map page number */
927   int offset;       /* Offset in pointer map page */
928   int rc;           /* Return code from subfunctions */
929 
930   if( *pRC ) return;
931 
932   assert( sqlite3_mutex_held(pBt->mutex) );
933   /* The master-journal page number must never be used as a pointer map page */
934   assert( 0==PTRMAP_ISPAGE(pBt, PENDING_BYTE_PAGE(pBt)) );
935 
936   assert( pBt->autoVacuum );
937   if( key==0 ){
938     *pRC = SQLITE_CORRUPT_BKPT;
939     return;
940   }
941   iPtrmap = PTRMAP_PAGENO(pBt, key);
942   rc = sqlite3PagerGet(pBt->pPager, iPtrmap, &pDbPage, 0);
943   if( rc!=SQLITE_OK ){
944     *pRC = rc;
945     return;
946   }
947   offset = PTRMAP_PTROFFSET(iPtrmap, key);
948   if( offset<0 ){
949     *pRC = SQLITE_CORRUPT_BKPT;
950     goto ptrmap_exit;
951   }
952   assert( offset <= (int)pBt->usableSize-5 );
953   pPtrmap = (u8 *)sqlite3PagerGetData(pDbPage);
954 
955   if( eType!=pPtrmap[offset] || get4byte(&pPtrmap[offset+1])!=parent ){
956     TRACE(("PTRMAP_UPDATE: %d->(%d,%d)\n", key, eType, parent));
957     *pRC= rc = sqlite3PagerWrite(pDbPage);
958     if( rc==SQLITE_OK ){
959       pPtrmap[offset] = eType;
960       put4byte(&pPtrmap[offset+1], parent);
961     }
962   }
963 
964 ptrmap_exit:
965   sqlite3PagerUnref(pDbPage);
966 }
967 
968 /*
969 ** Read an entry from the pointer map.
970 **
971 ** This routine retrieves the pointer map entry for page 'key', writing
972 ** the type and parent page number to *pEType and *pPgno respectively.
973 ** An error code is returned if something goes wrong, otherwise SQLITE_OK.
974 */
975 static int ptrmapGet(BtShared *pBt, Pgno key, u8 *pEType, Pgno *pPgno){
976   DbPage *pDbPage;   /* The pointer map page */
977   int iPtrmap;       /* Pointer map page index */
978   u8 *pPtrmap;       /* Pointer map page data */
979   int offset;        /* Offset of entry in pointer map */
980   int rc;
981 
982   assert( sqlite3_mutex_held(pBt->mutex) );
983 
984   iPtrmap = PTRMAP_PAGENO(pBt, key);
985   rc = sqlite3PagerGet(pBt->pPager, iPtrmap, &pDbPage, 0);
986   if( rc!=0 ){
987     return rc;
988   }
989   pPtrmap = (u8 *)sqlite3PagerGetData(pDbPage);
990 
991   offset = PTRMAP_PTROFFSET(iPtrmap, key);
992   if( offset<0 ){
993     sqlite3PagerUnref(pDbPage);
994     return SQLITE_CORRUPT_BKPT;
995   }
996   assert( offset <= (int)pBt->usableSize-5 );
997   assert( pEType!=0 );
998   *pEType = pPtrmap[offset];
999   if( pPgno ) *pPgno = get4byte(&pPtrmap[offset+1]);
1000 
1001   sqlite3PagerUnref(pDbPage);
1002   if( *pEType<1 || *pEType>5 ) return SQLITE_CORRUPT_BKPT;
1003   return SQLITE_OK;
1004 }
1005 
1006 #else /* if defined SQLITE_OMIT_AUTOVACUUM */
1007   #define ptrmapPut(w,x,y,z,rc)
1008   #define ptrmapGet(w,x,y,z) SQLITE_OK
1009   #define ptrmapPutOvflPtr(x, y, rc)
1010 #endif
1011 
1012 /*
1013 ** Given a btree page and a cell index (0 means the first cell on
1014 ** the page, 1 means the second cell, and so forth) return a pointer
1015 ** to the cell content.
1016 **
1017 ** findCellPastPtr() does the same except it skips past the initial
1018 ** 4-byte child pointer found on interior pages, if there is one.
1019 **
1020 ** This routine works only for pages that do not contain overflow cells.
1021 */
1022 #define findCell(P,I) \
1023   ((P)->aData + ((P)->maskPage & get2byteAligned(&(P)->aCellIdx[2*(I)])))
1024 #define findCellPastPtr(P,I) \
1025   ((P)->aDataOfst + ((P)->maskPage & get2byteAligned(&(P)->aCellIdx[2*(I)])))
1026 
1027 
1028 /*
1029 ** This is common tail processing for btreeParseCellPtr() and
1030 ** btreeParseCellPtrIndex() for the case when the cell does not fit entirely
1031 ** on a single B-tree page.  Make necessary adjustments to the CellInfo
1032 ** structure.
1033 */
1034 static SQLITE_NOINLINE void btreeParseCellAdjustSizeForOverflow(
1035   MemPage *pPage,         /* Page containing the cell */
1036   u8 *pCell,              /* Pointer to the cell text. */
1037   CellInfo *pInfo         /* Fill in this structure */
1038 ){
1039   /* If the payload will not fit completely on the local page, we have
1040   ** to decide how much to store locally and how much to spill onto
1041   ** overflow pages.  The strategy is to minimize the amount of unused
1042   ** space on overflow pages while keeping the amount of local storage
1043   ** in between minLocal and maxLocal.
1044   **
1045   ** Warning:  changing the way overflow payload is distributed in any
1046   ** way will result in an incompatible file format.
1047   */
1048   int minLocal;  /* Minimum amount of payload held locally */
1049   int maxLocal;  /* Maximum amount of payload held locally */
1050   int surplus;   /* Overflow payload available for local storage */
1051 
1052   minLocal = pPage->minLocal;
1053   maxLocal = pPage->maxLocal;
1054   surplus = minLocal + (pInfo->nPayload - minLocal)%(pPage->pBt->usableSize-4);
1055   testcase( surplus==maxLocal );
1056   testcase( surplus==maxLocal+1 );
1057   if( surplus <= maxLocal ){
1058     pInfo->nLocal = (u16)surplus;
1059   }else{
1060     pInfo->nLocal = (u16)minLocal;
1061   }
1062   pInfo->nSize = (u16)(&pInfo->pPayload[pInfo->nLocal] - pCell) + 4;
1063 }
1064 
1065 /*
1066 ** The following routines are implementations of the MemPage.xParseCell()
1067 ** method.
1068 **
1069 ** Parse a cell content block and fill in the CellInfo structure.
1070 **
1071 ** btreeParseCellPtr()        =>   table btree leaf nodes
1072 ** btreeParseCellNoPayload()  =>   table btree internal nodes
1073 ** btreeParseCellPtrIndex()   =>   index btree nodes
1074 **
1075 ** There is also a wrapper function btreeParseCell() that works for
1076 ** all MemPage types and that references the cell by index rather than
1077 ** by pointer.
1078 */
1079 static void btreeParseCellPtrNoPayload(
1080   MemPage *pPage,         /* Page containing the cell */
1081   u8 *pCell,              /* Pointer to the cell text. */
1082   CellInfo *pInfo         /* Fill in this structure */
1083 ){
1084   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1085   assert( pPage->leaf==0 );
1086   assert( pPage->childPtrSize==4 );
1087 #ifndef SQLITE_DEBUG
1088   UNUSED_PARAMETER(pPage);
1089 #endif
1090   pInfo->nSize = 4 + getVarint(&pCell[4], (u64*)&pInfo->nKey);
1091   pInfo->nPayload = 0;
1092   pInfo->nLocal = 0;
1093   pInfo->pPayload = 0;
1094   return;
1095 }
1096 static void btreeParseCellPtr(
1097   MemPage *pPage,         /* Page containing the cell */
1098   u8 *pCell,              /* Pointer to the cell text. */
1099   CellInfo *pInfo         /* Fill in this structure */
1100 ){
1101   u8 *pIter;              /* For scanning through pCell */
1102   u32 nPayload;           /* Number of bytes of cell payload */
1103   u64 iKey;               /* Extracted Key value */
1104 
1105   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1106   assert( pPage->leaf==0 || pPage->leaf==1 );
1107   assert( pPage->intKeyLeaf );
1108   assert( pPage->childPtrSize==0 );
1109   pIter = pCell;
1110 
1111   /* The next block of code is equivalent to:
1112   **
1113   **     pIter += getVarint32(pIter, nPayload);
1114   **
1115   ** The code is inlined to avoid a function call.
1116   */
1117   nPayload = *pIter;
1118   if( nPayload>=0x80 ){
1119     u8 *pEnd = &pIter[8];
1120     nPayload &= 0x7f;
1121     do{
1122       nPayload = (nPayload<<7) | (*++pIter & 0x7f);
1123     }while( (*pIter)>=0x80 && pIter<pEnd );
1124   }
1125   pIter++;
1126 
1127   /* The next block of code is equivalent to:
1128   **
1129   **     pIter += getVarint(pIter, (u64*)&pInfo->nKey);
1130   **
1131   ** The code is inlined to avoid a function call.
1132   */
1133   iKey = *pIter;
1134   if( iKey>=0x80 ){
1135     u8 *pEnd = &pIter[7];
1136     iKey &= 0x7f;
1137     while(1){
1138       iKey = (iKey<<7) | (*++pIter & 0x7f);
1139       if( (*pIter)<0x80 ) break;
1140       if( pIter>=pEnd ){
1141         iKey = (iKey<<8) | *++pIter;
1142         break;
1143       }
1144     }
1145   }
1146   pIter++;
1147 
1148   pInfo->nKey = *(i64*)&iKey;
1149   pInfo->nPayload = nPayload;
1150   pInfo->pPayload = pIter;
1151   testcase( nPayload==pPage->maxLocal );
1152   testcase( nPayload==pPage->maxLocal+1 );
1153   if( nPayload<=pPage->maxLocal ){
1154     /* This is the (easy) common case where the entire payload fits
1155     ** on the local page.  No overflow is required.
1156     */
1157     pInfo->nSize = nPayload + (u16)(pIter - pCell);
1158     if( pInfo->nSize<4 ) pInfo->nSize = 4;
1159     pInfo->nLocal = (u16)nPayload;
1160   }else{
1161     btreeParseCellAdjustSizeForOverflow(pPage, pCell, pInfo);
1162   }
1163 }
1164 static void btreeParseCellPtrIndex(
1165   MemPage *pPage,         /* Page containing the cell */
1166   u8 *pCell,              /* Pointer to the cell text. */
1167   CellInfo *pInfo         /* Fill in this structure */
1168 ){
1169   u8 *pIter;              /* For scanning through pCell */
1170   u32 nPayload;           /* Number of bytes of cell payload */
1171 
1172   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1173   assert( pPage->leaf==0 || pPage->leaf==1 );
1174   assert( pPage->intKeyLeaf==0 );
1175   pIter = pCell + pPage->childPtrSize;
1176   nPayload = *pIter;
1177   if( nPayload>=0x80 ){
1178     u8 *pEnd = &pIter[8];
1179     nPayload &= 0x7f;
1180     do{
1181       nPayload = (nPayload<<7) | (*++pIter & 0x7f);
1182     }while( *(pIter)>=0x80 && pIter<pEnd );
1183   }
1184   pIter++;
1185   pInfo->nKey = nPayload;
1186   pInfo->nPayload = nPayload;
1187   pInfo->pPayload = pIter;
1188   testcase( nPayload==pPage->maxLocal );
1189   testcase( nPayload==pPage->maxLocal+1 );
1190   if( nPayload<=pPage->maxLocal ){
1191     /* This is the (easy) common case where the entire payload fits
1192     ** on the local page.  No overflow is required.
1193     */
1194     pInfo->nSize = nPayload + (u16)(pIter - pCell);
1195     if( pInfo->nSize<4 ) pInfo->nSize = 4;
1196     pInfo->nLocal = (u16)nPayload;
1197   }else{
1198     btreeParseCellAdjustSizeForOverflow(pPage, pCell, pInfo);
1199   }
1200 }
1201 static void btreeParseCell(
1202   MemPage *pPage,         /* Page containing the cell */
1203   int iCell,              /* The cell index.  First cell is 0 */
1204   CellInfo *pInfo         /* Fill in this structure */
1205 ){
1206   pPage->xParseCell(pPage, findCell(pPage, iCell), pInfo);
1207 }
1208 
1209 /*
1210 ** The following routines are implementations of the MemPage.xCellSize
1211 ** method.
1212 **
1213 ** Compute the total number of bytes that a Cell needs in the cell
1214 ** data area of the btree-page.  The return number includes the cell
1215 ** data header and the local payload, but not any overflow page or
1216 ** the space used by the cell pointer.
1217 **
1218 ** cellSizePtrNoPayload()    =>   table internal nodes
1219 ** cellSizePtr()             =>   all index nodes & table leaf nodes
1220 */
1221 static u16 cellSizePtr(MemPage *pPage, u8 *pCell){
1222   u8 *pIter = pCell + pPage->childPtrSize; /* For looping over bytes of pCell */
1223   u8 *pEnd;                                /* End mark for a varint */
1224   u32 nSize;                               /* Size value to return */
1225 
1226 #ifdef SQLITE_DEBUG
1227   /* The value returned by this function should always be the same as
1228   ** the (CellInfo.nSize) value found by doing a full parse of the
1229   ** cell. If SQLITE_DEBUG is defined, an assert() at the bottom of
1230   ** this function verifies that this invariant is not violated. */
1231   CellInfo debuginfo;
1232   pPage->xParseCell(pPage, pCell, &debuginfo);
1233 #endif
1234 
1235   nSize = *pIter;
1236   if( nSize>=0x80 ){
1237     pEnd = &pIter[8];
1238     nSize &= 0x7f;
1239     do{
1240       nSize = (nSize<<7) | (*++pIter & 0x7f);
1241     }while( *(pIter)>=0x80 && pIter<pEnd );
1242   }
1243   pIter++;
1244   if( pPage->intKey ){
1245     /* pIter now points at the 64-bit integer key value, a variable length
1246     ** integer. The following block moves pIter to point at the first byte
1247     ** past the end of the key value. */
1248     pEnd = &pIter[9];
1249     while( (*pIter++)&0x80 && pIter<pEnd );
1250   }
1251   testcase( nSize==pPage->maxLocal );
1252   testcase( nSize==pPage->maxLocal+1 );
1253   if( nSize<=pPage->maxLocal ){
1254     nSize += (u32)(pIter - pCell);
1255     if( nSize<4 ) nSize = 4;
1256   }else{
1257     int minLocal = pPage->minLocal;
1258     nSize = minLocal + (nSize - minLocal) % (pPage->pBt->usableSize - 4);
1259     testcase( nSize==pPage->maxLocal );
1260     testcase( nSize==pPage->maxLocal+1 );
1261     if( nSize>pPage->maxLocal ){
1262       nSize = minLocal;
1263     }
1264     nSize += 4 + (u16)(pIter - pCell);
1265   }
1266   assert( nSize==debuginfo.nSize || CORRUPT_DB );
1267   return (u16)nSize;
1268 }
1269 static u16 cellSizePtrNoPayload(MemPage *pPage, u8 *pCell){
1270   u8 *pIter = pCell + 4; /* For looping over bytes of pCell */
1271   u8 *pEnd;              /* End mark for a varint */
1272 
1273 #ifdef SQLITE_DEBUG
1274   /* The value returned by this function should always be the same as
1275   ** the (CellInfo.nSize) value found by doing a full parse of the
1276   ** cell. If SQLITE_DEBUG is defined, an assert() at the bottom of
1277   ** this function verifies that this invariant is not violated. */
1278   CellInfo debuginfo;
1279   pPage->xParseCell(pPage, pCell, &debuginfo);
1280 #else
1281   UNUSED_PARAMETER(pPage);
1282 #endif
1283 
1284   assert( pPage->childPtrSize==4 );
1285   pEnd = pIter + 9;
1286   while( (*pIter++)&0x80 && pIter<pEnd );
1287   assert( debuginfo.nSize==(u16)(pIter - pCell) || CORRUPT_DB );
1288   return (u16)(pIter - pCell);
1289 }
1290 
1291 
1292 #ifdef SQLITE_DEBUG
1293 /* This variation on cellSizePtr() is used inside of assert() statements
1294 ** only. */
1295 static u16 cellSize(MemPage *pPage, int iCell){
1296   return pPage->xCellSize(pPage, findCell(pPage, iCell));
1297 }
1298 #endif
1299 
1300 #ifndef SQLITE_OMIT_AUTOVACUUM
1301 /*
1302 ** If the cell pCell, part of page pPage contains a pointer
1303 ** to an overflow page, insert an entry into the pointer-map
1304 ** for the overflow page.
1305 */
1306 static void ptrmapPutOvflPtr(MemPage *pPage, u8 *pCell, int *pRC){
1307   CellInfo info;
1308   if( *pRC ) return;
1309   assert( pCell!=0 );
1310   pPage->xParseCell(pPage, pCell, &info);
1311   if( info.nLocal<info.nPayload ){
1312     Pgno ovfl = get4byte(&pCell[info.nSize-4]);
1313     ptrmapPut(pPage->pBt, ovfl, PTRMAP_OVERFLOW1, pPage->pgno, pRC);
1314   }
1315 }
1316 #endif
1317 
1318 
1319 /*
1320 ** Defragment the page given.  All Cells are moved to the
1321 ** end of the page and all free space is collected into one
1322 ** big FreeBlk that occurs in between the header and cell
1323 ** pointer array and the cell content area.
1324 **
1325 ** EVIDENCE-OF: R-44582-60138 SQLite may from time to time reorganize a
1326 ** b-tree page so that there are no freeblocks or fragment bytes, all
1327 ** unused bytes are contained in the unallocated space region, and all
1328 ** cells are packed tightly at the end of the page.
1329 */
1330 static int defragmentPage(MemPage *pPage){
1331   int i;                     /* Loop counter */
1332   int pc;                    /* Address of the i-th cell */
1333   int hdr;                   /* Offset to the page header */
1334   int size;                  /* Size of a cell */
1335   int usableSize;            /* Number of usable bytes on a page */
1336   int cellOffset;            /* Offset to the cell pointer array */
1337   int cbrk;                  /* Offset to the cell content area */
1338   int nCell;                 /* Number of cells on the page */
1339   unsigned char *data;       /* The page data */
1340   unsigned char *temp;       /* Temp area for cell content */
1341   unsigned char *src;        /* Source of content */
1342   int iCellFirst;            /* First allowable cell index */
1343   int iCellLast;             /* Last possible cell index */
1344 
1345 
1346   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
1347   assert( pPage->pBt!=0 );
1348   assert( pPage->pBt->usableSize <= SQLITE_MAX_PAGE_SIZE );
1349   assert( pPage->nOverflow==0 );
1350   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1351   temp = 0;
1352   src = data = pPage->aData;
1353   hdr = pPage->hdrOffset;
1354   cellOffset = pPage->cellOffset;
1355   nCell = pPage->nCell;
1356   assert( nCell==get2byte(&data[hdr+3]) );
1357   usableSize = pPage->pBt->usableSize;
1358   cbrk = usableSize;
1359   iCellFirst = cellOffset + 2*nCell;
1360   iCellLast = usableSize - 4;
1361   for(i=0; i<nCell; i++){
1362     u8 *pAddr;     /* The i-th cell pointer */
1363     pAddr = &data[cellOffset + i*2];
1364     pc = get2byte(pAddr);
1365     testcase( pc==iCellFirst );
1366     testcase( pc==iCellLast );
1367     /* These conditions have already been verified in btreeInitPage()
1368     ** if PRAGMA cell_size_check=ON.
1369     */
1370     if( pc<iCellFirst || pc>iCellLast ){
1371       return SQLITE_CORRUPT_BKPT;
1372     }
1373     assert( pc>=iCellFirst && pc<=iCellLast );
1374     size = pPage->xCellSize(pPage, &src[pc]);
1375     cbrk -= size;
1376     if( cbrk<iCellFirst || pc+size>usableSize ){
1377       return SQLITE_CORRUPT_BKPT;
1378     }
1379     assert( cbrk+size<=usableSize && cbrk>=iCellFirst );
1380     testcase( cbrk+size==usableSize );
1381     testcase( pc+size==usableSize );
1382     put2byte(pAddr, cbrk);
1383     if( temp==0 ){
1384       int x;
1385       if( cbrk==pc ) continue;
1386       temp = sqlite3PagerTempSpace(pPage->pBt->pPager);
1387       x = get2byte(&data[hdr+5]);
1388       memcpy(&temp[x], &data[x], (cbrk+size) - x);
1389       src = temp;
1390     }
1391     memcpy(&data[cbrk], &src[pc], size);
1392   }
1393   assert( cbrk>=iCellFirst );
1394   put2byte(&data[hdr+5], cbrk);
1395   data[hdr+1] = 0;
1396   data[hdr+2] = 0;
1397   data[hdr+7] = 0;
1398   memset(&data[iCellFirst], 0, cbrk-iCellFirst);
1399   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
1400   if( cbrk-iCellFirst!=pPage->nFree ){
1401     return SQLITE_CORRUPT_BKPT;
1402   }
1403   return SQLITE_OK;
1404 }
1405 
1406 /*
1407 ** Search the free-list on page pPg for space to store a cell nByte bytes in
1408 ** size. If one can be found, return a pointer to the space and remove it
1409 ** from the free-list.
1410 **
1411 ** If no suitable space can be found on the free-list, return NULL.
1412 **
1413 ** This function may detect corruption within pPg.  If corruption is
1414 ** detected then *pRc is set to SQLITE_CORRUPT and NULL is returned.
1415 **
1416 ** Slots on the free list that are between 1 and 3 bytes larger than nByte
1417 ** will be ignored if adding the extra space to the fragmentation count
1418 ** causes the fragmentation count to exceed 60.
1419 */
1420 static u8 *pageFindSlot(MemPage *pPg, int nByte, int *pRc){
1421   const int hdr = pPg->hdrOffset;
1422   u8 * const aData = pPg->aData;
1423   int iAddr = hdr + 1;
1424   int pc = get2byte(&aData[iAddr]);
1425   int x;
1426   int usableSize = pPg->pBt->usableSize;
1427 
1428   assert( pc>0 );
1429   do{
1430     int size;            /* Size of the free slot */
1431     /* EVIDENCE-OF: R-06866-39125 Freeblocks are always connected in order of
1432     ** increasing offset. */
1433     if( pc>usableSize-4 || pc<iAddr+4 ){
1434       *pRc = SQLITE_CORRUPT_BKPT;
1435       return 0;
1436     }
1437     /* EVIDENCE-OF: R-22710-53328 The third and fourth bytes of each
1438     ** freeblock form a big-endian integer which is the size of the freeblock
1439     ** in bytes, including the 4-byte header. */
1440     size = get2byte(&aData[pc+2]);
1441     if( (x = size - nByte)>=0 ){
1442       testcase( x==4 );
1443       testcase( x==3 );
1444       if( pc < pPg->cellOffset+2*pPg->nCell || size+pc > usableSize ){
1445         *pRc = SQLITE_CORRUPT_BKPT;
1446         return 0;
1447       }else if( x<4 ){
1448         /* EVIDENCE-OF: R-11498-58022 In a well-formed b-tree page, the total
1449         ** number of bytes in fragments may not exceed 60. */
1450         if( aData[hdr+7]>57 ) return 0;
1451 
1452         /* Remove the slot from the free-list. Update the number of
1453         ** fragmented bytes within the page. */
1454         memcpy(&aData[iAddr], &aData[pc], 2);
1455         aData[hdr+7] += (u8)x;
1456       }else{
1457         /* The slot remains on the free-list. Reduce its size to account
1458          ** for the portion used by the new allocation. */
1459         put2byte(&aData[pc+2], x);
1460       }
1461       return &aData[pc + x];
1462     }
1463     iAddr = pc;
1464     pc = get2byte(&aData[pc]);
1465   }while( pc );
1466 
1467   return 0;
1468 }
1469 
1470 /*
1471 ** Allocate nByte bytes of space from within the B-Tree page passed
1472 ** as the first argument. Write into *pIdx the index into pPage->aData[]
1473 ** of the first byte of allocated space. Return either SQLITE_OK or
1474 ** an error code (usually SQLITE_CORRUPT).
1475 **
1476 ** The caller guarantees that there is sufficient space to make the
1477 ** allocation.  This routine might need to defragment in order to bring
1478 ** all the space together, however.  This routine will avoid using
1479 ** the first two bytes past the cell pointer area since presumably this
1480 ** allocation is being made in order to insert a new cell, so we will
1481 ** also end up needing a new cell pointer.
1482 */
1483 static int allocateSpace(MemPage *pPage, int nByte, int *pIdx){
1484   const int hdr = pPage->hdrOffset;    /* Local cache of pPage->hdrOffset */
1485   u8 * const data = pPage->aData;      /* Local cache of pPage->aData */
1486   int top;                             /* First byte of cell content area */
1487   int rc = SQLITE_OK;                  /* Integer return code */
1488   int gap;        /* First byte of gap between cell pointers and cell content */
1489 
1490   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
1491   assert( pPage->pBt );
1492   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1493   assert( nByte>=0 );  /* Minimum cell size is 4 */
1494   assert( pPage->nFree>=nByte );
1495   assert( pPage->nOverflow==0 );
1496   assert( nByte < (int)(pPage->pBt->usableSize-8) );
1497 
1498   assert( pPage->cellOffset == hdr + 12 - 4*pPage->leaf );
1499   gap = pPage->cellOffset + 2*pPage->nCell;
1500   assert( gap<=65536 );
1501   /* EVIDENCE-OF: R-29356-02391 If the database uses a 65536-byte page size
1502   ** and the reserved space is zero (the usual value for reserved space)
1503   ** then the cell content offset of an empty page wants to be 65536.
1504   ** However, that integer is too large to be stored in a 2-byte unsigned
1505   ** integer, so a value of 0 is used in its place. */
1506   top = get2byte(&data[hdr+5]);
1507   assert( top<=(int)pPage->pBt->usableSize ); /* Prevent by getAndInitPage() */
1508   if( gap>top ){
1509     if( top==0 && pPage->pBt->usableSize==65536 ){
1510       top = 65536;
1511     }else{
1512       return SQLITE_CORRUPT_BKPT;
1513     }
1514   }
1515 
1516   /* If there is enough space between gap and top for one more cell pointer
1517   ** array entry offset, and if the freelist is not empty, then search the
1518   ** freelist looking for a free slot big enough to satisfy the request.
1519   */
1520   testcase( gap+2==top );
1521   testcase( gap+1==top );
1522   testcase( gap==top );
1523   if( (data[hdr+2] || data[hdr+1]) && gap+2<=top ){
1524     u8 *pSpace = pageFindSlot(pPage, nByte, &rc);
1525     if( pSpace ){
1526       assert( pSpace>=data && (pSpace - data)<65536 );
1527       *pIdx = (int)(pSpace - data);
1528       return SQLITE_OK;
1529     }else if( rc ){
1530       return rc;
1531     }
1532   }
1533 
1534   /* The request could not be fulfilled using a freelist slot.  Check
1535   ** to see if defragmentation is necessary.
1536   */
1537   testcase( gap+2+nByte==top );
1538   if( gap+2+nByte>top ){
1539     assert( pPage->nCell>0 || CORRUPT_DB );
1540     rc = defragmentPage(pPage);
1541     if( rc ) return rc;
1542     top = get2byteNotZero(&data[hdr+5]);
1543     assert( gap+nByte<=top );
1544   }
1545 
1546 
1547   /* Allocate memory from the gap in between the cell pointer array
1548   ** and the cell content area.  The btreeInitPage() call has already
1549   ** validated the freelist.  Given that the freelist is valid, there
1550   ** is no way that the allocation can extend off the end of the page.
1551   ** The assert() below verifies the previous sentence.
1552   */
1553   top -= nByte;
1554   put2byte(&data[hdr+5], top);
1555   assert( top+nByte <= (int)pPage->pBt->usableSize );
1556   *pIdx = top;
1557   return SQLITE_OK;
1558 }
1559 
1560 /*
1561 ** Return a section of the pPage->aData to the freelist.
1562 ** The first byte of the new free block is pPage->aData[iStart]
1563 ** and the size of the block is iSize bytes.
1564 **
1565 ** Adjacent freeblocks are coalesced.
1566 **
1567 ** Note that even though the freeblock list was checked by btreeInitPage(),
1568 ** that routine will not detect overlap between cells or freeblocks.  Nor
1569 ** does it detect cells or freeblocks that encrouch into the reserved bytes
1570 ** at the end of the page.  So do additional corruption checks inside this
1571 ** routine and return SQLITE_CORRUPT if any problems are found.
1572 */
1573 static int freeSpace(MemPage *pPage, u16 iStart, u16 iSize){
1574   u16 iPtr;                             /* Address of ptr to next freeblock */
1575   u16 iFreeBlk;                         /* Address of the next freeblock */
1576   u8 hdr;                               /* Page header size.  0 or 100 */
1577   u8 nFrag = 0;                         /* Reduction in fragmentation */
1578   u16 iOrigSize = iSize;                /* Original value of iSize */
1579   u32 iLast = pPage->pBt->usableSize-4; /* Largest possible freeblock offset */
1580   u32 iEnd = iStart + iSize;            /* First byte past the iStart buffer */
1581   unsigned char *data = pPage->aData;   /* Page content */
1582 
1583   assert( pPage->pBt!=0 );
1584   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
1585   assert( CORRUPT_DB || iStart>=pPage->hdrOffset+6+pPage->childPtrSize );
1586   assert( CORRUPT_DB || iEnd <= pPage->pBt->usableSize );
1587   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1588   assert( iSize>=4 );   /* Minimum cell size is 4 */
1589   assert( iStart<=iLast );
1590 
1591   /* Overwrite deleted information with zeros when the secure_delete
1592   ** option is enabled */
1593   if( pPage->pBt->btsFlags & BTS_SECURE_DELETE ){
1594     memset(&data[iStart], 0, iSize);
1595   }
1596 
1597   /* The list of freeblocks must be in ascending order.  Find the
1598   ** spot on the list where iStart should be inserted.
1599   */
1600   hdr = pPage->hdrOffset;
1601   iPtr = hdr + 1;
1602   if( data[iPtr+1]==0 && data[iPtr]==0 ){
1603     iFreeBlk = 0;  /* Shortcut for the case when the freelist is empty */
1604   }else{
1605     while( (iFreeBlk = get2byte(&data[iPtr]))<iStart ){
1606       if( iFreeBlk<iPtr+4 ){
1607         if( iFreeBlk==0 ) break;
1608         return SQLITE_CORRUPT_BKPT;
1609       }
1610       iPtr = iFreeBlk;
1611     }
1612     if( iFreeBlk>iLast ) return SQLITE_CORRUPT_BKPT;
1613     assert( iFreeBlk>iPtr || iFreeBlk==0 );
1614 
1615     /* At this point:
1616     **    iFreeBlk:   First freeblock after iStart, or zero if none
1617     **    iPtr:       The address of a pointer to iFreeBlk
1618     **
1619     ** Check to see if iFreeBlk should be coalesced onto the end of iStart.
1620     */
1621     if( iFreeBlk && iEnd+3>=iFreeBlk ){
1622       nFrag = iFreeBlk - iEnd;
1623       if( iEnd>iFreeBlk ) return SQLITE_CORRUPT_BKPT;
1624       iEnd = iFreeBlk + get2byte(&data[iFreeBlk+2]);
1625       if( iEnd > pPage->pBt->usableSize ) return SQLITE_CORRUPT_BKPT;
1626       iSize = iEnd - iStart;
1627       iFreeBlk = get2byte(&data[iFreeBlk]);
1628     }
1629 
1630     /* If iPtr is another freeblock (that is, if iPtr is not the freelist
1631     ** pointer in the page header) then check to see if iStart should be
1632     ** coalesced onto the end of iPtr.
1633     */
1634     if( iPtr>hdr+1 ){
1635       int iPtrEnd = iPtr + get2byte(&data[iPtr+2]);
1636       if( iPtrEnd+3>=iStart ){
1637         if( iPtrEnd>iStart ) return SQLITE_CORRUPT_BKPT;
1638         nFrag += iStart - iPtrEnd;
1639         iSize = iEnd - iPtr;
1640         iStart = iPtr;
1641       }
1642     }
1643     if( nFrag>data[hdr+7] ) return SQLITE_CORRUPT_BKPT;
1644     data[hdr+7] -= nFrag;
1645   }
1646   if( iStart==get2byte(&data[hdr+5]) ){
1647     /* The new freeblock is at the beginning of the cell content area,
1648     ** so just extend the cell content area rather than create another
1649     ** freelist entry */
1650     if( iPtr!=hdr+1 ) return SQLITE_CORRUPT_BKPT;
1651     put2byte(&data[hdr+1], iFreeBlk);
1652     put2byte(&data[hdr+5], iEnd);
1653   }else{
1654     /* Insert the new freeblock into the freelist */
1655     put2byte(&data[iPtr], iStart);
1656     put2byte(&data[iStart], iFreeBlk);
1657     put2byte(&data[iStart+2], iSize);
1658   }
1659   pPage->nFree += iOrigSize;
1660   return SQLITE_OK;
1661 }
1662 
1663 /*
1664 ** Decode the flags byte (the first byte of the header) for a page
1665 ** and initialize fields of the MemPage structure accordingly.
1666 **
1667 ** Only the following combinations are supported.  Anything different
1668 ** indicates a corrupt database files:
1669 **
1670 **         PTF_ZERODATA
1671 **         PTF_ZERODATA | PTF_LEAF
1672 **         PTF_LEAFDATA | PTF_INTKEY
1673 **         PTF_LEAFDATA | PTF_INTKEY | PTF_LEAF
1674 */
1675 static int decodeFlags(MemPage *pPage, int flagByte){
1676   BtShared *pBt;     /* A copy of pPage->pBt */
1677 
1678   assert( pPage->hdrOffset==(pPage->pgno==1 ? 100 : 0) );
1679   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1680   pPage->leaf = (u8)(flagByte>>3);  assert( PTF_LEAF == 1<<3 );
1681   flagByte &= ~PTF_LEAF;
1682   pPage->childPtrSize = 4-4*pPage->leaf;
1683   pPage->xCellSize = cellSizePtr;
1684   pBt = pPage->pBt;
1685   if( flagByte==(PTF_LEAFDATA | PTF_INTKEY) ){
1686     /* EVIDENCE-OF: R-07291-35328 A value of 5 (0x05) means the page is an
1687     ** interior table b-tree page. */
1688     assert( (PTF_LEAFDATA|PTF_INTKEY)==5 );
1689     /* EVIDENCE-OF: R-26900-09176 A value of 13 (0x0d) means the page is a
1690     ** leaf table b-tree page. */
1691     assert( (PTF_LEAFDATA|PTF_INTKEY|PTF_LEAF)==13 );
1692     pPage->intKey = 1;
1693     if( pPage->leaf ){
1694       pPage->intKeyLeaf = 1;
1695       pPage->xParseCell = btreeParseCellPtr;
1696     }else{
1697       pPage->intKeyLeaf = 0;
1698       pPage->xCellSize = cellSizePtrNoPayload;
1699       pPage->xParseCell = btreeParseCellPtrNoPayload;
1700     }
1701     pPage->maxLocal = pBt->maxLeaf;
1702     pPage->minLocal = pBt->minLeaf;
1703   }else if( flagByte==PTF_ZERODATA ){
1704     /* EVIDENCE-OF: R-43316-37308 A value of 2 (0x02) means the page is an
1705     ** interior index b-tree page. */
1706     assert( (PTF_ZERODATA)==2 );
1707     /* EVIDENCE-OF: R-59615-42828 A value of 10 (0x0a) means the page is a
1708     ** leaf index b-tree page. */
1709     assert( (PTF_ZERODATA|PTF_LEAF)==10 );
1710     pPage->intKey = 0;
1711     pPage->intKeyLeaf = 0;
1712     pPage->xParseCell = btreeParseCellPtrIndex;
1713     pPage->maxLocal = pBt->maxLocal;
1714     pPage->minLocal = pBt->minLocal;
1715   }else{
1716     /* EVIDENCE-OF: R-47608-56469 Any other value for the b-tree page type is
1717     ** an error. */
1718     return SQLITE_CORRUPT_BKPT;
1719   }
1720   pPage->max1bytePayload = pBt->max1bytePayload;
1721   return SQLITE_OK;
1722 }
1723 
1724 /*
1725 ** Initialize the auxiliary information for a disk block.
1726 **
1727 ** Return SQLITE_OK on success.  If we see that the page does
1728 ** not contain a well-formed database page, then return
1729 ** SQLITE_CORRUPT.  Note that a return of SQLITE_OK does not
1730 ** guarantee that the page is well-formed.  It only shows that
1731 ** we failed to detect any corruption.
1732 */
1733 static int btreeInitPage(MemPage *pPage){
1734 
1735   assert( pPage->pBt!=0 );
1736   assert( pPage->pBt->db!=0 );
1737   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1738   assert( pPage->pgno==sqlite3PagerPagenumber(pPage->pDbPage) );
1739   assert( pPage == sqlite3PagerGetExtra(pPage->pDbPage) );
1740   assert( pPage->aData == sqlite3PagerGetData(pPage->pDbPage) );
1741 
1742   if( !pPage->isInit ){
1743     int pc;            /* Address of a freeblock within pPage->aData[] */
1744     u8 hdr;            /* Offset to beginning of page header */
1745     u8 *data;          /* Equal to pPage->aData */
1746     BtShared *pBt;        /* The main btree structure */
1747     int usableSize;    /* Amount of usable space on each page */
1748     u16 cellOffset;    /* Offset from start of page to first cell pointer */
1749     int nFree;         /* Number of unused bytes on the page */
1750     int top;           /* First byte of the cell content area */
1751     int iCellFirst;    /* First allowable cell or freeblock offset */
1752     int iCellLast;     /* Last possible cell or freeblock offset */
1753 
1754     pBt = pPage->pBt;
1755 
1756     hdr = pPage->hdrOffset;
1757     data = pPage->aData;
1758     /* EVIDENCE-OF: R-28594-02890 The one-byte flag at offset 0 indicating
1759     ** the b-tree page type. */
1760     if( decodeFlags(pPage, data[hdr]) ) return SQLITE_CORRUPT_BKPT;
1761     assert( pBt->pageSize>=512 && pBt->pageSize<=65536 );
1762     pPage->maskPage = (u16)(pBt->pageSize - 1);
1763     pPage->nOverflow = 0;
1764     usableSize = pBt->usableSize;
1765     pPage->cellOffset = cellOffset = hdr + 8 + pPage->childPtrSize;
1766     pPage->aDataEnd = &data[usableSize];
1767     pPage->aCellIdx = &data[cellOffset];
1768     pPage->aDataOfst = &data[pPage->childPtrSize];
1769     /* EVIDENCE-OF: R-58015-48175 The two-byte integer at offset 5 designates
1770     ** the start of the cell content area. A zero value for this integer is
1771     ** interpreted as 65536. */
1772     top = get2byteNotZero(&data[hdr+5]);
1773     /* EVIDENCE-OF: R-37002-32774 The two-byte integer at offset 3 gives the
1774     ** number of cells on the page. */
1775     pPage->nCell = get2byte(&data[hdr+3]);
1776     if( pPage->nCell>MX_CELL(pBt) ){
1777       /* To many cells for a single page.  The page must be corrupt */
1778       return SQLITE_CORRUPT_BKPT;
1779     }
1780     testcase( pPage->nCell==MX_CELL(pBt) );
1781     /* EVIDENCE-OF: R-24089-57979 If a page contains no cells (which is only
1782     ** possible for a root page of a table that contains no rows) then the
1783     ** offset to the cell content area will equal the page size minus the
1784     ** bytes of reserved space. */
1785     assert( pPage->nCell>0 || top==usableSize || CORRUPT_DB );
1786 
1787     /* A malformed database page might cause us to read past the end
1788     ** of page when parsing a cell.
1789     **
1790     ** The following block of code checks early to see if a cell extends
1791     ** past the end of a page boundary and causes SQLITE_CORRUPT to be
1792     ** returned if it does.
1793     */
1794     iCellFirst = cellOffset + 2*pPage->nCell;
1795     iCellLast = usableSize - 4;
1796     if( pBt->db->flags & SQLITE_CellSizeCk ){
1797       int i;            /* Index into the cell pointer array */
1798       int sz;           /* Size of a cell */
1799 
1800       if( !pPage->leaf ) iCellLast--;
1801       for(i=0; i<pPage->nCell; i++){
1802         pc = get2byteAligned(&data[cellOffset+i*2]);
1803         testcase( pc==iCellFirst );
1804         testcase( pc==iCellLast );
1805         if( pc<iCellFirst || pc>iCellLast ){
1806           return SQLITE_CORRUPT_BKPT;
1807         }
1808         sz = pPage->xCellSize(pPage, &data[pc]);
1809         testcase( pc+sz==usableSize );
1810         if( pc+sz>usableSize ){
1811           return SQLITE_CORRUPT_BKPT;
1812         }
1813       }
1814       if( !pPage->leaf ) iCellLast++;
1815     }
1816 
1817     /* Compute the total free space on the page
1818     ** EVIDENCE-OF: R-23588-34450 The two-byte integer at offset 1 gives the
1819     ** start of the first freeblock on the page, or is zero if there are no
1820     ** freeblocks. */
1821     pc = get2byte(&data[hdr+1]);
1822     nFree = data[hdr+7] + top;  /* Init nFree to non-freeblock free space */
1823     if( pc>0 ){
1824       u32 next, size;
1825       if( pc<iCellFirst ){
1826         /* EVIDENCE-OF: R-55530-52930 In a well-formed b-tree page, there will
1827         ** always be at least one cell before the first freeblock.
1828         */
1829         return SQLITE_CORRUPT_BKPT;
1830       }
1831       while( 1 ){
1832         if( pc>iCellLast ){
1833           return SQLITE_CORRUPT_BKPT; /* Freeblock off the end of the page */
1834         }
1835         next = get2byte(&data[pc]);
1836         size = get2byte(&data[pc+2]);
1837         nFree = nFree + size;
1838         if( next<=pc+size+3 ) break;
1839         pc = next;
1840       }
1841       if( next>0 ){
1842         return SQLITE_CORRUPT_BKPT;  /* Freeblock not in ascending order */
1843       }
1844       if( pc+size>(unsigned int)usableSize ){
1845         return SQLITE_CORRUPT_BKPT;  /* Last freeblock extends past page end */
1846       }
1847     }
1848 
1849     /* At this point, nFree contains the sum of the offset to the start
1850     ** of the cell-content area plus the number of free bytes within
1851     ** the cell-content area. If this is greater than the usable-size
1852     ** of the page, then the page must be corrupted. This check also
1853     ** serves to verify that the offset to the start of the cell-content
1854     ** area, according to the page header, lies within the page.
1855     */
1856     if( nFree>usableSize ){
1857       return SQLITE_CORRUPT_BKPT;
1858     }
1859     pPage->nFree = (u16)(nFree - iCellFirst);
1860     pPage->isInit = 1;
1861   }
1862   return SQLITE_OK;
1863 }
1864 
1865 /*
1866 ** Set up a raw page so that it looks like a database page holding
1867 ** no entries.
1868 */
1869 static void zeroPage(MemPage *pPage, int flags){
1870   unsigned char *data = pPage->aData;
1871   BtShared *pBt = pPage->pBt;
1872   u8 hdr = pPage->hdrOffset;
1873   u16 first;
1874 
1875   assert( sqlite3PagerPagenumber(pPage->pDbPage)==pPage->pgno );
1876   assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage );
1877   assert( sqlite3PagerGetData(pPage->pDbPage) == data );
1878   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
1879   assert( sqlite3_mutex_held(pBt->mutex) );
1880   if( pBt->btsFlags & BTS_SECURE_DELETE ){
1881     memset(&data[hdr], 0, pBt->usableSize - hdr);
1882   }
1883   data[hdr] = (char)flags;
1884   first = hdr + ((flags&PTF_LEAF)==0 ? 12 : 8);
1885   memset(&data[hdr+1], 0, 4);
1886   data[hdr+7] = 0;
1887   put2byte(&data[hdr+5], pBt->usableSize);
1888   pPage->nFree = (u16)(pBt->usableSize - first);
1889   decodeFlags(pPage, flags);
1890   pPage->cellOffset = first;
1891   pPage->aDataEnd = &data[pBt->usableSize];
1892   pPage->aCellIdx = &data[first];
1893   pPage->aDataOfst = &data[pPage->childPtrSize];
1894   pPage->nOverflow = 0;
1895   assert( pBt->pageSize>=512 && pBt->pageSize<=65536 );
1896   pPage->maskPage = (u16)(pBt->pageSize - 1);
1897   pPage->nCell = 0;
1898   pPage->isInit = 1;
1899 }
1900 
1901 
1902 /*
1903 ** Convert a DbPage obtained from the pager into a MemPage used by
1904 ** the btree layer.
1905 */
1906 static MemPage *btreePageFromDbPage(DbPage *pDbPage, Pgno pgno, BtShared *pBt){
1907   MemPage *pPage = (MemPage*)sqlite3PagerGetExtra(pDbPage);
1908   if( pgno!=pPage->pgno ){
1909     pPage->aData = sqlite3PagerGetData(pDbPage);
1910     pPage->pDbPage = pDbPage;
1911     pPage->pBt = pBt;
1912     pPage->pgno = pgno;
1913     pPage->hdrOffset = pgno==1 ? 100 : 0;
1914   }
1915   assert( pPage->aData==sqlite3PagerGetData(pDbPage) );
1916   return pPage;
1917 }
1918 
1919 /*
1920 ** Get a page from the pager.  Initialize the MemPage.pBt and
1921 ** MemPage.aData elements if needed.  See also: btreeGetUnusedPage().
1922 **
1923 ** If the PAGER_GET_NOCONTENT flag is set, it means that we do not care
1924 ** about the content of the page at this time.  So do not go to the disk
1925 ** to fetch the content.  Just fill in the content with zeros for now.
1926 ** If in the future we call sqlite3PagerWrite() on this page, that
1927 ** means we have started to be concerned about content and the disk
1928 ** read should occur at that point.
1929 */
1930 static int btreeGetPage(
1931   BtShared *pBt,       /* The btree */
1932   Pgno pgno,           /* Number of the page to fetch */
1933   MemPage **ppPage,    /* Return the page in this parameter */
1934   int flags            /* PAGER_GET_NOCONTENT or PAGER_GET_READONLY */
1935 ){
1936   int rc;
1937   DbPage *pDbPage;
1938 
1939   assert( flags==0 || flags==PAGER_GET_NOCONTENT || flags==PAGER_GET_READONLY );
1940   assert( sqlite3_mutex_held(pBt->mutex) );
1941   rc = sqlite3PagerGet(pBt->pPager, pgno, (DbPage**)&pDbPage, flags);
1942   if( rc ) return rc;
1943   *ppPage = btreePageFromDbPage(pDbPage, pgno, pBt);
1944   return SQLITE_OK;
1945 }
1946 
1947 /*
1948 ** Retrieve a page from the pager cache. If the requested page is not
1949 ** already in the pager cache return NULL. Initialize the MemPage.pBt and
1950 ** MemPage.aData elements if needed.
1951 */
1952 static MemPage *btreePageLookup(BtShared *pBt, Pgno pgno){
1953   DbPage *pDbPage;
1954   assert( sqlite3_mutex_held(pBt->mutex) );
1955   pDbPage = sqlite3PagerLookup(pBt->pPager, pgno);
1956   if( pDbPage ){
1957     return btreePageFromDbPage(pDbPage, pgno, pBt);
1958   }
1959   return 0;
1960 }
1961 
1962 /*
1963 ** Return the size of the database file in pages. If there is any kind of
1964 ** error, return ((unsigned int)-1).
1965 */
1966 static Pgno btreePagecount(BtShared *pBt){
1967   return pBt->nPage;
1968 }
1969 u32 sqlite3BtreeLastPage(Btree *p){
1970   assert( sqlite3BtreeHoldsMutex(p) );
1971   assert( ((p->pBt->nPage)&0x8000000)==0 );
1972   return btreePagecount(p->pBt);
1973 }
1974 
1975 /*
1976 ** Get a page from the pager and initialize it.
1977 **
1978 ** If pCur!=0 then the page is being fetched as part of a moveToChild()
1979 ** call.  Do additional sanity checking on the page in this case.
1980 ** And if the fetch fails, this routine must decrement pCur->iPage.
1981 **
1982 ** The page is fetched as read-write unless pCur is not NULL and is
1983 ** a read-only cursor.
1984 **
1985 ** If an error occurs, then *ppPage is undefined. It
1986 ** may remain unchanged, or it may be set to an invalid value.
1987 */
1988 static int getAndInitPage(
1989   BtShared *pBt,                  /* The database file */
1990   Pgno pgno,                      /* Number of the page to get */
1991   MemPage **ppPage,               /* Write the page pointer here */
1992   BtCursor *pCur,                 /* Cursor to receive the page, or NULL */
1993   int bReadOnly                   /* True for a read-only page */
1994 ){
1995   int rc;
1996   DbPage *pDbPage;
1997   assert( sqlite3_mutex_held(pBt->mutex) );
1998   assert( pCur==0 || ppPage==&pCur->apPage[pCur->iPage] );
1999   assert( pCur==0 || bReadOnly==pCur->curPagerFlags );
2000   assert( pCur==0 || pCur->iPage>0 );
2001 
2002   if( pgno>btreePagecount(pBt) ){
2003     rc = SQLITE_CORRUPT_BKPT;
2004     goto getAndInitPage_error;
2005   }
2006   rc = sqlite3PagerGet(pBt->pPager, pgno, (DbPage**)&pDbPage, bReadOnly);
2007   if( rc ){
2008     goto getAndInitPage_error;
2009   }
2010   *ppPage = (MemPage*)sqlite3PagerGetExtra(pDbPage);
2011   if( (*ppPage)->isInit==0 ){
2012     btreePageFromDbPage(pDbPage, pgno, pBt);
2013     rc = btreeInitPage(*ppPage);
2014     if( rc!=SQLITE_OK ){
2015       releasePage(*ppPage);
2016       goto getAndInitPage_error;
2017     }
2018   }
2019   assert( (*ppPage)->pgno==pgno );
2020   assert( (*ppPage)->aData==sqlite3PagerGetData(pDbPage) );
2021 
2022   /* If obtaining a child page for a cursor, we must verify that the page is
2023   ** compatible with the root page. */
2024   if( pCur && ((*ppPage)->nCell<1 || (*ppPage)->intKey!=pCur->curIntKey) ){
2025     rc = SQLITE_CORRUPT_BKPT;
2026     releasePage(*ppPage);
2027     goto getAndInitPage_error;
2028   }
2029   return SQLITE_OK;
2030 
2031 getAndInitPage_error:
2032   if( pCur ) pCur->iPage--;
2033   testcase( pgno==0 );
2034   assert( pgno!=0 || rc==SQLITE_CORRUPT );
2035   return rc;
2036 }
2037 
2038 /*
2039 ** Release a MemPage.  This should be called once for each prior
2040 ** call to btreeGetPage.
2041 */
2042 static void releasePageNotNull(MemPage *pPage){
2043   assert( pPage->aData );
2044   assert( pPage->pBt );
2045   assert( pPage->pDbPage!=0 );
2046   assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage );
2047   assert( sqlite3PagerGetData(pPage->pDbPage)==pPage->aData );
2048   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
2049   sqlite3PagerUnrefNotNull(pPage->pDbPage);
2050 }
2051 static void releasePage(MemPage *pPage){
2052   if( pPage ) releasePageNotNull(pPage);
2053 }
2054 
2055 /*
2056 ** Get an unused page.
2057 **
2058 ** This works just like btreeGetPage() with the addition:
2059 **
2060 **   *  If the page is already in use for some other purpose, immediately
2061 **      release it and return an SQLITE_CURRUPT error.
2062 **   *  Make sure the isInit flag is clear
2063 */
2064 static int btreeGetUnusedPage(
2065   BtShared *pBt,       /* The btree */
2066   Pgno pgno,           /* Number of the page to fetch */
2067   MemPage **ppPage,    /* Return the page in this parameter */
2068   int flags            /* PAGER_GET_NOCONTENT or PAGER_GET_READONLY */
2069 ){
2070   int rc = btreeGetPage(pBt, pgno, ppPage, flags);
2071   if( rc==SQLITE_OK ){
2072     if( sqlite3PagerPageRefcount((*ppPage)->pDbPage)>1 ){
2073       releasePage(*ppPage);
2074       *ppPage = 0;
2075       return SQLITE_CORRUPT_BKPT;
2076     }
2077     (*ppPage)->isInit = 0;
2078   }else{
2079     *ppPage = 0;
2080   }
2081   return rc;
2082 }
2083 
2084 
2085 /*
2086 ** During a rollback, when the pager reloads information into the cache
2087 ** so that the cache is restored to its original state at the start of
2088 ** the transaction, for each page restored this routine is called.
2089 **
2090 ** This routine needs to reset the extra data section at the end of the
2091 ** page to agree with the restored data.
2092 */
2093 static void pageReinit(DbPage *pData){
2094   MemPage *pPage;
2095   pPage = (MemPage *)sqlite3PagerGetExtra(pData);
2096   assert( sqlite3PagerPageRefcount(pData)>0 );
2097   if( pPage->isInit ){
2098     assert( sqlite3_mutex_held(pPage->pBt->mutex) );
2099     pPage->isInit = 0;
2100     if( sqlite3PagerPageRefcount(pData)>1 ){
2101       /* pPage might not be a btree page;  it might be an overflow page
2102       ** or ptrmap page or a free page.  In those cases, the following
2103       ** call to btreeInitPage() will likely return SQLITE_CORRUPT.
2104       ** But no harm is done by this.  And it is very important that
2105       ** btreeInitPage() be called on every btree page so we make
2106       ** the call for every page that comes in for re-initing. */
2107       btreeInitPage(pPage);
2108     }
2109   }
2110 }
2111 
2112 /*
2113 ** Invoke the busy handler for a btree.
2114 */
2115 static int btreeInvokeBusyHandler(void *pArg){
2116   BtShared *pBt = (BtShared*)pArg;
2117   assert( pBt->db );
2118   assert( sqlite3_mutex_held(pBt->db->mutex) );
2119   return sqlite3InvokeBusyHandler(&pBt->db->busyHandler);
2120 }
2121 
2122 /*
2123 ** Open a database file.
2124 **
2125 ** zFilename is the name of the database file.  If zFilename is NULL
2126 ** then an ephemeral database is created.  The ephemeral database might
2127 ** be exclusively in memory, or it might use a disk-based memory cache.
2128 ** Either way, the ephemeral database will be automatically deleted
2129 ** when sqlite3BtreeClose() is called.
2130 **
2131 ** If zFilename is ":memory:" then an in-memory database is created
2132 ** that is automatically destroyed when it is closed.
2133 **
2134 ** The "flags" parameter is a bitmask that might contain bits like
2135 ** BTREE_OMIT_JOURNAL and/or BTREE_MEMORY.
2136 **
2137 ** If the database is already opened in the same database connection
2138 ** and we are in shared cache mode, then the open will fail with an
2139 ** SQLITE_CONSTRAINT error.  We cannot allow two or more BtShared
2140 ** objects in the same database connection since doing so will lead
2141 ** to problems with locking.
2142 */
2143 int sqlite3BtreeOpen(
2144   sqlite3_vfs *pVfs,      /* VFS to use for this b-tree */
2145   const char *zFilename,  /* Name of the file containing the BTree database */
2146   sqlite3 *db,            /* Associated database handle */
2147   Btree **ppBtree,        /* Pointer to new Btree object written here */
2148   int flags,              /* Options */
2149   int vfsFlags            /* Flags passed through to sqlite3_vfs.xOpen() */
2150 ){
2151   BtShared *pBt = 0;             /* Shared part of btree structure */
2152   Btree *p;                      /* Handle to return */
2153   sqlite3_mutex *mutexOpen = 0;  /* Prevents a race condition. Ticket #3537 */
2154   int rc = SQLITE_OK;            /* Result code from this function */
2155   u8 nReserve;                   /* Byte of unused space on each page */
2156   unsigned char zDbHeader[100];  /* Database header content */
2157 
2158   /* True if opening an ephemeral, temporary database */
2159   const int isTempDb = zFilename==0 || zFilename[0]==0;
2160 
2161   /* Set the variable isMemdb to true for an in-memory database, or
2162   ** false for a file-based database.
2163   */
2164 #ifdef SQLITE_OMIT_MEMORYDB
2165   const int isMemdb = 0;
2166 #else
2167   const int isMemdb = (zFilename && strcmp(zFilename, ":memory:")==0)
2168                        || (isTempDb && sqlite3TempInMemory(db))
2169                        || (vfsFlags & SQLITE_OPEN_MEMORY)!=0;
2170 #endif
2171 
2172   assert( db!=0 );
2173   assert( pVfs!=0 );
2174   assert( sqlite3_mutex_held(db->mutex) );
2175   assert( (flags&0xff)==flags );   /* flags fit in 8 bits */
2176 
2177   /* Only a BTREE_SINGLE database can be BTREE_UNORDERED */
2178   assert( (flags & BTREE_UNORDERED)==0 || (flags & BTREE_SINGLE)!=0 );
2179 
2180   /* A BTREE_SINGLE database is always a temporary and/or ephemeral */
2181   assert( (flags & BTREE_SINGLE)==0 || isTempDb );
2182 
2183   if( isMemdb ){
2184     flags |= BTREE_MEMORY;
2185   }
2186   if( (vfsFlags & SQLITE_OPEN_MAIN_DB)!=0 && (isMemdb || isTempDb) ){
2187     vfsFlags = (vfsFlags & ~SQLITE_OPEN_MAIN_DB) | SQLITE_OPEN_TEMP_DB;
2188   }
2189   p = sqlite3MallocZero(sizeof(Btree));
2190   if( !p ){
2191     return SQLITE_NOMEM_BKPT;
2192   }
2193   p->inTrans = TRANS_NONE;
2194   p->db = db;
2195 #ifndef SQLITE_OMIT_SHARED_CACHE
2196   p->lock.pBtree = p;
2197   p->lock.iTable = 1;
2198 #endif
2199 
2200 #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)
2201   /*
2202   ** If this Btree is a candidate for shared cache, try to find an
2203   ** existing BtShared object that we can share with
2204   */
2205   if( isTempDb==0 && (isMemdb==0 || (vfsFlags&SQLITE_OPEN_URI)!=0) ){
2206     if( vfsFlags & SQLITE_OPEN_SHAREDCACHE ){
2207       int nFilename = sqlite3Strlen30(zFilename)+1;
2208       int nFullPathname = pVfs->mxPathname+1;
2209       char *zFullPathname = sqlite3Malloc(MAX(nFullPathname,nFilename));
2210       MUTEX_LOGIC( sqlite3_mutex *mutexShared; )
2211 
2212       p->sharable = 1;
2213       if( !zFullPathname ){
2214         sqlite3_free(p);
2215         return SQLITE_NOMEM_BKPT;
2216       }
2217       if( isMemdb ){
2218         memcpy(zFullPathname, zFilename, nFilename);
2219       }else{
2220         rc = sqlite3OsFullPathname(pVfs, zFilename,
2221                                    nFullPathname, zFullPathname);
2222         if( rc ){
2223           sqlite3_free(zFullPathname);
2224           sqlite3_free(p);
2225           return rc;
2226         }
2227       }
2228 #if SQLITE_THREADSAFE
2229       mutexOpen = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_OPEN);
2230       sqlite3_mutex_enter(mutexOpen);
2231       mutexShared = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER);
2232       sqlite3_mutex_enter(mutexShared);
2233 #endif
2234       for(pBt=GLOBAL(BtShared*,sqlite3SharedCacheList); pBt; pBt=pBt->pNext){
2235         assert( pBt->nRef>0 );
2236         if( 0==strcmp(zFullPathname, sqlite3PagerFilename(pBt->pPager, 0))
2237                  && sqlite3PagerVfs(pBt->pPager)==pVfs ){
2238           int iDb;
2239           for(iDb=db->nDb-1; iDb>=0; iDb--){
2240             Btree *pExisting = db->aDb[iDb].pBt;
2241             if( pExisting && pExisting->pBt==pBt ){
2242               sqlite3_mutex_leave(mutexShared);
2243               sqlite3_mutex_leave(mutexOpen);
2244               sqlite3_free(zFullPathname);
2245               sqlite3_free(p);
2246               return SQLITE_CONSTRAINT;
2247             }
2248           }
2249           p->pBt = pBt;
2250           pBt->nRef++;
2251           break;
2252         }
2253       }
2254       sqlite3_mutex_leave(mutexShared);
2255       sqlite3_free(zFullPathname);
2256     }
2257 #ifdef SQLITE_DEBUG
2258     else{
2259       /* In debug mode, we mark all persistent databases as sharable
2260       ** even when they are not.  This exercises the locking code and
2261       ** gives more opportunity for asserts(sqlite3_mutex_held())
2262       ** statements to find locking problems.
2263       */
2264       p->sharable = 1;
2265     }
2266 #endif
2267   }
2268 #endif
2269   if( pBt==0 ){
2270     /*
2271     ** The following asserts make sure that structures used by the btree are
2272     ** the right size.  This is to guard against size changes that result
2273     ** when compiling on a different architecture.
2274     */
2275     assert( sizeof(i64)==8 );
2276     assert( sizeof(u64)==8 );
2277     assert( sizeof(u32)==4 );
2278     assert( sizeof(u16)==2 );
2279     assert( sizeof(Pgno)==4 );
2280 
2281     pBt = sqlite3MallocZero( sizeof(*pBt) );
2282     if( pBt==0 ){
2283       rc = SQLITE_NOMEM_BKPT;
2284       goto btree_open_out;
2285     }
2286     rc = sqlite3PagerOpen(pVfs, &pBt->pPager, zFilename,
2287                           sizeof(MemPage), flags, vfsFlags, pageReinit);
2288     if( rc==SQLITE_OK ){
2289       sqlite3PagerSetMmapLimit(pBt->pPager, db->szMmap);
2290       rc = sqlite3PagerReadFileheader(pBt->pPager,sizeof(zDbHeader),zDbHeader);
2291     }
2292     if( rc!=SQLITE_OK ){
2293       goto btree_open_out;
2294     }
2295     pBt->openFlags = (u8)flags;
2296     pBt->db = db;
2297     sqlite3PagerSetBusyhandler(pBt->pPager, btreeInvokeBusyHandler, pBt);
2298     p->pBt = pBt;
2299 
2300     pBt->pCursor = 0;
2301     pBt->pPage1 = 0;
2302     if( sqlite3PagerIsreadonly(pBt->pPager) ) pBt->btsFlags |= BTS_READ_ONLY;
2303 #ifdef SQLITE_SECURE_DELETE
2304     pBt->btsFlags |= BTS_SECURE_DELETE;
2305 #endif
2306     /* EVIDENCE-OF: R-51873-39618 The page size for a database file is
2307     ** determined by the 2-byte integer located at an offset of 16 bytes from
2308     ** the beginning of the database file. */
2309     pBt->pageSize = (zDbHeader[16]<<8) | (zDbHeader[17]<<16);
2310     if( pBt->pageSize<512 || pBt->pageSize>SQLITE_MAX_PAGE_SIZE
2311          || ((pBt->pageSize-1)&pBt->pageSize)!=0 ){
2312       pBt->pageSize = 0;
2313 #ifndef SQLITE_OMIT_AUTOVACUUM
2314       /* If the magic name ":memory:" will create an in-memory database, then
2315       ** leave the autoVacuum mode at 0 (do not auto-vacuum), even if
2316       ** SQLITE_DEFAULT_AUTOVACUUM is true. On the other hand, if
2317       ** SQLITE_OMIT_MEMORYDB has been defined, then ":memory:" is just a
2318       ** regular file-name. In this case the auto-vacuum applies as per normal.
2319       */
2320       if( zFilename && !isMemdb ){
2321         pBt->autoVacuum = (SQLITE_DEFAULT_AUTOVACUUM ? 1 : 0);
2322         pBt->incrVacuum = (SQLITE_DEFAULT_AUTOVACUUM==2 ? 1 : 0);
2323       }
2324 #endif
2325       nReserve = 0;
2326     }else{
2327       /* EVIDENCE-OF: R-37497-42412 The size of the reserved region is
2328       ** determined by the one-byte unsigned integer found at an offset of 20
2329       ** into the database file header. */
2330       nReserve = zDbHeader[20];
2331       pBt->btsFlags |= BTS_PAGESIZE_FIXED;
2332 #ifndef SQLITE_OMIT_AUTOVACUUM
2333       pBt->autoVacuum = (get4byte(&zDbHeader[36 + 4*4])?1:0);
2334       pBt->incrVacuum = (get4byte(&zDbHeader[36 + 7*4])?1:0);
2335 #endif
2336     }
2337     rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize, nReserve);
2338     if( rc ) goto btree_open_out;
2339     pBt->usableSize = pBt->pageSize - nReserve;
2340     assert( (pBt->pageSize & 7)==0 );  /* 8-byte alignment of pageSize */
2341 
2342 #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)
2343     /* Add the new BtShared object to the linked list sharable BtShareds.
2344     */
2345     pBt->nRef = 1;
2346     if( p->sharable ){
2347       MUTEX_LOGIC( sqlite3_mutex *mutexShared; )
2348       MUTEX_LOGIC( mutexShared = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER);)
2349       if( SQLITE_THREADSAFE && sqlite3GlobalConfig.bCoreMutex ){
2350         pBt->mutex = sqlite3MutexAlloc(SQLITE_MUTEX_FAST);
2351         if( pBt->mutex==0 ){
2352           rc = SQLITE_NOMEM_BKPT;
2353           goto btree_open_out;
2354         }
2355       }
2356       sqlite3_mutex_enter(mutexShared);
2357       pBt->pNext = GLOBAL(BtShared*,sqlite3SharedCacheList);
2358       GLOBAL(BtShared*,sqlite3SharedCacheList) = pBt;
2359       sqlite3_mutex_leave(mutexShared);
2360     }
2361 #endif
2362   }
2363 
2364 #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)
2365   /* If the new Btree uses a sharable pBtShared, then link the new
2366   ** Btree into the list of all sharable Btrees for the same connection.
2367   ** The list is kept in ascending order by pBt address.
2368   */
2369   if( p->sharable ){
2370     int i;
2371     Btree *pSib;
2372     for(i=0; i<db->nDb; i++){
2373       if( (pSib = db->aDb[i].pBt)!=0 && pSib->sharable ){
2374         while( pSib->pPrev ){ pSib = pSib->pPrev; }
2375         if( (uptr)p->pBt<(uptr)pSib->pBt ){
2376           p->pNext = pSib;
2377           p->pPrev = 0;
2378           pSib->pPrev = p;
2379         }else{
2380           while( pSib->pNext && (uptr)pSib->pNext->pBt<(uptr)p->pBt ){
2381             pSib = pSib->pNext;
2382           }
2383           p->pNext = pSib->pNext;
2384           p->pPrev = pSib;
2385           if( p->pNext ){
2386             p->pNext->pPrev = p;
2387           }
2388           pSib->pNext = p;
2389         }
2390         break;
2391       }
2392     }
2393   }
2394 #endif
2395   *ppBtree = p;
2396 
2397 btree_open_out:
2398   if( rc!=SQLITE_OK ){
2399     if( pBt && pBt->pPager ){
2400       sqlite3PagerClose(pBt->pPager, 0);
2401     }
2402     sqlite3_free(pBt);
2403     sqlite3_free(p);
2404     *ppBtree = 0;
2405   }else{
2406     sqlite3_file *pFile;
2407 
2408     /* If the B-Tree was successfully opened, set the pager-cache size to the
2409     ** default value. Except, when opening on an existing shared pager-cache,
2410     ** do not change the pager-cache size.
2411     */
2412     if( sqlite3BtreeSchema(p, 0, 0)==0 ){
2413       sqlite3PagerSetCachesize(p->pBt->pPager, SQLITE_DEFAULT_CACHE_SIZE);
2414     }
2415 
2416     pFile = sqlite3PagerFile(pBt->pPager);
2417     if( pFile->pMethods ){
2418       sqlite3OsFileControlHint(pFile, SQLITE_FCNTL_PDB, (void*)&pBt->db);
2419     }
2420   }
2421   if( mutexOpen ){
2422     assert( sqlite3_mutex_held(mutexOpen) );
2423     sqlite3_mutex_leave(mutexOpen);
2424   }
2425   assert( rc!=SQLITE_OK || sqlite3BtreeConnectionCount(*ppBtree)>0 );
2426   return rc;
2427 }
2428 
2429 /*
2430 ** Decrement the BtShared.nRef counter.  When it reaches zero,
2431 ** remove the BtShared structure from the sharing list.  Return
2432 ** true if the BtShared.nRef counter reaches zero and return
2433 ** false if it is still positive.
2434 */
2435 static int removeFromSharingList(BtShared *pBt){
2436 #ifndef SQLITE_OMIT_SHARED_CACHE
2437   MUTEX_LOGIC( sqlite3_mutex *pMaster; )
2438   BtShared *pList;
2439   int removed = 0;
2440 
2441   assert( sqlite3_mutex_notheld(pBt->mutex) );
2442   MUTEX_LOGIC( pMaster = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER); )
2443   sqlite3_mutex_enter(pMaster);
2444   pBt->nRef--;
2445   if( pBt->nRef<=0 ){
2446     if( GLOBAL(BtShared*,sqlite3SharedCacheList)==pBt ){
2447       GLOBAL(BtShared*,sqlite3SharedCacheList) = pBt->pNext;
2448     }else{
2449       pList = GLOBAL(BtShared*,sqlite3SharedCacheList);
2450       while( ALWAYS(pList) && pList->pNext!=pBt ){
2451         pList=pList->pNext;
2452       }
2453       if( ALWAYS(pList) ){
2454         pList->pNext = pBt->pNext;
2455       }
2456     }
2457     if( SQLITE_THREADSAFE ){
2458       sqlite3_mutex_free(pBt->mutex);
2459     }
2460     removed = 1;
2461   }
2462   sqlite3_mutex_leave(pMaster);
2463   return removed;
2464 #else
2465   return 1;
2466 #endif
2467 }
2468 
2469 /*
2470 ** Make sure pBt->pTmpSpace points to an allocation of
2471 ** MX_CELL_SIZE(pBt) bytes with a 4-byte prefix for a left-child
2472 ** pointer.
2473 */
2474 static void allocateTempSpace(BtShared *pBt){
2475   if( !pBt->pTmpSpace ){
2476     pBt->pTmpSpace = sqlite3PageMalloc( pBt->pageSize );
2477 
2478     /* One of the uses of pBt->pTmpSpace is to format cells before
2479     ** inserting them into a leaf page (function fillInCell()). If
2480     ** a cell is less than 4 bytes in size, it is rounded up to 4 bytes
2481     ** by the various routines that manipulate binary cells. Which
2482     ** can mean that fillInCell() only initializes the first 2 or 3
2483     ** bytes of pTmpSpace, but that the first 4 bytes are copied from
2484     ** it into a database page. This is not actually a problem, but it
2485     ** does cause a valgrind error when the 1 or 2 bytes of unitialized
2486     ** data is passed to system call write(). So to avoid this error,
2487     ** zero the first 4 bytes of temp space here.
2488     **
2489     ** Also:  Provide four bytes of initialized space before the
2490     ** beginning of pTmpSpace as an area available to prepend the
2491     ** left-child pointer to the beginning of a cell.
2492     */
2493     if( pBt->pTmpSpace ){
2494       memset(pBt->pTmpSpace, 0, 8);
2495       pBt->pTmpSpace += 4;
2496     }
2497   }
2498 }
2499 
2500 /*
2501 ** Free the pBt->pTmpSpace allocation
2502 */
2503 static void freeTempSpace(BtShared *pBt){
2504   if( pBt->pTmpSpace ){
2505     pBt->pTmpSpace -= 4;
2506     sqlite3PageFree(pBt->pTmpSpace);
2507     pBt->pTmpSpace = 0;
2508   }
2509 }
2510 
2511 /*
2512 ** Close an open database and invalidate all cursors.
2513 */
2514 int sqlite3BtreeClose(Btree *p){
2515   BtShared *pBt = p->pBt;
2516   BtCursor *pCur;
2517 
2518   /* Close all cursors opened via this handle.  */
2519   assert( sqlite3_mutex_held(p->db->mutex) );
2520   sqlite3BtreeEnter(p);
2521   pCur = pBt->pCursor;
2522   while( pCur ){
2523     BtCursor *pTmp = pCur;
2524     pCur = pCur->pNext;
2525     if( pTmp->pBtree==p ){
2526       sqlite3BtreeCloseCursor(pTmp);
2527     }
2528   }
2529 
2530   /* Rollback any active transaction and free the handle structure.
2531   ** The call to sqlite3BtreeRollback() drops any table-locks held by
2532   ** this handle.
2533   */
2534   sqlite3BtreeRollback(p, SQLITE_OK, 0);
2535   sqlite3BtreeLeave(p);
2536 
2537   /* If there are still other outstanding references to the shared-btree
2538   ** structure, return now. The remainder of this procedure cleans
2539   ** up the shared-btree.
2540   */
2541   assert( p->wantToLock==0 && p->locked==0 );
2542   if( !p->sharable || removeFromSharingList(pBt) ){
2543     /* The pBt is no longer on the sharing list, so we can access
2544     ** it without having to hold the mutex.
2545     **
2546     ** Clean out and delete the BtShared object.
2547     */
2548     assert( !pBt->pCursor );
2549     sqlite3PagerClose(pBt->pPager, p->db);
2550     if( pBt->xFreeSchema && pBt->pSchema ){
2551       pBt->xFreeSchema(pBt->pSchema);
2552     }
2553     sqlite3DbFree(0, pBt->pSchema);
2554     freeTempSpace(pBt);
2555     sqlite3_free(pBt);
2556   }
2557 
2558 #ifndef SQLITE_OMIT_SHARED_CACHE
2559   assert( p->wantToLock==0 );
2560   assert( p->locked==0 );
2561   if( p->pPrev ) p->pPrev->pNext = p->pNext;
2562   if( p->pNext ) p->pNext->pPrev = p->pPrev;
2563 #endif
2564 
2565   sqlite3_free(p);
2566   return SQLITE_OK;
2567 }
2568 
2569 /*
2570 ** Change the "soft" limit on the number of pages in the cache.
2571 ** Unused and unmodified pages will be recycled when the number of
2572 ** pages in the cache exceeds this soft limit.  But the size of the
2573 ** cache is allowed to grow larger than this limit if it contains
2574 ** dirty pages or pages still in active use.
2575 */
2576 int sqlite3BtreeSetCacheSize(Btree *p, int mxPage){
2577   BtShared *pBt = p->pBt;
2578   assert( sqlite3_mutex_held(p->db->mutex) );
2579   sqlite3BtreeEnter(p);
2580   sqlite3PagerSetCachesize(pBt->pPager, mxPage);
2581   sqlite3BtreeLeave(p);
2582   return SQLITE_OK;
2583 }
2584 
2585 /*
2586 ** Change the "spill" limit on the number of pages in the cache.
2587 ** If the number of pages exceeds this limit during a write transaction,
2588 ** the pager might attempt to "spill" pages to the journal early in
2589 ** order to free up memory.
2590 **
2591 ** The value returned is the current spill size.  If zero is passed
2592 ** as an argument, no changes are made to the spill size setting, so
2593 ** using mxPage of 0 is a way to query the current spill size.
2594 */
2595 int sqlite3BtreeSetSpillSize(Btree *p, int mxPage){
2596   BtShared *pBt = p->pBt;
2597   int res;
2598   assert( sqlite3_mutex_held(p->db->mutex) );
2599   sqlite3BtreeEnter(p);
2600   res = sqlite3PagerSetSpillsize(pBt->pPager, mxPage);
2601   sqlite3BtreeLeave(p);
2602   return res;
2603 }
2604 
2605 #if SQLITE_MAX_MMAP_SIZE>0
2606 /*
2607 ** Change the limit on the amount of the database file that may be
2608 ** memory mapped.
2609 */
2610 int sqlite3BtreeSetMmapLimit(Btree *p, sqlite3_int64 szMmap){
2611   BtShared *pBt = p->pBt;
2612   assert( sqlite3_mutex_held(p->db->mutex) );
2613   sqlite3BtreeEnter(p);
2614   sqlite3PagerSetMmapLimit(pBt->pPager, szMmap);
2615   sqlite3BtreeLeave(p);
2616   return SQLITE_OK;
2617 }
2618 #endif /* SQLITE_MAX_MMAP_SIZE>0 */
2619 
2620 /*
2621 ** Change the way data is synced to disk in order to increase or decrease
2622 ** how well the database resists damage due to OS crashes and power
2623 ** failures.  Level 1 is the same as asynchronous (no syncs() occur and
2624 ** there is a high probability of damage)  Level 2 is the default.  There
2625 ** is a very low but non-zero probability of damage.  Level 3 reduces the
2626 ** probability of damage to near zero but with a write performance reduction.
2627 */
2628 #ifndef SQLITE_OMIT_PAGER_PRAGMAS
2629 int sqlite3BtreeSetPagerFlags(
2630   Btree *p,              /* The btree to set the safety level on */
2631   unsigned pgFlags       /* Various PAGER_* flags */
2632 ){
2633   BtShared *pBt = p->pBt;
2634   assert( sqlite3_mutex_held(p->db->mutex) );
2635   sqlite3BtreeEnter(p);
2636   sqlite3PagerSetFlags(pBt->pPager, pgFlags);
2637   sqlite3BtreeLeave(p);
2638   return SQLITE_OK;
2639 }
2640 #endif
2641 
2642 /*
2643 ** Change the default pages size and the number of reserved bytes per page.
2644 ** Or, if the page size has already been fixed, return SQLITE_READONLY
2645 ** without changing anything.
2646 **
2647 ** The page size must be a power of 2 between 512 and 65536.  If the page
2648 ** size supplied does not meet this constraint then the page size is not
2649 ** changed.
2650 **
2651 ** Page sizes are constrained to be a power of two so that the region
2652 ** of the database file used for locking (beginning at PENDING_BYTE,
2653 ** the first byte past the 1GB boundary, 0x40000000) needs to occur
2654 ** at the beginning of a page.
2655 **
2656 ** If parameter nReserve is less than zero, then the number of reserved
2657 ** bytes per page is left unchanged.
2658 **
2659 ** If the iFix!=0 then the BTS_PAGESIZE_FIXED flag is set so that the page size
2660 ** and autovacuum mode can no longer be changed.
2661 */
2662 int sqlite3BtreeSetPageSize(Btree *p, int pageSize, int nReserve, int iFix){
2663   int rc = SQLITE_OK;
2664   BtShared *pBt = p->pBt;
2665   assert( nReserve>=-1 && nReserve<=255 );
2666   sqlite3BtreeEnter(p);
2667 #if SQLITE_HAS_CODEC
2668   if( nReserve>pBt->optimalReserve ) pBt->optimalReserve = (u8)nReserve;
2669 #endif
2670   if( pBt->btsFlags & BTS_PAGESIZE_FIXED ){
2671     sqlite3BtreeLeave(p);
2672     return SQLITE_READONLY;
2673   }
2674   if( nReserve<0 ){
2675     nReserve = pBt->pageSize - pBt->usableSize;
2676   }
2677   assert( nReserve>=0 && nReserve<=255 );
2678   if( pageSize>=512 && pageSize<=SQLITE_MAX_PAGE_SIZE &&
2679         ((pageSize-1)&pageSize)==0 ){
2680     assert( (pageSize & 7)==0 );
2681     assert( !pBt->pCursor );
2682     pBt->pageSize = (u32)pageSize;
2683     freeTempSpace(pBt);
2684   }
2685   rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize, nReserve);
2686   pBt->usableSize = pBt->pageSize - (u16)nReserve;
2687   if( iFix ) pBt->btsFlags |= BTS_PAGESIZE_FIXED;
2688   sqlite3BtreeLeave(p);
2689   return rc;
2690 }
2691 
2692 /*
2693 ** Return the currently defined page size
2694 */
2695 int sqlite3BtreeGetPageSize(Btree *p){
2696   return p->pBt->pageSize;
2697 }
2698 
2699 /*
2700 ** This function is similar to sqlite3BtreeGetReserve(), except that it
2701 ** may only be called if it is guaranteed that the b-tree mutex is already
2702 ** held.
2703 **
2704 ** This is useful in one special case in the backup API code where it is
2705 ** known that the shared b-tree mutex is held, but the mutex on the
2706 ** database handle that owns *p is not. In this case if sqlite3BtreeEnter()
2707 ** were to be called, it might collide with some other operation on the
2708 ** database handle that owns *p, causing undefined behavior.
2709 */
2710 int sqlite3BtreeGetReserveNoMutex(Btree *p){
2711   int n;
2712   assert( sqlite3_mutex_held(p->pBt->mutex) );
2713   n = p->pBt->pageSize - p->pBt->usableSize;
2714   return n;
2715 }
2716 
2717 /*
2718 ** Return the number of bytes of space at the end of every page that
2719 ** are intentually left unused.  This is the "reserved" space that is
2720 ** sometimes used by extensions.
2721 **
2722 ** If SQLITE_HAS_MUTEX is defined then the number returned is the
2723 ** greater of the current reserved space and the maximum requested
2724 ** reserve space.
2725 */
2726 int sqlite3BtreeGetOptimalReserve(Btree *p){
2727   int n;
2728   sqlite3BtreeEnter(p);
2729   n = sqlite3BtreeGetReserveNoMutex(p);
2730 #ifdef SQLITE_HAS_CODEC
2731   if( n<p->pBt->optimalReserve ) n = p->pBt->optimalReserve;
2732 #endif
2733   sqlite3BtreeLeave(p);
2734   return n;
2735 }
2736 
2737 
2738 /*
2739 ** Set the maximum page count for a database if mxPage is positive.
2740 ** No changes are made if mxPage is 0 or negative.
2741 ** Regardless of the value of mxPage, return the maximum page count.
2742 */
2743 int sqlite3BtreeMaxPageCount(Btree *p, int mxPage){
2744   int n;
2745   sqlite3BtreeEnter(p);
2746   n = sqlite3PagerMaxPageCount(p->pBt->pPager, mxPage);
2747   sqlite3BtreeLeave(p);
2748   return n;
2749 }
2750 
2751 /*
2752 ** Set the BTS_SECURE_DELETE flag if newFlag is 0 or 1.  If newFlag is -1,
2753 ** then make no changes.  Always return the value of the BTS_SECURE_DELETE
2754 ** setting after the change.
2755 */
2756 int sqlite3BtreeSecureDelete(Btree *p, int newFlag){
2757   int b;
2758   if( p==0 ) return 0;
2759   sqlite3BtreeEnter(p);
2760   if( newFlag>=0 ){
2761     p->pBt->btsFlags &= ~BTS_SECURE_DELETE;
2762     if( newFlag ) p->pBt->btsFlags |= BTS_SECURE_DELETE;
2763   }
2764   b = (p->pBt->btsFlags & BTS_SECURE_DELETE)!=0;
2765   sqlite3BtreeLeave(p);
2766   return b;
2767 }
2768 
2769 /*
2770 ** Change the 'auto-vacuum' property of the database. If the 'autoVacuum'
2771 ** parameter is non-zero, then auto-vacuum mode is enabled. If zero, it
2772 ** is disabled. The default value for the auto-vacuum property is
2773 ** determined by the SQLITE_DEFAULT_AUTOVACUUM macro.
2774 */
2775 int sqlite3BtreeSetAutoVacuum(Btree *p, int autoVacuum){
2776 #ifdef SQLITE_OMIT_AUTOVACUUM
2777   return SQLITE_READONLY;
2778 #else
2779   BtShared *pBt = p->pBt;
2780   int rc = SQLITE_OK;
2781   u8 av = (u8)autoVacuum;
2782 
2783   sqlite3BtreeEnter(p);
2784   if( (pBt->btsFlags & BTS_PAGESIZE_FIXED)!=0 && (av ?1:0)!=pBt->autoVacuum ){
2785     rc = SQLITE_READONLY;
2786   }else{
2787     pBt->autoVacuum = av ?1:0;
2788     pBt->incrVacuum = av==2 ?1:0;
2789   }
2790   sqlite3BtreeLeave(p);
2791   return rc;
2792 #endif
2793 }
2794 
2795 /*
2796 ** Return the value of the 'auto-vacuum' property. If auto-vacuum is
2797 ** enabled 1 is returned. Otherwise 0.
2798 */
2799 int sqlite3BtreeGetAutoVacuum(Btree *p){
2800 #ifdef SQLITE_OMIT_AUTOVACUUM
2801   return BTREE_AUTOVACUUM_NONE;
2802 #else
2803   int rc;
2804   sqlite3BtreeEnter(p);
2805   rc = (
2806     (!p->pBt->autoVacuum)?BTREE_AUTOVACUUM_NONE:
2807     (!p->pBt->incrVacuum)?BTREE_AUTOVACUUM_FULL:
2808     BTREE_AUTOVACUUM_INCR
2809   );
2810   sqlite3BtreeLeave(p);
2811   return rc;
2812 #endif
2813 }
2814 
2815 
2816 /*
2817 ** Get a reference to pPage1 of the database file.  This will
2818 ** also acquire a readlock on that file.
2819 **
2820 ** SQLITE_OK is returned on success.  If the file is not a
2821 ** well-formed database file, then SQLITE_CORRUPT is returned.
2822 ** SQLITE_BUSY is returned if the database is locked.  SQLITE_NOMEM
2823 ** is returned if we run out of memory.
2824 */
2825 static int lockBtree(BtShared *pBt){
2826   int rc;              /* Result code from subfunctions */
2827   MemPage *pPage1;     /* Page 1 of the database file */
2828   int nPage;           /* Number of pages in the database */
2829   int nPageFile = 0;   /* Number of pages in the database file */
2830   int nPageHeader;     /* Number of pages in the database according to hdr */
2831 
2832   assert( sqlite3_mutex_held(pBt->mutex) );
2833   assert( pBt->pPage1==0 );
2834   rc = sqlite3PagerSharedLock(pBt->pPager);
2835   if( rc!=SQLITE_OK ) return rc;
2836   rc = btreeGetPage(pBt, 1, &pPage1, 0);
2837   if( rc!=SQLITE_OK ) return rc;
2838 
2839   /* Do some checking to help insure the file we opened really is
2840   ** a valid database file.
2841   */
2842   nPage = nPageHeader = get4byte(28+(u8*)pPage1->aData);
2843   sqlite3PagerPagecount(pBt->pPager, &nPageFile);
2844   if( nPage==0 || memcmp(24+(u8*)pPage1->aData, 92+(u8*)pPage1->aData,4)!=0 ){
2845     nPage = nPageFile;
2846   }
2847   if( nPage>0 ){
2848     u32 pageSize;
2849     u32 usableSize;
2850     u8 *page1 = pPage1->aData;
2851     rc = SQLITE_NOTADB;
2852     /* EVIDENCE-OF: R-43737-39999 Every valid SQLite database file begins
2853     ** with the following 16 bytes (in hex): 53 51 4c 69 74 65 20 66 6f 72 6d
2854     ** 61 74 20 33 00. */
2855     if( memcmp(page1, zMagicHeader, 16)!=0 ){
2856       goto page1_init_failed;
2857     }
2858 
2859 #ifdef SQLITE_OMIT_WAL
2860     if( page1[18]>1 ){
2861       pBt->btsFlags |= BTS_READ_ONLY;
2862     }
2863     if( page1[19]>1 ){
2864       goto page1_init_failed;
2865     }
2866 #else
2867     if( page1[18]>2 ){
2868       pBt->btsFlags |= BTS_READ_ONLY;
2869     }
2870     if( page1[19]>2 ){
2871       goto page1_init_failed;
2872     }
2873 
2874     /* If the write version is set to 2, this database should be accessed
2875     ** in WAL mode. If the log is not already open, open it now. Then
2876     ** return SQLITE_OK and return without populating BtShared.pPage1.
2877     ** The caller detects this and calls this function again. This is
2878     ** required as the version of page 1 currently in the page1 buffer
2879     ** may not be the latest version - there may be a newer one in the log
2880     ** file.
2881     */
2882     if( page1[19]==2 && (pBt->btsFlags & BTS_NO_WAL)==0 ){
2883       int isOpen = 0;
2884       rc = sqlite3PagerOpenWal(pBt->pPager, &isOpen);
2885       if( rc!=SQLITE_OK ){
2886         goto page1_init_failed;
2887       }else{
2888 #if SQLITE_DEFAULT_SYNCHRONOUS!=SQLITE_DEFAULT_WAL_SYNCHRONOUS
2889         sqlite3 *db;
2890         Db *pDb;
2891         if( (db=pBt->db)!=0 && (pDb=db->aDb)!=0 ){
2892           while( pDb->pBt==0 || pDb->pBt->pBt!=pBt ){ pDb++; }
2893           if( pDb->bSyncSet==0
2894            && pDb->safety_level==SQLITE_DEFAULT_SYNCHRONOUS+1
2895           ){
2896             pDb->safety_level = SQLITE_DEFAULT_WAL_SYNCHRONOUS+1;
2897             sqlite3PagerSetFlags(pBt->pPager,
2898                pDb->safety_level | (db->flags & PAGER_FLAGS_MASK));
2899           }
2900         }
2901 #endif
2902         if( isOpen==0 ){
2903           releasePage(pPage1);
2904           return SQLITE_OK;
2905         }
2906       }
2907       rc = SQLITE_NOTADB;
2908     }
2909 #endif
2910 
2911     /* EVIDENCE-OF: R-15465-20813 The maximum and minimum embedded payload
2912     ** fractions and the leaf payload fraction values must be 64, 32, and 32.
2913     **
2914     ** The original design allowed these amounts to vary, but as of
2915     ** version 3.6.0, we require them to be fixed.
2916     */
2917     if( memcmp(&page1[21], "\100\040\040",3)!=0 ){
2918       goto page1_init_failed;
2919     }
2920     /* EVIDENCE-OF: R-51873-39618 The page size for a database file is
2921     ** determined by the 2-byte integer located at an offset of 16 bytes from
2922     ** the beginning of the database file. */
2923     pageSize = (page1[16]<<8) | (page1[17]<<16);
2924     /* EVIDENCE-OF: R-25008-21688 The size of a page is a power of two
2925     ** between 512 and 65536 inclusive. */
2926     if( ((pageSize-1)&pageSize)!=0
2927      || pageSize>SQLITE_MAX_PAGE_SIZE
2928      || pageSize<=256
2929     ){
2930       goto page1_init_failed;
2931     }
2932     assert( (pageSize & 7)==0 );
2933     /* EVIDENCE-OF: R-59310-51205 The "reserved space" size in the 1-byte
2934     ** integer at offset 20 is the number of bytes of space at the end of
2935     ** each page to reserve for extensions.
2936     **
2937     ** EVIDENCE-OF: R-37497-42412 The size of the reserved region is
2938     ** determined by the one-byte unsigned integer found at an offset of 20
2939     ** into the database file header. */
2940     usableSize = pageSize - page1[20];
2941     if( (u32)pageSize!=pBt->pageSize ){
2942       /* After reading the first page of the database assuming a page size
2943       ** of BtShared.pageSize, we have discovered that the page-size is
2944       ** actually pageSize. Unlock the database, leave pBt->pPage1 at
2945       ** zero and return SQLITE_OK. The caller will call this function
2946       ** again with the correct page-size.
2947       */
2948       releasePage(pPage1);
2949       pBt->usableSize = usableSize;
2950       pBt->pageSize = pageSize;
2951       freeTempSpace(pBt);
2952       rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize,
2953                                    pageSize-usableSize);
2954       return rc;
2955     }
2956     if( (pBt->db->flags & SQLITE_RecoveryMode)==0 && nPage>nPageFile ){
2957       rc = SQLITE_CORRUPT_BKPT;
2958       goto page1_init_failed;
2959     }
2960     /* EVIDENCE-OF: R-28312-64704 However, the usable size is not allowed to
2961     ** be less than 480. In other words, if the page size is 512, then the
2962     ** reserved space size cannot exceed 32. */
2963     if( usableSize<480 ){
2964       goto page1_init_failed;
2965     }
2966     pBt->pageSize = pageSize;
2967     pBt->usableSize = usableSize;
2968 #ifndef SQLITE_OMIT_AUTOVACUUM
2969     pBt->autoVacuum = (get4byte(&page1[36 + 4*4])?1:0);
2970     pBt->incrVacuum = (get4byte(&page1[36 + 7*4])?1:0);
2971 #endif
2972   }
2973 
2974   /* maxLocal is the maximum amount of payload to store locally for
2975   ** a cell.  Make sure it is small enough so that at least minFanout
2976   ** cells can will fit on one page.  We assume a 10-byte page header.
2977   ** Besides the payload, the cell must store:
2978   **     2-byte pointer to the cell
2979   **     4-byte child pointer
2980   **     9-byte nKey value
2981   **     4-byte nData value
2982   **     4-byte overflow page pointer
2983   ** So a cell consists of a 2-byte pointer, a header which is as much as
2984   ** 17 bytes long, 0 to N bytes of payload, and an optional 4 byte overflow
2985   ** page pointer.
2986   */
2987   pBt->maxLocal = (u16)((pBt->usableSize-12)*64/255 - 23);
2988   pBt->minLocal = (u16)((pBt->usableSize-12)*32/255 - 23);
2989   pBt->maxLeaf = (u16)(pBt->usableSize - 35);
2990   pBt->minLeaf = (u16)((pBt->usableSize-12)*32/255 - 23);
2991   if( pBt->maxLocal>127 ){
2992     pBt->max1bytePayload = 127;
2993   }else{
2994     pBt->max1bytePayload = (u8)pBt->maxLocal;
2995   }
2996   assert( pBt->maxLeaf + 23 <= MX_CELL_SIZE(pBt) );
2997   pBt->pPage1 = pPage1;
2998   pBt->nPage = nPage;
2999   return SQLITE_OK;
3000 
3001 page1_init_failed:
3002   releasePage(pPage1);
3003   pBt->pPage1 = 0;
3004   return rc;
3005 }
3006 
3007 #ifndef NDEBUG
3008 /*
3009 ** Return the number of cursors open on pBt. This is for use
3010 ** in assert() expressions, so it is only compiled if NDEBUG is not
3011 ** defined.
3012 **
3013 ** Only write cursors are counted if wrOnly is true.  If wrOnly is
3014 ** false then all cursors are counted.
3015 **
3016 ** For the purposes of this routine, a cursor is any cursor that
3017 ** is capable of reading or writing to the database.  Cursors that
3018 ** have been tripped into the CURSOR_FAULT state are not counted.
3019 */
3020 static int countValidCursors(BtShared *pBt, int wrOnly){
3021   BtCursor *pCur;
3022   int r = 0;
3023   for(pCur=pBt->pCursor; pCur; pCur=pCur->pNext){
3024     if( (wrOnly==0 || (pCur->curFlags & BTCF_WriteFlag)!=0)
3025      && pCur->eState!=CURSOR_FAULT ) r++;
3026   }
3027   return r;
3028 }
3029 #endif
3030 
3031 /*
3032 ** If there are no outstanding cursors and we are not in the middle
3033 ** of a transaction but there is a read lock on the database, then
3034 ** this routine unrefs the first page of the database file which
3035 ** has the effect of releasing the read lock.
3036 **
3037 ** If there is a transaction in progress, this routine is a no-op.
3038 */
3039 static void unlockBtreeIfUnused(BtShared *pBt){
3040   assert( sqlite3_mutex_held(pBt->mutex) );
3041   assert( countValidCursors(pBt,0)==0 || pBt->inTransaction>TRANS_NONE );
3042   if( pBt->inTransaction==TRANS_NONE && pBt->pPage1!=0 ){
3043     MemPage *pPage1 = pBt->pPage1;
3044     assert( pPage1->aData );
3045     assert( sqlite3PagerRefcount(pBt->pPager)==1 );
3046     pBt->pPage1 = 0;
3047     releasePageNotNull(pPage1);
3048   }
3049 }
3050 
3051 /*
3052 ** If pBt points to an empty file then convert that empty file
3053 ** into a new empty database by initializing the first page of
3054 ** the database.
3055 */
3056 static int newDatabase(BtShared *pBt){
3057   MemPage *pP1;
3058   unsigned char *data;
3059   int rc;
3060 
3061   assert( sqlite3_mutex_held(pBt->mutex) );
3062   if( pBt->nPage>0 ){
3063     return SQLITE_OK;
3064   }
3065   pP1 = pBt->pPage1;
3066   assert( pP1!=0 );
3067   data = pP1->aData;
3068   rc = sqlite3PagerWrite(pP1->pDbPage);
3069   if( rc ) return rc;
3070   memcpy(data, zMagicHeader, sizeof(zMagicHeader));
3071   assert( sizeof(zMagicHeader)==16 );
3072   data[16] = (u8)((pBt->pageSize>>8)&0xff);
3073   data[17] = (u8)((pBt->pageSize>>16)&0xff);
3074   data[18] = 1;
3075   data[19] = 1;
3076   assert( pBt->usableSize<=pBt->pageSize && pBt->usableSize+255>=pBt->pageSize);
3077   data[20] = (u8)(pBt->pageSize - pBt->usableSize);
3078   data[21] = 64;
3079   data[22] = 32;
3080   data[23] = 32;
3081   memset(&data[24], 0, 100-24);
3082   zeroPage(pP1, PTF_INTKEY|PTF_LEAF|PTF_LEAFDATA );
3083   pBt->btsFlags |= BTS_PAGESIZE_FIXED;
3084 #ifndef SQLITE_OMIT_AUTOVACUUM
3085   assert( pBt->autoVacuum==1 || pBt->autoVacuum==0 );
3086   assert( pBt->incrVacuum==1 || pBt->incrVacuum==0 );
3087   put4byte(&data[36 + 4*4], pBt->autoVacuum);
3088   put4byte(&data[36 + 7*4], pBt->incrVacuum);
3089 #endif
3090   pBt->nPage = 1;
3091   data[31] = 1;
3092   return SQLITE_OK;
3093 }
3094 
3095 /*
3096 ** Initialize the first page of the database file (creating a database
3097 ** consisting of a single page and no schema objects). Return SQLITE_OK
3098 ** if successful, or an SQLite error code otherwise.
3099 */
3100 int sqlite3BtreeNewDb(Btree *p){
3101   int rc;
3102   sqlite3BtreeEnter(p);
3103   p->pBt->nPage = 0;
3104   rc = newDatabase(p->pBt);
3105   sqlite3BtreeLeave(p);
3106   return rc;
3107 }
3108 
3109 /*
3110 ** Attempt to start a new transaction. A write-transaction
3111 ** is started if the second argument is nonzero, otherwise a read-
3112 ** transaction.  If the second argument is 2 or more and exclusive
3113 ** transaction is started, meaning that no other process is allowed
3114 ** to access the database.  A preexisting transaction may not be
3115 ** upgraded to exclusive by calling this routine a second time - the
3116 ** exclusivity flag only works for a new transaction.
3117 **
3118 ** A write-transaction must be started before attempting any
3119 ** changes to the database.  None of the following routines
3120 ** will work unless a transaction is started first:
3121 **
3122 **      sqlite3BtreeCreateTable()
3123 **      sqlite3BtreeCreateIndex()
3124 **      sqlite3BtreeClearTable()
3125 **      sqlite3BtreeDropTable()
3126 **      sqlite3BtreeInsert()
3127 **      sqlite3BtreeDelete()
3128 **      sqlite3BtreeUpdateMeta()
3129 **
3130 ** If an initial attempt to acquire the lock fails because of lock contention
3131 ** and the database was previously unlocked, then invoke the busy handler
3132 ** if there is one.  But if there was previously a read-lock, do not
3133 ** invoke the busy handler - just return SQLITE_BUSY.  SQLITE_BUSY is
3134 ** returned when there is already a read-lock in order to avoid a deadlock.
3135 **
3136 ** Suppose there are two processes A and B.  A has a read lock and B has
3137 ** a reserved lock.  B tries to promote to exclusive but is blocked because
3138 ** of A's read lock.  A tries to promote to reserved but is blocked by B.
3139 ** One or the other of the two processes must give way or there can be
3140 ** no progress.  By returning SQLITE_BUSY and not invoking the busy callback
3141 ** when A already has a read lock, we encourage A to give up and let B
3142 ** proceed.
3143 */
3144 int sqlite3BtreeBeginTrans(Btree *p, int wrflag){
3145   BtShared *pBt = p->pBt;
3146   int rc = SQLITE_OK;
3147 
3148   sqlite3BtreeEnter(p);
3149   btreeIntegrity(p);
3150 
3151   /* If the btree is already in a write-transaction, or it
3152   ** is already in a read-transaction and a read-transaction
3153   ** is requested, this is a no-op.
3154   */
3155   if( p->inTrans==TRANS_WRITE || (p->inTrans==TRANS_READ && !wrflag) ){
3156     goto trans_begun;
3157   }
3158   assert( pBt->inTransaction==TRANS_WRITE || IfNotOmitAV(pBt->bDoTruncate)==0 );
3159 
3160   /* Write transactions are not possible on a read-only database */
3161   if( (pBt->btsFlags & BTS_READ_ONLY)!=0 && wrflag ){
3162     rc = SQLITE_READONLY;
3163     goto trans_begun;
3164   }
3165 
3166 #ifndef SQLITE_OMIT_SHARED_CACHE
3167   {
3168     sqlite3 *pBlock = 0;
3169     /* If another database handle has already opened a write transaction
3170     ** on this shared-btree structure and a second write transaction is
3171     ** requested, return SQLITE_LOCKED.
3172     */
3173     if( (wrflag && pBt->inTransaction==TRANS_WRITE)
3174      || (pBt->btsFlags & BTS_PENDING)!=0
3175     ){
3176       pBlock = pBt->pWriter->db;
3177     }else if( wrflag>1 ){
3178       BtLock *pIter;
3179       for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){
3180         if( pIter->pBtree!=p ){
3181           pBlock = pIter->pBtree->db;
3182           break;
3183         }
3184       }
3185     }
3186     if( pBlock ){
3187       sqlite3ConnectionBlocked(p->db, pBlock);
3188       rc = SQLITE_LOCKED_SHAREDCACHE;
3189       goto trans_begun;
3190     }
3191   }
3192 #endif
3193 
3194   /* Any read-only or read-write transaction implies a read-lock on
3195   ** page 1. So if some other shared-cache client already has a write-lock
3196   ** on page 1, the transaction cannot be opened. */
3197   rc = querySharedCacheTableLock(p, MASTER_ROOT, READ_LOCK);
3198   if( SQLITE_OK!=rc ) goto trans_begun;
3199 
3200   pBt->btsFlags &= ~BTS_INITIALLY_EMPTY;
3201   if( pBt->nPage==0 ) pBt->btsFlags |= BTS_INITIALLY_EMPTY;
3202   do {
3203     /* Call lockBtree() until either pBt->pPage1 is populated or
3204     ** lockBtree() returns something other than SQLITE_OK. lockBtree()
3205     ** may return SQLITE_OK but leave pBt->pPage1 set to 0 if after
3206     ** reading page 1 it discovers that the page-size of the database
3207     ** file is not pBt->pageSize. In this case lockBtree() will update
3208     ** pBt->pageSize to the page-size of the file on disk.
3209     */
3210     while( pBt->pPage1==0 && SQLITE_OK==(rc = lockBtree(pBt)) );
3211 
3212     if( rc==SQLITE_OK && wrflag ){
3213       if( (pBt->btsFlags & BTS_READ_ONLY)!=0 ){
3214         rc = SQLITE_READONLY;
3215       }else{
3216         rc = sqlite3PagerBegin(pBt->pPager,wrflag>1,sqlite3TempInMemory(p->db));
3217         if( rc==SQLITE_OK ){
3218           rc = newDatabase(pBt);
3219         }
3220       }
3221     }
3222 
3223     if( rc!=SQLITE_OK ){
3224       unlockBtreeIfUnused(pBt);
3225     }
3226   }while( (rc&0xFF)==SQLITE_BUSY && pBt->inTransaction==TRANS_NONE &&
3227           btreeInvokeBusyHandler(pBt) );
3228 
3229   if( rc==SQLITE_OK ){
3230     if( p->inTrans==TRANS_NONE ){
3231       pBt->nTransaction++;
3232 #ifndef SQLITE_OMIT_SHARED_CACHE
3233       if( p->sharable ){
3234         assert( p->lock.pBtree==p && p->lock.iTable==1 );
3235         p->lock.eLock = READ_LOCK;
3236         p->lock.pNext = pBt->pLock;
3237         pBt->pLock = &p->lock;
3238       }
3239 #endif
3240     }
3241     p->inTrans = (wrflag?TRANS_WRITE:TRANS_READ);
3242     if( p->inTrans>pBt->inTransaction ){
3243       pBt->inTransaction = p->inTrans;
3244     }
3245     if( wrflag ){
3246       MemPage *pPage1 = pBt->pPage1;
3247 #ifndef SQLITE_OMIT_SHARED_CACHE
3248       assert( !pBt->pWriter );
3249       pBt->pWriter = p;
3250       pBt->btsFlags &= ~BTS_EXCLUSIVE;
3251       if( wrflag>1 ) pBt->btsFlags |= BTS_EXCLUSIVE;
3252 #endif
3253 
3254       /* If the db-size header field is incorrect (as it may be if an old
3255       ** client has been writing the database file), update it now. Doing
3256       ** this sooner rather than later means the database size can safely
3257       ** re-read the database size from page 1 if a savepoint or transaction
3258       ** rollback occurs within the transaction.
3259       */
3260       if( pBt->nPage!=get4byte(&pPage1->aData[28]) ){
3261         rc = sqlite3PagerWrite(pPage1->pDbPage);
3262         if( rc==SQLITE_OK ){
3263           put4byte(&pPage1->aData[28], pBt->nPage);
3264         }
3265       }
3266     }
3267   }
3268 
3269 
3270 trans_begun:
3271   if( rc==SQLITE_OK && wrflag ){
3272     /* This call makes sure that the pager has the correct number of
3273     ** open savepoints. If the second parameter is greater than 0 and
3274     ** the sub-journal is not already open, then it will be opened here.
3275     */
3276     rc = sqlite3PagerOpenSavepoint(pBt->pPager, p->db->nSavepoint);
3277   }
3278 
3279   btreeIntegrity(p);
3280   sqlite3BtreeLeave(p);
3281   return rc;
3282 }
3283 
3284 #ifndef SQLITE_OMIT_AUTOVACUUM
3285 
3286 /*
3287 ** Set the pointer-map entries for all children of page pPage. Also, if
3288 ** pPage contains cells that point to overflow pages, set the pointer
3289 ** map entries for the overflow pages as well.
3290 */
3291 static int setChildPtrmaps(MemPage *pPage){
3292   int i;                             /* Counter variable */
3293   int nCell;                         /* Number of cells in page pPage */
3294   int rc;                            /* Return code */
3295   BtShared *pBt = pPage->pBt;
3296   Pgno pgno = pPage->pgno;
3297 
3298   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
3299   rc = btreeInitPage(pPage);
3300   if( rc!=SQLITE_OK ) return rc;
3301   nCell = pPage->nCell;
3302 
3303   for(i=0; i<nCell; i++){
3304     u8 *pCell = findCell(pPage, i);
3305 
3306     ptrmapPutOvflPtr(pPage, pCell, &rc);
3307 
3308     if( !pPage->leaf ){
3309       Pgno childPgno = get4byte(pCell);
3310       ptrmapPut(pBt, childPgno, PTRMAP_BTREE, pgno, &rc);
3311     }
3312   }
3313 
3314   if( !pPage->leaf ){
3315     Pgno childPgno = get4byte(&pPage->aData[pPage->hdrOffset+8]);
3316     ptrmapPut(pBt, childPgno, PTRMAP_BTREE, pgno, &rc);
3317   }
3318 
3319   return rc;
3320 }
3321 
3322 /*
3323 ** Somewhere on pPage is a pointer to page iFrom.  Modify this pointer so
3324 ** that it points to iTo. Parameter eType describes the type of pointer to
3325 ** be modified, as  follows:
3326 **
3327 ** PTRMAP_BTREE:     pPage is a btree-page. The pointer points at a child
3328 **                   page of pPage.
3329 **
3330 ** PTRMAP_OVERFLOW1: pPage is a btree-page. The pointer points at an overflow
3331 **                   page pointed to by one of the cells on pPage.
3332 **
3333 ** PTRMAP_OVERFLOW2: pPage is an overflow-page. The pointer points at the next
3334 **                   overflow page in the list.
3335 */
3336 static int modifyPagePointer(MemPage *pPage, Pgno iFrom, Pgno iTo, u8 eType){
3337   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
3338   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
3339   if( eType==PTRMAP_OVERFLOW2 ){
3340     /* The pointer is always the first 4 bytes of the page in this case.  */
3341     if( get4byte(pPage->aData)!=iFrom ){
3342       return SQLITE_CORRUPT_BKPT;
3343     }
3344     put4byte(pPage->aData, iTo);
3345   }else{
3346     int i;
3347     int nCell;
3348     int rc;
3349 
3350     rc = btreeInitPage(pPage);
3351     if( rc ) return rc;
3352     nCell = pPage->nCell;
3353 
3354     for(i=0; i<nCell; i++){
3355       u8 *pCell = findCell(pPage, i);
3356       if( eType==PTRMAP_OVERFLOW1 ){
3357         CellInfo info;
3358         pPage->xParseCell(pPage, pCell, &info);
3359         if( info.nLocal<info.nPayload ){
3360           if( pCell+info.nSize > pPage->aData+pPage->pBt->usableSize ){
3361             return SQLITE_CORRUPT_BKPT;
3362           }
3363           if( iFrom==get4byte(pCell+info.nSize-4) ){
3364             put4byte(pCell+info.nSize-4, iTo);
3365             break;
3366           }
3367         }
3368       }else{
3369         if( get4byte(pCell)==iFrom ){
3370           put4byte(pCell, iTo);
3371           break;
3372         }
3373       }
3374     }
3375 
3376     if( i==nCell ){
3377       if( eType!=PTRMAP_BTREE ||
3378           get4byte(&pPage->aData[pPage->hdrOffset+8])!=iFrom ){
3379         return SQLITE_CORRUPT_BKPT;
3380       }
3381       put4byte(&pPage->aData[pPage->hdrOffset+8], iTo);
3382     }
3383   }
3384   return SQLITE_OK;
3385 }
3386 
3387 
3388 /*
3389 ** Move the open database page pDbPage to location iFreePage in the
3390 ** database. The pDbPage reference remains valid.
3391 **
3392 ** The isCommit flag indicates that there is no need to remember that
3393 ** the journal needs to be sync()ed before database page pDbPage->pgno
3394 ** can be written to. The caller has already promised not to write to that
3395 ** page.
3396 */
3397 static int relocatePage(
3398   BtShared *pBt,           /* Btree */
3399   MemPage *pDbPage,        /* Open page to move */
3400   u8 eType,                /* Pointer map 'type' entry for pDbPage */
3401   Pgno iPtrPage,           /* Pointer map 'page-no' entry for pDbPage */
3402   Pgno iFreePage,          /* The location to move pDbPage to */
3403   int isCommit             /* isCommit flag passed to sqlite3PagerMovepage */
3404 ){
3405   MemPage *pPtrPage;   /* The page that contains a pointer to pDbPage */
3406   Pgno iDbPage = pDbPage->pgno;
3407   Pager *pPager = pBt->pPager;
3408   int rc;
3409 
3410   assert( eType==PTRMAP_OVERFLOW2 || eType==PTRMAP_OVERFLOW1 ||
3411       eType==PTRMAP_BTREE || eType==PTRMAP_ROOTPAGE );
3412   assert( sqlite3_mutex_held(pBt->mutex) );
3413   assert( pDbPage->pBt==pBt );
3414 
3415   /* Move page iDbPage from its current location to page number iFreePage */
3416   TRACE(("AUTOVACUUM: Moving %d to free page %d (ptr page %d type %d)\n",
3417       iDbPage, iFreePage, iPtrPage, eType));
3418   rc = sqlite3PagerMovepage(pPager, pDbPage->pDbPage, iFreePage, isCommit);
3419   if( rc!=SQLITE_OK ){
3420     return rc;
3421   }
3422   pDbPage->pgno = iFreePage;
3423 
3424   /* If pDbPage was a btree-page, then it may have child pages and/or cells
3425   ** that point to overflow pages. The pointer map entries for all these
3426   ** pages need to be changed.
3427   **
3428   ** If pDbPage is an overflow page, then the first 4 bytes may store a
3429   ** pointer to a subsequent overflow page. If this is the case, then
3430   ** the pointer map needs to be updated for the subsequent overflow page.
3431   */
3432   if( eType==PTRMAP_BTREE || eType==PTRMAP_ROOTPAGE ){
3433     rc = setChildPtrmaps(pDbPage);
3434     if( rc!=SQLITE_OK ){
3435       return rc;
3436     }
3437   }else{
3438     Pgno nextOvfl = get4byte(pDbPage->aData);
3439     if( nextOvfl!=0 ){
3440       ptrmapPut(pBt, nextOvfl, PTRMAP_OVERFLOW2, iFreePage, &rc);
3441       if( rc!=SQLITE_OK ){
3442         return rc;
3443       }
3444     }
3445   }
3446 
3447   /* Fix the database pointer on page iPtrPage that pointed at iDbPage so
3448   ** that it points at iFreePage. Also fix the pointer map entry for
3449   ** iPtrPage.
3450   */
3451   if( eType!=PTRMAP_ROOTPAGE ){
3452     rc = btreeGetPage(pBt, iPtrPage, &pPtrPage, 0);
3453     if( rc!=SQLITE_OK ){
3454       return rc;
3455     }
3456     rc = sqlite3PagerWrite(pPtrPage->pDbPage);
3457     if( rc!=SQLITE_OK ){
3458       releasePage(pPtrPage);
3459       return rc;
3460     }
3461     rc = modifyPagePointer(pPtrPage, iDbPage, iFreePage, eType);
3462     releasePage(pPtrPage);
3463     if( rc==SQLITE_OK ){
3464       ptrmapPut(pBt, iFreePage, eType, iPtrPage, &rc);
3465     }
3466   }
3467   return rc;
3468 }
3469 
3470 /* Forward declaration required by incrVacuumStep(). */
3471 static int allocateBtreePage(BtShared *, MemPage **, Pgno *, Pgno, u8);
3472 
3473 /*
3474 ** Perform a single step of an incremental-vacuum. If successful, return
3475 ** SQLITE_OK. If there is no work to do (and therefore no point in
3476 ** calling this function again), return SQLITE_DONE. Or, if an error
3477 ** occurs, return some other error code.
3478 **
3479 ** More specifically, this function attempts to re-organize the database so
3480 ** that the last page of the file currently in use is no longer in use.
3481 **
3482 ** Parameter nFin is the number of pages that this database would contain
3483 ** were this function called until it returns SQLITE_DONE.
3484 **
3485 ** If the bCommit parameter is non-zero, this function assumes that the
3486 ** caller will keep calling incrVacuumStep() until it returns SQLITE_DONE
3487 ** or an error. bCommit is passed true for an auto-vacuum-on-commit
3488 ** operation, or false for an incremental vacuum.
3489 */
3490 static int incrVacuumStep(BtShared *pBt, Pgno nFin, Pgno iLastPg, int bCommit){
3491   Pgno nFreeList;           /* Number of pages still on the free-list */
3492   int rc;
3493 
3494   assert( sqlite3_mutex_held(pBt->mutex) );
3495   assert( iLastPg>nFin );
3496 
3497   if( !PTRMAP_ISPAGE(pBt, iLastPg) && iLastPg!=PENDING_BYTE_PAGE(pBt) ){
3498     u8 eType;
3499     Pgno iPtrPage;
3500 
3501     nFreeList = get4byte(&pBt->pPage1->aData[36]);
3502     if( nFreeList==0 ){
3503       return SQLITE_DONE;
3504     }
3505 
3506     rc = ptrmapGet(pBt, iLastPg, &eType, &iPtrPage);
3507     if( rc!=SQLITE_OK ){
3508       return rc;
3509     }
3510     if( eType==PTRMAP_ROOTPAGE ){
3511       return SQLITE_CORRUPT_BKPT;
3512     }
3513 
3514     if( eType==PTRMAP_FREEPAGE ){
3515       if( bCommit==0 ){
3516         /* Remove the page from the files free-list. This is not required
3517         ** if bCommit is non-zero. In that case, the free-list will be
3518         ** truncated to zero after this function returns, so it doesn't
3519         ** matter if it still contains some garbage entries.
3520         */
3521         Pgno iFreePg;
3522         MemPage *pFreePg;
3523         rc = allocateBtreePage(pBt, &pFreePg, &iFreePg, iLastPg, BTALLOC_EXACT);
3524         if( rc!=SQLITE_OK ){
3525           return rc;
3526         }
3527         assert( iFreePg==iLastPg );
3528         releasePage(pFreePg);
3529       }
3530     } else {
3531       Pgno iFreePg;             /* Index of free page to move pLastPg to */
3532       MemPage *pLastPg;
3533       u8 eMode = BTALLOC_ANY;   /* Mode parameter for allocateBtreePage() */
3534       Pgno iNear = 0;           /* nearby parameter for allocateBtreePage() */
3535 
3536       rc = btreeGetPage(pBt, iLastPg, &pLastPg, 0);
3537       if( rc!=SQLITE_OK ){
3538         return rc;
3539       }
3540 
3541       /* If bCommit is zero, this loop runs exactly once and page pLastPg
3542       ** is swapped with the first free page pulled off the free list.
3543       **
3544       ** On the other hand, if bCommit is greater than zero, then keep
3545       ** looping until a free-page located within the first nFin pages
3546       ** of the file is found.
3547       */
3548       if( bCommit==0 ){
3549         eMode = BTALLOC_LE;
3550         iNear = nFin;
3551       }
3552       do {
3553         MemPage *pFreePg;
3554         rc = allocateBtreePage(pBt, &pFreePg, &iFreePg, iNear, eMode);
3555         if( rc!=SQLITE_OK ){
3556           releasePage(pLastPg);
3557           return rc;
3558         }
3559         releasePage(pFreePg);
3560       }while( bCommit && iFreePg>nFin );
3561       assert( iFreePg<iLastPg );
3562 
3563       rc = relocatePage(pBt, pLastPg, eType, iPtrPage, iFreePg, bCommit);
3564       releasePage(pLastPg);
3565       if( rc!=SQLITE_OK ){
3566         return rc;
3567       }
3568     }
3569   }
3570 
3571   if( bCommit==0 ){
3572     do {
3573       iLastPg--;
3574     }while( iLastPg==PENDING_BYTE_PAGE(pBt) || PTRMAP_ISPAGE(pBt, iLastPg) );
3575     pBt->bDoTruncate = 1;
3576     pBt->nPage = iLastPg;
3577   }
3578   return SQLITE_OK;
3579 }
3580 
3581 /*
3582 ** The database opened by the first argument is an auto-vacuum database
3583 ** nOrig pages in size containing nFree free pages. Return the expected
3584 ** size of the database in pages following an auto-vacuum operation.
3585 */
3586 static Pgno finalDbSize(BtShared *pBt, Pgno nOrig, Pgno nFree){
3587   int nEntry;                     /* Number of entries on one ptrmap page */
3588   Pgno nPtrmap;                   /* Number of PtrMap pages to be freed */
3589   Pgno nFin;                      /* Return value */
3590 
3591   nEntry = pBt->usableSize/5;
3592   nPtrmap = (nFree-nOrig+PTRMAP_PAGENO(pBt, nOrig)+nEntry)/nEntry;
3593   nFin = nOrig - nFree - nPtrmap;
3594   if( nOrig>PENDING_BYTE_PAGE(pBt) && nFin<PENDING_BYTE_PAGE(pBt) ){
3595     nFin--;
3596   }
3597   while( PTRMAP_ISPAGE(pBt, nFin) || nFin==PENDING_BYTE_PAGE(pBt) ){
3598     nFin--;
3599   }
3600 
3601   return nFin;
3602 }
3603 
3604 /*
3605 ** A write-transaction must be opened before calling this function.
3606 ** It performs a single unit of work towards an incremental vacuum.
3607 **
3608 ** If the incremental vacuum is finished after this function has run,
3609 ** SQLITE_DONE is returned. If it is not finished, but no error occurred,
3610 ** SQLITE_OK is returned. Otherwise an SQLite error code.
3611 */
3612 int sqlite3BtreeIncrVacuum(Btree *p){
3613   int rc;
3614   BtShared *pBt = p->pBt;
3615 
3616   sqlite3BtreeEnter(p);
3617   assert( pBt->inTransaction==TRANS_WRITE && p->inTrans==TRANS_WRITE );
3618   if( !pBt->autoVacuum ){
3619     rc = SQLITE_DONE;
3620   }else{
3621     Pgno nOrig = btreePagecount(pBt);
3622     Pgno nFree = get4byte(&pBt->pPage1->aData[36]);
3623     Pgno nFin = finalDbSize(pBt, nOrig, nFree);
3624 
3625     if( nOrig<nFin ){
3626       rc = SQLITE_CORRUPT_BKPT;
3627     }else if( nFree>0 ){
3628       rc = saveAllCursors(pBt, 0, 0);
3629       if( rc==SQLITE_OK ){
3630         invalidateAllOverflowCache(pBt);
3631         rc = incrVacuumStep(pBt, nFin, nOrig, 0);
3632       }
3633       if( rc==SQLITE_OK ){
3634         rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
3635         put4byte(&pBt->pPage1->aData[28], pBt->nPage);
3636       }
3637     }else{
3638       rc = SQLITE_DONE;
3639     }
3640   }
3641   sqlite3BtreeLeave(p);
3642   return rc;
3643 }
3644 
3645 /*
3646 ** This routine is called prior to sqlite3PagerCommit when a transaction
3647 ** is committed for an auto-vacuum database.
3648 **
3649 ** If SQLITE_OK is returned, then *pnTrunc is set to the number of pages
3650 ** the database file should be truncated to during the commit process.
3651 ** i.e. the database has been reorganized so that only the first *pnTrunc
3652 ** pages are in use.
3653 */
3654 static int autoVacuumCommit(BtShared *pBt){
3655   int rc = SQLITE_OK;
3656   Pager *pPager = pBt->pPager;
3657   VVA_ONLY( int nRef = sqlite3PagerRefcount(pPager); )
3658 
3659   assert( sqlite3_mutex_held(pBt->mutex) );
3660   invalidateAllOverflowCache(pBt);
3661   assert(pBt->autoVacuum);
3662   if( !pBt->incrVacuum ){
3663     Pgno nFin;         /* Number of pages in database after autovacuuming */
3664     Pgno nFree;        /* Number of pages on the freelist initially */
3665     Pgno iFree;        /* The next page to be freed */
3666     Pgno nOrig;        /* Database size before freeing */
3667 
3668     nOrig = btreePagecount(pBt);
3669     if( PTRMAP_ISPAGE(pBt, nOrig) || nOrig==PENDING_BYTE_PAGE(pBt) ){
3670       /* It is not possible to create a database for which the final page
3671       ** is either a pointer-map page or the pending-byte page. If one
3672       ** is encountered, this indicates corruption.
3673       */
3674       return SQLITE_CORRUPT_BKPT;
3675     }
3676 
3677     nFree = get4byte(&pBt->pPage1->aData[36]);
3678     nFin = finalDbSize(pBt, nOrig, nFree);
3679     if( nFin>nOrig ) return SQLITE_CORRUPT_BKPT;
3680     if( nFin<nOrig ){
3681       rc = saveAllCursors(pBt, 0, 0);
3682     }
3683     for(iFree=nOrig; iFree>nFin && rc==SQLITE_OK; iFree--){
3684       rc = incrVacuumStep(pBt, nFin, iFree, 1);
3685     }
3686     if( (rc==SQLITE_DONE || rc==SQLITE_OK) && nFree>0 ){
3687       rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
3688       put4byte(&pBt->pPage1->aData[32], 0);
3689       put4byte(&pBt->pPage1->aData[36], 0);
3690       put4byte(&pBt->pPage1->aData[28], nFin);
3691       pBt->bDoTruncate = 1;
3692       pBt->nPage = nFin;
3693     }
3694     if( rc!=SQLITE_OK ){
3695       sqlite3PagerRollback(pPager);
3696     }
3697   }
3698 
3699   assert( nRef>=sqlite3PagerRefcount(pPager) );
3700   return rc;
3701 }
3702 
3703 #else /* ifndef SQLITE_OMIT_AUTOVACUUM */
3704 # define setChildPtrmaps(x) SQLITE_OK
3705 #endif
3706 
3707 /*
3708 ** This routine does the first phase of a two-phase commit.  This routine
3709 ** causes a rollback journal to be created (if it does not already exist)
3710 ** and populated with enough information so that if a power loss occurs
3711 ** the database can be restored to its original state by playing back
3712 ** the journal.  Then the contents of the journal are flushed out to
3713 ** the disk.  After the journal is safely on oxide, the changes to the
3714 ** database are written into the database file and flushed to oxide.
3715 ** At the end of this call, the rollback journal still exists on the
3716 ** disk and we are still holding all locks, so the transaction has not
3717 ** committed.  See sqlite3BtreeCommitPhaseTwo() for the second phase of the
3718 ** commit process.
3719 **
3720 ** This call is a no-op if no write-transaction is currently active on pBt.
3721 **
3722 ** Otherwise, sync the database file for the btree pBt. zMaster points to
3723 ** the name of a master journal file that should be written into the
3724 ** individual journal file, or is NULL, indicating no master journal file
3725 ** (single database transaction).
3726 **
3727 ** When this is called, the master journal should already have been
3728 ** created, populated with this journal pointer and synced to disk.
3729 **
3730 ** Once this is routine has returned, the only thing required to commit
3731 ** the write-transaction for this database file is to delete the journal.
3732 */
3733 int sqlite3BtreeCommitPhaseOne(Btree *p, const char *zMaster){
3734   int rc = SQLITE_OK;
3735   if( p->inTrans==TRANS_WRITE ){
3736     BtShared *pBt = p->pBt;
3737     sqlite3BtreeEnter(p);
3738 #ifndef SQLITE_OMIT_AUTOVACUUM
3739     if( pBt->autoVacuum ){
3740       rc = autoVacuumCommit(pBt);
3741       if( rc!=SQLITE_OK ){
3742         sqlite3BtreeLeave(p);
3743         return rc;
3744       }
3745     }
3746     if( pBt->bDoTruncate ){
3747       sqlite3PagerTruncateImage(pBt->pPager, pBt->nPage);
3748     }
3749 #endif
3750     rc = sqlite3PagerCommitPhaseOne(pBt->pPager, zMaster, 0);
3751     sqlite3BtreeLeave(p);
3752   }
3753   return rc;
3754 }
3755 
3756 /*
3757 ** This function is called from both BtreeCommitPhaseTwo() and BtreeRollback()
3758 ** at the conclusion of a transaction.
3759 */
3760 static void btreeEndTransaction(Btree *p){
3761   BtShared *pBt = p->pBt;
3762   sqlite3 *db = p->db;
3763   assert( sqlite3BtreeHoldsMutex(p) );
3764 
3765 #ifndef SQLITE_OMIT_AUTOVACUUM
3766   pBt->bDoTruncate = 0;
3767 #endif
3768   if( p->inTrans>TRANS_NONE && db->nVdbeRead>1 ){
3769     /* If there are other active statements that belong to this database
3770     ** handle, downgrade to a read-only transaction. The other statements
3771     ** may still be reading from the database.  */
3772     downgradeAllSharedCacheTableLocks(p);
3773     p->inTrans = TRANS_READ;
3774   }else{
3775     /* If the handle had any kind of transaction open, decrement the
3776     ** transaction count of the shared btree. If the transaction count
3777     ** reaches 0, set the shared state to TRANS_NONE. The unlockBtreeIfUnused()
3778     ** call below will unlock the pager.  */
3779     if( p->inTrans!=TRANS_NONE ){
3780       clearAllSharedCacheTableLocks(p);
3781       pBt->nTransaction--;
3782       if( 0==pBt->nTransaction ){
3783         pBt->inTransaction = TRANS_NONE;
3784       }
3785     }
3786 
3787     /* Set the current transaction state to TRANS_NONE and unlock the
3788     ** pager if this call closed the only read or write transaction.  */
3789     p->inTrans = TRANS_NONE;
3790     unlockBtreeIfUnused(pBt);
3791   }
3792 
3793   btreeIntegrity(p);
3794 }
3795 
3796 /*
3797 ** Commit the transaction currently in progress.
3798 **
3799 ** This routine implements the second phase of a 2-phase commit.  The
3800 ** sqlite3BtreeCommitPhaseOne() routine does the first phase and should
3801 ** be invoked prior to calling this routine.  The sqlite3BtreeCommitPhaseOne()
3802 ** routine did all the work of writing information out to disk and flushing the
3803 ** contents so that they are written onto the disk platter.  All this
3804 ** routine has to do is delete or truncate or zero the header in the
3805 ** the rollback journal (which causes the transaction to commit) and
3806 ** drop locks.
3807 **
3808 ** Normally, if an error occurs while the pager layer is attempting to
3809 ** finalize the underlying journal file, this function returns an error and
3810 ** the upper layer will attempt a rollback. However, if the second argument
3811 ** is non-zero then this b-tree transaction is part of a multi-file
3812 ** transaction. In this case, the transaction has already been committed
3813 ** (by deleting a master journal file) and the caller will ignore this
3814 ** functions return code. So, even if an error occurs in the pager layer,
3815 ** reset the b-tree objects internal state to indicate that the write
3816 ** transaction has been closed. This is quite safe, as the pager will have
3817 ** transitioned to the error state.
3818 **
3819 ** This will release the write lock on the database file.  If there
3820 ** are no active cursors, it also releases the read lock.
3821 */
3822 int sqlite3BtreeCommitPhaseTwo(Btree *p, int bCleanup){
3823 
3824   if( p->inTrans==TRANS_NONE ) return SQLITE_OK;
3825   sqlite3BtreeEnter(p);
3826   btreeIntegrity(p);
3827 
3828   /* If the handle has a write-transaction open, commit the shared-btrees
3829   ** transaction and set the shared state to TRANS_READ.
3830   */
3831   if( p->inTrans==TRANS_WRITE ){
3832     int rc;
3833     BtShared *pBt = p->pBt;
3834     assert( pBt->inTransaction==TRANS_WRITE );
3835     assert( pBt->nTransaction>0 );
3836     rc = sqlite3PagerCommitPhaseTwo(pBt->pPager);
3837     if( rc!=SQLITE_OK && bCleanup==0 ){
3838       sqlite3BtreeLeave(p);
3839       return rc;
3840     }
3841     p->iDataVersion--;  /* Compensate for pPager->iDataVersion++; */
3842     pBt->inTransaction = TRANS_READ;
3843     btreeClearHasContent(pBt);
3844   }
3845 
3846   btreeEndTransaction(p);
3847   sqlite3BtreeLeave(p);
3848   return SQLITE_OK;
3849 }
3850 
3851 /*
3852 ** Do both phases of a commit.
3853 */
3854 int sqlite3BtreeCommit(Btree *p){
3855   int rc;
3856   sqlite3BtreeEnter(p);
3857   rc = sqlite3BtreeCommitPhaseOne(p, 0);
3858   if( rc==SQLITE_OK ){
3859     rc = sqlite3BtreeCommitPhaseTwo(p, 0);
3860   }
3861   sqlite3BtreeLeave(p);
3862   return rc;
3863 }
3864 
3865 /*
3866 ** This routine sets the state to CURSOR_FAULT and the error
3867 ** code to errCode for every cursor on any BtShared that pBtree
3868 ** references.  Or if the writeOnly flag is set to 1, then only
3869 ** trip write cursors and leave read cursors unchanged.
3870 **
3871 ** Every cursor is a candidate to be tripped, including cursors
3872 ** that belong to other database connections that happen to be
3873 ** sharing the cache with pBtree.
3874 **
3875 ** This routine gets called when a rollback occurs. If the writeOnly
3876 ** flag is true, then only write-cursors need be tripped - read-only
3877 ** cursors save their current positions so that they may continue
3878 ** following the rollback. Or, if writeOnly is false, all cursors are
3879 ** tripped. In general, writeOnly is false if the transaction being
3880 ** rolled back modified the database schema. In this case b-tree root
3881 ** pages may be moved or deleted from the database altogether, making
3882 ** it unsafe for read cursors to continue.
3883 **
3884 ** If the writeOnly flag is true and an error is encountered while
3885 ** saving the current position of a read-only cursor, all cursors,
3886 ** including all read-cursors are tripped.
3887 **
3888 ** SQLITE_OK is returned if successful, or if an error occurs while
3889 ** saving a cursor position, an SQLite error code.
3890 */
3891 int sqlite3BtreeTripAllCursors(Btree *pBtree, int errCode, int writeOnly){
3892   BtCursor *p;
3893   int rc = SQLITE_OK;
3894 
3895   assert( (writeOnly==0 || writeOnly==1) && BTCF_WriteFlag==1 );
3896   if( pBtree ){
3897     sqlite3BtreeEnter(pBtree);
3898     for(p=pBtree->pBt->pCursor; p; p=p->pNext){
3899       int i;
3900       if( writeOnly && (p->curFlags & BTCF_WriteFlag)==0 ){
3901         if( p->eState==CURSOR_VALID || p->eState==CURSOR_SKIPNEXT ){
3902           rc = saveCursorPosition(p);
3903           if( rc!=SQLITE_OK ){
3904             (void)sqlite3BtreeTripAllCursors(pBtree, rc, 0);
3905             break;
3906           }
3907         }
3908       }else{
3909         sqlite3BtreeClearCursor(p);
3910         p->eState = CURSOR_FAULT;
3911         p->skipNext = errCode;
3912       }
3913       for(i=0; i<=p->iPage; i++){
3914         releasePage(p->apPage[i]);
3915         p->apPage[i] = 0;
3916       }
3917     }
3918     sqlite3BtreeLeave(pBtree);
3919   }
3920   return rc;
3921 }
3922 
3923 /*
3924 ** Rollback the transaction in progress.
3925 **
3926 ** If tripCode is not SQLITE_OK then cursors will be invalidated (tripped).
3927 ** Only write cursors are tripped if writeOnly is true but all cursors are
3928 ** tripped if writeOnly is false.  Any attempt to use
3929 ** a tripped cursor will result in an error.
3930 **
3931 ** This will release the write lock on the database file.  If there
3932 ** are no active cursors, it also releases the read lock.
3933 */
3934 int sqlite3BtreeRollback(Btree *p, int tripCode, int writeOnly){
3935   int rc;
3936   BtShared *pBt = p->pBt;
3937   MemPage *pPage1;
3938 
3939   assert( writeOnly==1 || writeOnly==0 );
3940   assert( tripCode==SQLITE_ABORT_ROLLBACK || tripCode==SQLITE_OK );
3941   sqlite3BtreeEnter(p);
3942   if( tripCode==SQLITE_OK ){
3943     rc = tripCode = saveAllCursors(pBt, 0, 0);
3944     if( rc ) writeOnly = 0;
3945   }else{
3946     rc = SQLITE_OK;
3947   }
3948   if( tripCode ){
3949     int rc2 = sqlite3BtreeTripAllCursors(p, tripCode, writeOnly);
3950     assert( rc==SQLITE_OK || (writeOnly==0 && rc2==SQLITE_OK) );
3951     if( rc2!=SQLITE_OK ) rc = rc2;
3952   }
3953   btreeIntegrity(p);
3954 
3955   if( p->inTrans==TRANS_WRITE ){
3956     int rc2;
3957 
3958     assert( TRANS_WRITE==pBt->inTransaction );
3959     rc2 = sqlite3PagerRollback(pBt->pPager);
3960     if( rc2!=SQLITE_OK ){
3961       rc = rc2;
3962     }
3963 
3964     /* The rollback may have destroyed the pPage1->aData value.  So
3965     ** call btreeGetPage() on page 1 again to make
3966     ** sure pPage1->aData is set correctly. */
3967     if( btreeGetPage(pBt, 1, &pPage1, 0)==SQLITE_OK ){
3968       int nPage = get4byte(28+(u8*)pPage1->aData);
3969       testcase( nPage==0 );
3970       if( nPage==0 ) sqlite3PagerPagecount(pBt->pPager, &nPage);
3971       testcase( pBt->nPage!=nPage );
3972       pBt->nPage = nPage;
3973       releasePage(pPage1);
3974     }
3975     assert( countValidCursors(pBt, 1)==0 );
3976     pBt->inTransaction = TRANS_READ;
3977     btreeClearHasContent(pBt);
3978   }
3979 
3980   btreeEndTransaction(p);
3981   sqlite3BtreeLeave(p);
3982   return rc;
3983 }
3984 
3985 /*
3986 ** Start a statement subtransaction. The subtransaction can be rolled
3987 ** back independently of the main transaction. You must start a transaction
3988 ** before starting a subtransaction. The subtransaction is ended automatically
3989 ** if the main transaction commits or rolls back.
3990 **
3991 ** Statement subtransactions are used around individual SQL statements
3992 ** that are contained within a BEGIN...COMMIT block.  If a constraint
3993 ** error occurs within the statement, the effect of that one statement
3994 ** can be rolled back without having to rollback the entire transaction.
3995 **
3996 ** A statement sub-transaction is implemented as an anonymous savepoint. The
3997 ** value passed as the second parameter is the total number of savepoints,
3998 ** including the new anonymous savepoint, open on the B-Tree. i.e. if there
3999 ** are no active savepoints and no other statement-transactions open,
4000 ** iStatement is 1. This anonymous savepoint can be released or rolled back
4001 ** using the sqlite3BtreeSavepoint() function.
4002 */
4003 int sqlite3BtreeBeginStmt(Btree *p, int iStatement){
4004   int rc;
4005   BtShared *pBt = p->pBt;
4006   sqlite3BtreeEnter(p);
4007   assert( p->inTrans==TRANS_WRITE );
4008   assert( (pBt->btsFlags & BTS_READ_ONLY)==0 );
4009   assert( iStatement>0 );
4010   assert( iStatement>p->db->nSavepoint );
4011   assert( pBt->inTransaction==TRANS_WRITE );
4012   /* At the pager level, a statement transaction is a savepoint with
4013   ** an index greater than all savepoints created explicitly using
4014   ** SQL statements. It is illegal to open, release or rollback any
4015   ** such savepoints while the statement transaction savepoint is active.
4016   */
4017   rc = sqlite3PagerOpenSavepoint(pBt->pPager, iStatement);
4018   sqlite3BtreeLeave(p);
4019   return rc;
4020 }
4021 
4022 /*
4023 ** The second argument to this function, op, is always SAVEPOINT_ROLLBACK
4024 ** or SAVEPOINT_RELEASE. This function either releases or rolls back the
4025 ** savepoint identified by parameter iSavepoint, depending on the value
4026 ** of op.
4027 **
4028 ** Normally, iSavepoint is greater than or equal to zero. However, if op is
4029 ** SAVEPOINT_ROLLBACK, then iSavepoint may also be -1. In this case the
4030 ** contents of the entire transaction are rolled back. This is different
4031 ** from a normal transaction rollback, as no locks are released and the
4032 ** transaction remains open.
4033 */
4034 int sqlite3BtreeSavepoint(Btree *p, int op, int iSavepoint){
4035   int rc = SQLITE_OK;
4036   if( p && p->inTrans==TRANS_WRITE ){
4037     BtShared *pBt = p->pBt;
4038     assert( op==SAVEPOINT_RELEASE || op==SAVEPOINT_ROLLBACK );
4039     assert( iSavepoint>=0 || (iSavepoint==-1 && op==SAVEPOINT_ROLLBACK) );
4040     sqlite3BtreeEnter(p);
4041     rc = sqlite3PagerSavepoint(pBt->pPager, op, iSavepoint);
4042     if( rc==SQLITE_OK ){
4043       if( iSavepoint<0 && (pBt->btsFlags & BTS_INITIALLY_EMPTY)!=0 ){
4044         pBt->nPage = 0;
4045       }
4046       rc = newDatabase(pBt);
4047       pBt->nPage = get4byte(28 + pBt->pPage1->aData);
4048 
4049       /* The database size was written into the offset 28 of the header
4050       ** when the transaction started, so we know that the value at offset
4051       ** 28 is nonzero. */
4052       assert( pBt->nPage>0 );
4053     }
4054     sqlite3BtreeLeave(p);
4055   }
4056   return rc;
4057 }
4058 
4059 /*
4060 ** Create a new cursor for the BTree whose root is on the page
4061 ** iTable. If a read-only cursor is requested, it is assumed that
4062 ** the caller already has at least a read-only transaction open
4063 ** on the database already. If a write-cursor is requested, then
4064 ** the caller is assumed to have an open write transaction.
4065 **
4066 ** If the BTREE_WRCSR bit of wrFlag is clear, then the cursor can only
4067 ** be used for reading.  If the BTREE_WRCSR bit is set, then the cursor
4068 ** can be used for reading or for writing if other conditions for writing
4069 ** are also met.  These are the conditions that must be met in order
4070 ** for writing to be allowed:
4071 **
4072 ** 1:  The cursor must have been opened with wrFlag containing BTREE_WRCSR
4073 **
4074 ** 2:  Other database connections that share the same pager cache
4075 **     but which are not in the READ_UNCOMMITTED state may not have
4076 **     cursors open with wrFlag==0 on the same table.  Otherwise
4077 **     the changes made by this write cursor would be visible to
4078 **     the read cursors in the other database connection.
4079 **
4080 ** 3:  The database must be writable (not on read-only media)
4081 **
4082 ** 4:  There must be an active transaction.
4083 **
4084 ** The BTREE_FORDELETE bit of wrFlag may optionally be set if BTREE_WRCSR
4085 ** is set.  If FORDELETE is set, that is a hint to the implementation that
4086 ** this cursor will only be used to seek to and delete entries of an index
4087 ** as part of a larger DELETE statement.  The FORDELETE hint is not used by
4088 ** this implementation.  But in a hypothetical alternative storage engine
4089 ** in which index entries are automatically deleted when corresponding table
4090 ** rows are deleted, the FORDELETE flag is a hint that all SEEK and DELETE
4091 ** operations on this cursor can be no-ops and all READ operations can
4092 ** return a null row (2-bytes: 0x01 0x00).
4093 **
4094 ** No checking is done to make sure that page iTable really is the
4095 ** root page of a b-tree.  If it is not, then the cursor acquired
4096 ** will not work correctly.
4097 **
4098 ** It is assumed that the sqlite3BtreeCursorZero() has been called
4099 ** on pCur to initialize the memory space prior to invoking this routine.
4100 */
4101 static int btreeCursor(
4102   Btree *p,                              /* The btree */
4103   int iTable,                            /* Root page of table to open */
4104   int wrFlag,                            /* 1 to write. 0 read-only */
4105   struct KeyInfo *pKeyInfo,              /* First arg to comparison function */
4106   BtCursor *pCur                         /* Space for new cursor */
4107 ){
4108   BtShared *pBt = p->pBt;                /* Shared b-tree handle */
4109   BtCursor *pX;                          /* Looping over other all cursors */
4110 
4111   assert( sqlite3BtreeHoldsMutex(p) );
4112   assert( wrFlag==0
4113        || wrFlag==BTREE_WRCSR
4114        || wrFlag==(BTREE_WRCSR|BTREE_FORDELETE)
4115   );
4116 
4117   /* The following assert statements verify that if this is a sharable
4118   ** b-tree database, the connection is holding the required table locks,
4119   ** and that no other connection has any open cursor that conflicts with
4120   ** this lock.  */
4121   assert( hasSharedCacheTableLock(p, iTable, pKeyInfo!=0, (wrFlag?2:1)) );
4122   assert( wrFlag==0 || !hasReadConflicts(p, iTable) );
4123 
4124   /* Assert that the caller has opened the required transaction. */
4125   assert( p->inTrans>TRANS_NONE );
4126   assert( wrFlag==0 || p->inTrans==TRANS_WRITE );
4127   assert( pBt->pPage1 && pBt->pPage1->aData );
4128   assert( wrFlag==0 || (pBt->btsFlags & BTS_READ_ONLY)==0 );
4129 
4130   if( wrFlag ){
4131     allocateTempSpace(pBt);
4132     if( pBt->pTmpSpace==0 ) return SQLITE_NOMEM_BKPT;
4133   }
4134   if( iTable==1 && btreePagecount(pBt)==0 ){
4135     assert( wrFlag==0 );
4136     iTable = 0;
4137   }
4138 
4139   /* Now that no other errors can occur, finish filling in the BtCursor
4140   ** variables and link the cursor into the BtShared list.  */
4141   pCur->pgnoRoot = (Pgno)iTable;
4142   pCur->iPage = -1;
4143   pCur->pKeyInfo = pKeyInfo;
4144   pCur->pBtree = p;
4145   pCur->pBt = pBt;
4146   pCur->curFlags = wrFlag ? BTCF_WriteFlag : 0;
4147   pCur->curPagerFlags = wrFlag ? 0 : PAGER_GET_READONLY;
4148   /* If there are two or more cursors on the same btree, then all such
4149   ** cursors *must* have the BTCF_Multiple flag set. */
4150   for(pX=pBt->pCursor; pX; pX=pX->pNext){
4151     if( pX->pgnoRoot==(Pgno)iTable ){
4152       pX->curFlags |= BTCF_Multiple;
4153       pCur->curFlags |= BTCF_Multiple;
4154     }
4155   }
4156   pCur->pNext = pBt->pCursor;
4157   pBt->pCursor = pCur;
4158   pCur->eState = CURSOR_INVALID;
4159   return SQLITE_OK;
4160 }
4161 int sqlite3BtreeCursor(
4162   Btree *p,                                   /* The btree */
4163   int iTable,                                 /* Root page of table to open */
4164   int wrFlag,                                 /* 1 to write. 0 read-only */
4165   struct KeyInfo *pKeyInfo,                   /* First arg to xCompare() */
4166   BtCursor *pCur                              /* Write new cursor here */
4167 ){
4168   int rc;
4169   if( iTable<1 ){
4170     rc = SQLITE_CORRUPT_BKPT;
4171   }else{
4172     sqlite3BtreeEnter(p);
4173     rc = btreeCursor(p, iTable, wrFlag, pKeyInfo, pCur);
4174     sqlite3BtreeLeave(p);
4175   }
4176   return rc;
4177 }
4178 
4179 /*
4180 ** Return the size of a BtCursor object in bytes.
4181 **
4182 ** This interfaces is needed so that users of cursors can preallocate
4183 ** sufficient storage to hold a cursor.  The BtCursor object is opaque
4184 ** to users so they cannot do the sizeof() themselves - they must call
4185 ** this routine.
4186 */
4187 int sqlite3BtreeCursorSize(void){
4188   return ROUND8(sizeof(BtCursor));
4189 }
4190 
4191 /*
4192 ** Initialize memory that will be converted into a BtCursor object.
4193 **
4194 ** The simple approach here would be to memset() the entire object
4195 ** to zero.  But it turns out that the apPage[] and aiIdx[] arrays
4196 ** do not need to be zeroed and they are large, so we can save a lot
4197 ** of run-time by skipping the initialization of those elements.
4198 */
4199 void sqlite3BtreeCursorZero(BtCursor *p){
4200   memset(p, 0, offsetof(BtCursor, iPage));
4201 }
4202 
4203 /*
4204 ** Close a cursor.  The read lock on the database file is released
4205 ** when the last cursor is closed.
4206 */
4207 int sqlite3BtreeCloseCursor(BtCursor *pCur){
4208   Btree *pBtree = pCur->pBtree;
4209   if( pBtree ){
4210     int i;
4211     BtShared *pBt = pCur->pBt;
4212     sqlite3BtreeEnter(pBtree);
4213     sqlite3BtreeClearCursor(pCur);
4214     assert( pBt->pCursor!=0 );
4215     if( pBt->pCursor==pCur ){
4216       pBt->pCursor = pCur->pNext;
4217     }else{
4218       BtCursor *pPrev = pBt->pCursor;
4219       do{
4220         if( pPrev->pNext==pCur ){
4221           pPrev->pNext = pCur->pNext;
4222           break;
4223         }
4224         pPrev = pPrev->pNext;
4225       }while( ALWAYS(pPrev) );
4226     }
4227     for(i=0; i<=pCur->iPage; i++){
4228       releasePage(pCur->apPage[i]);
4229     }
4230     unlockBtreeIfUnused(pBt);
4231     sqlite3_free(pCur->aOverflow);
4232     /* sqlite3_free(pCur); */
4233     sqlite3BtreeLeave(pBtree);
4234   }
4235   return SQLITE_OK;
4236 }
4237 
4238 /*
4239 ** Make sure the BtCursor* given in the argument has a valid
4240 ** BtCursor.info structure.  If it is not already valid, call
4241 ** btreeParseCell() to fill it in.
4242 **
4243 ** BtCursor.info is a cache of the information in the current cell.
4244 ** Using this cache reduces the number of calls to btreeParseCell().
4245 */
4246 #ifndef NDEBUG
4247   static void assertCellInfo(BtCursor *pCur){
4248     CellInfo info;
4249     int iPage = pCur->iPage;
4250     memset(&info, 0, sizeof(info));
4251     btreeParseCell(pCur->apPage[iPage], pCur->aiIdx[iPage], &info);
4252     assert( CORRUPT_DB || memcmp(&info, &pCur->info, sizeof(info))==0 );
4253   }
4254 #else
4255   #define assertCellInfo(x)
4256 #endif
4257 static SQLITE_NOINLINE void getCellInfo(BtCursor *pCur){
4258   if( pCur->info.nSize==0 ){
4259     int iPage = pCur->iPage;
4260     pCur->curFlags |= BTCF_ValidNKey;
4261     btreeParseCell(pCur->apPage[iPage],pCur->aiIdx[iPage],&pCur->info);
4262   }else{
4263     assertCellInfo(pCur);
4264   }
4265 }
4266 
4267 #ifndef NDEBUG  /* The next routine used only within assert() statements */
4268 /*
4269 ** Return true if the given BtCursor is valid.  A valid cursor is one
4270 ** that is currently pointing to a row in a (non-empty) table.
4271 ** This is a verification routine is used only within assert() statements.
4272 */
4273 int sqlite3BtreeCursorIsValid(BtCursor *pCur){
4274   return pCur && pCur->eState==CURSOR_VALID;
4275 }
4276 #endif /* NDEBUG */
4277 int sqlite3BtreeCursorIsValidNN(BtCursor *pCur){
4278   assert( pCur!=0 );
4279   return pCur->eState==CURSOR_VALID;
4280 }
4281 
4282 /*
4283 ** Return the value of the integer key or "rowid" for a table btree.
4284 ** This routine is only valid for a cursor that is pointing into a
4285 ** ordinary table btree.  If the cursor points to an index btree or
4286 ** is invalid, the result of this routine is undefined.
4287 */
4288 i64 sqlite3BtreeIntegerKey(BtCursor *pCur){
4289   assert( cursorHoldsMutex(pCur) );
4290   assert( pCur->eState==CURSOR_VALID );
4291   assert( pCur->curIntKey );
4292   getCellInfo(pCur);
4293   return pCur->info.nKey;
4294 }
4295 
4296 /*
4297 ** Return the number of bytes of payload for the entry that pCur is
4298 ** currently pointing to.  For table btrees, this will be the amount
4299 ** of data.  For index btrees, this will be the size of the key.
4300 **
4301 ** The caller must guarantee that the cursor is pointing to a non-NULL
4302 ** valid entry.  In other words, the calling procedure must guarantee
4303 ** that the cursor has Cursor.eState==CURSOR_VALID.
4304 */
4305 u32 sqlite3BtreePayloadSize(BtCursor *pCur){
4306   assert( cursorHoldsMutex(pCur) );
4307   assert( pCur->eState==CURSOR_VALID );
4308   getCellInfo(pCur);
4309   return pCur->info.nPayload;
4310 }
4311 
4312 /*
4313 ** Given the page number of an overflow page in the database (parameter
4314 ** ovfl), this function finds the page number of the next page in the
4315 ** linked list of overflow pages. If possible, it uses the auto-vacuum
4316 ** pointer-map data instead of reading the content of page ovfl to do so.
4317 **
4318 ** If an error occurs an SQLite error code is returned. Otherwise:
4319 **
4320 ** The page number of the next overflow page in the linked list is
4321 ** written to *pPgnoNext. If page ovfl is the last page in its linked
4322 ** list, *pPgnoNext is set to zero.
4323 **
4324 ** If ppPage is not NULL, and a reference to the MemPage object corresponding
4325 ** to page number pOvfl was obtained, then *ppPage is set to point to that
4326 ** reference. It is the responsibility of the caller to call releasePage()
4327 ** on *ppPage to free the reference. In no reference was obtained (because
4328 ** the pointer-map was used to obtain the value for *pPgnoNext), then
4329 ** *ppPage is set to zero.
4330 */
4331 static int getOverflowPage(
4332   BtShared *pBt,               /* The database file */
4333   Pgno ovfl,                   /* Current overflow page number */
4334   MemPage **ppPage,            /* OUT: MemPage handle (may be NULL) */
4335   Pgno *pPgnoNext              /* OUT: Next overflow page number */
4336 ){
4337   Pgno next = 0;
4338   MemPage *pPage = 0;
4339   int rc = SQLITE_OK;
4340 
4341   assert( sqlite3_mutex_held(pBt->mutex) );
4342   assert(pPgnoNext);
4343 
4344 #ifndef SQLITE_OMIT_AUTOVACUUM
4345   /* Try to find the next page in the overflow list using the
4346   ** autovacuum pointer-map pages. Guess that the next page in
4347   ** the overflow list is page number (ovfl+1). If that guess turns
4348   ** out to be wrong, fall back to loading the data of page
4349   ** number ovfl to determine the next page number.
4350   */
4351   if( pBt->autoVacuum ){
4352     Pgno pgno;
4353     Pgno iGuess = ovfl+1;
4354     u8 eType;
4355 
4356     while( PTRMAP_ISPAGE(pBt, iGuess) || iGuess==PENDING_BYTE_PAGE(pBt) ){
4357       iGuess++;
4358     }
4359 
4360     if( iGuess<=btreePagecount(pBt) ){
4361       rc = ptrmapGet(pBt, iGuess, &eType, &pgno);
4362       if( rc==SQLITE_OK && eType==PTRMAP_OVERFLOW2 && pgno==ovfl ){
4363         next = iGuess;
4364         rc = SQLITE_DONE;
4365       }
4366     }
4367   }
4368 #endif
4369 
4370   assert( next==0 || rc==SQLITE_DONE );
4371   if( rc==SQLITE_OK ){
4372     rc = btreeGetPage(pBt, ovfl, &pPage, (ppPage==0) ? PAGER_GET_READONLY : 0);
4373     assert( rc==SQLITE_OK || pPage==0 );
4374     if( rc==SQLITE_OK ){
4375       next = get4byte(pPage->aData);
4376     }
4377   }
4378 
4379   *pPgnoNext = next;
4380   if( ppPage ){
4381     *ppPage = pPage;
4382   }else{
4383     releasePage(pPage);
4384   }
4385   return (rc==SQLITE_DONE ? SQLITE_OK : rc);
4386 }
4387 
4388 /*
4389 ** Copy data from a buffer to a page, or from a page to a buffer.
4390 **
4391 ** pPayload is a pointer to data stored on database page pDbPage.
4392 ** If argument eOp is false, then nByte bytes of data are copied
4393 ** from pPayload to the buffer pointed at by pBuf. If eOp is true,
4394 ** then sqlite3PagerWrite() is called on pDbPage and nByte bytes
4395 ** of data are copied from the buffer pBuf to pPayload.
4396 **
4397 ** SQLITE_OK is returned on success, otherwise an error code.
4398 */
4399 static int copyPayload(
4400   void *pPayload,           /* Pointer to page data */
4401   void *pBuf,               /* Pointer to buffer */
4402   int nByte,                /* Number of bytes to copy */
4403   int eOp,                  /* 0 -> copy from page, 1 -> copy to page */
4404   DbPage *pDbPage           /* Page containing pPayload */
4405 ){
4406   if( eOp ){
4407     /* Copy data from buffer to page (a write operation) */
4408     int rc = sqlite3PagerWrite(pDbPage);
4409     if( rc!=SQLITE_OK ){
4410       return rc;
4411     }
4412     memcpy(pPayload, pBuf, nByte);
4413   }else{
4414     /* Copy data from page to buffer (a read operation) */
4415     memcpy(pBuf, pPayload, nByte);
4416   }
4417   return SQLITE_OK;
4418 }
4419 
4420 /*
4421 ** This function is used to read or overwrite payload information
4422 ** for the entry that the pCur cursor is pointing to. The eOp
4423 ** argument is interpreted as follows:
4424 **
4425 **   0: The operation is a read. Populate the overflow cache.
4426 **   1: The operation is a write. Populate the overflow cache.
4427 **
4428 ** A total of "amt" bytes are read or written beginning at "offset".
4429 ** Data is read to or from the buffer pBuf.
4430 **
4431 ** The content being read or written might appear on the main page
4432 ** or be scattered out on multiple overflow pages.
4433 **
4434 ** If the current cursor entry uses one or more overflow pages
4435 ** this function may allocate space for and lazily populate
4436 ** the overflow page-list cache array (BtCursor.aOverflow).
4437 ** Subsequent calls use this cache to make seeking to the supplied offset
4438 ** more efficient.
4439 **
4440 ** Once an overflow page-list cache has been allocated, it must be
4441 ** invalidated if some other cursor writes to the same table, or if
4442 ** the cursor is moved to a different row. Additionally, in auto-vacuum
4443 ** mode, the following events may invalidate an overflow page-list cache.
4444 **
4445 **   * An incremental vacuum,
4446 **   * A commit in auto_vacuum="full" mode,
4447 **   * Creating a table (may require moving an overflow page).
4448 */
4449 static int accessPayload(
4450   BtCursor *pCur,      /* Cursor pointing to entry to read from */
4451   u32 offset,          /* Begin reading this far into payload */
4452   u32 amt,             /* Read this many bytes */
4453   unsigned char *pBuf, /* Write the bytes into this buffer */
4454   int eOp              /* zero to read. non-zero to write. */
4455 ){
4456   unsigned char *aPayload;
4457   int rc = SQLITE_OK;
4458   int iIdx = 0;
4459   MemPage *pPage = pCur->apPage[pCur->iPage]; /* Btree page of current entry */
4460   BtShared *pBt = pCur->pBt;                  /* Btree this cursor belongs to */
4461 #ifdef SQLITE_DIRECT_OVERFLOW_READ
4462   unsigned char * const pBufStart = pBuf;     /* Start of original out buffer */
4463 #endif
4464 
4465   assert( pPage );
4466   assert( eOp==0 || eOp==1 );
4467   assert( pCur->eState==CURSOR_VALID );
4468   assert( pCur->aiIdx[pCur->iPage]<pPage->nCell );
4469   assert( cursorHoldsMutex(pCur) );
4470 
4471   getCellInfo(pCur);
4472   aPayload = pCur->info.pPayload;
4473   assert( offset+amt <= pCur->info.nPayload );
4474 
4475   assert( aPayload > pPage->aData );
4476   if( (uptr)(aPayload - pPage->aData) > (pBt->usableSize - pCur->info.nLocal) ){
4477     /* Trying to read or write past the end of the data is an error.  The
4478     ** conditional above is really:
4479     **    &aPayload[pCur->info.nLocal] > &pPage->aData[pBt->usableSize]
4480     ** but is recast into its current form to avoid integer overflow problems
4481     */
4482     return SQLITE_CORRUPT_BKPT;
4483   }
4484 
4485   /* Check if data must be read/written to/from the btree page itself. */
4486   if( offset<pCur->info.nLocal ){
4487     int a = amt;
4488     if( a+offset>pCur->info.nLocal ){
4489       a = pCur->info.nLocal - offset;
4490     }
4491     rc = copyPayload(&aPayload[offset], pBuf, a, eOp, pPage->pDbPage);
4492     offset = 0;
4493     pBuf += a;
4494     amt -= a;
4495   }else{
4496     offset -= pCur->info.nLocal;
4497   }
4498 
4499 
4500   if( rc==SQLITE_OK && amt>0 ){
4501     const u32 ovflSize = pBt->usableSize - 4;  /* Bytes content per ovfl page */
4502     Pgno nextPage;
4503 
4504     nextPage = get4byte(&aPayload[pCur->info.nLocal]);
4505 
4506     /* If the BtCursor.aOverflow[] has not been allocated, allocate it now.
4507     **
4508     ** The aOverflow[] array is sized at one entry for each overflow page
4509     ** in the overflow chain. The page number of the first overflow page is
4510     ** stored in aOverflow[0], etc. A value of 0 in the aOverflow[] array
4511     ** means "not yet known" (the cache is lazily populated).
4512     */
4513     if( (pCur->curFlags & BTCF_ValidOvfl)==0 ){
4514       int nOvfl = (pCur->info.nPayload-pCur->info.nLocal+ovflSize-1)/ovflSize;
4515       if( nOvfl>pCur->nOvflAlloc ){
4516         Pgno *aNew = (Pgno*)sqlite3Realloc(
4517             pCur->aOverflow, nOvfl*2*sizeof(Pgno)
4518         );
4519         if( aNew==0 ){
4520           return SQLITE_NOMEM_BKPT;
4521         }else{
4522           pCur->nOvflAlloc = nOvfl*2;
4523           pCur->aOverflow = aNew;
4524         }
4525       }
4526       memset(pCur->aOverflow, 0, nOvfl*sizeof(Pgno));
4527       pCur->curFlags |= BTCF_ValidOvfl;
4528     }else{
4529       /* If the overflow page-list cache has been allocated and the
4530       ** entry for the first required overflow page is valid, skip
4531       ** directly to it.
4532       */
4533       if( pCur->aOverflow[offset/ovflSize] ){
4534         iIdx = (offset/ovflSize);
4535         nextPage = pCur->aOverflow[iIdx];
4536         offset = (offset%ovflSize);
4537       }
4538     }
4539 
4540     assert( rc==SQLITE_OK && amt>0 );
4541     while( nextPage ){
4542       /* If required, populate the overflow page-list cache. */
4543       assert( pCur->aOverflow[iIdx]==0
4544               || pCur->aOverflow[iIdx]==nextPage
4545               || CORRUPT_DB );
4546       pCur->aOverflow[iIdx] = nextPage;
4547 
4548       if( offset>=ovflSize ){
4549         /* The only reason to read this page is to obtain the page
4550         ** number for the next page in the overflow chain. The page
4551         ** data is not required. So first try to lookup the overflow
4552         ** page-list cache, if any, then fall back to the getOverflowPage()
4553         ** function.
4554         */
4555         assert( pCur->curFlags & BTCF_ValidOvfl );
4556         assert( pCur->pBtree->db==pBt->db );
4557         if( pCur->aOverflow[iIdx+1] ){
4558           nextPage = pCur->aOverflow[iIdx+1];
4559         }else{
4560           rc = getOverflowPage(pBt, nextPage, 0, &nextPage);
4561         }
4562         offset -= ovflSize;
4563       }else{
4564         /* Need to read this page properly. It contains some of the
4565         ** range of data that is being read (eOp==0) or written (eOp!=0).
4566         */
4567 #ifdef SQLITE_DIRECT_OVERFLOW_READ
4568         sqlite3_file *fd;      /* File from which to do direct overflow read */
4569 #endif
4570         int a = amt;
4571         if( a + offset > ovflSize ){
4572           a = ovflSize - offset;
4573         }
4574 
4575 #ifdef SQLITE_DIRECT_OVERFLOW_READ
4576         /* If all the following are true:
4577         **
4578         **   1) this is a read operation, and
4579         **   2) data is required from the start of this overflow page, and
4580         **   3) there is no open write-transaction, and
4581         **   4) the database is file-backed, and
4582         **   5) the page is not in the WAL file
4583         **   6) at least 4 bytes have already been read into the output buffer
4584         **
4585         ** then data can be read directly from the database file into the
4586         ** output buffer, bypassing the page-cache altogether. This speeds
4587         ** up loading large records that span many overflow pages.
4588         */
4589         if( eOp==0                                             /* (1) */
4590          && offset==0                                          /* (2) */
4591          && pBt->inTransaction==TRANS_READ                     /* (3) */
4592          && (fd = sqlite3PagerFile(pBt->pPager))->pMethods     /* (4) */
4593          && 0==sqlite3PagerUseWal(pBt->pPager, nextPage)       /* (5) */
4594          && &pBuf[-4]>=pBufStart                               /* (6) */
4595         ){
4596           u8 aSave[4];
4597           u8 *aWrite = &pBuf[-4];
4598           assert( aWrite>=pBufStart );                         /* due to (6) */
4599           memcpy(aSave, aWrite, 4);
4600           rc = sqlite3OsRead(fd, aWrite, a+4, (i64)pBt->pageSize*(nextPage-1));
4601           nextPage = get4byte(aWrite);
4602           memcpy(aWrite, aSave, 4);
4603         }else
4604 #endif
4605 
4606         {
4607           DbPage *pDbPage;
4608           rc = sqlite3PagerGet(pBt->pPager, nextPage, &pDbPage,
4609               (eOp==0 ? PAGER_GET_READONLY : 0)
4610           );
4611           if( rc==SQLITE_OK ){
4612             aPayload = sqlite3PagerGetData(pDbPage);
4613             nextPage = get4byte(aPayload);
4614             rc = copyPayload(&aPayload[offset+4], pBuf, a, eOp, pDbPage);
4615             sqlite3PagerUnref(pDbPage);
4616             offset = 0;
4617           }
4618         }
4619         amt -= a;
4620         if( amt==0 ) return rc;
4621         pBuf += a;
4622       }
4623       if( rc ) break;
4624       iIdx++;
4625     }
4626   }
4627 
4628   if( rc==SQLITE_OK && amt>0 ){
4629     return SQLITE_CORRUPT_BKPT; /* Overflow chain ends prematurely */
4630   }
4631   return rc;
4632 }
4633 
4634 /*
4635 ** Read part of the payload for the row at which that cursor pCur is currently
4636 ** pointing.  "amt" bytes will be transferred into pBuf[].  The transfer
4637 ** begins at "offset".
4638 **
4639 ** pCur can be pointing to either a table or an index b-tree.
4640 ** If pointing to a table btree, then the content section is read.  If
4641 ** pCur is pointing to an index b-tree then the key section is read.
4642 **
4643 ** For sqlite3BtreePayload(), the caller must ensure that pCur is pointing
4644 ** to a valid row in the table.  For sqlite3BtreePayloadChecked(), the
4645 ** cursor might be invalid or might need to be restored before being read.
4646 **
4647 ** Return SQLITE_OK on success or an error code if anything goes
4648 ** wrong.  An error is returned if "offset+amt" is larger than
4649 ** the available payload.
4650 */
4651 int sqlite3BtreePayload(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){
4652   assert( cursorHoldsMutex(pCur) );
4653   assert( pCur->eState==CURSOR_VALID );
4654   assert( pCur->iPage>=0 && pCur->apPage[pCur->iPage] );
4655   assert( pCur->aiIdx[pCur->iPage]<pCur->apPage[pCur->iPage]->nCell );
4656   return accessPayload(pCur, offset, amt, (unsigned char*)pBuf, 0);
4657 }
4658 
4659 /*
4660 ** This variant of sqlite3BtreePayload() works even if the cursor has not
4661 ** in the CURSOR_VALID state.  It is only used by the sqlite3_blob_read()
4662 ** interface.
4663 */
4664 #ifndef SQLITE_OMIT_INCRBLOB
4665 static SQLITE_NOINLINE int accessPayloadChecked(
4666   BtCursor *pCur,
4667   u32 offset,
4668   u32 amt,
4669   void *pBuf
4670 ){
4671   int rc;
4672   if ( pCur->eState==CURSOR_INVALID ){
4673     return SQLITE_ABORT;
4674   }
4675   assert( cursorOwnsBtShared(pCur) );
4676   rc = btreeRestoreCursorPosition(pCur);
4677   return rc ? rc : accessPayload(pCur, offset, amt, pBuf, 0);
4678 }
4679 int sqlite3BtreePayloadChecked(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){
4680   if( pCur->eState==CURSOR_VALID ){
4681     assert( cursorOwnsBtShared(pCur) );
4682     return accessPayload(pCur, offset, amt, pBuf, 0);
4683   }else{
4684     return accessPayloadChecked(pCur, offset, amt, pBuf);
4685   }
4686 }
4687 #endif /* SQLITE_OMIT_INCRBLOB */
4688 
4689 /*
4690 ** Return a pointer to payload information from the entry that the
4691 ** pCur cursor is pointing to.  The pointer is to the beginning of
4692 ** the key if index btrees (pPage->intKey==0) and is the data for
4693 ** table btrees (pPage->intKey==1). The number of bytes of available
4694 ** key/data is written into *pAmt.  If *pAmt==0, then the value
4695 ** returned will not be a valid pointer.
4696 **
4697 ** This routine is an optimization.  It is common for the entire key
4698 ** and data to fit on the local page and for there to be no overflow
4699 ** pages.  When that is so, this routine can be used to access the
4700 ** key and data without making a copy.  If the key and/or data spills
4701 ** onto overflow pages, then accessPayload() must be used to reassemble
4702 ** the key/data and copy it into a preallocated buffer.
4703 **
4704 ** The pointer returned by this routine looks directly into the cached
4705 ** page of the database.  The data might change or move the next time
4706 ** any btree routine is called.
4707 */
4708 static const void *fetchPayload(
4709   BtCursor *pCur,      /* Cursor pointing to entry to read from */
4710   u32 *pAmt            /* Write the number of available bytes here */
4711 ){
4712   u32 amt;
4713   assert( pCur!=0 && pCur->iPage>=0 && pCur->apPage[pCur->iPage]);
4714   assert( pCur->eState==CURSOR_VALID );
4715   assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
4716   assert( cursorOwnsBtShared(pCur) );
4717   assert( pCur->aiIdx[pCur->iPage]<pCur->apPage[pCur->iPage]->nCell );
4718   assert( pCur->info.nSize>0 );
4719   assert( pCur->info.pPayload>pCur->apPage[pCur->iPage]->aData || CORRUPT_DB );
4720   assert( pCur->info.pPayload<pCur->apPage[pCur->iPage]->aDataEnd ||CORRUPT_DB);
4721   amt = (int)(pCur->apPage[pCur->iPage]->aDataEnd - pCur->info.pPayload);
4722   if( pCur->info.nLocal<amt ) amt = pCur->info.nLocal;
4723   *pAmt = amt;
4724   return (void*)pCur->info.pPayload;
4725 }
4726 
4727 
4728 /*
4729 ** For the entry that cursor pCur is point to, return as
4730 ** many bytes of the key or data as are available on the local
4731 ** b-tree page.  Write the number of available bytes into *pAmt.
4732 **
4733 ** The pointer returned is ephemeral.  The key/data may move
4734 ** or be destroyed on the next call to any Btree routine,
4735 ** including calls from other threads against the same cache.
4736 ** Hence, a mutex on the BtShared should be held prior to calling
4737 ** this routine.
4738 **
4739 ** These routines is used to get quick access to key and data
4740 ** in the common case where no overflow pages are used.
4741 */
4742 const void *sqlite3BtreePayloadFetch(BtCursor *pCur, u32 *pAmt){
4743   return fetchPayload(pCur, pAmt);
4744 }
4745 
4746 
4747 /*
4748 ** Move the cursor down to a new child page.  The newPgno argument is the
4749 ** page number of the child page to move to.
4750 **
4751 ** This function returns SQLITE_CORRUPT if the page-header flags field of
4752 ** the new child page does not match the flags field of the parent (i.e.
4753 ** if an intkey page appears to be the parent of a non-intkey page, or
4754 ** vice-versa).
4755 */
4756 static int moveToChild(BtCursor *pCur, u32 newPgno){
4757   BtShared *pBt = pCur->pBt;
4758 
4759   assert( cursorOwnsBtShared(pCur) );
4760   assert( pCur->eState==CURSOR_VALID );
4761   assert( pCur->iPage<BTCURSOR_MAX_DEPTH );
4762   assert( pCur->iPage>=0 );
4763   if( pCur->iPage>=(BTCURSOR_MAX_DEPTH-1) ){
4764     return SQLITE_CORRUPT_BKPT;
4765   }
4766   pCur->info.nSize = 0;
4767   pCur->curFlags &= ~(BTCF_ValidNKey|BTCF_ValidOvfl);
4768   pCur->iPage++;
4769   pCur->aiIdx[pCur->iPage] = 0;
4770   return getAndInitPage(pBt, newPgno, &pCur->apPage[pCur->iPage],
4771                         pCur, pCur->curPagerFlags);
4772 }
4773 
4774 #if SQLITE_DEBUG
4775 /*
4776 ** Page pParent is an internal (non-leaf) tree page. This function
4777 ** asserts that page number iChild is the left-child if the iIdx'th
4778 ** cell in page pParent. Or, if iIdx is equal to the total number of
4779 ** cells in pParent, that page number iChild is the right-child of
4780 ** the page.
4781 */
4782 static void assertParentIndex(MemPage *pParent, int iIdx, Pgno iChild){
4783   if( CORRUPT_DB ) return;  /* The conditions tested below might not be true
4784                             ** in a corrupt database */
4785   assert( iIdx<=pParent->nCell );
4786   if( iIdx==pParent->nCell ){
4787     assert( get4byte(&pParent->aData[pParent->hdrOffset+8])==iChild );
4788   }else{
4789     assert( get4byte(findCell(pParent, iIdx))==iChild );
4790   }
4791 }
4792 #else
4793 #  define assertParentIndex(x,y,z)
4794 #endif
4795 
4796 /*
4797 ** Move the cursor up to the parent page.
4798 **
4799 ** pCur->idx is set to the cell index that contains the pointer
4800 ** to the page we are coming from.  If we are coming from the
4801 ** right-most child page then pCur->idx is set to one more than
4802 ** the largest cell index.
4803 */
4804 static void moveToParent(BtCursor *pCur){
4805   assert( cursorOwnsBtShared(pCur) );
4806   assert( pCur->eState==CURSOR_VALID );
4807   assert( pCur->iPage>0 );
4808   assert( pCur->apPage[pCur->iPage] );
4809   assertParentIndex(
4810     pCur->apPage[pCur->iPage-1],
4811     pCur->aiIdx[pCur->iPage-1],
4812     pCur->apPage[pCur->iPage]->pgno
4813   );
4814   testcase( pCur->aiIdx[pCur->iPage-1] > pCur->apPage[pCur->iPage-1]->nCell );
4815   pCur->info.nSize = 0;
4816   pCur->curFlags &= ~(BTCF_ValidNKey|BTCF_ValidOvfl);
4817   releasePageNotNull(pCur->apPage[pCur->iPage--]);
4818 }
4819 
4820 /*
4821 ** Move the cursor to point to the root page of its b-tree structure.
4822 **
4823 ** If the table has a virtual root page, then the cursor is moved to point
4824 ** to the virtual root page instead of the actual root page. A table has a
4825 ** virtual root page when the actual root page contains no cells and a
4826 ** single child page. This can only happen with the table rooted at page 1.
4827 **
4828 ** If the b-tree structure is empty, the cursor state is set to
4829 ** CURSOR_INVALID. Otherwise, the cursor is set to point to the first
4830 ** cell located on the root (or virtual root) page and the cursor state
4831 ** is set to CURSOR_VALID.
4832 **
4833 ** If this function returns successfully, it may be assumed that the
4834 ** page-header flags indicate that the [virtual] root-page is the expected
4835 ** kind of b-tree page (i.e. if when opening the cursor the caller did not
4836 ** specify a KeyInfo structure the flags byte is set to 0x05 or 0x0D,
4837 ** indicating a table b-tree, or if the caller did specify a KeyInfo
4838 ** structure the flags byte is set to 0x02 or 0x0A, indicating an index
4839 ** b-tree).
4840 */
4841 static int moveToRoot(BtCursor *pCur){
4842   MemPage *pRoot;
4843   int rc = SQLITE_OK;
4844 
4845   assert( cursorOwnsBtShared(pCur) );
4846   assert( CURSOR_INVALID < CURSOR_REQUIRESEEK );
4847   assert( CURSOR_VALID   < CURSOR_REQUIRESEEK );
4848   assert( CURSOR_FAULT   > CURSOR_REQUIRESEEK );
4849   if( pCur->eState>=CURSOR_REQUIRESEEK ){
4850     if( pCur->eState==CURSOR_FAULT ){
4851       assert( pCur->skipNext!=SQLITE_OK );
4852       return pCur->skipNext;
4853     }
4854     sqlite3BtreeClearCursor(pCur);
4855   }
4856 
4857   if( pCur->iPage>=0 ){
4858     if( pCur->iPage ){
4859       do{
4860         assert( pCur->apPage[pCur->iPage]!=0 );
4861         releasePageNotNull(pCur->apPage[pCur->iPage--]);
4862       }while( pCur->iPage);
4863       goto skip_init;
4864     }
4865   }else if( pCur->pgnoRoot==0 ){
4866     pCur->eState = CURSOR_INVALID;
4867     return SQLITE_OK;
4868   }else{
4869     assert( pCur->iPage==(-1) );
4870     rc = getAndInitPage(pCur->pBtree->pBt, pCur->pgnoRoot, &pCur->apPage[0],
4871                         0, pCur->curPagerFlags);
4872     if( rc!=SQLITE_OK ){
4873       pCur->eState = CURSOR_INVALID;
4874        return rc;
4875     }
4876     pCur->iPage = 0;
4877     pCur->curIntKey = pCur->apPage[0]->intKey;
4878   }
4879   pRoot = pCur->apPage[0];
4880   assert( pRoot->pgno==pCur->pgnoRoot );
4881 
4882   /* If pCur->pKeyInfo is not NULL, then the caller that opened this cursor
4883   ** expected to open it on an index b-tree. Otherwise, if pKeyInfo is
4884   ** NULL, the caller expects a table b-tree. If this is not the case,
4885   ** return an SQLITE_CORRUPT error.
4886   **
4887   ** Earlier versions of SQLite assumed that this test could not fail
4888   ** if the root page was already loaded when this function was called (i.e.
4889   ** if pCur->iPage>=0). But this is not so if the database is corrupted
4890   ** in such a way that page pRoot is linked into a second b-tree table
4891   ** (or the freelist).  */
4892   assert( pRoot->intKey==1 || pRoot->intKey==0 );
4893   if( pRoot->isInit==0 || (pCur->pKeyInfo==0)!=pRoot->intKey ){
4894     return SQLITE_CORRUPT_BKPT;
4895   }
4896 
4897 skip_init:
4898   pCur->aiIdx[0] = 0;
4899   pCur->info.nSize = 0;
4900   pCur->curFlags &= ~(BTCF_AtLast|BTCF_ValidNKey|BTCF_ValidOvfl);
4901 
4902   pRoot = pCur->apPage[0];
4903   if( pRoot->nCell>0 ){
4904     pCur->eState = CURSOR_VALID;
4905   }else if( !pRoot->leaf ){
4906     Pgno subpage;
4907     if( pRoot->pgno!=1 ) return SQLITE_CORRUPT_BKPT;
4908     subpage = get4byte(&pRoot->aData[pRoot->hdrOffset+8]);
4909     pCur->eState = CURSOR_VALID;
4910     rc = moveToChild(pCur, subpage);
4911   }else{
4912     pCur->eState = CURSOR_INVALID;
4913   }
4914   return rc;
4915 }
4916 
4917 /*
4918 ** Move the cursor down to the left-most leaf entry beneath the
4919 ** entry to which it is currently pointing.
4920 **
4921 ** The left-most leaf is the one with the smallest key - the first
4922 ** in ascending order.
4923 */
4924 static int moveToLeftmost(BtCursor *pCur){
4925   Pgno pgno;
4926   int rc = SQLITE_OK;
4927   MemPage *pPage;
4928 
4929   assert( cursorOwnsBtShared(pCur) );
4930   assert( pCur->eState==CURSOR_VALID );
4931   while( rc==SQLITE_OK && !(pPage = pCur->apPage[pCur->iPage])->leaf ){
4932     assert( pCur->aiIdx[pCur->iPage]<pPage->nCell );
4933     pgno = get4byte(findCell(pPage, pCur->aiIdx[pCur->iPage]));
4934     rc = moveToChild(pCur, pgno);
4935   }
4936   return rc;
4937 }
4938 
4939 /*
4940 ** Move the cursor down to the right-most leaf entry beneath the
4941 ** page to which it is currently pointing.  Notice the difference
4942 ** between moveToLeftmost() and moveToRightmost().  moveToLeftmost()
4943 ** finds the left-most entry beneath the *entry* whereas moveToRightmost()
4944 ** finds the right-most entry beneath the *page*.
4945 **
4946 ** The right-most entry is the one with the largest key - the last
4947 ** key in ascending order.
4948 */
4949 static int moveToRightmost(BtCursor *pCur){
4950   Pgno pgno;
4951   int rc = SQLITE_OK;
4952   MemPage *pPage = 0;
4953 
4954   assert( cursorOwnsBtShared(pCur) );
4955   assert( pCur->eState==CURSOR_VALID );
4956   while( !(pPage = pCur->apPage[pCur->iPage])->leaf ){
4957     pgno = get4byte(&pPage->aData[pPage->hdrOffset+8]);
4958     pCur->aiIdx[pCur->iPage] = pPage->nCell;
4959     rc = moveToChild(pCur, pgno);
4960     if( rc ) return rc;
4961   }
4962   pCur->aiIdx[pCur->iPage] = pPage->nCell-1;
4963   assert( pCur->info.nSize==0 );
4964   assert( (pCur->curFlags & BTCF_ValidNKey)==0 );
4965   return SQLITE_OK;
4966 }
4967 
4968 /* Move the cursor to the first entry in the table.  Return SQLITE_OK
4969 ** on success.  Set *pRes to 0 if the cursor actually points to something
4970 ** or set *pRes to 1 if the table is empty.
4971 */
4972 int sqlite3BtreeFirst(BtCursor *pCur, int *pRes){
4973   int rc;
4974 
4975   assert( cursorOwnsBtShared(pCur) );
4976   assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
4977   rc = moveToRoot(pCur);
4978   if( rc==SQLITE_OK ){
4979     if( pCur->eState==CURSOR_INVALID ){
4980       assert( pCur->pgnoRoot==0 || pCur->apPage[pCur->iPage]->nCell==0 );
4981       *pRes = 1;
4982     }else{
4983       assert( pCur->apPage[pCur->iPage]->nCell>0 );
4984       *pRes = 0;
4985       rc = moveToLeftmost(pCur);
4986     }
4987   }
4988   return rc;
4989 }
4990 
4991 /* Move the cursor to the last entry in the table.  Return SQLITE_OK
4992 ** on success.  Set *pRes to 0 if the cursor actually points to something
4993 ** or set *pRes to 1 if the table is empty.
4994 */
4995 int sqlite3BtreeLast(BtCursor *pCur, int *pRes){
4996   int rc;
4997 
4998   assert( cursorOwnsBtShared(pCur) );
4999   assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
5000 
5001   /* If the cursor already points to the last entry, this is a no-op. */
5002   if( CURSOR_VALID==pCur->eState && (pCur->curFlags & BTCF_AtLast)!=0 ){
5003 #ifdef SQLITE_DEBUG
5004     /* This block serves to assert() that the cursor really does point
5005     ** to the last entry in the b-tree. */
5006     int ii;
5007     for(ii=0; ii<pCur->iPage; ii++){
5008       assert( pCur->aiIdx[ii]==pCur->apPage[ii]->nCell );
5009     }
5010     assert( pCur->aiIdx[pCur->iPage]==pCur->apPage[pCur->iPage]->nCell-1 );
5011     assert( pCur->apPage[pCur->iPage]->leaf );
5012 #endif
5013     return SQLITE_OK;
5014   }
5015 
5016   rc = moveToRoot(pCur);
5017   if( rc==SQLITE_OK ){
5018     if( CURSOR_INVALID==pCur->eState ){
5019       assert( pCur->pgnoRoot==0 || pCur->apPage[pCur->iPage]->nCell==0 );
5020       *pRes = 1;
5021     }else{
5022       assert( pCur->eState==CURSOR_VALID );
5023       *pRes = 0;
5024       rc = moveToRightmost(pCur);
5025       if( rc==SQLITE_OK ){
5026         pCur->curFlags |= BTCF_AtLast;
5027       }else{
5028         pCur->curFlags &= ~BTCF_AtLast;
5029       }
5030 
5031     }
5032   }
5033   return rc;
5034 }
5035 
5036 /* Move the cursor so that it points to an entry near the key
5037 ** specified by pIdxKey or intKey.   Return a success code.
5038 **
5039 ** For INTKEY tables, the intKey parameter is used.  pIdxKey
5040 ** must be NULL.  For index tables, pIdxKey is used and intKey
5041 ** is ignored.
5042 **
5043 ** If an exact match is not found, then the cursor is always
5044 ** left pointing at a leaf page which would hold the entry if it
5045 ** were present.  The cursor might point to an entry that comes
5046 ** before or after the key.
5047 **
5048 ** An integer is written into *pRes which is the result of
5049 ** comparing the key with the entry to which the cursor is
5050 ** pointing.  The meaning of the integer written into
5051 ** *pRes is as follows:
5052 **
5053 **     *pRes<0      The cursor is left pointing at an entry that
5054 **                  is smaller than intKey/pIdxKey or if the table is empty
5055 **                  and the cursor is therefore left point to nothing.
5056 **
5057 **     *pRes==0     The cursor is left pointing at an entry that
5058 **                  exactly matches intKey/pIdxKey.
5059 **
5060 **     *pRes>0      The cursor is left pointing at an entry that
5061 **                  is larger than intKey/pIdxKey.
5062 **
5063 ** For index tables, the pIdxKey->eqSeen field is set to 1 if there
5064 ** exists an entry in the table that exactly matches pIdxKey.
5065 */
5066 int sqlite3BtreeMovetoUnpacked(
5067   BtCursor *pCur,          /* The cursor to be moved */
5068   UnpackedRecord *pIdxKey, /* Unpacked index key */
5069   i64 intKey,              /* The table key */
5070   int biasRight,           /* If true, bias the search to the high end */
5071   int *pRes                /* Write search results here */
5072 ){
5073   int rc;
5074   RecordCompare xRecordCompare;
5075 
5076   assert( cursorOwnsBtShared(pCur) );
5077   assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
5078   assert( pRes );
5079   assert( (pIdxKey==0)==(pCur->pKeyInfo==0) );
5080   assert( pCur->eState!=CURSOR_VALID || (pIdxKey==0)==(pCur->curIntKey!=0) );
5081 
5082   /* If the cursor is already positioned at the point we are trying
5083   ** to move to, then just return without doing any work */
5084   if( pIdxKey==0
5085    && pCur->eState==CURSOR_VALID && (pCur->curFlags & BTCF_ValidNKey)!=0
5086   ){
5087     if( pCur->info.nKey==intKey ){
5088       *pRes = 0;
5089       return SQLITE_OK;
5090     }
5091     if( pCur->info.nKey<intKey ){
5092       if( (pCur->curFlags & BTCF_AtLast)!=0 ){
5093         *pRes = -1;
5094         return SQLITE_OK;
5095       }
5096       /* If the requested key is one more than the previous key, then
5097       ** try to get there using sqlite3BtreeNext() rather than a full
5098       ** binary search.  This is an optimization only.  The correct answer
5099       ** is still obtained without this ase, only a little more slowely */
5100       if( pCur->info.nKey+1==intKey && !pCur->skipNext ){
5101         *pRes = 0;
5102         rc = sqlite3BtreeNext(pCur, pRes);
5103         if( rc ) return rc;
5104         if( *pRes==0 ){
5105           getCellInfo(pCur);
5106           if( pCur->info.nKey==intKey ){
5107             return SQLITE_OK;
5108           }
5109         }
5110       }
5111     }
5112   }
5113 
5114   if( pIdxKey ){
5115     xRecordCompare = sqlite3VdbeFindCompare(pIdxKey);
5116     pIdxKey->errCode = 0;
5117     assert( pIdxKey->default_rc==1
5118          || pIdxKey->default_rc==0
5119          || pIdxKey->default_rc==-1
5120     );
5121   }else{
5122     xRecordCompare = 0; /* All keys are integers */
5123   }
5124 
5125   rc = moveToRoot(pCur);
5126   if( rc ){
5127     return rc;
5128   }
5129   assert( pCur->pgnoRoot==0 || pCur->apPage[pCur->iPage] );
5130   assert( pCur->pgnoRoot==0 || pCur->apPage[pCur->iPage]->isInit );
5131   assert( pCur->eState==CURSOR_INVALID || pCur->apPage[pCur->iPage]->nCell>0 );
5132   if( pCur->eState==CURSOR_INVALID ){
5133     *pRes = -1;
5134     assert( pCur->pgnoRoot==0 || pCur->apPage[pCur->iPage]->nCell==0 );
5135     return SQLITE_OK;
5136   }
5137   assert( pCur->apPage[0]->intKey==pCur->curIntKey );
5138   assert( pCur->curIntKey || pIdxKey );
5139   for(;;){
5140     int lwr, upr, idx, c;
5141     Pgno chldPg;
5142     MemPage *pPage = pCur->apPage[pCur->iPage];
5143     u8 *pCell;                          /* Pointer to current cell in pPage */
5144 
5145     /* pPage->nCell must be greater than zero. If this is the root-page
5146     ** the cursor would have been INVALID above and this for(;;) loop
5147     ** not run. If this is not the root-page, then the moveToChild() routine
5148     ** would have already detected db corruption. Similarly, pPage must
5149     ** be the right kind (index or table) of b-tree page. Otherwise
5150     ** a moveToChild() or moveToRoot() call would have detected corruption.  */
5151     assert( pPage->nCell>0 );
5152     assert( pPage->intKey==(pIdxKey==0) );
5153     lwr = 0;
5154     upr = pPage->nCell-1;
5155     assert( biasRight==0 || biasRight==1 );
5156     idx = upr>>(1-biasRight); /* idx = biasRight ? upr : (lwr+upr)/2; */
5157     pCur->aiIdx[pCur->iPage] = (u16)idx;
5158     if( xRecordCompare==0 ){
5159       for(;;){
5160         i64 nCellKey;
5161         pCell = findCellPastPtr(pPage, idx);
5162         if( pPage->intKeyLeaf ){
5163           while( 0x80 <= *(pCell++) ){
5164             if( pCell>=pPage->aDataEnd ) return SQLITE_CORRUPT_BKPT;
5165           }
5166         }
5167         getVarint(pCell, (u64*)&nCellKey);
5168         if( nCellKey<intKey ){
5169           lwr = idx+1;
5170           if( lwr>upr ){ c = -1; break; }
5171         }else if( nCellKey>intKey ){
5172           upr = idx-1;
5173           if( lwr>upr ){ c = +1; break; }
5174         }else{
5175           assert( nCellKey==intKey );
5176           pCur->aiIdx[pCur->iPage] = (u16)idx;
5177           if( !pPage->leaf ){
5178             lwr = idx;
5179             goto moveto_next_layer;
5180           }else{
5181             pCur->curFlags |= BTCF_ValidNKey;
5182             pCur->info.nKey = nCellKey;
5183             pCur->info.nSize = 0;
5184             *pRes = 0;
5185             return SQLITE_OK;
5186           }
5187         }
5188         assert( lwr+upr>=0 );
5189         idx = (lwr+upr)>>1;  /* idx = (lwr+upr)/2; */
5190       }
5191     }else{
5192       for(;;){
5193         int nCell;  /* Size of the pCell cell in bytes */
5194         pCell = findCellPastPtr(pPage, idx);
5195 
5196         /* The maximum supported page-size is 65536 bytes. This means that
5197         ** the maximum number of record bytes stored on an index B-Tree
5198         ** page is less than 16384 bytes and may be stored as a 2-byte
5199         ** varint. This information is used to attempt to avoid parsing
5200         ** the entire cell by checking for the cases where the record is
5201         ** stored entirely within the b-tree page by inspecting the first
5202         ** 2 bytes of the cell.
5203         */
5204         nCell = pCell[0];
5205         if( nCell<=pPage->max1bytePayload ){
5206           /* This branch runs if the record-size field of the cell is a
5207           ** single byte varint and the record fits entirely on the main
5208           ** b-tree page.  */
5209           testcase( pCell+nCell+1==pPage->aDataEnd );
5210           c = xRecordCompare(nCell, (void*)&pCell[1], pIdxKey);
5211         }else if( !(pCell[1] & 0x80)
5212           && (nCell = ((nCell&0x7f)<<7) + pCell[1])<=pPage->maxLocal
5213         ){
5214           /* The record-size field is a 2 byte varint and the record
5215           ** fits entirely on the main b-tree page.  */
5216           testcase( pCell+nCell+2==pPage->aDataEnd );
5217           c = xRecordCompare(nCell, (void*)&pCell[2], pIdxKey);
5218         }else{
5219           /* The record flows over onto one or more overflow pages. In
5220           ** this case the whole cell needs to be parsed, a buffer allocated
5221           ** and accessPayload() used to retrieve the record into the
5222           ** buffer before VdbeRecordCompare() can be called.
5223           **
5224           ** If the record is corrupt, the xRecordCompare routine may read
5225           ** up to two varints past the end of the buffer. An extra 18
5226           ** bytes of padding is allocated at the end of the buffer in
5227           ** case this happens.  */
5228           void *pCellKey;
5229           u8 * const pCellBody = pCell - pPage->childPtrSize;
5230           pPage->xParseCell(pPage, pCellBody, &pCur->info);
5231           nCell = (int)pCur->info.nKey;
5232           testcase( nCell<0 );   /* True if key size is 2^32 or more */
5233           testcase( nCell==0 );  /* Invalid key size:  0x80 0x80 0x00 */
5234           testcase( nCell==1 );  /* Invalid key size:  0x80 0x80 0x01 */
5235           testcase( nCell==2 );  /* Minimum legal index key size */
5236           if( nCell<2 ){
5237             rc = SQLITE_CORRUPT_BKPT;
5238             goto moveto_finish;
5239           }
5240           pCellKey = sqlite3Malloc( nCell+18 );
5241           if( pCellKey==0 ){
5242             rc = SQLITE_NOMEM_BKPT;
5243             goto moveto_finish;
5244           }
5245           pCur->aiIdx[pCur->iPage] = (u16)idx;
5246           rc = accessPayload(pCur, 0, nCell, (unsigned char*)pCellKey, 0);
5247           pCur->curFlags &= ~BTCF_ValidOvfl;
5248           if( rc ){
5249             sqlite3_free(pCellKey);
5250             goto moveto_finish;
5251           }
5252           c = xRecordCompare(nCell, pCellKey, pIdxKey);
5253           sqlite3_free(pCellKey);
5254         }
5255         assert(
5256             (pIdxKey->errCode!=SQLITE_CORRUPT || c==0)
5257          && (pIdxKey->errCode!=SQLITE_NOMEM || pCur->pBtree->db->mallocFailed)
5258         );
5259         if( c<0 ){
5260           lwr = idx+1;
5261         }else if( c>0 ){
5262           upr = idx-1;
5263         }else{
5264           assert( c==0 );
5265           *pRes = 0;
5266           rc = SQLITE_OK;
5267           pCur->aiIdx[pCur->iPage] = (u16)idx;
5268           if( pIdxKey->errCode ) rc = SQLITE_CORRUPT;
5269           goto moveto_finish;
5270         }
5271         if( lwr>upr ) break;
5272         assert( lwr+upr>=0 );
5273         idx = (lwr+upr)>>1;  /* idx = (lwr+upr)/2 */
5274       }
5275     }
5276     assert( lwr==upr+1 || (pPage->intKey && !pPage->leaf) );
5277     assert( pPage->isInit );
5278     if( pPage->leaf ){
5279       assert( pCur->aiIdx[pCur->iPage]<pCur->apPage[pCur->iPage]->nCell );
5280       pCur->aiIdx[pCur->iPage] = (u16)idx;
5281       *pRes = c;
5282       rc = SQLITE_OK;
5283       goto moveto_finish;
5284     }
5285 moveto_next_layer:
5286     if( lwr>=pPage->nCell ){
5287       chldPg = get4byte(&pPage->aData[pPage->hdrOffset+8]);
5288     }else{
5289       chldPg = get4byte(findCell(pPage, lwr));
5290     }
5291     pCur->aiIdx[pCur->iPage] = (u16)lwr;
5292     rc = moveToChild(pCur, chldPg);
5293     if( rc ) break;
5294   }
5295 moveto_finish:
5296   pCur->info.nSize = 0;
5297   assert( (pCur->curFlags & BTCF_ValidOvfl)==0 );
5298   return rc;
5299 }
5300 
5301 
5302 /*
5303 ** Return TRUE if the cursor is not pointing at an entry of the table.
5304 **
5305 ** TRUE will be returned after a call to sqlite3BtreeNext() moves
5306 ** past the last entry in the table or sqlite3BtreePrev() moves past
5307 ** the first entry.  TRUE is also returned if the table is empty.
5308 */
5309 int sqlite3BtreeEof(BtCursor *pCur){
5310   /* TODO: What if the cursor is in CURSOR_REQUIRESEEK but all table entries
5311   ** have been deleted? This API will need to change to return an error code
5312   ** as well as the boolean result value.
5313   */
5314   return (CURSOR_VALID!=pCur->eState);
5315 }
5316 
5317 /*
5318 ** Advance the cursor to the next entry in the database.  If
5319 ** successful then set *pRes=0.  If the cursor
5320 ** was already pointing to the last entry in the database before
5321 ** this routine was called, then set *pRes=1.
5322 **
5323 ** The main entry point is sqlite3BtreeNext().  That routine is optimized
5324 ** for the common case of merely incrementing the cell counter BtCursor.aiIdx
5325 ** to the next cell on the current page.  The (slower) btreeNext() helper
5326 ** routine is called when it is necessary to move to a different page or
5327 ** to restore the cursor.
5328 **
5329 ** The calling function will set *pRes to 0 or 1.  The initial *pRes value
5330 ** will be 1 if the cursor being stepped corresponds to an SQL index and
5331 ** if this routine could have been skipped if that SQL index had been
5332 ** a unique index.  Otherwise the caller will have set *pRes to zero.
5333 ** Zero is the common case. The btree implementation is free to use the
5334 ** initial *pRes value as a hint to improve performance, but the current
5335 ** SQLite btree implementation does not. (Note that the comdb2 btree
5336 ** implementation does use this hint, however.)
5337 */
5338 static SQLITE_NOINLINE int btreeNext(BtCursor *pCur, int *pRes){
5339   int rc;
5340   int idx;
5341   MemPage *pPage;
5342 
5343   assert( cursorOwnsBtShared(pCur) );
5344   assert( pCur->skipNext==0 || pCur->eState!=CURSOR_VALID );
5345   assert( *pRes==0 );
5346   if( pCur->eState!=CURSOR_VALID ){
5347     assert( (pCur->curFlags & BTCF_ValidOvfl)==0 );
5348     rc = restoreCursorPosition(pCur);
5349     if( rc!=SQLITE_OK ){
5350       return rc;
5351     }
5352     if( CURSOR_INVALID==pCur->eState ){
5353       *pRes = 1;
5354       return SQLITE_OK;
5355     }
5356     if( pCur->skipNext ){
5357       assert( pCur->eState==CURSOR_VALID || pCur->eState==CURSOR_SKIPNEXT );
5358       pCur->eState = CURSOR_VALID;
5359       if( pCur->skipNext>0 ){
5360         pCur->skipNext = 0;
5361         return SQLITE_OK;
5362       }
5363       pCur->skipNext = 0;
5364     }
5365   }
5366 
5367   pPage = pCur->apPage[pCur->iPage];
5368   idx = ++pCur->aiIdx[pCur->iPage];
5369   assert( pPage->isInit );
5370 
5371   /* If the database file is corrupt, it is possible for the value of idx
5372   ** to be invalid here. This can only occur if a second cursor modifies
5373   ** the page while cursor pCur is holding a reference to it. Which can
5374   ** only happen if the database is corrupt in such a way as to link the
5375   ** page into more than one b-tree structure. */
5376   testcase( idx>pPage->nCell );
5377 
5378   if( idx>=pPage->nCell ){
5379     if( !pPage->leaf ){
5380       rc = moveToChild(pCur, get4byte(&pPage->aData[pPage->hdrOffset+8]));
5381       if( rc ) return rc;
5382       return moveToLeftmost(pCur);
5383     }
5384     do{
5385       if( pCur->iPage==0 ){
5386         *pRes = 1;
5387         pCur->eState = CURSOR_INVALID;
5388         return SQLITE_OK;
5389       }
5390       moveToParent(pCur);
5391       pPage = pCur->apPage[pCur->iPage];
5392     }while( pCur->aiIdx[pCur->iPage]>=pPage->nCell );
5393     if( pPage->intKey ){
5394       return sqlite3BtreeNext(pCur, pRes);
5395     }else{
5396       return SQLITE_OK;
5397     }
5398   }
5399   if( pPage->leaf ){
5400     return SQLITE_OK;
5401   }else{
5402     return moveToLeftmost(pCur);
5403   }
5404 }
5405 int sqlite3BtreeNext(BtCursor *pCur, int *pRes){
5406   MemPage *pPage;
5407   assert( cursorOwnsBtShared(pCur) );
5408   assert( pRes!=0 );
5409   assert( *pRes==0 || *pRes==1 );
5410   assert( pCur->skipNext==0 || pCur->eState!=CURSOR_VALID );
5411   pCur->info.nSize = 0;
5412   pCur->curFlags &= ~(BTCF_ValidNKey|BTCF_ValidOvfl);
5413   *pRes = 0;
5414   if( pCur->eState!=CURSOR_VALID ) return btreeNext(pCur, pRes);
5415   pPage = pCur->apPage[pCur->iPage];
5416   if( (++pCur->aiIdx[pCur->iPage])>=pPage->nCell ){
5417     pCur->aiIdx[pCur->iPage]--;
5418     return btreeNext(pCur, pRes);
5419   }
5420   if( pPage->leaf ){
5421     return SQLITE_OK;
5422   }else{
5423     return moveToLeftmost(pCur);
5424   }
5425 }
5426 
5427 /*
5428 ** Step the cursor to the back to the previous entry in the database.  If
5429 ** successful then set *pRes=0.  If the cursor
5430 ** was already pointing to the first entry in the database before
5431 ** this routine was called, then set *pRes=1.
5432 **
5433 ** The main entry point is sqlite3BtreePrevious().  That routine is optimized
5434 ** for the common case of merely decrementing the cell counter BtCursor.aiIdx
5435 ** to the previous cell on the current page.  The (slower) btreePrevious()
5436 ** helper routine is called when it is necessary to move to a different page
5437 ** or to restore the cursor.
5438 **
5439 ** The calling function will set *pRes to 0 or 1.  The initial *pRes value
5440 ** will be 1 if the cursor being stepped corresponds to an SQL index and
5441 ** if this routine could have been skipped if that SQL index had been
5442 ** a unique index.  Otherwise the caller will have set *pRes to zero.
5443 ** Zero is the common case. The btree implementation is free to use the
5444 ** initial *pRes value as a hint to improve performance, but the current
5445 ** SQLite btree implementation does not. (Note that the comdb2 btree
5446 ** implementation does use this hint, however.)
5447 */
5448 static SQLITE_NOINLINE int btreePrevious(BtCursor *pCur, int *pRes){
5449   int rc;
5450   MemPage *pPage;
5451 
5452   assert( cursorOwnsBtShared(pCur) );
5453   assert( pRes!=0 );
5454   assert( *pRes==0 );
5455   assert( pCur->skipNext==0 || pCur->eState!=CURSOR_VALID );
5456   assert( (pCur->curFlags & (BTCF_AtLast|BTCF_ValidOvfl|BTCF_ValidNKey))==0 );
5457   assert( pCur->info.nSize==0 );
5458   if( pCur->eState!=CURSOR_VALID ){
5459     rc = restoreCursorPosition(pCur);
5460     if( rc!=SQLITE_OK ){
5461       return rc;
5462     }
5463     if( CURSOR_INVALID==pCur->eState ){
5464       *pRes = 1;
5465       return SQLITE_OK;
5466     }
5467     if( pCur->skipNext ){
5468       assert( pCur->eState==CURSOR_VALID || pCur->eState==CURSOR_SKIPNEXT );
5469       pCur->eState = CURSOR_VALID;
5470       if( pCur->skipNext<0 ){
5471         pCur->skipNext = 0;
5472         return SQLITE_OK;
5473       }
5474       pCur->skipNext = 0;
5475     }
5476   }
5477 
5478   pPage = pCur->apPage[pCur->iPage];
5479   assert( pPage->isInit );
5480   if( !pPage->leaf ){
5481     int idx = pCur->aiIdx[pCur->iPage];
5482     rc = moveToChild(pCur, get4byte(findCell(pPage, idx)));
5483     if( rc ) return rc;
5484     rc = moveToRightmost(pCur);
5485   }else{
5486     while( pCur->aiIdx[pCur->iPage]==0 ){
5487       if( pCur->iPage==0 ){
5488         pCur->eState = CURSOR_INVALID;
5489         *pRes = 1;
5490         return SQLITE_OK;
5491       }
5492       moveToParent(pCur);
5493     }
5494     assert( pCur->info.nSize==0 );
5495     assert( (pCur->curFlags & (BTCF_ValidOvfl))==0 );
5496 
5497     pCur->aiIdx[pCur->iPage]--;
5498     pPage = pCur->apPage[pCur->iPage];
5499     if( pPage->intKey && !pPage->leaf ){
5500       rc = sqlite3BtreePrevious(pCur, pRes);
5501     }else{
5502       rc = SQLITE_OK;
5503     }
5504   }
5505   return rc;
5506 }
5507 int sqlite3BtreePrevious(BtCursor *pCur, int *pRes){
5508   assert( cursorOwnsBtShared(pCur) );
5509   assert( pRes!=0 );
5510   assert( *pRes==0 || *pRes==1 );
5511   assert( pCur->skipNext==0 || pCur->eState!=CURSOR_VALID );
5512   *pRes = 0;
5513   pCur->curFlags &= ~(BTCF_AtLast|BTCF_ValidOvfl|BTCF_ValidNKey);
5514   pCur->info.nSize = 0;
5515   if( pCur->eState!=CURSOR_VALID
5516    || pCur->aiIdx[pCur->iPage]==0
5517    || pCur->apPage[pCur->iPage]->leaf==0
5518   ){
5519     return btreePrevious(pCur, pRes);
5520   }
5521   pCur->aiIdx[pCur->iPage]--;
5522   return SQLITE_OK;
5523 }
5524 
5525 /*
5526 ** Allocate a new page from the database file.
5527 **
5528 ** The new page is marked as dirty.  (In other words, sqlite3PagerWrite()
5529 ** has already been called on the new page.)  The new page has also
5530 ** been referenced and the calling routine is responsible for calling
5531 ** sqlite3PagerUnref() on the new page when it is done.
5532 **
5533 ** SQLITE_OK is returned on success.  Any other return value indicates
5534 ** an error.  *ppPage is set to NULL in the event of an error.
5535 **
5536 ** If the "nearby" parameter is not 0, then an effort is made to
5537 ** locate a page close to the page number "nearby".  This can be used in an
5538 ** attempt to keep related pages close to each other in the database file,
5539 ** which in turn can make database access faster.
5540 **
5541 ** If the eMode parameter is BTALLOC_EXACT and the nearby page exists
5542 ** anywhere on the free-list, then it is guaranteed to be returned.  If
5543 ** eMode is BTALLOC_LT then the page returned will be less than or equal
5544 ** to nearby if any such page exists.  If eMode is BTALLOC_ANY then there
5545 ** are no restrictions on which page is returned.
5546 */
5547 static int allocateBtreePage(
5548   BtShared *pBt,         /* The btree */
5549   MemPage **ppPage,      /* Store pointer to the allocated page here */
5550   Pgno *pPgno,           /* Store the page number here */
5551   Pgno nearby,           /* Search for a page near this one */
5552   u8 eMode               /* BTALLOC_EXACT, BTALLOC_LT, or BTALLOC_ANY */
5553 ){
5554   MemPage *pPage1;
5555   int rc;
5556   u32 n;     /* Number of pages on the freelist */
5557   u32 k;     /* Number of leaves on the trunk of the freelist */
5558   MemPage *pTrunk = 0;
5559   MemPage *pPrevTrunk = 0;
5560   Pgno mxPage;     /* Total size of the database file */
5561 
5562   assert( sqlite3_mutex_held(pBt->mutex) );
5563   assert( eMode==BTALLOC_ANY || (nearby>0 && IfNotOmitAV(pBt->autoVacuum)) );
5564   pPage1 = pBt->pPage1;
5565   mxPage = btreePagecount(pBt);
5566   /* EVIDENCE-OF: R-05119-02637 The 4-byte big-endian integer at offset 36
5567   ** stores stores the total number of pages on the freelist. */
5568   n = get4byte(&pPage1->aData[36]);
5569   testcase( n==mxPage-1 );
5570   if( n>=mxPage ){
5571     return SQLITE_CORRUPT_BKPT;
5572   }
5573   if( n>0 ){
5574     /* There are pages on the freelist.  Reuse one of those pages. */
5575     Pgno iTrunk;
5576     u8 searchList = 0; /* If the free-list must be searched for 'nearby' */
5577     u32 nSearch = 0;   /* Count of the number of search attempts */
5578 
5579     /* If eMode==BTALLOC_EXACT and a query of the pointer-map
5580     ** shows that the page 'nearby' is somewhere on the free-list, then
5581     ** the entire-list will be searched for that page.
5582     */
5583 #ifndef SQLITE_OMIT_AUTOVACUUM
5584     if( eMode==BTALLOC_EXACT ){
5585       if( nearby<=mxPage ){
5586         u8 eType;
5587         assert( nearby>0 );
5588         assert( pBt->autoVacuum );
5589         rc = ptrmapGet(pBt, nearby, &eType, 0);
5590         if( rc ) return rc;
5591         if( eType==PTRMAP_FREEPAGE ){
5592           searchList = 1;
5593         }
5594       }
5595     }else if( eMode==BTALLOC_LE ){
5596       searchList = 1;
5597     }
5598 #endif
5599 
5600     /* Decrement the free-list count by 1. Set iTrunk to the index of the
5601     ** first free-list trunk page. iPrevTrunk is initially 1.
5602     */
5603     rc = sqlite3PagerWrite(pPage1->pDbPage);
5604     if( rc ) return rc;
5605     put4byte(&pPage1->aData[36], n-1);
5606 
5607     /* The code within this loop is run only once if the 'searchList' variable
5608     ** is not true. Otherwise, it runs once for each trunk-page on the
5609     ** free-list until the page 'nearby' is located (eMode==BTALLOC_EXACT)
5610     ** or until a page less than 'nearby' is located (eMode==BTALLOC_LT)
5611     */
5612     do {
5613       pPrevTrunk = pTrunk;
5614       if( pPrevTrunk ){
5615         /* EVIDENCE-OF: R-01506-11053 The first integer on a freelist trunk page
5616         ** is the page number of the next freelist trunk page in the list or
5617         ** zero if this is the last freelist trunk page. */
5618         iTrunk = get4byte(&pPrevTrunk->aData[0]);
5619       }else{
5620         /* EVIDENCE-OF: R-59841-13798 The 4-byte big-endian integer at offset 32
5621         ** stores the page number of the first page of the freelist, or zero if
5622         ** the freelist is empty. */
5623         iTrunk = get4byte(&pPage1->aData[32]);
5624       }
5625       testcase( iTrunk==mxPage );
5626       if( iTrunk>mxPage || nSearch++ > n ){
5627         rc = SQLITE_CORRUPT_BKPT;
5628       }else{
5629         rc = btreeGetUnusedPage(pBt, iTrunk, &pTrunk, 0);
5630       }
5631       if( rc ){
5632         pTrunk = 0;
5633         goto end_allocate_page;
5634       }
5635       assert( pTrunk!=0 );
5636       assert( pTrunk->aData!=0 );
5637       /* EVIDENCE-OF: R-13523-04394 The second integer on a freelist trunk page
5638       ** is the number of leaf page pointers to follow. */
5639       k = get4byte(&pTrunk->aData[4]);
5640       if( k==0 && !searchList ){
5641         /* The trunk has no leaves and the list is not being searched.
5642         ** So extract the trunk page itself and use it as the newly
5643         ** allocated page */
5644         assert( pPrevTrunk==0 );
5645         rc = sqlite3PagerWrite(pTrunk->pDbPage);
5646         if( rc ){
5647           goto end_allocate_page;
5648         }
5649         *pPgno = iTrunk;
5650         memcpy(&pPage1->aData[32], &pTrunk->aData[0], 4);
5651         *ppPage = pTrunk;
5652         pTrunk = 0;
5653         TRACE(("ALLOCATE: %d trunk - %d free pages left\n", *pPgno, n-1));
5654       }else if( k>(u32)(pBt->usableSize/4 - 2) ){
5655         /* Value of k is out of range.  Database corruption */
5656         rc = SQLITE_CORRUPT_BKPT;
5657         goto end_allocate_page;
5658 #ifndef SQLITE_OMIT_AUTOVACUUM
5659       }else if( searchList
5660             && (nearby==iTrunk || (iTrunk<nearby && eMode==BTALLOC_LE))
5661       ){
5662         /* The list is being searched and this trunk page is the page
5663         ** to allocate, regardless of whether it has leaves.
5664         */
5665         *pPgno = iTrunk;
5666         *ppPage = pTrunk;
5667         searchList = 0;
5668         rc = sqlite3PagerWrite(pTrunk->pDbPage);
5669         if( rc ){
5670           goto end_allocate_page;
5671         }
5672         if( k==0 ){
5673           if( !pPrevTrunk ){
5674             memcpy(&pPage1->aData[32], &pTrunk->aData[0], 4);
5675           }else{
5676             rc = sqlite3PagerWrite(pPrevTrunk->pDbPage);
5677             if( rc!=SQLITE_OK ){
5678               goto end_allocate_page;
5679             }
5680             memcpy(&pPrevTrunk->aData[0], &pTrunk->aData[0], 4);
5681           }
5682         }else{
5683           /* The trunk page is required by the caller but it contains
5684           ** pointers to free-list leaves. The first leaf becomes a trunk
5685           ** page in this case.
5686           */
5687           MemPage *pNewTrunk;
5688           Pgno iNewTrunk = get4byte(&pTrunk->aData[8]);
5689           if( iNewTrunk>mxPage ){
5690             rc = SQLITE_CORRUPT_BKPT;
5691             goto end_allocate_page;
5692           }
5693           testcase( iNewTrunk==mxPage );
5694           rc = btreeGetUnusedPage(pBt, iNewTrunk, &pNewTrunk, 0);
5695           if( rc!=SQLITE_OK ){
5696             goto end_allocate_page;
5697           }
5698           rc = sqlite3PagerWrite(pNewTrunk->pDbPage);
5699           if( rc!=SQLITE_OK ){
5700             releasePage(pNewTrunk);
5701             goto end_allocate_page;
5702           }
5703           memcpy(&pNewTrunk->aData[0], &pTrunk->aData[0], 4);
5704           put4byte(&pNewTrunk->aData[4], k-1);
5705           memcpy(&pNewTrunk->aData[8], &pTrunk->aData[12], (k-1)*4);
5706           releasePage(pNewTrunk);
5707           if( !pPrevTrunk ){
5708             assert( sqlite3PagerIswriteable(pPage1->pDbPage) );
5709             put4byte(&pPage1->aData[32], iNewTrunk);
5710           }else{
5711             rc = sqlite3PagerWrite(pPrevTrunk->pDbPage);
5712             if( rc ){
5713               goto end_allocate_page;
5714             }
5715             put4byte(&pPrevTrunk->aData[0], iNewTrunk);
5716           }
5717         }
5718         pTrunk = 0;
5719         TRACE(("ALLOCATE: %d trunk - %d free pages left\n", *pPgno, n-1));
5720 #endif
5721       }else if( k>0 ){
5722         /* Extract a leaf from the trunk */
5723         u32 closest;
5724         Pgno iPage;
5725         unsigned char *aData = pTrunk->aData;
5726         if( nearby>0 ){
5727           u32 i;
5728           closest = 0;
5729           if( eMode==BTALLOC_LE ){
5730             for(i=0; i<k; i++){
5731               iPage = get4byte(&aData[8+i*4]);
5732               if( iPage<=nearby ){
5733                 closest = i;
5734                 break;
5735               }
5736             }
5737           }else{
5738             int dist;
5739             dist = sqlite3AbsInt32(get4byte(&aData[8]) - nearby);
5740             for(i=1; i<k; i++){
5741               int d2 = sqlite3AbsInt32(get4byte(&aData[8+i*4]) - nearby);
5742               if( d2<dist ){
5743                 closest = i;
5744                 dist = d2;
5745               }
5746             }
5747           }
5748         }else{
5749           closest = 0;
5750         }
5751 
5752         iPage = get4byte(&aData[8+closest*4]);
5753         testcase( iPage==mxPage );
5754         if( iPage>mxPage ){
5755           rc = SQLITE_CORRUPT_BKPT;
5756           goto end_allocate_page;
5757         }
5758         testcase( iPage==mxPage );
5759         if( !searchList
5760          || (iPage==nearby || (iPage<nearby && eMode==BTALLOC_LE))
5761         ){
5762           int noContent;
5763           *pPgno = iPage;
5764           TRACE(("ALLOCATE: %d was leaf %d of %d on trunk %d"
5765                  ": %d more free pages\n",
5766                  *pPgno, closest+1, k, pTrunk->pgno, n-1));
5767           rc = sqlite3PagerWrite(pTrunk->pDbPage);
5768           if( rc ) goto end_allocate_page;
5769           if( closest<k-1 ){
5770             memcpy(&aData[8+closest*4], &aData[4+k*4], 4);
5771           }
5772           put4byte(&aData[4], k-1);
5773           noContent = !btreeGetHasContent(pBt, *pPgno)? PAGER_GET_NOCONTENT : 0;
5774           rc = btreeGetUnusedPage(pBt, *pPgno, ppPage, noContent);
5775           if( rc==SQLITE_OK ){
5776             rc = sqlite3PagerWrite((*ppPage)->pDbPage);
5777             if( rc!=SQLITE_OK ){
5778               releasePage(*ppPage);
5779               *ppPage = 0;
5780             }
5781           }
5782           searchList = 0;
5783         }
5784       }
5785       releasePage(pPrevTrunk);
5786       pPrevTrunk = 0;
5787     }while( searchList );
5788   }else{
5789     /* There are no pages on the freelist, so append a new page to the
5790     ** database image.
5791     **
5792     ** Normally, new pages allocated by this block can be requested from the
5793     ** pager layer with the 'no-content' flag set. This prevents the pager
5794     ** from trying to read the pages content from disk. However, if the
5795     ** current transaction has already run one or more incremental-vacuum
5796     ** steps, then the page we are about to allocate may contain content
5797     ** that is required in the event of a rollback. In this case, do
5798     ** not set the no-content flag. This causes the pager to load and journal
5799     ** the current page content before overwriting it.
5800     **
5801     ** Note that the pager will not actually attempt to load or journal
5802     ** content for any page that really does lie past the end of the database
5803     ** file on disk. So the effects of disabling the no-content optimization
5804     ** here are confined to those pages that lie between the end of the
5805     ** database image and the end of the database file.
5806     */
5807     int bNoContent = (0==IfNotOmitAV(pBt->bDoTruncate))? PAGER_GET_NOCONTENT:0;
5808 
5809     rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
5810     if( rc ) return rc;
5811     pBt->nPage++;
5812     if( pBt->nPage==PENDING_BYTE_PAGE(pBt) ) pBt->nPage++;
5813 
5814 #ifndef SQLITE_OMIT_AUTOVACUUM
5815     if( pBt->autoVacuum && PTRMAP_ISPAGE(pBt, pBt->nPage) ){
5816       /* If *pPgno refers to a pointer-map page, allocate two new pages
5817       ** at the end of the file instead of one. The first allocated page
5818       ** becomes a new pointer-map page, the second is used by the caller.
5819       */
5820       MemPage *pPg = 0;
5821       TRACE(("ALLOCATE: %d from end of file (pointer-map page)\n", pBt->nPage));
5822       assert( pBt->nPage!=PENDING_BYTE_PAGE(pBt) );
5823       rc = btreeGetUnusedPage(pBt, pBt->nPage, &pPg, bNoContent);
5824       if( rc==SQLITE_OK ){
5825         rc = sqlite3PagerWrite(pPg->pDbPage);
5826         releasePage(pPg);
5827       }
5828       if( rc ) return rc;
5829       pBt->nPage++;
5830       if( pBt->nPage==PENDING_BYTE_PAGE(pBt) ){ pBt->nPage++; }
5831     }
5832 #endif
5833     put4byte(28 + (u8*)pBt->pPage1->aData, pBt->nPage);
5834     *pPgno = pBt->nPage;
5835 
5836     assert( *pPgno!=PENDING_BYTE_PAGE(pBt) );
5837     rc = btreeGetUnusedPage(pBt, *pPgno, ppPage, bNoContent);
5838     if( rc ) return rc;
5839     rc = sqlite3PagerWrite((*ppPage)->pDbPage);
5840     if( rc!=SQLITE_OK ){
5841       releasePage(*ppPage);
5842       *ppPage = 0;
5843     }
5844     TRACE(("ALLOCATE: %d from end of file\n", *pPgno));
5845   }
5846 
5847   assert( *pPgno!=PENDING_BYTE_PAGE(pBt) );
5848 
5849 end_allocate_page:
5850   releasePage(pTrunk);
5851   releasePage(pPrevTrunk);
5852   assert( rc!=SQLITE_OK || sqlite3PagerPageRefcount((*ppPage)->pDbPage)<=1 );
5853   assert( rc!=SQLITE_OK || (*ppPage)->isInit==0 );
5854   return rc;
5855 }
5856 
5857 /*
5858 ** This function is used to add page iPage to the database file free-list.
5859 ** It is assumed that the page is not already a part of the free-list.
5860 **
5861 ** The value passed as the second argument to this function is optional.
5862 ** If the caller happens to have a pointer to the MemPage object
5863 ** corresponding to page iPage handy, it may pass it as the second value.
5864 ** Otherwise, it may pass NULL.
5865 **
5866 ** If a pointer to a MemPage object is passed as the second argument,
5867 ** its reference count is not altered by this function.
5868 */
5869 static int freePage2(BtShared *pBt, MemPage *pMemPage, Pgno iPage){
5870   MemPage *pTrunk = 0;                /* Free-list trunk page */
5871   Pgno iTrunk = 0;                    /* Page number of free-list trunk page */
5872   MemPage *pPage1 = pBt->pPage1;      /* Local reference to page 1 */
5873   MemPage *pPage;                     /* Page being freed. May be NULL. */
5874   int rc;                             /* Return Code */
5875   int nFree;                          /* Initial number of pages on free-list */
5876 
5877   assert( sqlite3_mutex_held(pBt->mutex) );
5878   assert( CORRUPT_DB || iPage>1 );
5879   assert( !pMemPage || pMemPage->pgno==iPage );
5880 
5881   if( iPage<2 ) return SQLITE_CORRUPT_BKPT;
5882   if( pMemPage ){
5883     pPage = pMemPage;
5884     sqlite3PagerRef(pPage->pDbPage);
5885   }else{
5886     pPage = btreePageLookup(pBt, iPage);
5887   }
5888 
5889   /* Increment the free page count on pPage1 */
5890   rc = sqlite3PagerWrite(pPage1->pDbPage);
5891   if( rc ) goto freepage_out;
5892   nFree = get4byte(&pPage1->aData[36]);
5893   put4byte(&pPage1->aData[36], nFree+1);
5894 
5895   if( pBt->btsFlags & BTS_SECURE_DELETE ){
5896     /* If the secure_delete option is enabled, then
5897     ** always fully overwrite deleted information with zeros.
5898     */
5899     if( (!pPage && ((rc = btreeGetPage(pBt, iPage, &pPage, 0))!=0) )
5900      ||            ((rc = sqlite3PagerWrite(pPage->pDbPage))!=0)
5901     ){
5902       goto freepage_out;
5903     }
5904     memset(pPage->aData, 0, pPage->pBt->pageSize);
5905   }
5906 
5907   /* If the database supports auto-vacuum, write an entry in the pointer-map
5908   ** to indicate that the page is free.
5909   */
5910   if( ISAUTOVACUUM ){
5911     ptrmapPut(pBt, iPage, PTRMAP_FREEPAGE, 0, &rc);
5912     if( rc ) goto freepage_out;
5913   }
5914 
5915   /* Now manipulate the actual database free-list structure. There are two
5916   ** possibilities. If the free-list is currently empty, or if the first
5917   ** trunk page in the free-list is full, then this page will become a
5918   ** new free-list trunk page. Otherwise, it will become a leaf of the
5919   ** first trunk page in the current free-list. This block tests if it
5920   ** is possible to add the page as a new free-list leaf.
5921   */
5922   if( nFree!=0 ){
5923     u32 nLeaf;                /* Initial number of leaf cells on trunk page */
5924 
5925     iTrunk = get4byte(&pPage1->aData[32]);
5926     rc = btreeGetPage(pBt, iTrunk, &pTrunk, 0);
5927     if( rc!=SQLITE_OK ){
5928       goto freepage_out;
5929     }
5930 
5931     nLeaf = get4byte(&pTrunk->aData[4]);
5932     assert( pBt->usableSize>32 );
5933     if( nLeaf > (u32)pBt->usableSize/4 - 2 ){
5934       rc = SQLITE_CORRUPT_BKPT;
5935       goto freepage_out;
5936     }
5937     if( nLeaf < (u32)pBt->usableSize/4 - 8 ){
5938       /* In this case there is room on the trunk page to insert the page
5939       ** being freed as a new leaf.
5940       **
5941       ** Note that the trunk page is not really full until it contains
5942       ** usableSize/4 - 2 entries, not usableSize/4 - 8 entries as we have
5943       ** coded.  But due to a coding error in versions of SQLite prior to
5944       ** 3.6.0, databases with freelist trunk pages holding more than
5945       ** usableSize/4 - 8 entries will be reported as corrupt.  In order
5946       ** to maintain backwards compatibility with older versions of SQLite,
5947       ** we will continue to restrict the number of entries to usableSize/4 - 8
5948       ** for now.  At some point in the future (once everyone has upgraded
5949       ** to 3.6.0 or later) we should consider fixing the conditional above
5950       ** to read "usableSize/4-2" instead of "usableSize/4-8".
5951       **
5952       ** EVIDENCE-OF: R-19920-11576 However, newer versions of SQLite still
5953       ** avoid using the last six entries in the freelist trunk page array in
5954       ** order that database files created by newer versions of SQLite can be
5955       ** read by older versions of SQLite.
5956       */
5957       rc = sqlite3PagerWrite(pTrunk->pDbPage);
5958       if( rc==SQLITE_OK ){
5959         put4byte(&pTrunk->aData[4], nLeaf+1);
5960         put4byte(&pTrunk->aData[8+nLeaf*4], iPage);
5961         if( pPage && (pBt->btsFlags & BTS_SECURE_DELETE)==0 ){
5962           sqlite3PagerDontWrite(pPage->pDbPage);
5963         }
5964         rc = btreeSetHasContent(pBt, iPage);
5965       }
5966       TRACE(("FREE-PAGE: %d leaf on trunk page %d\n",pPage->pgno,pTrunk->pgno));
5967       goto freepage_out;
5968     }
5969   }
5970 
5971   /* If control flows to this point, then it was not possible to add the
5972   ** the page being freed as a leaf page of the first trunk in the free-list.
5973   ** Possibly because the free-list is empty, or possibly because the
5974   ** first trunk in the free-list is full. Either way, the page being freed
5975   ** will become the new first trunk page in the free-list.
5976   */
5977   if( pPage==0 && SQLITE_OK!=(rc = btreeGetPage(pBt, iPage, &pPage, 0)) ){
5978     goto freepage_out;
5979   }
5980   rc = sqlite3PagerWrite(pPage->pDbPage);
5981   if( rc!=SQLITE_OK ){
5982     goto freepage_out;
5983   }
5984   put4byte(pPage->aData, iTrunk);
5985   put4byte(&pPage->aData[4], 0);
5986   put4byte(&pPage1->aData[32], iPage);
5987   TRACE(("FREE-PAGE: %d new trunk page replacing %d\n", pPage->pgno, iTrunk));
5988 
5989 freepage_out:
5990   if( pPage ){
5991     pPage->isInit = 0;
5992   }
5993   releasePage(pPage);
5994   releasePage(pTrunk);
5995   return rc;
5996 }
5997 static void freePage(MemPage *pPage, int *pRC){
5998   if( (*pRC)==SQLITE_OK ){
5999     *pRC = freePage2(pPage->pBt, pPage, pPage->pgno);
6000   }
6001 }
6002 
6003 /*
6004 ** Free any overflow pages associated with the given Cell.  Write the
6005 ** local Cell size (the number of bytes on the original page, omitting
6006 ** overflow) into *pnSize.
6007 */
6008 static int clearCell(
6009   MemPage *pPage,          /* The page that contains the Cell */
6010   unsigned char *pCell,    /* First byte of the Cell */
6011   CellInfo *pInfo          /* Size information about the cell */
6012 ){
6013   BtShared *pBt = pPage->pBt;
6014   Pgno ovflPgno;
6015   int rc;
6016   int nOvfl;
6017   u32 ovflPageSize;
6018 
6019   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
6020   pPage->xParseCell(pPage, pCell, pInfo);
6021   if( pInfo->nLocal==pInfo->nPayload ){
6022     return SQLITE_OK;  /* No overflow pages. Return without doing anything */
6023   }
6024   if( pCell+pInfo->nSize-1 > pPage->aData+pPage->maskPage ){
6025     return SQLITE_CORRUPT_BKPT;  /* Cell extends past end of page */
6026   }
6027   ovflPgno = get4byte(pCell + pInfo->nSize - 4);
6028   assert( pBt->usableSize > 4 );
6029   ovflPageSize = pBt->usableSize - 4;
6030   nOvfl = (pInfo->nPayload - pInfo->nLocal + ovflPageSize - 1)/ovflPageSize;
6031   assert( nOvfl>0 ||
6032     (CORRUPT_DB && (pInfo->nPayload + ovflPageSize)<ovflPageSize)
6033   );
6034   while( nOvfl-- ){
6035     Pgno iNext = 0;
6036     MemPage *pOvfl = 0;
6037     if( ovflPgno<2 || ovflPgno>btreePagecount(pBt) ){
6038       /* 0 is not a legal page number and page 1 cannot be an
6039       ** overflow page. Therefore if ovflPgno<2 or past the end of the
6040       ** file the database must be corrupt. */
6041       return SQLITE_CORRUPT_BKPT;
6042     }
6043     if( nOvfl ){
6044       rc = getOverflowPage(pBt, ovflPgno, &pOvfl, &iNext);
6045       if( rc ) return rc;
6046     }
6047 
6048     if( ( pOvfl || ((pOvfl = btreePageLookup(pBt, ovflPgno))!=0) )
6049      && sqlite3PagerPageRefcount(pOvfl->pDbPage)!=1
6050     ){
6051       /* There is no reason any cursor should have an outstanding reference
6052       ** to an overflow page belonging to a cell that is being deleted/updated.
6053       ** So if there exists more than one reference to this page, then it
6054       ** must not really be an overflow page and the database must be corrupt.
6055       ** It is helpful to detect this before calling freePage2(), as
6056       ** freePage2() may zero the page contents if secure-delete mode is
6057       ** enabled. If this 'overflow' page happens to be a page that the
6058       ** caller is iterating through or using in some other way, this
6059       ** can be problematic.
6060       */
6061       rc = SQLITE_CORRUPT_BKPT;
6062     }else{
6063       rc = freePage2(pBt, pOvfl, ovflPgno);
6064     }
6065 
6066     if( pOvfl ){
6067       sqlite3PagerUnref(pOvfl->pDbPage);
6068     }
6069     if( rc ) return rc;
6070     ovflPgno = iNext;
6071   }
6072   return SQLITE_OK;
6073 }
6074 
6075 /*
6076 ** Create the byte sequence used to represent a cell on page pPage
6077 ** and write that byte sequence into pCell[].  Overflow pages are
6078 ** allocated and filled in as necessary.  The calling procedure
6079 ** is responsible for making sure sufficient space has been allocated
6080 ** for pCell[].
6081 **
6082 ** Note that pCell does not necessary need to point to the pPage->aData
6083 ** area.  pCell might point to some temporary storage.  The cell will
6084 ** be constructed in this temporary area then copied into pPage->aData
6085 ** later.
6086 */
6087 static int fillInCell(
6088   MemPage *pPage,                /* The page that contains the cell */
6089   unsigned char *pCell,          /* Complete text of the cell */
6090   const BtreePayload *pX,        /* Payload with which to construct the cell */
6091   int *pnSize                    /* Write cell size here */
6092 ){
6093   int nPayload;
6094   const u8 *pSrc;
6095   int nSrc, n, rc;
6096   int spaceLeft;
6097   MemPage *pOvfl = 0;
6098   MemPage *pToRelease = 0;
6099   unsigned char *pPrior;
6100   unsigned char *pPayload;
6101   BtShared *pBt = pPage->pBt;
6102   Pgno pgnoOvfl = 0;
6103   int nHeader;
6104 
6105   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
6106 
6107   /* pPage is not necessarily writeable since pCell might be auxiliary
6108   ** buffer space that is separate from the pPage buffer area */
6109   assert( pCell<pPage->aData || pCell>=&pPage->aData[pBt->pageSize]
6110             || sqlite3PagerIswriteable(pPage->pDbPage) );
6111 
6112   /* Fill in the header. */
6113   nHeader = pPage->childPtrSize;
6114   if( pPage->intKey ){
6115     nPayload = pX->nData + pX->nZero;
6116     pSrc = pX->pData;
6117     nSrc = pX->nData;
6118     assert( pPage->intKeyLeaf ); /* fillInCell() only called for leaves */
6119     nHeader += putVarint32(&pCell[nHeader], nPayload);
6120     nHeader += putVarint(&pCell[nHeader], *(u64*)&pX->nKey);
6121   }else{
6122     assert( pX->nKey<=0x7fffffff && pX->pKey!=0 );
6123     nSrc = nPayload = (int)pX->nKey;
6124     pSrc = pX->pKey;
6125     nHeader += putVarint32(&pCell[nHeader], nPayload);
6126   }
6127 
6128   /* Fill in the payload */
6129   if( nPayload<=pPage->maxLocal ){
6130     n = nHeader + nPayload;
6131     testcase( n==3 );
6132     testcase( n==4 );
6133     if( n<4 ) n = 4;
6134     *pnSize = n;
6135     spaceLeft = nPayload;
6136     pPrior = pCell;
6137   }else{
6138     int mn = pPage->minLocal;
6139     n = mn + (nPayload - mn) % (pPage->pBt->usableSize - 4);
6140     testcase( n==pPage->maxLocal );
6141     testcase( n==pPage->maxLocal+1 );
6142     if( n > pPage->maxLocal ) n = mn;
6143     spaceLeft = n;
6144     *pnSize = n + nHeader + 4;
6145     pPrior = &pCell[nHeader+n];
6146   }
6147   pPayload = &pCell[nHeader];
6148 
6149   /* At this point variables should be set as follows:
6150   **
6151   **   nPayload           Total payload size in bytes
6152   **   pPayload           Begin writing payload here
6153   **   spaceLeft          Space available at pPayload.  If nPayload>spaceLeft,
6154   **                      that means content must spill into overflow pages.
6155   **   *pnSize            Size of the local cell (not counting overflow pages)
6156   **   pPrior             Where to write the pgno of the first overflow page
6157   **
6158   ** Use a call to btreeParseCellPtr() to verify that the values above
6159   ** were computed correctly.
6160   */
6161 #if SQLITE_DEBUG
6162   {
6163     CellInfo info;
6164     pPage->xParseCell(pPage, pCell, &info);
6165     assert( nHeader==(int)(info.pPayload - pCell) );
6166     assert( info.nKey==pX->nKey );
6167     assert( *pnSize == info.nSize );
6168     assert( spaceLeft == info.nLocal );
6169   }
6170 #endif
6171 
6172   /* Write the payload into the local Cell and any extra into overflow pages */
6173   while( nPayload>0 ){
6174     if( spaceLeft==0 ){
6175 #ifndef SQLITE_OMIT_AUTOVACUUM
6176       Pgno pgnoPtrmap = pgnoOvfl; /* Overflow page pointer-map entry page */
6177       if( pBt->autoVacuum ){
6178         do{
6179           pgnoOvfl++;
6180         } while(
6181           PTRMAP_ISPAGE(pBt, pgnoOvfl) || pgnoOvfl==PENDING_BYTE_PAGE(pBt)
6182         );
6183       }
6184 #endif
6185       rc = allocateBtreePage(pBt, &pOvfl, &pgnoOvfl, pgnoOvfl, 0);
6186 #ifndef SQLITE_OMIT_AUTOVACUUM
6187       /* If the database supports auto-vacuum, and the second or subsequent
6188       ** overflow page is being allocated, add an entry to the pointer-map
6189       ** for that page now.
6190       **
6191       ** If this is the first overflow page, then write a partial entry
6192       ** to the pointer-map. If we write nothing to this pointer-map slot,
6193       ** then the optimistic overflow chain processing in clearCell()
6194       ** may misinterpret the uninitialized values and delete the
6195       ** wrong pages from the database.
6196       */
6197       if( pBt->autoVacuum && rc==SQLITE_OK ){
6198         u8 eType = (pgnoPtrmap?PTRMAP_OVERFLOW2:PTRMAP_OVERFLOW1);
6199         ptrmapPut(pBt, pgnoOvfl, eType, pgnoPtrmap, &rc);
6200         if( rc ){
6201           releasePage(pOvfl);
6202         }
6203       }
6204 #endif
6205       if( rc ){
6206         releasePage(pToRelease);
6207         return rc;
6208       }
6209 
6210       /* If pToRelease is not zero than pPrior points into the data area
6211       ** of pToRelease.  Make sure pToRelease is still writeable. */
6212       assert( pToRelease==0 || sqlite3PagerIswriteable(pToRelease->pDbPage) );
6213 
6214       /* If pPrior is part of the data area of pPage, then make sure pPage
6215       ** is still writeable */
6216       assert( pPrior<pPage->aData || pPrior>=&pPage->aData[pBt->pageSize]
6217             || sqlite3PagerIswriteable(pPage->pDbPage) );
6218 
6219       put4byte(pPrior, pgnoOvfl);
6220       releasePage(pToRelease);
6221       pToRelease = pOvfl;
6222       pPrior = pOvfl->aData;
6223       put4byte(pPrior, 0);
6224       pPayload = &pOvfl->aData[4];
6225       spaceLeft = pBt->usableSize - 4;
6226     }
6227     n = nPayload;
6228     if( n>spaceLeft ) n = spaceLeft;
6229 
6230     /* If pToRelease is not zero than pPayload points into the data area
6231     ** of pToRelease.  Make sure pToRelease is still writeable. */
6232     assert( pToRelease==0 || sqlite3PagerIswriteable(pToRelease->pDbPage) );
6233 
6234     /* If pPayload is part of the data area of pPage, then make sure pPage
6235     ** is still writeable */
6236     assert( pPayload<pPage->aData || pPayload>=&pPage->aData[pBt->pageSize]
6237             || sqlite3PagerIswriteable(pPage->pDbPage) );
6238 
6239     if( nSrc>0 ){
6240       if( n>nSrc ) n = nSrc;
6241       assert( pSrc );
6242       memcpy(pPayload, pSrc, n);
6243     }else{
6244       memset(pPayload, 0, n);
6245     }
6246     nPayload -= n;
6247     pPayload += n;
6248     pSrc += n;
6249     nSrc -= n;
6250     spaceLeft -= n;
6251   }
6252   releasePage(pToRelease);
6253   return SQLITE_OK;
6254 }
6255 
6256 /*
6257 ** Remove the i-th cell from pPage.  This routine effects pPage only.
6258 ** The cell content is not freed or deallocated.  It is assumed that
6259 ** the cell content has been copied someplace else.  This routine just
6260 ** removes the reference to the cell from pPage.
6261 **
6262 ** "sz" must be the number of bytes in the cell.
6263 */
6264 static void dropCell(MemPage *pPage, int idx, int sz, int *pRC){
6265   u32 pc;         /* Offset to cell content of cell being deleted */
6266   u8 *data;       /* pPage->aData */
6267   u8 *ptr;        /* Used to move bytes around within data[] */
6268   int rc;         /* The return code */
6269   int hdr;        /* Beginning of the header.  0 most pages.  100 page 1 */
6270 
6271   if( *pRC ) return;
6272   assert( idx>=0 && idx<pPage->nCell );
6273   assert( CORRUPT_DB || sz==cellSize(pPage, idx) );
6274   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
6275   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
6276   data = pPage->aData;
6277   ptr = &pPage->aCellIdx[2*idx];
6278   pc = get2byte(ptr);
6279   hdr = pPage->hdrOffset;
6280   testcase( pc==get2byte(&data[hdr+5]) );
6281   testcase( pc+sz==pPage->pBt->usableSize );
6282   if( pc < (u32)get2byte(&data[hdr+5]) || pc+sz > pPage->pBt->usableSize ){
6283     *pRC = SQLITE_CORRUPT_BKPT;
6284     return;
6285   }
6286   rc = freeSpace(pPage, pc, sz);
6287   if( rc ){
6288     *pRC = rc;
6289     return;
6290   }
6291   pPage->nCell--;
6292   if( pPage->nCell==0 ){
6293     memset(&data[hdr+1], 0, 4);
6294     data[hdr+7] = 0;
6295     put2byte(&data[hdr+5], pPage->pBt->usableSize);
6296     pPage->nFree = pPage->pBt->usableSize - pPage->hdrOffset
6297                        - pPage->childPtrSize - 8;
6298   }else{
6299     memmove(ptr, ptr+2, 2*(pPage->nCell - idx));
6300     put2byte(&data[hdr+3], pPage->nCell);
6301     pPage->nFree += 2;
6302   }
6303 }
6304 
6305 /*
6306 ** Insert a new cell on pPage at cell index "i".  pCell points to the
6307 ** content of the cell.
6308 **
6309 ** If the cell content will fit on the page, then put it there.  If it
6310 ** will not fit, then make a copy of the cell content into pTemp if
6311 ** pTemp is not null.  Regardless of pTemp, allocate a new entry
6312 ** in pPage->apOvfl[] and make it point to the cell content (either
6313 ** in pTemp or the original pCell) and also record its index.
6314 ** Allocating a new entry in pPage->aCell[] implies that
6315 ** pPage->nOverflow is incremented.
6316 **
6317 ** *pRC must be SQLITE_OK when this routine is called.
6318 */
6319 static void insertCell(
6320   MemPage *pPage,   /* Page into which we are copying */
6321   int i,            /* New cell becomes the i-th cell of the page */
6322   u8 *pCell,        /* Content of the new cell */
6323   int sz,           /* Bytes of content in pCell */
6324   u8 *pTemp,        /* Temp storage space for pCell, if needed */
6325   Pgno iChild,      /* If non-zero, replace first 4 bytes with this value */
6326   int *pRC          /* Read and write return code from here */
6327 ){
6328   int idx = 0;      /* Where to write new cell content in data[] */
6329   int j;            /* Loop counter */
6330   u8 *data;         /* The content of the whole page */
6331   u8 *pIns;         /* The point in pPage->aCellIdx[] where no cell inserted */
6332 
6333   assert( *pRC==SQLITE_OK );
6334   assert( i>=0 && i<=pPage->nCell+pPage->nOverflow );
6335   assert( MX_CELL(pPage->pBt)<=10921 );
6336   assert( pPage->nCell<=MX_CELL(pPage->pBt) || CORRUPT_DB );
6337   assert( pPage->nOverflow<=ArraySize(pPage->apOvfl) );
6338   assert( ArraySize(pPage->apOvfl)==ArraySize(pPage->aiOvfl) );
6339   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
6340   /* The cell should normally be sized correctly.  However, when moving a
6341   ** malformed cell from a leaf page to an interior page, if the cell size
6342   ** wanted to be less than 4 but got rounded up to 4 on the leaf, then size
6343   ** might be less than 8 (leaf-size + pointer) on the interior node.  Hence
6344   ** the term after the || in the following assert(). */
6345   assert( sz==pPage->xCellSize(pPage, pCell) || (sz==8 && iChild>0) );
6346   if( pPage->nOverflow || sz+2>pPage->nFree ){
6347     if( pTemp ){
6348       memcpy(pTemp, pCell, sz);
6349       pCell = pTemp;
6350     }
6351     if( iChild ){
6352       put4byte(pCell, iChild);
6353     }
6354     j = pPage->nOverflow++;
6355     /* Comparison against ArraySize-1 since we hold back one extra slot
6356     ** as a contingency.  In other words, never need more than 3 overflow
6357     ** slots but 4 are allocated, just to be safe. */
6358     assert( j < ArraySize(pPage->apOvfl)-1 );
6359     pPage->apOvfl[j] = pCell;
6360     pPage->aiOvfl[j] = (u16)i;
6361 
6362     /* When multiple overflows occur, they are always sequential and in
6363     ** sorted order.  This invariants arise because multiple overflows can
6364     ** only occur when inserting divider cells into the parent page during
6365     ** balancing, and the dividers are adjacent and sorted.
6366     */
6367     assert( j==0 || pPage->aiOvfl[j-1]<(u16)i ); /* Overflows in sorted order */
6368     assert( j==0 || i==pPage->aiOvfl[j-1]+1 );   /* Overflows are sequential */
6369   }else{
6370     int rc = sqlite3PagerWrite(pPage->pDbPage);
6371     if( rc!=SQLITE_OK ){
6372       *pRC = rc;
6373       return;
6374     }
6375     assert( sqlite3PagerIswriteable(pPage->pDbPage) );
6376     data = pPage->aData;
6377     assert( &data[pPage->cellOffset]==pPage->aCellIdx );
6378     rc = allocateSpace(pPage, sz, &idx);
6379     if( rc ){ *pRC = rc; return; }
6380     /* The allocateSpace() routine guarantees the following properties
6381     ** if it returns successfully */
6382     assert( idx >= 0 );
6383     assert( idx >= pPage->cellOffset+2*pPage->nCell+2 || CORRUPT_DB );
6384     assert( idx+sz <= (int)pPage->pBt->usableSize );
6385     pPage->nFree -= (u16)(2 + sz);
6386     memcpy(&data[idx], pCell, sz);
6387     if( iChild ){
6388       put4byte(&data[idx], iChild);
6389     }
6390     pIns = pPage->aCellIdx + i*2;
6391     memmove(pIns+2, pIns, 2*(pPage->nCell - i));
6392     put2byte(pIns, idx);
6393     pPage->nCell++;
6394     /* increment the cell count */
6395     if( (++data[pPage->hdrOffset+4])==0 ) data[pPage->hdrOffset+3]++;
6396     assert( get2byte(&data[pPage->hdrOffset+3])==pPage->nCell );
6397 #ifndef SQLITE_OMIT_AUTOVACUUM
6398     if( pPage->pBt->autoVacuum ){
6399       /* The cell may contain a pointer to an overflow page. If so, write
6400       ** the entry for the overflow page into the pointer map.
6401       */
6402       ptrmapPutOvflPtr(pPage, pCell, pRC);
6403     }
6404 #endif
6405   }
6406 }
6407 
6408 /*
6409 ** A CellArray object contains a cache of pointers and sizes for a
6410 ** consecutive sequence of cells that might be held on multiple pages.
6411 */
6412 typedef struct CellArray CellArray;
6413 struct CellArray {
6414   int nCell;              /* Number of cells in apCell[] */
6415   MemPage *pRef;          /* Reference page */
6416   u8 **apCell;            /* All cells begin balanced */
6417   u16 *szCell;            /* Local size of all cells in apCell[] */
6418 };
6419 
6420 /*
6421 ** Make sure the cell sizes at idx, idx+1, ..., idx+N-1 have been
6422 ** computed.
6423 */
6424 static void populateCellCache(CellArray *p, int idx, int N){
6425   assert( idx>=0 && idx+N<=p->nCell );
6426   while( N>0 ){
6427     assert( p->apCell[idx]!=0 );
6428     if( p->szCell[idx]==0 ){
6429       p->szCell[idx] = p->pRef->xCellSize(p->pRef, p->apCell[idx]);
6430     }else{
6431       assert( CORRUPT_DB ||
6432               p->szCell[idx]==p->pRef->xCellSize(p->pRef, p->apCell[idx]) );
6433     }
6434     idx++;
6435     N--;
6436   }
6437 }
6438 
6439 /*
6440 ** Return the size of the Nth element of the cell array
6441 */
6442 static SQLITE_NOINLINE u16 computeCellSize(CellArray *p, int N){
6443   assert( N>=0 && N<p->nCell );
6444   assert( p->szCell[N]==0 );
6445   p->szCell[N] = p->pRef->xCellSize(p->pRef, p->apCell[N]);
6446   return p->szCell[N];
6447 }
6448 static u16 cachedCellSize(CellArray *p, int N){
6449   assert( N>=0 && N<p->nCell );
6450   if( p->szCell[N] ) return p->szCell[N];
6451   return computeCellSize(p, N);
6452 }
6453 
6454 /*
6455 ** Array apCell[] contains pointers to nCell b-tree page cells. The
6456 ** szCell[] array contains the size in bytes of each cell. This function
6457 ** replaces the current contents of page pPg with the contents of the cell
6458 ** array.
6459 **
6460 ** Some of the cells in apCell[] may currently be stored in pPg. This
6461 ** function works around problems caused by this by making a copy of any
6462 ** such cells before overwriting the page data.
6463 **
6464 ** The MemPage.nFree field is invalidated by this function. It is the
6465 ** responsibility of the caller to set it correctly.
6466 */
6467 static int rebuildPage(
6468   MemPage *pPg,                   /* Edit this page */
6469   int nCell,                      /* Final number of cells on page */
6470   u8 **apCell,                    /* Array of cells */
6471   u16 *szCell                     /* Array of cell sizes */
6472 ){
6473   const int hdr = pPg->hdrOffset;          /* Offset of header on pPg */
6474   u8 * const aData = pPg->aData;           /* Pointer to data for pPg */
6475   const int usableSize = pPg->pBt->usableSize;
6476   u8 * const pEnd = &aData[usableSize];
6477   int i;
6478   u8 *pCellptr = pPg->aCellIdx;
6479   u8 *pTmp = sqlite3PagerTempSpace(pPg->pBt->pPager);
6480   u8 *pData;
6481 
6482   i = get2byte(&aData[hdr+5]);
6483   memcpy(&pTmp[i], &aData[i], usableSize - i);
6484 
6485   pData = pEnd;
6486   for(i=0; i<nCell; i++){
6487     u8 *pCell = apCell[i];
6488     if( SQLITE_WITHIN(pCell,aData,pEnd) ){
6489       pCell = &pTmp[pCell - aData];
6490     }
6491     pData -= szCell[i];
6492     put2byte(pCellptr, (pData - aData));
6493     pCellptr += 2;
6494     if( pData < pCellptr ) return SQLITE_CORRUPT_BKPT;
6495     memcpy(pData, pCell, szCell[i]);
6496     assert( szCell[i]==pPg->xCellSize(pPg, pCell) || CORRUPT_DB );
6497     testcase( szCell[i]!=pPg->xCellSize(pPg,pCell) );
6498   }
6499 
6500   /* The pPg->nFree field is now set incorrectly. The caller will fix it. */
6501   pPg->nCell = nCell;
6502   pPg->nOverflow = 0;
6503 
6504   put2byte(&aData[hdr+1], 0);
6505   put2byte(&aData[hdr+3], pPg->nCell);
6506   put2byte(&aData[hdr+5], pData - aData);
6507   aData[hdr+7] = 0x00;
6508   return SQLITE_OK;
6509 }
6510 
6511 /*
6512 ** Array apCell[] contains nCell pointers to b-tree cells. Array szCell
6513 ** contains the size in bytes of each such cell. This function attempts to
6514 ** add the cells stored in the array to page pPg. If it cannot (because
6515 ** the page needs to be defragmented before the cells will fit), non-zero
6516 ** is returned. Otherwise, if the cells are added successfully, zero is
6517 ** returned.
6518 **
6519 ** Argument pCellptr points to the first entry in the cell-pointer array
6520 ** (part of page pPg) to populate. After cell apCell[0] is written to the
6521 ** page body, a 16-bit offset is written to pCellptr. And so on, for each
6522 ** cell in the array. It is the responsibility of the caller to ensure
6523 ** that it is safe to overwrite this part of the cell-pointer array.
6524 **
6525 ** When this function is called, *ppData points to the start of the
6526 ** content area on page pPg. If the size of the content area is extended,
6527 ** *ppData is updated to point to the new start of the content area
6528 ** before returning.
6529 **
6530 ** Finally, argument pBegin points to the byte immediately following the
6531 ** end of the space required by this page for the cell-pointer area (for
6532 ** all cells - not just those inserted by the current call). If the content
6533 ** area must be extended to before this point in order to accomodate all
6534 ** cells in apCell[], then the cells do not fit and non-zero is returned.
6535 */
6536 static int pageInsertArray(
6537   MemPage *pPg,                   /* Page to add cells to */
6538   u8 *pBegin,                     /* End of cell-pointer array */
6539   u8 **ppData,                    /* IN/OUT: Page content -area pointer */
6540   u8 *pCellptr,                   /* Pointer to cell-pointer area */
6541   int iFirst,                     /* Index of first cell to add */
6542   int nCell,                      /* Number of cells to add to pPg */
6543   CellArray *pCArray              /* Array of cells */
6544 ){
6545   int i;
6546   u8 *aData = pPg->aData;
6547   u8 *pData = *ppData;
6548   int iEnd = iFirst + nCell;
6549   assert( CORRUPT_DB || pPg->hdrOffset==0 );    /* Never called on page 1 */
6550   for(i=iFirst; i<iEnd; i++){
6551     int sz, rc;
6552     u8 *pSlot;
6553     sz = cachedCellSize(pCArray, i);
6554     if( (aData[1]==0 && aData[2]==0) || (pSlot = pageFindSlot(pPg,sz,&rc))==0 ){
6555       if( (pData - pBegin)<sz ) return 1;
6556       pData -= sz;
6557       pSlot = pData;
6558     }
6559     /* pSlot and pCArray->apCell[i] will never overlap on a well-formed
6560     ** database.  But they might for a corrupt database.  Hence use memmove()
6561     ** since memcpy() sends SIGABORT with overlapping buffers on OpenBSD */
6562     assert( (pSlot+sz)<=pCArray->apCell[i]
6563          || pSlot>=(pCArray->apCell[i]+sz)
6564          || CORRUPT_DB );
6565     memmove(pSlot, pCArray->apCell[i], sz);
6566     put2byte(pCellptr, (pSlot - aData));
6567     pCellptr += 2;
6568   }
6569   *ppData = pData;
6570   return 0;
6571 }
6572 
6573 /*
6574 ** Array apCell[] contains nCell pointers to b-tree cells. Array szCell
6575 ** contains the size in bytes of each such cell. This function adds the
6576 ** space associated with each cell in the array that is currently stored
6577 ** within the body of pPg to the pPg free-list. The cell-pointers and other
6578 ** fields of the page are not updated.
6579 **
6580 ** This function returns the total number of cells added to the free-list.
6581 */
6582 static int pageFreeArray(
6583   MemPage *pPg,                   /* Page to edit */
6584   int iFirst,                     /* First cell to delete */
6585   int nCell,                      /* Cells to delete */
6586   CellArray *pCArray              /* Array of cells */
6587 ){
6588   u8 * const aData = pPg->aData;
6589   u8 * const pEnd = &aData[pPg->pBt->usableSize];
6590   u8 * const pStart = &aData[pPg->hdrOffset + 8 + pPg->childPtrSize];
6591   int nRet = 0;
6592   int i;
6593   int iEnd = iFirst + nCell;
6594   u8 *pFree = 0;
6595   int szFree = 0;
6596 
6597   for(i=iFirst; i<iEnd; i++){
6598     u8 *pCell = pCArray->apCell[i];
6599     if( SQLITE_WITHIN(pCell, pStart, pEnd) ){
6600       int sz;
6601       /* No need to use cachedCellSize() here.  The sizes of all cells that
6602       ** are to be freed have already been computing while deciding which
6603       ** cells need freeing */
6604       sz = pCArray->szCell[i];  assert( sz>0 );
6605       if( pFree!=(pCell + sz) ){
6606         if( pFree ){
6607           assert( pFree>aData && (pFree - aData)<65536 );
6608           freeSpace(pPg, (u16)(pFree - aData), szFree);
6609         }
6610         pFree = pCell;
6611         szFree = sz;
6612         if( pFree+sz>pEnd ) return 0;
6613       }else{
6614         pFree = pCell;
6615         szFree += sz;
6616       }
6617       nRet++;
6618     }
6619   }
6620   if( pFree ){
6621     assert( pFree>aData && (pFree - aData)<65536 );
6622     freeSpace(pPg, (u16)(pFree - aData), szFree);
6623   }
6624   return nRet;
6625 }
6626 
6627 /*
6628 ** apCell[] and szCell[] contains pointers to and sizes of all cells in the
6629 ** pages being balanced.  The current page, pPg, has pPg->nCell cells starting
6630 ** with apCell[iOld].  After balancing, this page should hold nNew cells
6631 ** starting at apCell[iNew].
6632 **
6633 ** This routine makes the necessary adjustments to pPg so that it contains
6634 ** the correct cells after being balanced.
6635 **
6636 ** The pPg->nFree field is invalid when this function returns. It is the
6637 ** responsibility of the caller to set it correctly.
6638 */
6639 static int editPage(
6640   MemPage *pPg,                   /* Edit this page */
6641   int iOld,                       /* Index of first cell currently on page */
6642   int iNew,                       /* Index of new first cell on page */
6643   int nNew,                       /* Final number of cells on page */
6644   CellArray *pCArray              /* Array of cells and sizes */
6645 ){
6646   u8 * const aData = pPg->aData;
6647   const int hdr = pPg->hdrOffset;
6648   u8 *pBegin = &pPg->aCellIdx[nNew * 2];
6649   int nCell = pPg->nCell;       /* Cells stored on pPg */
6650   u8 *pData;
6651   u8 *pCellptr;
6652   int i;
6653   int iOldEnd = iOld + pPg->nCell + pPg->nOverflow;
6654   int iNewEnd = iNew + nNew;
6655 
6656 #ifdef SQLITE_DEBUG
6657   u8 *pTmp = sqlite3PagerTempSpace(pPg->pBt->pPager);
6658   memcpy(pTmp, aData, pPg->pBt->usableSize);
6659 #endif
6660 
6661   /* Remove cells from the start and end of the page */
6662   if( iOld<iNew ){
6663     int nShift = pageFreeArray(pPg, iOld, iNew-iOld, pCArray);
6664     memmove(pPg->aCellIdx, &pPg->aCellIdx[nShift*2], nCell*2);
6665     nCell -= nShift;
6666   }
6667   if( iNewEnd < iOldEnd ){
6668     nCell -= pageFreeArray(pPg, iNewEnd, iOldEnd - iNewEnd, pCArray);
6669   }
6670 
6671   pData = &aData[get2byteNotZero(&aData[hdr+5])];
6672   if( pData<pBegin ) goto editpage_fail;
6673 
6674   /* Add cells to the start of the page */
6675   if( iNew<iOld ){
6676     int nAdd = MIN(nNew,iOld-iNew);
6677     assert( (iOld-iNew)<nNew || nCell==0 || CORRUPT_DB );
6678     pCellptr = pPg->aCellIdx;
6679     memmove(&pCellptr[nAdd*2], pCellptr, nCell*2);
6680     if( pageInsertArray(
6681           pPg, pBegin, &pData, pCellptr,
6682           iNew, nAdd, pCArray
6683     ) ) goto editpage_fail;
6684     nCell += nAdd;
6685   }
6686 
6687   /* Add any overflow cells */
6688   for(i=0; i<pPg->nOverflow; i++){
6689     int iCell = (iOld + pPg->aiOvfl[i]) - iNew;
6690     if( iCell>=0 && iCell<nNew ){
6691       pCellptr = &pPg->aCellIdx[iCell * 2];
6692       memmove(&pCellptr[2], pCellptr, (nCell - iCell) * 2);
6693       nCell++;
6694       if( pageInsertArray(
6695             pPg, pBegin, &pData, pCellptr,
6696             iCell+iNew, 1, pCArray
6697       ) ) goto editpage_fail;
6698     }
6699   }
6700 
6701   /* Append cells to the end of the page */
6702   pCellptr = &pPg->aCellIdx[nCell*2];
6703   if( pageInsertArray(
6704         pPg, pBegin, &pData, pCellptr,
6705         iNew+nCell, nNew-nCell, pCArray
6706   ) ) goto editpage_fail;
6707 
6708   pPg->nCell = nNew;
6709   pPg->nOverflow = 0;
6710 
6711   put2byte(&aData[hdr+3], pPg->nCell);
6712   put2byte(&aData[hdr+5], pData - aData);
6713 
6714 #ifdef SQLITE_DEBUG
6715   for(i=0; i<nNew && !CORRUPT_DB; i++){
6716     u8 *pCell = pCArray->apCell[i+iNew];
6717     int iOff = get2byteAligned(&pPg->aCellIdx[i*2]);
6718     if( SQLITE_WITHIN(pCell, aData, &aData[pPg->pBt->usableSize]) ){
6719       pCell = &pTmp[pCell - aData];
6720     }
6721     assert( 0==memcmp(pCell, &aData[iOff],
6722             pCArray->pRef->xCellSize(pCArray->pRef, pCArray->apCell[i+iNew])) );
6723   }
6724 #endif
6725 
6726   return SQLITE_OK;
6727  editpage_fail:
6728   /* Unable to edit this page. Rebuild it from scratch instead. */
6729   populateCellCache(pCArray, iNew, nNew);
6730   return rebuildPage(pPg, nNew, &pCArray->apCell[iNew], &pCArray->szCell[iNew]);
6731 }
6732 
6733 /*
6734 ** The following parameters determine how many adjacent pages get involved
6735 ** in a balancing operation.  NN is the number of neighbors on either side
6736 ** of the page that participate in the balancing operation.  NB is the
6737 ** total number of pages that participate, including the target page and
6738 ** NN neighbors on either side.
6739 **
6740 ** The minimum value of NN is 1 (of course).  Increasing NN above 1
6741 ** (to 2 or 3) gives a modest improvement in SELECT and DELETE performance
6742 ** in exchange for a larger degradation in INSERT and UPDATE performance.
6743 ** The value of NN appears to give the best results overall.
6744 */
6745 #define NN 1             /* Number of neighbors on either side of pPage */
6746 #define NB (NN*2+1)      /* Total pages involved in the balance */
6747 
6748 
6749 #ifndef SQLITE_OMIT_QUICKBALANCE
6750 /*
6751 ** This version of balance() handles the common special case where
6752 ** a new entry is being inserted on the extreme right-end of the
6753 ** tree, in other words, when the new entry will become the largest
6754 ** entry in the tree.
6755 **
6756 ** Instead of trying to balance the 3 right-most leaf pages, just add
6757 ** a new page to the right-hand side and put the one new entry in
6758 ** that page.  This leaves the right side of the tree somewhat
6759 ** unbalanced.  But odds are that we will be inserting new entries
6760 ** at the end soon afterwards so the nearly empty page will quickly
6761 ** fill up.  On average.
6762 **
6763 ** pPage is the leaf page which is the right-most page in the tree.
6764 ** pParent is its parent.  pPage must have a single overflow entry
6765 ** which is also the right-most entry on the page.
6766 **
6767 ** The pSpace buffer is used to store a temporary copy of the divider
6768 ** cell that will be inserted into pParent. Such a cell consists of a 4
6769 ** byte page number followed by a variable length integer. In other
6770 ** words, at most 13 bytes. Hence the pSpace buffer must be at
6771 ** least 13 bytes in size.
6772 */
6773 static int balance_quick(MemPage *pParent, MemPage *pPage, u8 *pSpace){
6774   BtShared *const pBt = pPage->pBt;    /* B-Tree Database */
6775   MemPage *pNew;                       /* Newly allocated page */
6776   int rc;                              /* Return Code */
6777   Pgno pgnoNew;                        /* Page number of pNew */
6778 
6779   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
6780   assert( sqlite3PagerIswriteable(pParent->pDbPage) );
6781   assert( pPage->nOverflow==1 );
6782 
6783   /* This error condition is now caught prior to reaching this function */
6784   if( NEVER(pPage->nCell==0) ) return SQLITE_CORRUPT_BKPT;
6785 
6786   /* Allocate a new page. This page will become the right-sibling of
6787   ** pPage. Make the parent page writable, so that the new divider cell
6788   ** may be inserted. If both these operations are successful, proceed.
6789   */
6790   rc = allocateBtreePage(pBt, &pNew, &pgnoNew, 0, 0);
6791 
6792   if( rc==SQLITE_OK ){
6793 
6794     u8 *pOut = &pSpace[4];
6795     u8 *pCell = pPage->apOvfl[0];
6796     u16 szCell = pPage->xCellSize(pPage, pCell);
6797     u8 *pStop;
6798 
6799     assert( sqlite3PagerIswriteable(pNew->pDbPage) );
6800     assert( pPage->aData[0]==(PTF_INTKEY|PTF_LEAFDATA|PTF_LEAF) );
6801     zeroPage(pNew, PTF_INTKEY|PTF_LEAFDATA|PTF_LEAF);
6802     rc = rebuildPage(pNew, 1, &pCell, &szCell);
6803     if( NEVER(rc) ) return rc;
6804     pNew->nFree = pBt->usableSize - pNew->cellOffset - 2 - szCell;
6805 
6806     /* If this is an auto-vacuum database, update the pointer map
6807     ** with entries for the new page, and any pointer from the
6808     ** cell on the page to an overflow page. If either of these
6809     ** operations fails, the return code is set, but the contents
6810     ** of the parent page are still manipulated by thh code below.
6811     ** That is Ok, at this point the parent page is guaranteed to
6812     ** be marked as dirty. Returning an error code will cause a
6813     ** rollback, undoing any changes made to the parent page.
6814     */
6815     if( ISAUTOVACUUM ){
6816       ptrmapPut(pBt, pgnoNew, PTRMAP_BTREE, pParent->pgno, &rc);
6817       if( szCell>pNew->minLocal ){
6818         ptrmapPutOvflPtr(pNew, pCell, &rc);
6819       }
6820     }
6821 
6822     /* Create a divider cell to insert into pParent. The divider cell
6823     ** consists of a 4-byte page number (the page number of pPage) and
6824     ** a variable length key value (which must be the same value as the
6825     ** largest key on pPage).
6826     **
6827     ** To find the largest key value on pPage, first find the right-most
6828     ** cell on pPage. The first two fields of this cell are the
6829     ** record-length (a variable length integer at most 32-bits in size)
6830     ** and the key value (a variable length integer, may have any value).
6831     ** The first of the while(...) loops below skips over the record-length
6832     ** field. The second while(...) loop copies the key value from the
6833     ** cell on pPage into the pSpace buffer.
6834     */
6835     pCell = findCell(pPage, pPage->nCell-1);
6836     pStop = &pCell[9];
6837     while( (*(pCell++)&0x80) && pCell<pStop );
6838     pStop = &pCell[9];
6839     while( ((*(pOut++) = *(pCell++))&0x80) && pCell<pStop );
6840 
6841     /* Insert the new divider cell into pParent. */
6842     if( rc==SQLITE_OK ){
6843       insertCell(pParent, pParent->nCell, pSpace, (int)(pOut-pSpace),
6844                    0, pPage->pgno, &rc);
6845     }
6846 
6847     /* Set the right-child pointer of pParent to point to the new page. */
6848     put4byte(&pParent->aData[pParent->hdrOffset+8], pgnoNew);
6849 
6850     /* Release the reference to the new page. */
6851     releasePage(pNew);
6852   }
6853 
6854   return rc;
6855 }
6856 #endif /* SQLITE_OMIT_QUICKBALANCE */
6857 
6858 #if 0
6859 /*
6860 ** This function does not contribute anything to the operation of SQLite.
6861 ** it is sometimes activated temporarily while debugging code responsible
6862 ** for setting pointer-map entries.
6863 */
6864 static int ptrmapCheckPages(MemPage **apPage, int nPage){
6865   int i, j;
6866   for(i=0; i<nPage; i++){
6867     Pgno n;
6868     u8 e;
6869     MemPage *pPage = apPage[i];
6870     BtShared *pBt = pPage->pBt;
6871     assert( pPage->isInit );
6872 
6873     for(j=0; j<pPage->nCell; j++){
6874       CellInfo info;
6875       u8 *z;
6876 
6877       z = findCell(pPage, j);
6878       pPage->xParseCell(pPage, z, &info);
6879       if( info.nLocal<info.nPayload ){
6880         Pgno ovfl = get4byte(&z[info.nSize-4]);
6881         ptrmapGet(pBt, ovfl, &e, &n);
6882         assert( n==pPage->pgno && e==PTRMAP_OVERFLOW1 );
6883       }
6884       if( !pPage->leaf ){
6885         Pgno child = get4byte(z);
6886         ptrmapGet(pBt, child, &e, &n);
6887         assert( n==pPage->pgno && e==PTRMAP_BTREE );
6888       }
6889     }
6890     if( !pPage->leaf ){
6891       Pgno child = get4byte(&pPage->aData[pPage->hdrOffset+8]);
6892       ptrmapGet(pBt, child, &e, &n);
6893       assert( n==pPage->pgno && e==PTRMAP_BTREE );
6894     }
6895   }
6896   return 1;
6897 }
6898 #endif
6899 
6900 /*
6901 ** This function is used to copy the contents of the b-tree node stored
6902 ** on page pFrom to page pTo. If page pFrom was not a leaf page, then
6903 ** the pointer-map entries for each child page are updated so that the
6904 ** parent page stored in the pointer map is page pTo. If pFrom contained
6905 ** any cells with overflow page pointers, then the corresponding pointer
6906 ** map entries are also updated so that the parent page is page pTo.
6907 **
6908 ** If pFrom is currently carrying any overflow cells (entries in the
6909 ** MemPage.apOvfl[] array), they are not copied to pTo.
6910 **
6911 ** Before returning, page pTo is reinitialized using btreeInitPage().
6912 **
6913 ** The performance of this function is not critical. It is only used by
6914 ** the balance_shallower() and balance_deeper() procedures, neither of
6915 ** which are called often under normal circumstances.
6916 */
6917 static void copyNodeContent(MemPage *pFrom, MemPage *pTo, int *pRC){
6918   if( (*pRC)==SQLITE_OK ){
6919     BtShared * const pBt = pFrom->pBt;
6920     u8 * const aFrom = pFrom->aData;
6921     u8 * const aTo = pTo->aData;
6922     int const iFromHdr = pFrom->hdrOffset;
6923     int const iToHdr = ((pTo->pgno==1) ? 100 : 0);
6924     int rc;
6925     int iData;
6926 
6927 
6928     assert( pFrom->isInit );
6929     assert( pFrom->nFree>=iToHdr );
6930     assert( get2byte(&aFrom[iFromHdr+5]) <= (int)pBt->usableSize );
6931 
6932     /* Copy the b-tree node content from page pFrom to page pTo. */
6933     iData = get2byte(&aFrom[iFromHdr+5]);
6934     memcpy(&aTo[iData], &aFrom[iData], pBt->usableSize-iData);
6935     memcpy(&aTo[iToHdr], &aFrom[iFromHdr], pFrom->cellOffset + 2*pFrom->nCell);
6936 
6937     /* Reinitialize page pTo so that the contents of the MemPage structure
6938     ** match the new data. The initialization of pTo can actually fail under
6939     ** fairly obscure circumstances, even though it is a copy of initialized
6940     ** page pFrom.
6941     */
6942     pTo->isInit = 0;
6943     rc = btreeInitPage(pTo);
6944     if( rc!=SQLITE_OK ){
6945       *pRC = rc;
6946       return;
6947     }
6948 
6949     /* If this is an auto-vacuum database, update the pointer-map entries
6950     ** for any b-tree or overflow pages that pTo now contains the pointers to.
6951     */
6952     if( ISAUTOVACUUM ){
6953       *pRC = setChildPtrmaps(pTo);
6954     }
6955   }
6956 }
6957 
6958 /*
6959 ** This routine redistributes cells on the iParentIdx'th child of pParent
6960 ** (hereafter "the page") and up to 2 siblings so that all pages have about the
6961 ** same amount of free space. Usually a single sibling on either side of the
6962 ** page are used in the balancing, though both siblings might come from one
6963 ** side if the page is the first or last child of its parent. If the page
6964 ** has fewer than 2 siblings (something which can only happen if the page
6965 ** is a root page or a child of a root page) then all available siblings
6966 ** participate in the balancing.
6967 **
6968 ** The number of siblings of the page might be increased or decreased by
6969 ** one or two in an effort to keep pages nearly full but not over full.
6970 **
6971 ** Note that when this routine is called, some of the cells on the page
6972 ** might not actually be stored in MemPage.aData[]. This can happen
6973 ** if the page is overfull. This routine ensures that all cells allocated
6974 ** to the page and its siblings fit into MemPage.aData[] before returning.
6975 **
6976 ** In the course of balancing the page and its siblings, cells may be
6977 ** inserted into or removed from the parent page (pParent). Doing so
6978 ** may cause the parent page to become overfull or underfull. If this
6979 ** happens, it is the responsibility of the caller to invoke the correct
6980 ** balancing routine to fix this problem (see the balance() routine).
6981 **
6982 ** If this routine fails for any reason, it might leave the database
6983 ** in a corrupted state. So if this routine fails, the database should
6984 ** be rolled back.
6985 **
6986 ** The third argument to this function, aOvflSpace, is a pointer to a
6987 ** buffer big enough to hold one page. If while inserting cells into the parent
6988 ** page (pParent) the parent page becomes overfull, this buffer is
6989 ** used to store the parent's overflow cells. Because this function inserts
6990 ** a maximum of four divider cells into the parent page, and the maximum
6991 ** size of a cell stored within an internal node is always less than 1/4
6992 ** of the page-size, the aOvflSpace[] buffer is guaranteed to be large
6993 ** enough for all overflow cells.
6994 **
6995 ** If aOvflSpace is set to a null pointer, this function returns
6996 ** SQLITE_NOMEM.
6997 */
6998 static int balance_nonroot(
6999   MemPage *pParent,               /* Parent page of siblings being balanced */
7000   int iParentIdx,                 /* Index of "the page" in pParent */
7001   u8 *aOvflSpace,                 /* page-size bytes of space for parent ovfl */
7002   int isRoot,                     /* True if pParent is a root-page */
7003   int bBulk                       /* True if this call is part of a bulk load */
7004 ){
7005   BtShared *pBt;               /* The whole database */
7006   int nMaxCells = 0;           /* Allocated size of apCell, szCell, aFrom. */
7007   int nNew = 0;                /* Number of pages in apNew[] */
7008   int nOld;                    /* Number of pages in apOld[] */
7009   int i, j, k;                 /* Loop counters */
7010   int nxDiv;                   /* Next divider slot in pParent->aCell[] */
7011   int rc = SQLITE_OK;          /* The return code */
7012   u16 leafCorrection;          /* 4 if pPage is a leaf.  0 if not */
7013   int leafData;                /* True if pPage is a leaf of a LEAFDATA tree */
7014   int usableSpace;             /* Bytes in pPage beyond the header */
7015   int pageFlags;               /* Value of pPage->aData[0] */
7016   int iSpace1 = 0;             /* First unused byte of aSpace1[] */
7017   int iOvflSpace = 0;          /* First unused byte of aOvflSpace[] */
7018   int szScratch;               /* Size of scratch memory requested */
7019   MemPage *apOld[NB];          /* pPage and up to two siblings */
7020   MemPage *apNew[NB+2];        /* pPage and up to NB siblings after balancing */
7021   u8 *pRight;                  /* Location in parent of right-sibling pointer */
7022   u8 *apDiv[NB-1];             /* Divider cells in pParent */
7023   int cntNew[NB+2];            /* Index in b.paCell[] of cell after i-th page */
7024   int cntOld[NB+2];            /* Old index in b.apCell[] */
7025   int szNew[NB+2];             /* Combined size of cells placed on i-th page */
7026   u8 *aSpace1;                 /* Space for copies of dividers cells */
7027   Pgno pgno;                   /* Temp var to store a page number in */
7028   u8 abDone[NB+2];             /* True after i'th new page is populated */
7029   Pgno aPgno[NB+2];            /* Page numbers of new pages before shuffling */
7030   Pgno aPgOrder[NB+2];         /* Copy of aPgno[] used for sorting pages */
7031   u16 aPgFlags[NB+2];          /* flags field of new pages before shuffling */
7032   CellArray b;                  /* Parsed information on cells being balanced */
7033 
7034   memset(abDone, 0, sizeof(abDone));
7035   b.nCell = 0;
7036   b.apCell = 0;
7037   pBt = pParent->pBt;
7038   assert( sqlite3_mutex_held(pBt->mutex) );
7039   assert( sqlite3PagerIswriteable(pParent->pDbPage) );
7040 
7041 #if 0
7042   TRACE(("BALANCE: begin page %d child of %d\n", pPage->pgno, pParent->pgno));
7043 #endif
7044 
7045   /* At this point pParent may have at most one overflow cell. And if
7046   ** this overflow cell is present, it must be the cell with
7047   ** index iParentIdx. This scenario comes about when this function
7048   ** is called (indirectly) from sqlite3BtreeDelete().
7049   */
7050   assert( pParent->nOverflow==0 || pParent->nOverflow==1 );
7051   assert( pParent->nOverflow==0 || pParent->aiOvfl[0]==iParentIdx );
7052 
7053   if( !aOvflSpace ){
7054     return SQLITE_NOMEM_BKPT;
7055   }
7056 
7057   /* Find the sibling pages to balance. Also locate the cells in pParent
7058   ** that divide the siblings. An attempt is made to find NN siblings on
7059   ** either side of pPage. More siblings are taken from one side, however,
7060   ** if there are fewer than NN siblings on the other side. If pParent
7061   ** has NB or fewer children then all children of pParent are taken.
7062   **
7063   ** This loop also drops the divider cells from the parent page. This
7064   ** way, the remainder of the function does not have to deal with any
7065   ** overflow cells in the parent page, since if any existed they will
7066   ** have already been removed.
7067   */
7068   i = pParent->nOverflow + pParent->nCell;
7069   if( i<2 ){
7070     nxDiv = 0;
7071   }else{
7072     assert( bBulk==0 || bBulk==1 );
7073     if( iParentIdx==0 ){
7074       nxDiv = 0;
7075     }else if( iParentIdx==i ){
7076       nxDiv = i-2+bBulk;
7077     }else{
7078       nxDiv = iParentIdx-1;
7079     }
7080     i = 2-bBulk;
7081   }
7082   nOld = i+1;
7083   if( (i+nxDiv-pParent->nOverflow)==pParent->nCell ){
7084     pRight = &pParent->aData[pParent->hdrOffset+8];
7085   }else{
7086     pRight = findCell(pParent, i+nxDiv-pParent->nOverflow);
7087   }
7088   pgno = get4byte(pRight);
7089   while( 1 ){
7090     rc = getAndInitPage(pBt, pgno, &apOld[i], 0, 0);
7091     if( rc ){
7092       memset(apOld, 0, (i+1)*sizeof(MemPage*));
7093       goto balance_cleanup;
7094     }
7095     nMaxCells += 1+apOld[i]->nCell+apOld[i]->nOverflow;
7096     if( (i--)==0 ) break;
7097 
7098     if( pParent->nOverflow && i+nxDiv==pParent->aiOvfl[0] ){
7099       apDiv[i] = pParent->apOvfl[0];
7100       pgno = get4byte(apDiv[i]);
7101       szNew[i] = pParent->xCellSize(pParent, apDiv[i]);
7102       pParent->nOverflow = 0;
7103     }else{
7104       apDiv[i] = findCell(pParent, i+nxDiv-pParent->nOverflow);
7105       pgno = get4byte(apDiv[i]);
7106       szNew[i] = pParent->xCellSize(pParent, apDiv[i]);
7107 
7108       /* Drop the cell from the parent page. apDiv[i] still points to
7109       ** the cell within the parent, even though it has been dropped.
7110       ** This is safe because dropping a cell only overwrites the first
7111       ** four bytes of it, and this function does not need the first
7112       ** four bytes of the divider cell. So the pointer is safe to use
7113       ** later on.
7114       **
7115       ** But not if we are in secure-delete mode. In secure-delete mode,
7116       ** the dropCell() routine will overwrite the entire cell with zeroes.
7117       ** In this case, temporarily copy the cell into the aOvflSpace[]
7118       ** buffer. It will be copied out again as soon as the aSpace[] buffer
7119       ** is allocated.  */
7120       if( pBt->btsFlags & BTS_SECURE_DELETE ){
7121         int iOff;
7122 
7123         iOff = SQLITE_PTR_TO_INT(apDiv[i]) - SQLITE_PTR_TO_INT(pParent->aData);
7124         if( (iOff+szNew[i])>(int)pBt->usableSize ){
7125           rc = SQLITE_CORRUPT_BKPT;
7126           memset(apOld, 0, (i+1)*sizeof(MemPage*));
7127           goto balance_cleanup;
7128         }else{
7129           memcpy(&aOvflSpace[iOff], apDiv[i], szNew[i]);
7130           apDiv[i] = &aOvflSpace[apDiv[i]-pParent->aData];
7131         }
7132       }
7133       dropCell(pParent, i+nxDiv-pParent->nOverflow, szNew[i], &rc);
7134     }
7135   }
7136 
7137   /* Make nMaxCells a multiple of 4 in order to preserve 8-byte
7138   ** alignment */
7139   nMaxCells = (nMaxCells + 3)&~3;
7140 
7141   /*
7142   ** Allocate space for memory structures
7143   */
7144   szScratch =
7145        nMaxCells*sizeof(u8*)                       /* b.apCell */
7146      + nMaxCells*sizeof(u16)                       /* b.szCell */
7147      + pBt->pageSize;                              /* aSpace1 */
7148 
7149   /* EVIDENCE-OF: R-28375-38319 SQLite will never request a scratch buffer
7150   ** that is more than 6 times the database page size. */
7151   assert( szScratch<=6*(int)pBt->pageSize );
7152   b.apCell = sqlite3ScratchMalloc( szScratch );
7153   if( b.apCell==0 ){
7154     rc = SQLITE_NOMEM_BKPT;
7155     goto balance_cleanup;
7156   }
7157   b.szCell = (u16*)&b.apCell[nMaxCells];
7158   aSpace1 = (u8*)&b.szCell[nMaxCells];
7159   assert( EIGHT_BYTE_ALIGNMENT(aSpace1) );
7160 
7161   /*
7162   ** Load pointers to all cells on sibling pages and the divider cells
7163   ** into the local b.apCell[] array.  Make copies of the divider cells
7164   ** into space obtained from aSpace1[]. The divider cells have already
7165   ** been removed from pParent.
7166   **
7167   ** If the siblings are on leaf pages, then the child pointers of the
7168   ** divider cells are stripped from the cells before they are copied
7169   ** into aSpace1[].  In this way, all cells in b.apCell[] are without
7170   ** child pointers.  If siblings are not leaves, then all cell in
7171   ** b.apCell[] include child pointers.  Either way, all cells in b.apCell[]
7172   ** are alike.
7173   **
7174   ** leafCorrection:  4 if pPage is a leaf.  0 if pPage is not a leaf.
7175   **       leafData:  1 if pPage holds key+data and pParent holds only keys.
7176   */
7177   b.pRef = apOld[0];
7178   leafCorrection = b.pRef->leaf*4;
7179   leafData = b.pRef->intKeyLeaf;
7180   for(i=0; i<nOld; i++){
7181     MemPage *pOld = apOld[i];
7182     int limit = pOld->nCell;
7183     u8 *aData = pOld->aData;
7184     u16 maskPage = pOld->maskPage;
7185     u8 *piCell = aData + pOld->cellOffset;
7186     u8 *piEnd;
7187 
7188     /* Verify that all sibling pages are of the same "type" (table-leaf,
7189     ** table-interior, index-leaf, or index-interior).
7190     */
7191     if( pOld->aData[0]!=apOld[0]->aData[0] ){
7192       rc = SQLITE_CORRUPT_BKPT;
7193       goto balance_cleanup;
7194     }
7195 
7196     /* Load b.apCell[] with pointers to all cells in pOld.  If pOld
7197     ** constains overflow cells, include them in the b.apCell[] array
7198     ** in the correct spot.
7199     **
7200     ** Note that when there are multiple overflow cells, it is always the
7201     ** case that they are sequential and adjacent.  This invariant arises
7202     ** because multiple overflows can only occurs when inserting divider
7203     ** cells into a parent on a prior balance, and divider cells are always
7204     ** adjacent and are inserted in order.  There is an assert() tagged
7205     ** with "NOTE 1" in the overflow cell insertion loop to prove this
7206     ** invariant.
7207     **
7208     ** This must be done in advance.  Once the balance starts, the cell
7209     ** offset section of the btree page will be overwritten and we will no
7210     ** long be able to find the cells if a pointer to each cell is not saved
7211     ** first.
7212     */
7213     memset(&b.szCell[b.nCell], 0, sizeof(b.szCell[0])*(limit+pOld->nOverflow));
7214     if( pOld->nOverflow>0 ){
7215       limit = pOld->aiOvfl[0];
7216       for(j=0; j<limit; j++){
7217         b.apCell[b.nCell] = aData + (maskPage & get2byteAligned(piCell));
7218         piCell += 2;
7219         b.nCell++;
7220       }
7221       for(k=0; k<pOld->nOverflow; k++){
7222         assert( k==0 || pOld->aiOvfl[k-1]+1==pOld->aiOvfl[k] );/* NOTE 1 */
7223         b.apCell[b.nCell] = pOld->apOvfl[k];
7224         b.nCell++;
7225       }
7226     }
7227     piEnd = aData + pOld->cellOffset + 2*pOld->nCell;
7228     while( piCell<piEnd ){
7229       assert( b.nCell<nMaxCells );
7230       b.apCell[b.nCell] = aData + (maskPage & get2byteAligned(piCell));
7231       piCell += 2;
7232       b.nCell++;
7233     }
7234 
7235     cntOld[i] = b.nCell;
7236     if( i<nOld-1 && !leafData){
7237       u16 sz = (u16)szNew[i];
7238       u8 *pTemp;
7239       assert( b.nCell<nMaxCells );
7240       b.szCell[b.nCell] = sz;
7241       pTemp = &aSpace1[iSpace1];
7242       iSpace1 += sz;
7243       assert( sz<=pBt->maxLocal+23 );
7244       assert( iSpace1 <= (int)pBt->pageSize );
7245       memcpy(pTemp, apDiv[i], sz);
7246       b.apCell[b.nCell] = pTemp+leafCorrection;
7247       assert( leafCorrection==0 || leafCorrection==4 );
7248       b.szCell[b.nCell] = b.szCell[b.nCell] - leafCorrection;
7249       if( !pOld->leaf ){
7250         assert( leafCorrection==0 );
7251         assert( pOld->hdrOffset==0 );
7252         /* The right pointer of the child page pOld becomes the left
7253         ** pointer of the divider cell */
7254         memcpy(b.apCell[b.nCell], &pOld->aData[8], 4);
7255       }else{
7256         assert( leafCorrection==4 );
7257         while( b.szCell[b.nCell]<4 ){
7258           /* Do not allow any cells smaller than 4 bytes. If a smaller cell
7259           ** does exist, pad it with 0x00 bytes. */
7260           assert( b.szCell[b.nCell]==3 || CORRUPT_DB );
7261           assert( b.apCell[b.nCell]==&aSpace1[iSpace1-3] || CORRUPT_DB );
7262           aSpace1[iSpace1++] = 0x00;
7263           b.szCell[b.nCell]++;
7264         }
7265       }
7266       b.nCell++;
7267     }
7268   }
7269 
7270   /*
7271   ** Figure out the number of pages needed to hold all b.nCell cells.
7272   ** Store this number in "k".  Also compute szNew[] which is the total
7273   ** size of all cells on the i-th page and cntNew[] which is the index
7274   ** in b.apCell[] of the cell that divides page i from page i+1.
7275   ** cntNew[k] should equal b.nCell.
7276   **
7277   ** Values computed by this block:
7278   **
7279   **           k: The total number of sibling pages
7280   **    szNew[i]: Spaced used on the i-th sibling page.
7281   **   cntNew[i]: Index in b.apCell[] and b.szCell[] for the first cell to
7282   **              the right of the i-th sibling page.
7283   ** usableSpace: Number of bytes of space available on each sibling.
7284   **
7285   */
7286   usableSpace = pBt->usableSize - 12 + leafCorrection;
7287   for(i=0; i<nOld; i++){
7288     MemPage *p = apOld[i];
7289     szNew[i] = usableSpace - p->nFree;
7290     for(j=0; j<p->nOverflow; j++){
7291       szNew[i] += 2 + p->xCellSize(p, p->apOvfl[j]);
7292     }
7293     cntNew[i] = cntOld[i];
7294   }
7295   k = nOld;
7296   for(i=0; i<k; i++){
7297     int sz;
7298     while( szNew[i]>usableSpace ){
7299       if( i+1>=k ){
7300         k = i+2;
7301         if( k>NB+2 ){ rc = SQLITE_CORRUPT_BKPT; goto balance_cleanup; }
7302         szNew[k-1] = 0;
7303         cntNew[k-1] = b.nCell;
7304       }
7305       sz = 2 + cachedCellSize(&b, cntNew[i]-1);
7306       szNew[i] -= sz;
7307       if( !leafData ){
7308         if( cntNew[i]<b.nCell ){
7309           sz = 2 + cachedCellSize(&b, cntNew[i]);
7310         }else{
7311           sz = 0;
7312         }
7313       }
7314       szNew[i+1] += sz;
7315       cntNew[i]--;
7316     }
7317     while( cntNew[i]<b.nCell ){
7318       sz = 2 + cachedCellSize(&b, cntNew[i]);
7319       if( szNew[i]+sz>usableSpace ) break;
7320       szNew[i] += sz;
7321       cntNew[i]++;
7322       if( !leafData ){
7323         if( cntNew[i]<b.nCell ){
7324           sz = 2 + cachedCellSize(&b, cntNew[i]);
7325         }else{
7326           sz = 0;
7327         }
7328       }
7329       szNew[i+1] -= sz;
7330     }
7331     if( cntNew[i]>=b.nCell ){
7332       k = i+1;
7333     }else if( cntNew[i] <= (i>0 ? cntNew[i-1] : 0) ){
7334       rc = SQLITE_CORRUPT_BKPT;
7335       goto balance_cleanup;
7336     }
7337   }
7338 
7339   /*
7340   ** The packing computed by the previous block is biased toward the siblings
7341   ** on the left side (siblings with smaller keys). The left siblings are
7342   ** always nearly full, while the right-most sibling might be nearly empty.
7343   ** The next block of code attempts to adjust the packing of siblings to
7344   ** get a better balance.
7345   **
7346   ** This adjustment is more than an optimization.  The packing above might
7347   ** be so out of balance as to be illegal.  For example, the right-most
7348   ** sibling might be completely empty.  This adjustment is not optional.
7349   */
7350   for(i=k-1; i>0; i--){
7351     int szRight = szNew[i];  /* Size of sibling on the right */
7352     int szLeft = szNew[i-1]; /* Size of sibling on the left */
7353     int r;              /* Index of right-most cell in left sibling */
7354     int d;              /* Index of first cell to the left of right sibling */
7355 
7356     r = cntNew[i-1] - 1;
7357     d = r + 1 - leafData;
7358     (void)cachedCellSize(&b, d);
7359     do{
7360       assert( d<nMaxCells );
7361       assert( r<nMaxCells );
7362       (void)cachedCellSize(&b, r);
7363       if( szRight!=0
7364        && (bBulk || szRight+b.szCell[d]+2 > szLeft-(b.szCell[r]+(i==k-1?0:2)))){
7365         break;
7366       }
7367       szRight += b.szCell[d] + 2;
7368       szLeft -= b.szCell[r] + 2;
7369       cntNew[i-1] = r;
7370       r--;
7371       d--;
7372     }while( r>=0 );
7373     szNew[i] = szRight;
7374     szNew[i-1] = szLeft;
7375     if( cntNew[i-1] <= (i>1 ? cntNew[i-2] : 0) ){
7376       rc = SQLITE_CORRUPT_BKPT;
7377       goto balance_cleanup;
7378     }
7379   }
7380 
7381   /* Sanity check:  For a non-corrupt database file one of the follwing
7382   ** must be true:
7383   **    (1) We found one or more cells (cntNew[0])>0), or
7384   **    (2) pPage is a virtual root page.  A virtual root page is when
7385   **        the real root page is page 1 and we are the only child of
7386   **        that page.
7387   */
7388   assert( cntNew[0]>0 || (pParent->pgno==1 && pParent->nCell==0) || CORRUPT_DB);
7389   TRACE(("BALANCE: old: %d(nc=%d) %d(nc=%d) %d(nc=%d)\n",
7390     apOld[0]->pgno, apOld[0]->nCell,
7391     nOld>=2 ? apOld[1]->pgno : 0, nOld>=2 ? apOld[1]->nCell : 0,
7392     nOld>=3 ? apOld[2]->pgno : 0, nOld>=3 ? apOld[2]->nCell : 0
7393   ));
7394 
7395   /*
7396   ** Allocate k new pages.  Reuse old pages where possible.
7397   */
7398   pageFlags = apOld[0]->aData[0];
7399   for(i=0; i<k; i++){
7400     MemPage *pNew;
7401     if( i<nOld ){
7402       pNew = apNew[i] = apOld[i];
7403       apOld[i] = 0;
7404       rc = sqlite3PagerWrite(pNew->pDbPage);
7405       nNew++;
7406       if( rc ) goto balance_cleanup;
7407     }else{
7408       assert( i>0 );
7409       rc = allocateBtreePage(pBt, &pNew, &pgno, (bBulk ? 1 : pgno), 0);
7410       if( rc ) goto balance_cleanup;
7411       zeroPage(pNew, pageFlags);
7412       apNew[i] = pNew;
7413       nNew++;
7414       cntOld[i] = b.nCell;
7415 
7416       /* Set the pointer-map entry for the new sibling page. */
7417       if( ISAUTOVACUUM ){
7418         ptrmapPut(pBt, pNew->pgno, PTRMAP_BTREE, pParent->pgno, &rc);
7419         if( rc!=SQLITE_OK ){
7420           goto balance_cleanup;
7421         }
7422       }
7423     }
7424   }
7425 
7426   /*
7427   ** Reassign page numbers so that the new pages are in ascending order.
7428   ** This helps to keep entries in the disk file in order so that a scan
7429   ** of the table is closer to a linear scan through the file. That in turn
7430   ** helps the operating system to deliver pages from the disk more rapidly.
7431   **
7432   ** An O(n^2) insertion sort algorithm is used, but since n is never more
7433   ** than (NB+2) (a small constant), that should not be a problem.
7434   **
7435   ** When NB==3, this one optimization makes the database about 25% faster
7436   ** for large insertions and deletions.
7437   */
7438   for(i=0; i<nNew; i++){
7439     aPgOrder[i] = aPgno[i] = apNew[i]->pgno;
7440     aPgFlags[i] = apNew[i]->pDbPage->flags;
7441     for(j=0; j<i; j++){
7442       if( aPgno[j]==aPgno[i] ){
7443         /* This branch is taken if the set of sibling pages somehow contains
7444         ** duplicate entries. This can happen if the database is corrupt.
7445         ** It would be simpler to detect this as part of the loop below, but
7446         ** we do the detection here in order to avoid populating the pager
7447         ** cache with two separate objects associated with the same
7448         ** page number.  */
7449         assert( CORRUPT_DB );
7450         rc = SQLITE_CORRUPT_BKPT;
7451         goto balance_cleanup;
7452       }
7453     }
7454   }
7455   for(i=0; i<nNew; i++){
7456     int iBest = 0;                /* aPgno[] index of page number to use */
7457     for(j=1; j<nNew; j++){
7458       if( aPgOrder[j]<aPgOrder[iBest] ) iBest = j;
7459     }
7460     pgno = aPgOrder[iBest];
7461     aPgOrder[iBest] = 0xffffffff;
7462     if( iBest!=i ){
7463       if( iBest>i ){
7464         sqlite3PagerRekey(apNew[iBest]->pDbPage, pBt->nPage+iBest+1, 0);
7465       }
7466       sqlite3PagerRekey(apNew[i]->pDbPage, pgno, aPgFlags[iBest]);
7467       apNew[i]->pgno = pgno;
7468     }
7469   }
7470 
7471   TRACE(("BALANCE: new: %d(%d nc=%d) %d(%d nc=%d) %d(%d nc=%d) "
7472          "%d(%d nc=%d) %d(%d nc=%d)\n",
7473     apNew[0]->pgno, szNew[0], cntNew[0],
7474     nNew>=2 ? apNew[1]->pgno : 0, nNew>=2 ? szNew[1] : 0,
7475     nNew>=2 ? cntNew[1] - cntNew[0] - !leafData : 0,
7476     nNew>=3 ? apNew[2]->pgno : 0, nNew>=3 ? szNew[2] : 0,
7477     nNew>=3 ? cntNew[2] - cntNew[1] - !leafData : 0,
7478     nNew>=4 ? apNew[3]->pgno : 0, nNew>=4 ? szNew[3] : 0,
7479     nNew>=4 ? cntNew[3] - cntNew[2] - !leafData : 0,
7480     nNew>=5 ? apNew[4]->pgno : 0, nNew>=5 ? szNew[4] : 0,
7481     nNew>=5 ? cntNew[4] - cntNew[3] - !leafData : 0
7482   ));
7483 
7484   assert( sqlite3PagerIswriteable(pParent->pDbPage) );
7485   put4byte(pRight, apNew[nNew-1]->pgno);
7486 
7487   /* If the sibling pages are not leaves, ensure that the right-child pointer
7488   ** of the right-most new sibling page is set to the value that was
7489   ** originally in the same field of the right-most old sibling page. */
7490   if( (pageFlags & PTF_LEAF)==0 && nOld!=nNew ){
7491     MemPage *pOld = (nNew>nOld ? apNew : apOld)[nOld-1];
7492     memcpy(&apNew[nNew-1]->aData[8], &pOld->aData[8], 4);
7493   }
7494 
7495   /* Make any required updates to pointer map entries associated with
7496   ** cells stored on sibling pages following the balance operation. Pointer
7497   ** map entries associated with divider cells are set by the insertCell()
7498   ** routine. The associated pointer map entries are:
7499   **
7500   **   a) if the cell contains a reference to an overflow chain, the
7501   **      entry associated with the first page in the overflow chain, and
7502   **
7503   **   b) if the sibling pages are not leaves, the child page associated
7504   **      with the cell.
7505   **
7506   ** If the sibling pages are not leaves, then the pointer map entry
7507   ** associated with the right-child of each sibling may also need to be
7508   ** updated. This happens below, after the sibling pages have been
7509   ** populated, not here.
7510   */
7511   if( ISAUTOVACUUM ){
7512     MemPage *pNew = apNew[0];
7513     u8 *aOld = pNew->aData;
7514     int cntOldNext = pNew->nCell + pNew->nOverflow;
7515     int usableSize = pBt->usableSize;
7516     int iNew = 0;
7517     int iOld = 0;
7518 
7519     for(i=0; i<b.nCell; i++){
7520       u8 *pCell = b.apCell[i];
7521       if( i==cntOldNext ){
7522         MemPage *pOld = (++iOld)<nNew ? apNew[iOld] : apOld[iOld];
7523         cntOldNext += pOld->nCell + pOld->nOverflow + !leafData;
7524         aOld = pOld->aData;
7525       }
7526       if( i==cntNew[iNew] ){
7527         pNew = apNew[++iNew];
7528         if( !leafData ) continue;
7529       }
7530 
7531       /* Cell pCell is destined for new sibling page pNew. Originally, it
7532       ** was either part of sibling page iOld (possibly an overflow cell),
7533       ** or else the divider cell to the left of sibling page iOld. So,
7534       ** if sibling page iOld had the same page number as pNew, and if
7535       ** pCell really was a part of sibling page iOld (not a divider or
7536       ** overflow cell), we can skip updating the pointer map entries.  */
7537       if( iOld>=nNew
7538        || pNew->pgno!=aPgno[iOld]
7539        || !SQLITE_WITHIN(pCell,aOld,&aOld[usableSize])
7540       ){
7541         if( !leafCorrection ){
7542           ptrmapPut(pBt, get4byte(pCell), PTRMAP_BTREE, pNew->pgno, &rc);
7543         }
7544         if( cachedCellSize(&b,i)>pNew->minLocal ){
7545           ptrmapPutOvflPtr(pNew, pCell, &rc);
7546         }
7547         if( rc ) goto balance_cleanup;
7548       }
7549     }
7550   }
7551 
7552   /* Insert new divider cells into pParent. */
7553   for(i=0; i<nNew-1; i++){
7554     u8 *pCell;
7555     u8 *pTemp;
7556     int sz;
7557     MemPage *pNew = apNew[i];
7558     j = cntNew[i];
7559 
7560     assert( j<nMaxCells );
7561     assert( b.apCell[j]!=0 );
7562     pCell = b.apCell[j];
7563     sz = b.szCell[j] + leafCorrection;
7564     pTemp = &aOvflSpace[iOvflSpace];
7565     if( !pNew->leaf ){
7566       memcpy(&pNew->aData[8], pCell, 4);
7567     }else if( leafData ){
7568       /* If the tree is a leaf-data tree, and the siblings are leaves,
7569       ** then there is no divider cell in b.apCell[]. Instead, the divider
7570       ** cell consists of the integer key for the right-most cell of
7571       ** the sibling-page assembled above only.
7572       */
7573       CellInfo info;
7574       j--;
7575       pNew->xParseCell(pNew, b.apCell[j], &info);
7576       pCell = pTemp;
7577       sz = 4 + putVarint(&pCell[4], info.nKey);
7578       pTemp = 0;
7579     }else{
7580       pCell -= 4;
7581       /* Obscure case for non-leaf-data trees: If the cell at pCell was
7582       ** previously stored on a leaf node, and its reported size was 4
7583       ** bytes, then it may actually be smaller than this
7584       ** (see btreeParseCellPtr(), 4 bytes is the minimum size of
7585       ** any cell). But it is important to pass the correct size to
7586       ** insertCell(), so reparse the cell now.
7587       **
7588       ** This can only happen for b-trees used to evaluate "IN (SELECT ...)"
7589       ** and WITHOUT ROWID tables with exactly one column which is the
7590       ** primary key.
7591       */
7592       if( b.szCell[j]==4 ){
7593         assert(leafCorrection==4);
7594         sz = pParent->xCellSize(pParent, pCell);
7595       }
7596     }
7597     iOvflSpace += sz;
7598     assert( sz<=pBt->maxLocal+23 );
7599     assert( iOvflSpace <= (int)pBt->pageSize );
7600     insertCell(pParent, nxDiv+i, pCell, sz, pTemp, pNew->pgno, &rc);
7601     if( rc!=SQLITE_OK ) goto balance_cleanup;
7602     assert( sqlite3PagerIswriteable(pParent->pDbPage) );
7603   }
7604 
7605   /* Now update the actual sibling pages. The order in which they are updated
7606   ** is important, as this code needs to avoid disrupting any page from which
7607   ** cells may still to be read. In practice, this means:
7608   **
7609   **  (1) If cells are moving left (from apNew[iPg] to apNew[iPg-1])
7610   **      then it is not safe to update page apNew[iPg] until after
7611   **      the left-hand sibling apNew[iPg-1] has been updated.
7612   **
7613   **  (2) If cells are moving right (from apNew[iPg] to apNew[iPg+1])
7614   **      then it is not safe to update page apNew[iPg] until after
7615   **      the right-hand sibling apNew[iPg+1] has been updated.
7616   **
7617   ** If neither of the above apply, the page is safe to update.
7618   **
7619   ** The iPg value in the following loop starts at nNew-1 goes down
7620   ** to 0, then back up to nNew-1 again, thus making two passes over
7621   ** the pages.  On the initial downward pass, only condition (1) above
7622   ** needs to be tested because (2) will always be true from the previous
7623   ** step.  On the upward pass, both conditions are always true, so the
7624   ** upwards pass simply processes pages that were missed on the downward
7625   ** pass.
7626   */
7627   for(i=1-nNew; i<nNew; i++){
7628     int iPg = i<0 ? -i : i;
7629     assert( iPg>=0 && iPg<nNew );
7630     if( abDone[iPg] ) continue;         /* Skip pages already processed */
7631     if( i>=0                            /* On the upwards pass, or... */
7632      || cntOld[iPg-1]>=cntNew[iPg-1]    /* Condition (1) is true */
7633     ){
7634       int iNew;
7635       int iOld;
7636       int nNewCell;
7637 
7638       /* Verify condition (1):  If cells are moving left, update iPg
7639       ** only after iPg-1 has already been updated. */
7640       assert( iPg==0 || cntOld[iPg-1]>=cntNew[iPg-1] || abDone[iPg-1] );
7641 
7642       /* Verify condition (2):  If cells are moving right, update iPg
7643       ** only after iPg+1 has already been updated. */
7644       assert( cntNew[iPg]>=cntOld[iPg] || abDone[iPg+1] );
7645 
7646       if( iPg==0 ){
7647         iNew = iOld = 0;
7648         nNewCell = cntNew[0];
7649       }else{
7650         iOld = iPg<nOld ? (cntOld[iPg-1] + !leafData) : b.nCell;
7651         iNew = cntNew[iPg-1] + !leafData;
7652         nNewCell = cntNew[iPg] - iNew;
7653       }
7654 
7655       rc = editPage(apNew[iPg], iOld, iNew, nNewCell, &b);
7656       if( rc ) goto balance_cleanup;
7657       abDone[iPg]++;
7658       apNew[iPg]->nFree = usableSpace-szNew[iPg];
7659       assert( apNew[iPg]->nOverflow==0 );
7660       assert( apNew[iPg]->nCell==nNewCell );
7661     }
7662   }
7663 
7664   /* All pages have been processed exactly once */
7665   assert( memcmp(abDone, "\01\01\01\01\01", nNew)==0 );
7666 
7667   assert( nOld>0 );
7668   assert( nNew>0 );
7669 
7670   if( isRoot && pParent->nCell==0 && pParent->hdrOffset<=apNew[0]->nFree ){
7671     /* The root page of the b-tree now contains no cells. The only sibling
7672     ** page is the right-child of the parent. Copy the contents of the
7673     ** child page into the parent, decreasing the overall height of the
7674     ** b-tree structure by one. This is described as the "balance-shallower"
7675     ** sub-algorithm in some documentation.
7676     **
7677     ** If this is an auto-vacuum database, the call to copyNodeContent()
7678     ** sets all pointer-map entries corresponding to database image pages
7679     ** for which the pointer is stored within the content being copied.
7680     **
7681     ** It is critical that the child page be defragmented before being
7682     ** copied into the parent, because if the parent is page 1 then it will
7683     ** by smaller than the child due to the database header, and so all the
7684     ** free space needs to be up front.
7685     */
7686     assert( nNew==1 || CORRUPT_DB );
7687     rc = defragmentPage(apNew[0]);
7688     testcase( rc!=SQLITE_OK );
7689     assert( apNew[0]->nFree ==
7690         (get2byte(&apNew[0]->aData[5])-apNew[0]->cellOffset-apNew[0]->nCell*2)
7691       || rc!=SQLITE_OK
7692     );
7693     copyNodeContent(apNew[0], pParent, &rc);
7694     freePage(apNew[0], &rc);
7695   }else if( ISAUTOVACUUM && !leafCorrection ){
7696     /* Fix the pointer map entries associated with the right-child of each
7697     ** sibling page. All other pointer map entries have already been taken
7698     ** care of.  */
7699     for(i=0; i<nNew; i++){
7700       u32 key = get4byte(&apNew[i]->aData[8]);
7701       ptrmapPut(pBt, key, PTRMAP_BTREE, apNew[i]->pgno, &rc);
7702     }
7703   }
7704 
7705   assert( pParent->isInit );
7706   TRACE(("BALANCE: finished: old=%d new=%d cells=%d\n",
7707           nOld, nNew, b.nCell));
7708 
7709   /* Free any old pages that were not reused as new pages.
7710   */
7711   for(i=nNew; i<nOld; i++){
7712     freePage(apOld[i], &rc);
7713   }
7714 
7715 #if 0
7716   if( ISAUTOVACUUM && rc==SQLITE_OK && apNew[0]->isInit ){
7717     /* The ptrmapCheckPages() contains assert() statements that verify that
7718     ** all pointer map pages are set correctly. This is helpful while
7719     ** debugging. This is usually disabled because a corrupt database may
7720     ** cause an assert() statement to fail.  */
7721     ptrmapCheckPages(apNew, nNew);
7722     ptrmapCheckPages(&pParent, 1);
7723   }
7724 #endif
7725 
7726   /*
7727   ** Cleanup before returning.
7728   */
7729 balance_cleanup:
7730   sqlite3ScratchFree(b.apCell);
7731   for(i=0; i<nOld; i++){
7732     releasePage(apOld[i]);
7733   }
7734   for(i=0; i<nNew; i++){
7735     releasePage(apNew[i]);
7736   }
7737 
7738   return rc;
7739 }
7740 
7741 
7742 /*
7743 ** This function is called when the root page of a b-tree structure is
7744 ** overfull (has one or more overflow pages).
7745 **
7746 ** A new child page is allocated and the contents of the current root
7747 ** page, including overflow cells, are copied into the child. The root
7748 ** page is then overwritten to make it an empty page with the right-child
7749 ** pointer pointing to the new page.
7750 **
7751 ** Before returning, all pointer-map entries corresponding to pages
7752 ** that the new child-page now contains pointers to are updated. The
7753 ** entry corresponding to the new right-child pointer of the root
7754 ** page is also updated.
7755 **
7756 ** If successful, *ppChild is set to contain a reference to the child
7757 ** page and SQLITE_OK is returned. In this case the caller is required
7758 ** to call releasePage() on *ppChild exactly once. If an error occurs,
7759 ** an error code is returned and *ppChild is set to 0.
7760 */
7761 static int balance_deeper(MemPage *pRoot, MemPage **ppChild){
7762   int rc;                        /* Return value from subprocedures */
7763   MemPage *pChild = 0;           /* Pointer to a new child page */
7764   Pgno pgnoChild = 0;            /* Page number of the new child page */
7765   BtShared *pBt = pRoot->pBt;    /* The BTree */
7766 
7767   assert( pRoot->nOverflow>0 );
7768   assert( sqlite3_mutex_held(pBt->mutex) );
7769 
7770   /* Make pRoot, the root page of the b-tree, writable. Allocate a new
7771   ** page that will become the new right-child of pPage. Copy the contents
7772   ** of the node stored on pRoot into the new child page.
7773   */
7774   rc = sqlite3PagerWrite(pRoot->pDbPage);
7775   if( rc==SQLITE_OK ){
7776     rc = allocateBtreePage(pBt,&pChild,&pgnoChild,pRoot->pgno,0);
7777     copyNodeContent(pRoot, pChild, &rc);
7778     if( ISAUTOVACUUM ){
7779       ptrmapPut(pBt, pgnoChild, PTRMAP_BTREE, pRoot->pgno, &rc);
7780     }
7781   }
7782   if( rc ){
7783     *ppChild = 0;
7784     releasePage(pChild);
7785     return rc;
7786   }
7787   assert( sqlite3PagerIswriteable(pChild->pDbPage) );
7788   assert( sqlite3PagerIswriteable(pRoot->pDbPage) );
7789   assert( pChild->nCell==pRoot->nCell );
7790 
7791   TRACE(("BALANCE: copy root %d into %d\n", pRoot->pgno, pChild->pgno));
7792 
7793   /* Copy the overflow cells from pRoot to pChild */
7794   memcpy(pChild->aiOvfl, pRoot->aiOvfl,
7795          pRoot->nOverflow*sizeof(pRoot->aiOvfl[0]));
7796   memcpy(pChild->apOvfl, pRoot->apOvfl,
7797          pRoot->nOverflow*sizeof(pRoot->apOvfl[0]));
7798   pChild->nOverflow = pRoot->nOverflow;
7799 
7800   /* Zero the contents of pRoot. Then install pChild as the right-child. */
7801   zeroPage(pRoot, pChild->aData[0] & ~PTF_LEAF);
7802   put4byte(&pRoot->aData[pRoot->hdrOffset+8], pgnoChild);
7803 
7804   *ppChild = pChild;
7805   return SQLITE_OK;
7806 }
7807 
7808 /*
7809 ** The page that pCur currently points to has just been modified in
7810 ** some way. This function figures out if this modification means the
7811 ** tree needs to be balanced, and if so calls the appropriate balancing
7812 ** routine. Balancing routines are:
7813 **
7814 **   balance_quick()
7815 **   balance_deeper()
7816 **   balance_nonroot()
7817 */
7818 static int balance(BtCursor *pCur){
7819   int rc = SQLITE_OK;
7820   const int nMin = pCur->pBt->usableSize * 2 / 3;
7821   u8 aBalanceQuickSpace[13];
7822   u8 *pFree = 0;
7823 
7824   VVA_ONLY( int balance_quick_called = 0 );
7825   VVA_ONLY( int balance_deeper_called = 0 );
7826 
7827   do {
7828     int iPage = pCur->iPage;
7829     MemPage *pPage = pCur->apPage[iPage];
7830 
7831     if( iPage==0 ){
7832       if( pPage->nOverflow ){
7833         /* The root page of the b-tree is overfull. In this case call the
7834         ** balance_deeper() function to create a new child for the root-page
7835         ** and copy the current contents of the root-page to it. The
7836         ** next iteration of the do-loop will balance the child page.
7837         */
7838         assert( balance_deeper_called==0 );
7839         VVA_ONLY( balance_deeper_called++ );
7840         rc = balance_deeper(pPage, &pCur->apPage[1]);
7841         if( rc==SQLITE_OK ){
7842           pCur->iPage = 1;
7843           pCur->aiIdx[0] = 0;
7844           pCur->aiIdx[1] = 0;
7845           assert( pCur->apPage[1]->nOverflow );
7846         }
7847       }else{
7848         break;
7849       }
7850     }else if( pPage->nOverflow==0 && pPage->nFree<=nMin ){
7851       break;
7852     }else{
7853       MemPage * const pParent = pCur->apPage[iPage-1];
7854       int const iIdx = pCur->aiIdx[iPage-1];
7855 
7856       rc = sqlite3PagerWrite(pParent->pDbPage);
7857       if( rc==SQLITE_OK ){
7858 #ifndef SQLITE_OMIT_QUICKBALANCE
7859         if( pPage->intKeyLeaf
7860          && pPage->nOverflow==1
7861          && pPage->aiOvfl[0]==pPage->nCell
7862          && pParent->pgno!=1
7863          && pParent->nCell==iIdx
7864         ){
7865           /* Call balance_quick() to create a new sibling of pPage on which
7866           ** to store the overflow cell. balance_quick() inserts a new cell
7867           ** into pParent, which may cause pParent overflow. If this
7868           ** happens, the next iteration of the do-loop will balance pParent
7869           ** use either balance_nonroot() or balance_deeper(). Until this
7870           ** happens, the overflow cell is stored in the aBalanceQuickSpace[]
7871           ** buffer.
7872           **
7873           ** The purpose of the following assert() is to check that only a
7874           ** single call to balance_quick() is made for each call to this
7875           ** function. If this were not verified, a subtle bug involving reuse
7876           ** of the aBalanceQuickSpace[] might sneak in.
7877           */
7878           assert( balance_quick_called==0 );
7879           VVA_ONLY( balance_quick_called++ );
7880           rc = balance_quick(pParent, pPage, aBalanceQuickSpace);
7881         }else
7882 #endif
7883         {
7884           /* In this case, call balance_nonroot() to redistribute cells
7885           ** between pPage and up to 2 of its sibling pages. This involves
7886           ** modifying the contents of pParent, which may cause pParent to
7887           ** become overfull or underfull. The next iteration of the do-loop
7888           ** will balance the parent page to correct this.
7889           **
7890           ** If the parent page becomes overfull, the overflow cell or cells
7891           ** are stored in the pSpace buffer allocated immediately below.
7892           ** A subsequent iteration of the do-loop will deal with this by
7893           ** calling balance_nonroot() (balance_deeper() may be called first,
7894           ** but it doesn't deal with overflow cells - just moves them to a
7895           ** different page). Once this subsequent call to balance_nonroot()
7896           ** has completed, it is safe to release the pSpace buffer used by
7897           ** the previous call, as the overflow cell data will have been
7898           ** copied either into the body of a database page or into the new
7899           ** pSpace buffer passed to the latter call to balance_nonroot().
7900           */
7901           u8 *pSpace = sqlite3PageMalloc(pCur->pBt->pageSize);
7902           rc = balance_nonroot(pParent, iIdx, pSpace, iPage==1,
7903                                pCur->hints&BTREE_BULKLOAD);
7904           if( pFree ){
7905             /* If pFree is not NULL, it points to the pSpace buffer used
7906             ** by a previous call to balance_nonroot(). Its contents are
7907             ** now stored either on real database pages or within the
7908             ** new pSpace buffer, so it may be safely freed here. */
7909             sqlite3PageFree(pFree);
7910           }
7911 
7912           /* The pSpace buffer will be freed after the next call to
7913           ** balance_nonroot(), or just before this function returns, whichever
7914           ** comes first. */
7915           pFree = pSpace;
7916         }
7917       }
7918 
7919       pPage->nOverflow = 0;
7920 
7921       /* The next iteration of the do-loop balances the parent page. */
7922       releasePage(pPage);
7923       pCur->iPage--;
7924       assert( pCur->iPage>=0 );
7925     }
7926   }while( rc==SQLITE_OK );
7927 
7928   if( pFree ){
7929     sqlite3PageFree(pFree);
7930   }
7931   return rc;
7932 }
7933 
7934 
7935 /*
7936 ** Insert a new record into the BTree.  The content of the new record
7937 ** is described by the pX object.  The pCur cursor is used only to
7938 ** define what table the record should be inserted into, and is left
7939 ** pointing at a random location.
7940 **
7941 ** For a table btree (used for rowid tables), only the pX.nKey value of
7942 ** the key is used. The pX.pKey value must be NULL.  The pX.nKey is the
7943 ** rowid or INTEGER PRIMARY KEY of the row.  The pX.nData,pData,nZero fields
7944 ** hold the content of the row.
7945 **
7946 ** For an index btree (used for indexes and WITHOUT ROWID tables), the
7947 ** key is an arbitrary byte sequence stored in pX.pKey,nKey.  The
7948 ** pX.pData,nData,nZero fields must be zero.
7949 **
7950 ** If the seekResult parameter is non-zero, then a successful call to
7951 ** MovetoUnpacked() to seek cursor pCur to (pKey,nKey) has already
7952 ** been performed.  In other words, if seekResult!=0 then the cursor
7953 ** is currently pointing to a cell that will be adjacent to the cell
7954 ** to be inserted.  If seekResult<0 then pCur points to a cell that is
7955 ** smaller then (pKey,nKey).  If seekResult>0 then pCur points to a cell
7956 ** that is larger than (pKey,nKey).
7957 **
7958 ** If seekResult==0, that means pCur is pointing at some unknown location.
7959 ** In that case, this routine must seek the cursor to the correct insertion
7960 ** point for (pKey,nKey) before doing the insertion.  For index btrees,
7961 ** if pX->nMem is non-zero, then pX->aMem contains pointers to the unpacked
7962 ** key values and pX->aMem can be used instead of pX->pKey to avoid having
7963 ** to decode the key.
7964 */
7965 int sqlite3BtreeInsert(
7966   BtCursor *pCur,                /* Insert data into the table of this cursor */
7967   const BtreePayload *pX,        /* Content of the row to be inserted */
7968   int flags,                     /* True if this is likely an append */
7969   int seekResult                 /* Result of prior MovetoUnpacked() call */
7970 ){
7971   int rc;
7972   int loc = seekResult;          /* -1: before desired location  +1: after */
7973   int szNew = 0;
7974   int idx;
7975   MemPage *pPage;
7976   Btree *p = pCur->pBtree;
7977   BtShared *pBt = p->pBt;
7978   unsigned char *oldCell;
7979   unsigned char *newCell = 0;
7980 
7981   assert( (flags & (BTREE_SAVEPOSITION|BTREE_APPEND))==flags );
7982 
7983   if( pCur->eState==CURSOR_FAULT ){
7984     assert( pCur->skipNext!=SQLITE_OK );
7985     return pCur->skipNext;
7986   }
7987 
7988   assert( cursorOwnsBtShared(pCur) );
7989   assert( (pCur->curFlags & BTCF_WriteFlag)!=0
7990               && pBt->inTransaction==TRANS_WRITE
7991               && (pBt->btsFlags & BTS_READ_ONLY)==0 );
7992   assert( hasSharedCacheTableLock(p, pCur->pgnoRoot, pCur->pKeyInfo!=0, 2) );
7993 
7994   /* Assert that the caller has been consistent. If this cursor was opened
7995   ** expecting an index b-tree, then the caller should be inserting blob
7996   ** keys with no associated data. If the cursor was opened expecting an
7997   ** intkey table, the caller should be inserting integer keys with a
7998   ** blob of associated data.  */
7999   assert( (pX->pKey==0)==(pCur->pKeyInfo==0) );
8000 
8001   /* Save the positions of any other cursors open on this table.
8002   **
8003   ** In some cases, the call to btreeMoveto() below is a no-op. For
8004   ** example, when inserting data into a table with auto-generated integer
8005   ** keys, the VDBE layer invokes sqlite3BtreeLast() to figure out the
8006   ** integer key to use. It then calls this function to actually insert the
8007   ** data into the intkey B-Tree. In this case btreeMoveto() recognizes
8008   ** that the cursor is already where it needs to be and returns without
8009   ** doing any work. To avoid thwarting these optimizations, it is important
8010   ** not to clear the cursor here.
8011   */
8012   if( pCur->curFlags & BTCF_Multiple ){
8013     rc = saveAllCursors(pBt, pCur->pgnoRoot, pCur);
8014     if( rc ) return rc;
8015   }
8016 
8017   if( pCur->pKeyInfo==0 ){
8018     assert( pX->pKey==0 );
8019     /* If this is an insert into a table b-tree, invalidate any incrblob
8020     ** cursors open on the row being replaced */
8021     invalidateIncrblobCursors(p, pX->nKey, 0);
8022 
8023     /* If BTREE_SAVEPOSITION is set, the cursor must already be pointing
8024     ** to a row with the same key as the new entry being inserted.  */
8025     assert( (flags & BTREE_SAVEPOSITION)==0 ||
8026             ((pCur->curFlags&BTCF_ValidNKey)!=0 && pX->nKey==pCur->info.nKey) );
8027 
8028     /* If the cursor is currently on the last row and we are appending a
8029     ** new row onto the end, set the "loc" to avoid an unnecessary
8030     ** btreeMoveto() call */
8031     if( (pCur->curFlags&BTCF_ValidNKey)!=0 && pX->nKey==pCur->info.nKey ){
8032       loc = 0;
8033     }else if( (pCur->curFlags&BTCF_ValidNKey)!=0 && pX->nKey>0
8034                && pCur->info.nKey==pX->nKey-1 ){
8035       loc = -1;
8036     }else if( loc==0 ){
8037       rc = sqlite3BtreeMovetoUnpacked(pCur, 0, pX->nKey, flags!=0, &loc);
8038       if( rc ) return rc;
8039     }
8040   }else if( loc==0 && (flags & BTREE_SAVEPOSITION)==0 ){
8041     if( pX->nMem ){
8042       UnpackedRecord r;
8043       r.pKeyInfo = pCur->pKeyInfo;
8044       r.aMem = pX->aMem;
8045       r.nField = pX->nMem;
8046       r.default_rc = 0;
8047       r.errCode = 0;
8048       r.r1 = 0;
8049       r.r2 = 0;
8050       r.eqSeen = 0;
8051       rc = sqlite3BtreeMovetoUnpacked(pCur, &r, 0, flags!=0, &loc);
8052     }else{
8053       rc = btreeMoveto(pCur, pX->pKey, pX->nKey, flags!=0, &loc);
8054     }
8055     if( rc ) return rc;
8056   }
8057   assert( pCur->eState==CURSOR_VALID || (pCur->eState==CURSOR_INVALID && loc) );
8058 
8059   pPage = pCur->apPage[pCur->iPage];
8060   assert( pPage->intKey || pX->nKey>=0 );
8061   assert( pPage->leaf || !pPage->intKey );
8062 
8063   TRACE(("INSERT: table=%d nkey=%lld ndata=%d page=%d %s\n",
8064           pCur->pgnoRoot, pX->nKey, pX->nData, pPage->pgno,
8065           loc==0 ? "overwrite" : "new entry"));
8066   assert( pPage->isInit );
8067   newCell = pBt->pTmpSpace;
8068   assert( newCell!=0 );
8069   rc = fillInCell(pPage, newCell, pX, &szNew);
8070   if( rc ) goto end_insert;
8071   assert( szNew==pPage->xCellSize(pPage, newCell) );
8072   assert( szNew <= MX_CELL_SIZE(pBt) );
8073   idx = pCur->aiIdx[pCur->iPage];
8074   if( loc==0 ){
8075     CellInfo info;
8076     assert( idx<pPage->nCell );
8077     rc = sqlite3PagerWrite(pPage->pDbPage);
8078     if( rc ){
8079       goto end_insert;
8080     }
8081     oldCell = findCell(pPage, idx);
8082     if( !pPage->leaf ){
8083       memcpy(newCell, oldCell, 4);
8084     }
8085     rc = clearCell(pPage, oldCell, &info);
8086     if( info.nSize==szNew && info.nLocal==info.nPayload ){
8087       /* Overwrite the old cell with the new if they are the same size.
8088       ** We could also try to do this if the old cell is smaller, then add
8089       ** the leftover space to the free list.  But experiments show that
8090       ** doing that is no faster then skipping this optimization and just
8091       ** calling dropCell() and insertCell(). */
8092       assert( rc==SQLITE_OK ); /* clearCell never fails when nLocal==nPayload */
8093       if( oldCell+szNew > pPage->aDataEnd ) return SQLITE_CORRUPT_BKPT;
8094       memcpy(oldCell, newCell, szNew);
8095       return SQLITE_OK;
8096     }
8097     dropCell(pPage, idx, info.nSize, &rc);
8098     if( rc ) goto end_insert;
8099   }else if( loc<0 && pPage->nCell>0 ){
8100     assert( pPage->leaf );
8101     idx = ++pCur->aiIdx[pCur->iPage];
8102   }else{
8103     assert( pPage->leaf );
8104   }
8105   insertCell(pPage, idx, newCell, szNew, 0, 0, &rc);
8106   assert( pPage->nOverflow==0 || rc==SQLITE_OK );
8107   assert( rc!=SQLITE_OK || pPage->nCell>0 || pPage->nOverflow>0 );
8108 
8109   /* If no error has occurred and pPage has an overflow cell, call balance()
8110   ** to redistribute the cells within the tree. Since balance() may move
8111   ** the cursor, zero the BtCursor.info.nSize and BTCF_ValidNKey
8112   ** variables.
8113   **
8114   ** Previous versions of SQLite called moveToRoot() to move the cursor
8115   ** back to the root page as balance() used to invalidate the contents
8116   ** of BtCursor.apPage[] and BtCursor.aiIdx[]. Instead of doing that,
8117   ** set the cursor state to "invalid". This makes common insert operations
8118   ** slightly faster.
8119   **
8120   ** There is a subtle but important optimization here too. When inserting
8121   ** multiple records into an intkey b-tree using a single cursor (as can
8122   ** happen while processing an "INSERT INTO ... SELECT" statement), it
8123   ** is advantageous to leave the cursor pointing to the last entry in
8124   ** the b-tree if possible. If the cursor is left pointing to the last
8125   ** entry in the table, and the next row inserted has an integer key
8126   ** larger than the largest existing key, it is possible to insert the
8127   ** row without seeking the cursor. This can be a big performance boost.
8128   */
8129   pCur->info.nSize = 0;
8130   if( pPage->nOverflow ){
8131     assert( rc==SQLITE_OK );
8132     pCur->curFlags &= ~(BTCF_ValidNKey);
8133     rc = balance(pCur);
8134 
8135     /* Must make sure nOverflow is reset to zero even if the balance()
8136     ** fails. Internal data structure corruption will result otherwise.
8137     ** Also, set the cursor state to invalid. This stops saveCursorPosition()
8138     ** from trying to save the current position of the cursor.  */
8139     pCur->apPage[pCur->iPage]->nOverflow = 0;
8140     pCur->eState = CURSOR_INVALID;
8141     if( (flags & BTREE_SAVEPOSITION) && rc==SQLITE_OK ){
8142       rc = moveToRoot(pCur);
8143       if( pCur->pKeyInfo ){
8144         assert( pCur->pKey==0 );
8145         pCur->pKey = sqlite3Malloc( pX->nKey );
8146         if( pCur->pKey==0 ){
8147           rc = SQLITE_NOMEM;
8148         }else{
8149           memcpy(pCur->pKey, pX->pKey, pX->nKey);
8150         }
8151       }
8152       pCur->eState = CURSOR_REQUIRESEEK;
8153       pCur->nKey = pX->nKey;
8154     }
8155   }
8156   assert( pCur->apPage[pCur->iPage]->nOverflow==0 );
8157 
8158 end_insert:
8159   return rc;
8160 }
8161 
8162 /*
8163 ** Delete the entry that the cursor is pointing to.
8164 **
8165 ** If the BTREE_SAVEPOSITION bit of the flags parameter is zero, then
8166 ** the cursor is left pointing at an arbitrary location after the delete.
8167 ** But if that bit is set, then the cursor is left in a state such that
8168 ** the next call to BtreeNext() or BtreePrev() moves it to the same row
8169 ** as it would have been on if the call to BtreeDelete() had been omitted.
8170 **
8171 ** The BTREE_AUXDELETE bit of flags indicates that is one of several deletes
8172 ** associated with a single table entry and its indexes.  Only one of those
8173 ** deletes is considered the "primary" delete.  The primary delete occurs
8174 ** on a cursor that is not a BTREE_FORDELETE cursor.  All but one delete
8175 ** operation on non-FORDELETE cursors is tagged with the AUXDELETE flag.
8176 ** The BTREE_AUXDELETE bit is a hint that is not used by this implementation,
8177 ** but which might be used by alternative storage engines.
8178 */
8179 int sqlite3BtreeDelete(BtCursor *pCur, u8 flags){
8180   Btree *p = pCur->pBtree;
8181   BtShared *pBt = p->pBt;
8182   int rc;                              /* Return code */
8183   MemPage *pPage;                      /* Page to delete cell from */
8184   unsigned char *pCell;                /* Pointer to cell to delete */
8185   int iCellIdx;                        /* Index of cell to delete */
8186   int iCellDepth;                      /* Depth of node containing pCell */
8187   CellInfo info;                       /* Size of the cell being deleted */
8188   int bSkipnext = 0;                   /* Leaf cursor in SKIPNEXT state */
8189   u8 bPreserve = flags & BTREE_SAVEPOSITION;  /* Keep cursor valid */
8190 
8191   assert( cursorOwnsBtShared(pCur) );
8192   assert( pBt->inTransaction==TRANS_WRITE );
8193   assert( (pBt->btsFlags & BTS_READ_ONLY)==0 );
8194   assert( pCur->curFlags & BTCF_WriteFlag );
8195   assert( hasSharedCacheTableLock(p, pCur->pgnoRoot, pCur->pKeyInfo!=0, 2) );
8196   assert( !hasReadConflicts(p, pCur->pgnoRoot) );
8197   assert( pCur->aiIdx[pCur->iPage]<pCur->apPage[pCur->iPage]->nCell );
8198   assert( pCur->eState==CURSOR_VALID );
8199   assert( (flags & ~(BTREE_SAVEPOSITION | BTREE_AUXDELETE))==0 );
8200 
8201   iCellDepth = pCur->iPage;
8202   iCellIdx = pCur->aiIdx[iCellDepth];
8203   pPage = pCur->apPage[iCellDepth];
8204   pCell = findCell(pPage, iCellIdx);
8205 
8206   /* If the bPreserve flag is set to true, then the cursor position must
8207   ** be preserved following this delete operation. If the current delete
8208   ** will cause a b-tree rebalance, then this is done by saving the cursor
8209   ** key and leaving the cursor in CURSOR_REQUIRESEEK state before
8210   ** returning.
8211   **
8212   ** Or, if the current delete will not cause a rebalance, then the cursor
8213   ** will be left in CURSOR_SKIPNEXT state pointing to the entry immediately
8214   ** before or after the deleted entry. In this case set bSkipnext to true.  */
8215   if( bPreserve ){
8216     if( !pPage->leaf
8217      || (pPage->nFree+cellSizePtr(pPage,pCell)+2)>(int)(pBt->usableSize*2/3)
8218     ){
8219       /* A b-tree rebalance will be required after deleting this entry.
8220       ** Save the cursor key.  */
8221       rc = saveCursorKey(pCur);
8222       if( rc ) return rc;
8223     }else{
8224       bSkipnext = 1;
8225     }
8226   }
8227 
8228   /* If the page containing the entry to delete is not a leaf page, move
8229   ** the cursor to the largest entry in the tree that is smaller than
8230   ** the entry being deleted. This cell will replace the cell being deleted
8231   ** from the internal node. The 'previous' entry is used for this instead
8232   ** of the 'next' entry, as the previous entry is always a part of the
8233   ** sub-tree headed by the child page of the cell being deleted. This makes
8234   ** balancing the tree following the delete operation easier.  */
8235   if( !pPage->leaf ){
8236     int notUsed = 0;
8237     rc = sqlite3BtreePrevious(pCur, &notUsed);
8238     if( rc ) return rc;
8239   }
8240 
8241   /* Save the positions of any other cursors open on this table before
8242   ** making any modifications.  */
8243   if( pCur->curFlags & BTCF_Multiple ){
8244     rc = saveAllCursors(pBt, pCur->pgnoRoot, pCur);
8245     if( rc ) return rc;
8246   }
8247 
8248   /* If this is a delete operation to remove a row from a table b-tree,
8249   ** invalidate any incrblob cursors open on the row being deleted.  */
8250   if( pCur->pKeyInfo==0 ){
8251     invalidateIncrblobCursors(p, pCur->info.nKey, 0);
8252   }
8253 
8254   /* Make the page containing the entry to be deleted writable. Then free any
8255   ** overflow pages associated with the entry and finally remove the cell
8256   ** itself from within the page.  */
8257   rc = sqlite3PagerWrite(pPage->pDbPage);
8258   if( rc ) return rc;
8259   rc = clearCell(pPage, pCell, &info);
8260   dropCell(pPage, iCellIdx, info.nSize, &rc);
8261   if( rc ) return rc;
8262 
8263   /* If the cell deleted was not located on a leaf page, then the cursor
8264   ** is currently pointing to the largest entry in the sub-tree headed
8265   ** by the child-page of the cell that was just deleted from an internal
8266   ** node. The cell from the leaf node needs to be moved to the internal
8267   ** node to replace the deleted cell.  */
8268   if( !pPage->leaf ){
8269     MemPage *pLeaf = pCur->apPage[pCur->iPage];
8270     int nCell;
8271     Pgno n = pCur->apPage[iCellDepth+1]->pgno;
8272     unsigned char *pTmp;
8273 
8274     pCell = findCell(pLeaf, pLeaf->nCell-1);
8275     if( pCell<&pLeaf->aData[4] ) return SQLITE_CORRUPT_BKPT;
8276     nCell = pLeaf->xCellSize(pLeaf, pCell);
8277     assert( MX_CELL_SIZE(pBt) >= nCell );
8278     pTmp = pBt->pTmpSpace;
8279     assert( pTmp!=0 );
8280     rc = sqlite3PagerWrite(pLeaf->pDbPage);
8281     if( rc==SQLITE_OK ){
8282       insertCell(pPage, iCellIdx, pCell-4, nCell+4, pTmp, n, &rc);
8283     }
8284     dropCell(pLeaf, pLeaf->nCell-1, nCell, &rc);
8285     if( rc ) return rc;
8286   }
8287 
8288   /* Balance the tree. If the entry deleted was located on a leaf page,
8289   ** then the cursor still points to that page. In this case the first
8290   ** call to balance() repairs the tree, and the if(...) condition is
8291   ** never true.
8292   **
8293   ** Otherwise, if the entry deleted was on an internal node page, then
8294   ** pCur is pointing to the leaf page from which a cell was removed to
8295   ** replace the cell deleted from the internal node. This is slightly
8296   ** tricky as the leaf node may be underfull, and the internal node may
8297   ** be either under or overfull. In this case run the balancing algorithm
8298   ** on the leaf node first. If the balance proceeds far enough up the
8299   ** tree that we can be sure that any problem in the internal node has
8300   ** been corrected, so be it. Otherwise, after balancing the leaf node,
8301   ** walk the cursor up the tree to the internal node and balance it as
8302   ** well.  */
8303   rc = balance(pCur);
8304   if( rc==SQLITE_OK && pCur->iPage>iCellDepth ){
8305     while( pCur->iPage>iCellDepth ){
8306       releasePage(pCur->apPage[pCur->iPage--]);
8307     }
8308     rc = balance(pCur);
8309   }
8310 
8311   if( rc==SQLITE_OK ){
8312     if( bSkipnext ){
8313       assert( bPreserve && (pCur->iPage==iCellDepth || CORRUPT_DB) );
8314       assert( pPage==pCur->apPage[pCur->iPage] || CORRUPT_DB );
8315       assert( (pPage->nCell>0 || CORRUPT_DB) && iCellIdx<=pPage->nCell );
8316       pCur->eState = CURSOR_SKIPNEXT;
8317       if( iCellIdx>=pPage->nCell ){
8318         pCur->skipNext = -1;
8319         pCur->aiIdx[iCellDepth] = pPage->nCell-1;
8320       }else{
8321         pCur->skipNext = 1;
8322       }
8323     }else{
8324       rc = moveToRoot(pCur);
8325       if( bPreserve ){
8326         pCur->eState = CURSOR_REQUIRESEEK;
8327       }
8328     }
8329   }
8330   return rc;
8331 }
8332 
8333 /*
8334 ** Create a new BTree table.  Write into *piTable the page
8335 ** number for the root page of the new table.
8336 **
8337 ** The type of type is determined by the flags parameter.  Only the
8338 ** following values of flags are currently in use.  Other values for
8339 ** flags might not work:
8340 **
8341 **     BTREE_INTKEY|BTREE_LEAFDATA     Used for SQL tables with rowid keys
8342 **     BTREE_ZERODATA                  Used for SQL indices
8343 */
8344 static int btreeCreateTable(Btree *p, int *piTable, int createTabFlags){
8345   BtShared *pBt = p->pBt;
8346   MemPage *pRoot;
8347   Pgno pgnoRoot;
8348   int rc;
8349   int ptfFlags;          /* Page-type flage for the root page of new table */
8350 
8351   assert( sqlite3BtreeHoldsMutex(p) );
8352   assert( pBt->inTransaction==TRANS_WRITE );
8353   assert( (pBt->btsFlags & BTS_READ_ONLY)==0 );
8354 
8355 #ifdef SQLITE_OMIT_AUTOVACUUM
8356   rc = allocateBtreePage(pBt, &pRoot, &pgnoRoot, 1, 0);
8357   if( rc ){
8358     return rc;
8359   }
8360 #else
8361   if( pBt->autoVacuum ){
8362     Pgno pgnoMove;      /* Move a page here to make room for the root-page */
8363     MemPage *pPageMove; /* The page to move to. */
8364 
8365     /* Creating a new table may probably require moving an existing database
8366     ** to make room for the new tables root page. In case this page turns
8367     ** out to be an overflow page, delete all overflow page-map caches
8368     ** held by open cursors.
8369     */
8370     invalidateAllOverflowCache(pBt);
8371 
8372     /* Read the value of meta[3] from the database to determine where the
8373     ** root page of the new table should go. meta[3] is the largest root-page
8374     ** created so far, so the new root-page is (meta[3]+1).
8375     */
8376     sqlite3BtreeGetMeta(p, BTREE_LARGEST_ROOT_PAGE, &pgnoRoot);
8377     pgnoRoot++;
8378 
8379     /* The new root-page may not be allocated on a pointer-map page, or the
8380     ** PENDING_BYTE page.
8381     */
8382     while( pgnoRoot==PTRMAP_PAGENO(pBt, pgnoRoot) ||
8383         pgnoRoot==PENDING_BYTE_PAGE(pBt) ){
8384       pgnoRoot++;
8385     }
8386     assert( pgnoRoot>=3 || CORRUPT_DB );
8387     testcase( pgnoRoot<3 );
8388 
8389     /* Allocate a page. The page that currently resides at pgnoRoot will
8390     ** be moved to the allocated page (unless the allocated page happens
8391     ** to reside at pgnoRoot).
8392     */
8393     rc = allocateBtreePage(pBt, &pPageMove, &pgnoMove, pgnoRoot, BTALLOC_EXACT);
8394     if( rc!=SQLITE_OK ){
8395       return rc;
8396     }
8397 
8398     if( pgnoMove!=pgnoRoot ){
8399       /* pgnoRoot is the page that will be used for the root-page of
8400       ** the new table (assuming an error did not occur). But we were
8401       ** allocated pgnoMove. If required (i.e. if it was not allocated
8402       ** by extending the file), the current page at position pgnoMove
8403       ** is already journaled.
8404       */
8405       u8 eType = 0;
8406       Pgno iPtrPage = 0;
8407 
8408       /* Save the positions of any open cursors. This is required in
8409       ** case they are holding a reference to an xFetch reference
8410       ** corresponding to page pgnoRoot.  */
8411       rc = saveAllCursors(pBt, 0, 0);
8412       releasePage(pPageMove);
8413       if( rc!=SQLITE_OK ){
8414         return rc;
8415       }
8416 
8417       /* Move the page currently at pgnoRoot to pgnoMove. */
8418       rc = btreeGetPage(pBt, pgnoRoot, &pRoot, 0);
8419       if( rc!=SQLITE_OK ){
8420         return rc;
8421       }
8422       rc = ptrmapGet(pBt, pgnoRoot, &eType, &iPtrPage);
8423       if( eType==PTRMAP_ROOTPAGE || eType==PTRMAP_FREEPAGE ){
8424         rc = SQLITE_CORRUPT_BKPT;
8425       }
8426       if( rc!=SQLITE_OK ){
8427         releasePage(pRoot);
8428         return rc;
8429       }
8430       assert( eType!=PTRMAP_ROOTPAGE );
8431       assert( eType!=PTRMAP_FREEPAGE );
8432       rc = relocatePage(pBt, pRoot, eType, iPtrPage, pgnoMove, 0);
8433       releasePage(pRoot);
8434 
8435       /* Obtain the page at pgnoRoot */
8436       if( rc!=SQLITE_OK ){
8437         return rc;
8438       }
8439       rc = btreeGetPage(pBt, pgnoRoot, &pRoot, 0);
8440       if( rc!=SQLITE_OK ){
8441         return rc;
8442       }
8443       rc = sqlite3PagerWrite(pRoot->pDbPage);
8444       if( rc!=SQLITE_OK ){
8445         releasePage(pRoot);
8446         return rc;
8447       }
8448     }else{
8449       pRoot = pPageMove;
8450     }
8451 
8452     /* Update the pointer-map and meta-data with the new root-page number. */
8453     ptrmapPut(pBt, pgnoRoot, PTRMAP_ROOTPAGE, 0, &rc);
8454     if( rc ){
8455       releasePage(pRoot);
8456       return rc;
8457     }
8458 
8459     /* When the new root page was allocated, page 1 was made writable in
8460     ** order either to increase the database filesize, or to decrement the
8461     ** freelist count.  Hence, the sqlite3BtreeUpdateMeta() call cannot fail.
8462     */
8463     assert( sqlite3PagerIswriteable(pBt->pPage1->pDbPage) );
8464     rc = sqlite3BtreeUpdateMeta(p, 4, pgnoRoot);
8465     if( NEVER(rc) ){
8466       releasePage(pRoot);
8467       return rc;
8468     }
8469 
8470   }else{
8471     rc = allocateBtreePage(pBt, &pRoot, &pgnoRoot, 1, 0);
8472     if( rc ) return rc;
8473   }
8474 #endif
8475   assert( sqlite3PagerIswriteable(pRoot->pDbPage) );
8476   if( createTabFlags & BTREE_INTKEY ){
8477     ptfFlags = PTF_INTKEY | PTF_LEAFDATA | PTF_LEAF;
8478   }else{
8479     ptfFlags = PTF_ZERODATA | PTF_LEAF;
8480   }
8481   zeroPage(pRoot, ptfFlags);
8482   sqlite3PagerUnref(pRoot->pDbPage);
8483   assert( (pBt->openFlags & BTREE_SINGLE)==0 || pgnoRoot==2 );
8484   *piTable = (int)pgnoRoot;
8485   return SQLITE_OK;
8486 }
8487 int sqlite3BtreeCreateTable(Btree *p, int *piTable, int flags){
8488   int rc;
8489   sqlite3BtreeEnter(p);
8490   rc = btreeCreateTable(p, piTable, flags);
8491   sqlite3BtreeLeave(p);
8492   return rc;
8493 }
8494 
8495 /*
8496 ** Erase the given database page and all its children.  Return
8497 ** the page to the freelist.
8498 */
8499 static int clearDatabasePage(
8500   BtShared *pBt,           /* The BTree that contains the table */
8501   Pgno pgno,               /* Page number to clear */
8502   int freePageFlag,        /* Deallocate page if true */
8503   int *pnChange            /* Add number of Cells freed to this counter */
8504 ){
8505   MemPage *pPage;
8506   int rc;
8507   unsigned char *pCell;
8508   int i;
8509   int hdr;
8510   CellInfo info;
8511 
8512   assert( sqlite3_mutex_held(pBt->mutex) );
8513   if( pgno>btreePagecount(pBt) ){
8514     return SQLITE_CORRUPT_BKPT;
8515   }
8516   rc = getAndInitPage(pBt, pgno, &pPage, 0, 0);
8517   if( rc ) return rc;
8518   if( pPage->bBusy ){
8519     rc = SQLITE_CORRUPT_BKPT;
8520     goto cleardatabasepage_out;
8521   }
8522   pPage->bBusy = 1;
8523   hdr = pPage->hdrOffset;
8524   for(i=0; i<pPage->nCell; i++){
8525     pCell = findCell(pPage, i);
8526     if( !pPage->leaf ){
8527       rc = clearDatabasePage(pBt, get4byte(pCell), 1, pnChange);
8528       if( rc ) goto cleardatabasepage_out;
8529     }
8530     rc = clearCell(pPage, pCell, &info);
8531     if( rc ) goto cleardatabasepage_out;
8532   }
8533   if( !pPage->leaf ){
8534     rc = clearDatabasePage(pBt, get4byte(&pPage->aData[hdr+8]), 1, pnChange);
8535     if( rc ) goto cleardatabasepage_out;
8536   }else if( pnChange ){
8537     assert( pPage->intKey || CORRUPT_DB );
8538     testcase( !pPage->intKey );
8539     *pnChange += pPage->nCell;
8540   }
8541   if( freePageFlag ){
8542     freePage(pPage, &rc);
8543   }else if( (rc = sqlite3PagerWrite(pPage->pDbPage))==0 ){
8544     zeroPage(pPage, pPage->aData[hdr] | PTF_LEAF);
8545   }
8546 
8547 cleardatabasepage_out:
8548   pPage->bBusy = 0;
8549   releasePage(pPage);
8550   return rc;
8551 }
8552 
8553 /*
8554 ** Delete all information from a single table in the database.  iTable is
8555 ** the page number of the root of the table.  After this routine returns,
8556 ** the root page is empty, but still exists.
8557 **
8558 ** This routine will fail with SQLITE_LOCKED if there are any open
8559 ** read cursors on the table.  Open write cursors are moved to the
8560 ** root of the table.
8561 **
8562 ** If pnChange is not NULL, then table iTable must be an intkey table. The
8563 ** integer value pointed to by pnChange is incremented by the number of
8564 ** entries in the table.
8565 */
8566 int sqlite3BtreeClearTable(Btree *p, int iTable, int *pnChange){
8567   int rc;
8568   BtShared *pBt = p->pBt;
8569   sqlite3BtreeEnter(p);
8570   assert( p->inTrans==TRANS_WRITE );
8571 
8572   rc = saveAllCursors(pBt, (Pgno)iTable, 0);
8573 
8574   if( SQLITE_OK==rc ){
8575     /* Invalidate all incrblob cursors open on table iTable (assuming iTable
8576     ** is the root of a table b-tree - if it is not, the following call is
8577     ** a no-op).  */
8578     invalidateIncrblobCursors(p, 0, 1);
8579     rc = clearDatabasePage(pBt, (Pgno)iTable, 0, pnChange);
8580   }
8581   sqlite3BtreeLeave(p);
8582   return rc;
8583 }
8584 
8585 /*
8586 ** Delete all information from the single table that pCur is open on.
8587 **
8588 ** This routine only work for pCur on an ephemeral table.
8589 */
8590 int sqlite3BtreeClearTableOfCursor(BtCursor *pCur){
8591   return sqlite3BtreeClearTable(pCur->pBtree, pCur->pgnoRoot, 0);
8592 }
8593 
8594 /*
8595 ** Erase all information in a table and add the root of the table to
8596 ** the freelist.  Except, the root of the principle table (the one on
8597 ** page 1) is never added to the freelist.
8598 **
8599 ** This routine will fail with SQLITE_LOCKED if there are any open
8600 ** cursors on the table.
8601 **
8602 ** If AUTOVACUUM is enabled and the page at iTable is not the last
8603 ** root page in the database file, then the last root page
8604 ** in the database file is moved into the slot formerly occupied by
8605 ** iTable and that last slot formerly occupied by the last root page
8606 ** is added to the freelist instead of iTable.  In this say, all
8607 ** root pages are kept at the beginning of the database file, which
8608 ** is necessary for AUTOVACUUM to work right.  *piMoved is set to the
8609 ** page number that used to be the last root page in the file before
8610 ** the move.  If no page gets moved, *piMoved is set to 0.
8611 ** The last root page is recorded in meta[3] and the value of
8612 ** meta[3] is updated by this procedure.
8613 */
8614 static int btreeDropTable(Btree *p, Pgno iTable, int *piMoved){
8615   int rc;
8616   MemPage *pPage = 0;
8617   BtShared *pBt = p->pBt;
8618 
8619   assert( sqlite3BtreeHoldsMutex(p) );
8620   assert( p->inTrans==TRANS_WRITE );
8621   assert( iTable>=2 );
8622 
8623   rc = btreeGetPage(pBt, (Pgno)iTable, &pPage, 0);
8624   if( rc ) return rc;
8625   rc = sqlite3BtreeClearTable(p, iTable, 0);
8626   if( rc ){
8627     releasePage(pPage);
8628     return rc;
8629   }
8630 
8631   *piMoved = 0;
8632 
8633 #ifdef SQLITE_OMIT_AUTOVACUUM
8634   freePage(pPage, &rc);
8635   releasePage(pPage);
8636 #else
8637   if( pBt->autoVacuum ){
8638     Pgno maxRootPgno;
8639     sqlite3BtreeGetMeta(p, BTREE_LARGEST_ROOT_PAGE, &maxRootPgno);
8640 
8641     if( iTable==maxRootPgno ){
8642       /* If the table being dropped is the table with the largest root-page
8643       ** number in the database, put the root page on the free list.
8644       */
8645       freePage(pPage, &rc);
8646       releasePage(pPage);
8647       if( rc!=SQLITE_OK ){
8648         return rc;
8649       }
8650     }else{
8651       /* The table being dropped does not have the largest root-page
8652       ** number in the database. So move the page that does into the
8653       ** gap left by the deleted root-page.
8654       */
8655       MemPage *pMove;
8656       releasePage(pPage);
8657       rc = btreeGetPage(pBt, maxRootPgno, &pMove, 0);
8658       if( rc!=SQLITE_OK ){
8659         return rc;
8660       }
8661       rc = relocatePage(pBt, pMove, PTRMAP_ROOTPAGE, 0, iTable, 0);
8662       releasePage(pMove);
8663       if( rc!=SQLITE_OK ){
8664         return rc;
8665       }
8666       pMove = 0;
8667       rc = btreeGetPage(pBt, maxRootPgno, &pMove, 0);
8668       freePage(pMove, &rc);
8669       releasePage(pMove);
8670       if( rc!=SQLITE_OK ){
8671         return rc;
8672       }
8673       *piMoved = maxRootPgno;
8674     }
8675 
8676     /* Set the new 'max-root-page' value in the database header. This
8677     ** is the old value less one, less one more if that happens to
8678     ** be a root-page number, less one again if that is the
8679     ** PENDING_BYTE_PAGE.
8680     */
8681     maxRootPgno--;
8682     while( maxRootPgno==PENDING_BYTE_PAGE(pBt)
8683            || PTRMAP_ISPAGE(pBt, maxRootPgno) ){
8684       maxRootPgno--;
8685     }
8686     assert( maxRootPgno!=PENDING_BYTE_PAGE(pBt) );
8687 
8688     rc = sqlite3BtreeUpdateMeta(p, 4, maxRootPgno);
8689   }else{
8690     freePage(pPage, &rc);
8691     releasePage(pPage);
8692   }
8693 #endif
8694   return rc;
8695 }
8696 int sqlite3BtreeDropTable(Btree *p, int iTable, int *piMoved){
8697   int rc;
8698   sqlite3BtreeEnter(p);
8699   rc = btreeDropTable(p, iTable, piMoved);
8700   sqlite3BtreeLeave(p);
8701   return rc;
8702 }
8703 
8704 
8705 /*
8706 ** This function may only be called if the b-tree connection already
8707 ** has a read or write transaction open on the database.
8708 **
8709 ** Read the meta-information out of a database file.  Meta[0]
8710 ** is the number of free pages currently in the database.  Meta[1]
8711 ** through meta[15] are available for use by higher layers.  Meta[0]
8712 ** is read-only, the others are read/write.
8713 **
8714 ** The schema layer numbers meta values differently.  At the schema
8715 ** layer (and the SetCookie and ReadCookie opcodes) the number of
8716 ** free pages is not visible.  So Cookie[0] is the same as Meta[1].
8717 **
8718 ** This routine treats Meta[BTREE_DATA_VERSION] as a special case.  Instead
8719 ** of reading the value out of the header, it instead loads the "DataVersion"
8720 ** from the pager.  The BTREE_DATA_VERSION value is not actually stored in the
8721 ** database file.  It is a number computed by the pager.  But its access
8722 ** pattern is the same as header meta values, and so it is convenient to
8723 ** read it from this routine.
8724 */
8725 void sqlite3BtreeGetMeta(Btree *p, int idx, u32 *pMeta){
8726   BtShared *pBt = p->pBt;
8727 
8728   sqlite3BtreeEnter(p);
8729   assert( p->inTrans>TRANS_NONE );
8730   assert( SQLITE_OK==querySharedCacheTableLock(p, MASTER_ROOT, READ_LOCK) );
8731   assert( pBt->pPage1 );
8732   assert( idx>=0 && idx<=15 );
8733 
8734   if( idx==BTREE_DATA_VERSION ){
8735     *pMeta = sqlite3PagerDataVersion(pBt->pPager) + p->iDataVersion;
8736   }else{
8737     *pMeta = get4byte(&pBt->pPage1->aData[36 + idx*4]);
8738   }
8739 
8740   /* If auto-vacuum is disabled in this build and this is an auto-vacuum
8741   ** database, mark the database as read-only.  */
8742 #ifdef SQLITE_OMIT_AUTOVACUUM
8743   if( idx==BTREE_LARGEST_ROOT_PAGE && *pMeta>0 ){
8744     pBt->btsFlags |= BTS_READ_ONLY;
8745   }
8746 #endif
8747 
8748   sqlite3BtreeLeave(p);
8749 }
8750 
8751 /*
8752 ** Write meta-information back into the database.  Meta[0] is
8753 ** read-only and may not be written.
8754 */
8755 int sqlite3BtreeUpdateMeta(Btree *p, int idx, u32 iMeta){
8756   BtShared *pBt = p->pBt;
8757   unsigned char *pP1;
8758   int rc;
8759   assert( idx>=1 && idx<=15 );
8760   sqlite3BtreeEnter(p);
8761   assert( p->inTrans==TRANS_WRITE );
8762   assert( pBt->pPage1!=0 );
8763   pP1 = pBt->pPage1->aData;
8764   rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
8765   if( rc==SQLITE_OK ){
8766     put4byte(&pP1[36 + idx*4], iMeta);
8767 #ifndef SQLITE_OMIT_AUTOVACUUM
8768     if( idx==BTREE_INCR_VACUUM ){
8769       assert( pBt->autoVacuum || iMeta==0 );
8770       assert( iMeta==0 || iMeta==1 );
8771       pBt->incrVacuum = (u8)iMeta;
8772     }
8773 #endif
8774   }
8775   sqlite3BtreeLeave(p);
8776   return rc;
8777 }
8778 
8779 #ifndef SQLITE_OMIT_BTREECOUNT
8780 /*
8781 ** The first argument, pCur, is a cursor opened on some b-tree. Count the
8782 ** number of entries in the b-tree and write the result to *pnEntry.
8783 **
8784 ** SQLITE_OK is returned if the operation is successfully executed.
8785 ** Otherwise, if an error is encountered (i.e. an IO error or database
8786 ** corruption) an SQLite error code is returned.
8787 */
8788 int sqlite3BtreeCount(BtCursor *pCur, i64 *pnEntry){
8789   i64 nEntry = 0;                      /* Value to return in *pnEntry */
8790   int rc;                              /* Return code */
8791 
8792   if( pCur->pgnoRoot==0 ){
8793     *pnEntry = 0;
8794     return SQLITE_OK;
8795   }
8796   rc = moveToRoot(pCur);
8797 
8798   /* Unless an error occurs, the following loop runs one iteration for each
8799   ** page in the B-Tree structure (not including overflow pages).
8800   */
8801   while( rc==SQLITE_OK ){
8802     int iIdx;                          /* Index of child node in parent */
8803     MemPage *pPage;                    /* Current page of the b-tree */
8804 
8805     /* If this is a leaf page or the tree is not an int-key tree, then
8806     ** this page contains countable entries. Increment the entry counter
8807     ** accordingly.
8808     */
8809     pPage = pCur->apPage[pCur->iPage];
8810     if( pPage->leaf || !pPage->intKey ){
8811       nEntry += pPage->nCell;
8812     }
8813 
8814     /* pPage is a leaf node. This loop navigates the cursor so that it
8815     ** points to the first interior cell that it points to the parent of
8816     ** the next page in the tree that has not yet been visited. The
8817     ** pCur->aiIdx[pCur->iPage] value is set to the index of the parent cell
8818     ** of the page, or to the number of cells in the page if the next page
8819     ** to visit is the right-child of its parent.
8820     **
8821     ** If all pages in the tree have been visited, return SQLITE_OK to the
8822     ** caller.
8823     */
8824     if( pPage->leaf ){
8825       do {
8826         if( pCur->iPage==0 ){
8827           /* All pages of the b-tree have been visited. Return successfully. */
8828           *pnEntry = nEntry;
8829           return moveToRoot(pCur);
8830         }
8831         moveToParent(pCur);
8832       }while ( pCur->aiIdx[pCur->iPage]>=pCur->apPage[pCur->iPage]->nCell );
8833 
8834       pCur->aiIdx[pCur->iPage]++;
8835       pPage = pCur->apPage[pCur->iPage];
8836     }
8837 
8838     /* Descend to the child node of the cell that the cursor currently
8839     ** points at. This is the right-child if (iIdx==pPage->nCell).
8840     */
8841     iIdx = pCur->aiIdx[pCur->iPage];
8842     if( iIdx==pPage->nCell ){
8843       rc = moveToChild(pCur, get4byte(&pPage->aData[pPage->hdrOffset+8]));
8844     }else{
8845       rc = moveToChild(pCur, get4byte(findCell(pPage, iIdx)));
8846     }
8847   }
8848 
8849   /* An error has occurred. Return an error code. */
8850   return rc;
8851 }
8852 #endif
8853 
8854 /*
8855 ** Return the pager associated with a BTree.  This routine is used for
8856 ** testing and debugging only.
8857 */
8858 Pager *sqlite3BtreePager(Btree *p){
8859   return p->pBt->pPager;
8860 }
8861 
8862 #ifndef SQLITE_OMIT_INTEGRITY_CHECK
8863 /*
8864 ** Append a message to the error message string.
8865 */
8866 static void checkAppendMsg(
8867   IntegrityCk *pCheck,
8868   const char *zFormat,
8869   ...
8870 ){
8871   va_list ap;
8872   if( !pCheck->mxErr ) return;
8873   pCheck->mxErr--;
8874   pCheck->nErr++;
8875   va_start(ap, zFormat);
8876   if( pCheck->errMsg.nChar ){
8877     sqlite3StrAccumAppend(&pCheck->errMsg, "\n", 1);
8878   }
8879   if( pCheck->zPfx ){
8880     sqlite3XPrintf(&pCheck->errMsg, pCheck->zPfx, pCheck->v1, pCheck->v2);
8881   }
8882   sqlite3VXPrintf(&pCheck->errMsg, zFormat, ap);
8883   va_end(ap);
8884   if( pCheck->errMsg.accError==STRACCUM_NOMEM ){
8885     pCheck->mallocFailed = 1;
8886   }
8887 }
8888 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
8889 
8890 #ifndef SQLITE_OMIT_INTEGRITY_CHECK
8891 
8892 /*
8893 ** Return non-zero if the bit in the IntegrityCk.aPgRef[] array that
8894 ** corresponds to page iPg is already set.
8895 */
8896 static int getPageReferenced(IntegrityCk *pCheck, Pgno iPg){
8897   assert( iPg<=pCheck->nPage && sizeof(pCheck->aPgRef[0])==1 );
8898   return (pCheck->aPgRef[iPg/8] & (1 << (iPg & 0x07)));
8899 }
8900 
8901 /*
8902 ** Set the bit in the IntegrityCk.aPgRef[] array that corresponds to page iPg.
8903 */
8904 static void setPageReferenced(IntegrityCk *pCheck, Pgno iPg){
8905   assert( iPg<=pCheck->nPage && sizeof(pCheck->aPgRef[0])==1 );
8906   pCheck->aPgRef[iPg/8] |= (1 << (iPg & 0x07));
8907 }
8908 
8909 
8910 /*
8911 ** Add 1 to the reference count for page iPage.  If this is the second
8912 ** reference to the page, add an error message to pCheck->zErrMsg.
8913 ** Return 1 if there are 2 or more references to the page and 0 if
8914 ** if this is the first reference to the page.
8915 **
8916 ** Also check that the page number is in bounds.
8917 */
8918 static int checkRef(IntegrityCk *pCheck, Pgno iPage){
8919   if( iPage==0 ) return 1;
8920   if( iPage>pCheck->nPage ){
8921     checkAppendMsg(pCheck, "invalid page number %d", iPage);
8922     return 1;
8923   }
8924   if( getPageReferenced(pCheck, iPage) ){
8925     checkAppendMsg(pCheck, "2nd reference to page %d", iPage);
8926     return 1;
8927   }
8928   setPageReferenced(pCheck, iPage);
8929   return 0;
8930 }
8931 
8932 #ifndef SQLITE_OMIT_AUTOVACUUM
8933 /*
8934 ** Check that the entry in the pointer-map for page iChild maps to
8935 ** page iParent, pointer type ptrType. If not, append an error message
8936 ** to pCheck.
8937 */
8938 static void checkPtrmap(
8939   IntegrityCk *pCheck,   /* Integrity check context */
8940   Pgno iChild,           /* Child page number */
8941   u8 eType,              /* Expected pointer map type */
8942   Pgno iParent           /* Expected pointer map parent page number */
8943 ){
8944   int rc;
8945   u8 ePtrmapType;
8946   Pgno iPtrmapParent;
8947 
8948   rc = ptrmapGet(pCheck->pBt, iChild, &ePtrmapType, &iPtrmapParent);
8949   if( rc!=SQLITE_OK ){
8950     if( rc==SQLITE_NOMEM || rc==SQLITE_IOERR_NOMEM ) pCheck->mallocFailed = 1;
8951     checkAppendMsg(pCheck, "Failed to read ptrmap key=%d", iChild);
8952     return;
8953   }
8954 
8955   if( ePtrmapType!=eType || iPtrmapParent!=iParent ){
8956     checkAppendMsg(pCheck,
8957       "Bad ptr map entry key=%d expected=(%d,%d) got=(%d,%d)",
8958       iChild, eType, iParent, ePtrmapType, iPtrmapParent);
8959   }
8960 }
8961 #endif
8962 
8963 /*
8964 ** Check the integrity of the freelist or of an overflow page list.
8965 ** Verify that the number of pages on the list is N.
8966 */
8967 static void checkList(
8968   IntegrityCk *pCheck,  /* Integrity checking context */
8969   int isFreeList,       /* True for a freelist.  False for overflow page list */
8970   int iPage,            /* Page number for first page in the list */
8971   int N                 /* Expected number of pages in the list */
8972 ){
8973   int i;
8974   int expected = N;
8975   int iFirst = iPage;
8976   while( N-- > 0 && pCheck->mxErr ){
8977     DbPage *pOvflPage;
8978     unsigned char *pOvflData;
8979     if( iPage<1 ){
8980       checkAppendMsg(pCheck,
8981          "%d of %d pages missing from overflow list starting at %d",
8982           N+1, expected, iFirst);
8983       break;
8984     }
8985     if( checkRef(pCheck, iPage) ) break;
8986     if( sqlite3PagerGet(pCheck->pPager, (Pgno)iPage, &pOvflPage, 0) ){
8987       checkAppendMsg(pCheck, "failed to get page %d", iPage);
8988       break;
8989     }
8990     pOvflData = (unsigned char *)sqlite3PagerGetData(pOvflPage);
8991     if( isFreeList ){
8992       int n = get4byte(&pOvflData[4]);
8993 #ifndef SQLITE_OMIT_AUTOVACUUM
8994       if( pCheck->pBt->autoVacuum ){
8995         checkPtrmap(pCheck, iPage, PTRMAP_FREEPAGE, 0);
8996       }
8997 #endif
8998       if( n>(int)pCheck->pBt->usableSize/4-2 ){
8999         checkAppendMsg(pCheck,
9000            "freelist leaf count too big on page %d", iPage);
9001         N--;
9002       }else{
9003         for(i=0; i<n; i++){
9004           Pgno iFreePage = get4byte(&pOvflData[8+i*4]);
9005 #ifndef SQLITE_OMIT_AUTOVACUUM
9006           if( pCheck->pBt->autoVacuum ){
9007             checkPtrmap(pCheck, iFreePage, PTRMAP_FREEPAGE, 0);
9008           }
9009 #endif
9010           checkRef(pCheck, iFreePage);
9011         }
9012         N -= n;
9013       }
9014     }
9015 #ifndef SQLITE_OMIT_AUTOVACUUM
9016     else{
9017       /* If this database supports auto-vacuum and iPage is not the last
9018       ** page in this overflow list, check that the pointer-map entry for
9019       ** the following page matches iPage.
9020       */
9021       if( pCheck->pBt->autoVacuum && N>0 ){
9022         i = get4byte(pOvflData);
9023         checkPtrmap(pCheck, i, PTRMAP_OVERFLOW2, iPage);
9024       }
9025     }
9026 #endif
9027     iPage = get4byte(pOvflData);
9028     sqlite3PagerUnref(pOvflPage);
9029 
9030     if( isFreeList && N<(iPage!=0) ){
9031       checkAppendMsg(pCheck, "free-page count in header is too small");
9032     }
9033   }
9034 }
9035 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
9036 
9037 /*
9038 ** An implementation of a min-heap.
9039 **
9040 ** aHeap[0] is the number of elements on the heap.  aHeap[1] is the
9041 ** root element.  The daughter nodes of aHeap[N] are aHeap[N*2]
9042 ** and aHeap[N*2+1].
9043 **
9044 ** The heap property is this:  Every node is less than or equal to both
9045 ** of its daughter nodes.  A consequence of the heap property is that the
9046 ** root node aHeap[1] is always the minimum value currently in the heap.
9047 **
9048 ** The btreeHeapInsert() routine inserts an unsigned 32-bit number onto
9049 ** the heap, preserving the heap property.  The btreeHeapPull() routine
9050 ** removes the root element from the heap (the minimum value in the heap)
9051 ** and then moves other nodes around as necessary to preserve the heap
9052 ** property.
9053 **
9054 ** This heap is used for cell overlap and coverage testing.  Each u32
9055 ** entry represents the span of a cell or freeblock on a btree page.
9056 ** The upper 16 bits are the index of the first byte of a range and the
9057 ** lower 16 bits are the index of the last byte of that range.
9058 */
9059 static void btreeHeapInsert(u32 *aHeap, u32 x){
9060   u32 j, i = ++aHeap[0];
9061   aHeap[i] = x;
9062   while( (j = i/2)>0 && aHeap[j]>aHeap[i] ){
9063     x = aHeap[j];
9064     aHeap[j] = aHeap[i];
9065     aHeap[i] = x;
9066     i = j;
9067   }
9068 }
9069 static int btreeHeapPull(u32 *aHeap, u32 *pOut){
9070   u32 j, i, x;
9071   if( (x = aHeap[0])==0 ) return 0;
9072   *pOut = aHeap[1];
9073   aHeap[1] = aHeap[x];
9074   aHeap[x] = 0xffffffff;
9075   aHeap[0]--;
9076   i = 1;
9077   while( (j = i*2)<=aHeap[0] ){
9078     if( aHeap[j]>aHeap[j+1] ) j++;
9079     if( aHeap[i]<aHeap[j] ) break;
9080     x = aHeap[i];
9081     aHeap[i] = aHeap[j];
9082     aHeap[j] = x;
9083     i = j;
9084   }
9085   return 1;
9086 }
9087 
9088 #ifndef SQLITE_OMIT_INTEGRITY_CHECK
9089 /*
9090 ** Do various sanity checks on a single page of a tree.  Return
9091 ** the tree depth.  Root pages return 0.  Parents of root pages
9092 ** return 1, and so forth.
9093 **
9094 ** These checks are done:
9095 **
9096 **      1.  Make sure that cells and freeblocks do not overlap
9097 **          but combine to completely cover the page.
9098 **      2.  Make sure integer cell keys are in order.
9099 **      3.  Check the integrity of overflow pages.
9100 **      4.  Recursively call checkTreePage on all children.
9101 **      5.  Verify that the depth of all children is the same.
9102 */
9103 static int checkTreePage(
9104   IntegrityCk *pCheck,  /* Context for the sanity check */
9105   int iPage,            /* Page number of the page to check */
9106   i64 *piMinKey,        /* Write minimum integer primary key here */
9107   i64 maxKey            /* Error if integer primary key greater than this */
9108 ){
9109   MemPage *pPage = 0;      /* The page being analyzed */
9110   int i;                   /* Loop counter */
9111   int rc;                  /* Result code from subroutine call */
9112   int depth = -1, d2;      /* Depth of a subtree */
9113   int pgno;                /* Page number */
9114   int nFrag;               /* Number of fragmented bytes on the page */
9115   int hdr;                 /* Offset to the page header */
9116   int cellStart;           /* Offset to the start of the cell pointer array */
9117   int nCell;               /* Number of cells */
9118   int doCoverageCheck = 1; /* True if cell coverage checking should be done */
9119   int keyCanBeEqual = 1;   /* True if IPK can be equal to maxKey
9120                            ** False if IPK must be strictly less than maxKey */
9121   u8 *data;                /* Page content */
9122   u8 *pCell;               /* Cell content */
9123   u8 *pCellIdx;            /* Next element of the cell pointer array */
9124   BtShared *pBt;           /* The BtShared object that owns pPage */
9125   u32 pc;                  /* Address of a cell */
9126   u32 usableSize;          /* Usable size of the page */
9127   u32 contentOffset;       /* Offset to the start of the cell content area */
9128   u32 *heap = 0;           /* Min-heap used for checking cell coverage */
9129   u32 x, prev = 0;         /* Next and previous entry on the min-heap */
9130   const char *saved_zPfx = pCheck->zPfx;
9131   int saved_v1 = pCheck->v1;
9132   int saved_v2 = pCheck->v2;
9133   u8 savedIsInit = 0;
9134 
9135   /* Check that the page exists
9136   */
9137   pBt = pCheck->pBt;
9138   usableSize = pBt->usableSize;
9139   if( iPage==0 ) return 0;
9140   if( checkRef(pCheck, iPage) ) return 0;
9141   pCheck->zPfx = "Page %d: ";
9142   pCheck->v1 = iPage;
9143   if( (rc = btreeGetPage(pBt, (Pgno)iPage, &pPage, 0))!=0 ){
9144     checkAppendMsg(pCheck,
9145        "unable to get the page. error code=%d", rc);
9146     goto end_of_check;
9147   }
9148 
9149   /* Clear MemPage.isInit to make sure the corruption detection code in
9150   ** btreeInitPage() is executed.  */
9151   savedIsInit = pPage->isInit;
9152   pPage->isInit = 0;
9153   if( (rc = btreeInitPage(pPage))!=0 ){
9154     assert( rc==SQLITE_CORRUPT );  /* The only possible error from InitPage */
9155     checkAppendMsg(pCheck,
9156                    "btreeInitPage() returns error code %d", rc);
9157     goto end_of_check;
9158   }
9159   data = pPage->aData;
9160   hdr = pPage->hdrOffset;
9161 
9162   /* Set up for cell analysis */
9163   pCheck->zPfx = "On tree page %d cell %d: ";
9164   contentOffset = get2byteNotZero(&data[hdr+5]);
9165   assert( contentOffset<=usableSize );  /* Enforced by btreeInitPage() */
9166 
9167   /* EVIDENCE-OF: R-37002-32774 The two-byte integer at offset 3 gives the
9168   ** number of cells on the page. */
9169   nCell = get2byte(&data[hdr+3]);
9170   assert( pPage->nCell==nCell );
9171 
9172   /* EVIDENCE-OF: R-23882-45353 The cell pointer array of a b-tree page
9173   ** immediately follows the b-tree page header. */
9174   cellStart = hdr + 12 - 4*pPage->leaf;
9175   assert( pPage->aCellIdx==&data[cellStart] );
9176   pCellIdx = &data[cellStart + 2*(nCell-1)];
9177 
9178   if( !pPage->leaf ){
9179     /* Analyze the right-child page of internal pages */
9180     pgno = get4byte(&data[hdr+8]);
9181 #ifndef SQLITE_OMIT_AUTOVACUUM
9182     if( pBt->autoVacuum ){
9183       pCheck->zPfx = "On page %d at right child: ";
9184       checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage);
9185     }
9186 #endif
9187     depth = checkTreePage(pCheck, pgno, &maxKey, maxKey);
9188     keyCanBeEqual = 0;
9189   }else{
9190     /* For leaf pages, the coverage check will occur in the same loop
9191     ** as the other cell checks, so initialize the heap.  */
9192     heap = pCheck->heap;
9193     heap[0] = 0;
9194   }
9195 
9196   /* EVIDENCE-OF: R-02776-14802 The cell pointer array consists of K 2-byte
9197   ** integer offsets to the cell contents. */
9198   for(i=nCell-1; i>=0 && pCheck->mxErr; i--){
9199     CellInfo info;
9200 
9201     /* Check cell size */
9202     pCheck->v2 = i;
9203     assert( pCellIdx==&data[cellStart + i*2] );
9204     pc = get2byteAligned(pCellIdx);
9205     pCellIdx -= 2;
9206     if( pc<contentOffset || pc>usableSize-4 ){
9207       checkAppendMsg(pCheck, "Offset %d out of range %d..%d",
9208                              pc, contentOffset, usableSize-4);
9209       doCoverageCheck = 0;
9210       continue;
9211     }
9212     pCell = &data[pc];
9213     pPage->xParseCell(pPage, pCell, &info);
9214     if( pc+info.nSize>usableSize ){
9215       checkAppendMsg(pCheck, "Extends off end of page");
9216       doCoverageCheck = 0;
9217       continue;
9218     }
9219 
9220     /* Check for integer primary key out of range */
9221     if( pPage->intKey ){
9222       if( keyCanBeEqual ? (info.nKey > maxKey) : (info.nKey >= maxKey) ){
9223         checkAppendMsg(pCheck, "Rowid %lld out of order", info.nKey);
9224       }
9225       maxKey = info.nKey;
9226     }
9227 
9228     /* Check the content overflow list */
9229     if( info.nPayload>info.nLocal ){
9230       int nPage;       /* Number of pages on the overflow chain */
9231       Pgno pgnoOvfl;   /* First page of the overflow chain */
9232       assert( pc + info.nSize - 4 <= usableSize );
9233       nPage = (info.nPayload - info.nLocal + usableSize - 5)/(usableSize - 4);
9234       pgnoOvfl = get4byte(&pCell[info.nSize - 4]);
9235 #ifndef SQLITE_OMIT_AUTOVACUUM
9236       if( pBt->autoVacuum ){
9237         checkPtrmap(pCheck, pgnoOvfl, PTRMAP_OVERFLOW1, iPage);
9238       }
9239 #endif
9240       checkList(pCheck, 0, pgnoOvfl, nPage);
9241     }
9242 
9243     if( !pPage->leaf ){
9244       /* Check sanity of left child page for internal pages */
9245       pgno = get4byte(pCell);
9246 #ifndef SQLITE_OMIT_AUTOVACUUM
9247       if( pBt->autoVacuum ){
9248         checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage);
9249       }
9250 #endif
9251       d2 = checkTreePage(pCheck, pgno, &maxKey, maxKey);
9252       keyCanBeEqual = 0;
9253       if( d2!=depth ){
9254         checkAppendMsg(pCheck, "Child page depth differs");
9255         depth = d2;
9256       }
9257     }else{
9258       /* Populate the coverage-checking heap for leaf pages */
9259       btreeHeapInsert(heap, (pc<<16)|(pc+info.nSize-1));
9260     }
9261   }
9262   *piMinKey = maxKey;
9263 
9264   /* Check for complete coverage of the page
9265   */
9266   pCheck->zPfx = 0;
9267   if( doCoverageCheck && pCheck->mxErr>0 ){
9268     /* For leaf pages, the min-heap has already been initialized and the
9269     ** cells have already been inserted.  But for internal pages, that has
9270     ** not yet been done, so do it now */
9271     if( !pPage->leaf ){
9272       heap = pCheck->heap;
9273       heap[0] = 0;
9274       for(i=nCell-1; i>=0; i--){
9275         u32 size;
9276         pc = get2byteAligned(&data[cellStart+i*2]);
9277         size = pPage->xCellSize(pPage, &data[pc]);
9278         btreeHeapInsert(heap, (pc<<16)|(pc+size-1));
9279       }
9280     }
9281     /* Add the freeblocks to the min-heap
9282     **
9283     ** EVIDENCE-OF: R-20690-50594 The second field of the b-tree page header
9284     ** is the offset of the first freeblock, or zero if there are no
9285     ** freeblocks on the page.
9286     */
9287     i = get2byte(&data[hdr+1]);
9288     while( i>0 ){
9289       int size, j;
9290       assert( (u32)i<=usableSize-4 );     /* Enforced by btreeInitPage() */
9291       size = get2byte(&data[i+2]);
9292       assert( (u32)(i+size)<=usableSize );  /* Enforced by btreeInitPage() */
9293       btreeHeapInsert(heap, (((u32)i)<<16)|(i+size-1));
9294       /* EVIDENCE-OF: R-58208-19414 The first 2 bytes of a freeblock are a
9295       ** big-endian integer which is the offset in the b-tree page of the next
9296       ** freeblock in the chain, or zero if the freeblock is the last on the
9297       ** chain. */
9298       j = get2byte(&data[i]);
9299       /* EVIDENCE-OF: R-06866-39125 Freeblocks are always connected in order of
9300       ** increasing offset. */
9301       assert( j==0 || j>i+size );  /* Enforced by btreeInitPage() */
9302       assert( (u32)j<=usableSize-4 );   /* Enforced by btreeInitPage() */
9303       i = j;
9304     }
9305     /* Analyze the min-heap looking for overlap between cells and/or
9306     ** freeblocks, and counting the number of untracked bytes in nFrag.
9307     **
9308     ** Each min-heap entry is of the form:    (start_address<<16)|end_address.
9309     ** There is an implied first entry the covers the page header, the cell
9310     ** pointer index, and the gap between the cell pointer index and the start
9311     ** of cell content.
9312     **
9313     ** The loop below pulls entries from the min-heap in order and compares
9314     ** the start_address against the previous end_address.  If there is an
9315     ** overlap, that means bytes are used multiple times.  If there is a gap,
9316     ** that gap is added to the fragmentation count.
9317     */
9318     nFrag = 0;
9319     prev = contentOffset - 1;   /* Implied first min-heap entry */
9320     while( btreeHeapPull(heap,&x) ){
9321       if( (prev&0xffff)>=(x>>16) ){
9322         checkAppendMsg(pCheck,
9323           "Multiple uses for byte %u of page %d", x>>16, iPage);
9324         break;
9325       }else{
9326         nFrag += (x>>16) - (prev&0xffff) - 1;
9327         prev = x;
9328       }
9329     }
9330     nFrag += usableSize - (prev&0xffff) - 1;
9331     /* EVIDENCE-OF: R-43263-13491 The total number of bytes in all fragments
9332     ** is stored in the fifth field of the b-tree page header.
9333     ** EVIDENCE-OF: R-07161-27322 The one-byte integer at offset 7 gives the
9334     ** number of fragmented free bytes within the cell content area.
9335     */
9336     if( heap[0]==0 && nFrag!=data[hdr+7] ){
9337       checkAppendMsg(pCheck,
9338           "Fragmentation of %d bytes reported as %d on page %d",
9339           nFrag, data[hdr+7], iPage);
9340     }
9341   }
9342 
9343 end_of_check:
9344   if( !doCoverageCheck ) pPage->isInit = savedIsInit;
9345   releasePage(pPage);
9346   pCheck->zPfx = saved_zPfx;
9347   pCheck->v1 = saved_v1;
9348   pCheck->v2 = saved_v2;
9349   return depth+1;
9350 }
9351 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
9352 
9353 #ifndef SQLITE_OMIT_INTEGRITY_CHECK
9354 /*
9355 ** This routine does a complete check of the given BTree file.  aRoot[] is
9356 ** an array of pages numbers were each page number is the root page of
9357 ** a table.  nRoot is the number of entries in aRoot.
9358 **
9359 ** A read-only or read-write transaction must be opened before calling
9360 ** this function.
9361 **
9362 ** Write the number of error seen in *pnErr.  Except for some memory
9363 ** allocation errors,  an error message held in memory obtained from
9364 ** malloc is returned if *pnErr is non-zero.  If *pnErr==0 then NULL is
9365 ** returned.  If a memory allocation error occurs, NULL is returned.
9366 */
9367 char *sqlite3BtreeIntegrityCheck(
9368   Btree *p,     /* The btree to be checked */
9369   int *aRoot,   /* An array of root pages numbers for individual trees */
9370   int nRoot,    /* Number of entries in aRoot[] */
9371   int mxErr,    /* Stop reporting errors after this many */
9372   int *pnErr    /* Write number of errors seen to this variable */
9373 ){
9374   Pgno i;
9375   IntegrityCk sCheck;
9376   BtShared *pBt = p->pBt;
9377   int savedDbFlags = pBt->db->flags;
9378   char zErr[100];
9379   VVA_ONLY( int nRef );
9380 
9381   sqlite3BtreeEnter(p);
9382   assert( p->inTrans>TRANS_NONE && pBt->inTransaction>TRANS_NONE );
9383   VVA_ONLY( nRef = sqlite3PagerRefcount(pBt->pPager) );
9384   assert( nRef>=0 );
9385   sCheck.pBt = pBt;
9386   sCheck.pPager = pBt->pPager;
9387   sCheck.nPage = btreePagecount(sCheck.pBt);
9388   sCheck.mxErr = mxErr;
9389   sCheck.nErr = 0;
9390   sCheck.mallocFailed = 0;
9391   sCheck.zPfx = 0;
9392   sCheck.v1 = 0;
9393   sCheck.v2 = 0;
9394   sCheck.aPgRef = 0;
9395   sCheck.heap = 0;
9396   sqlite3StrAccumInit(&sCheck.errMsg, 0, zErr, sizeof(zErr), SQLITE_MAX_LENGTH);
9397   sCheck.errMsg.printfFlags = SQLITE_PRINTF_INTERNAL;
9398   if( sCheck.nPage==0 ){
9399     goto integrity_ck_cleanup;
9400   }
9401 
9402   sCheck.aPgRef = sqlite3MallocZero((sCheck.nPage / 8)+ 1);
9403   if( !sCheck.aPgRef ){
9404     sCheck.mallocFailed = 1;
9405     goto integrity_ck_cleanup;
9406   }
9407   sCheck.heap = (u32*)sqlite3PageMalloc( pBt->pageSize );
9408   if( sCheck.heap==0 ){
9409     sCheck.mallocFailed = 1;
9410     goto integrity_ck_cleanup;
9411   }
9412 
9413   i = PENDING_BYTE_PAGE(pBt);
9414   if( i<=sCheck.nPage ) setPageReferenced(&sCheck, i);
9415 
9416   /* Check the integrity of the freelist
9417   */
9418   sCheck.zPfx = "Main freelist: ";
9419   checkList(&sCheck, 1, get4byte(&pBt->pPage1->aData[32]),
9420             get4byte(&pBt->pPage1->aData[36]));
9421   sCheck.zPfx = 0;
9422 
9423   /* Check all the tables.
9424   */
9425   testcase( pBt->db->flags & SQLITE_CellSizeCk );
9426   pBt->db->flags &= ~SQLITE_CellSizeCk;
9427   for(i=0; (int)i<nRoot && sCheck.mxErr; i++){
9428     i64 notUsed;
9429     if( aRoot[i]==0 ) continue;
9430 #ifndef SQLITE_OMIT_AUTOVACUUM
9431     if( pBt->autoVacuum && aRoot[i]>1 ){
9432       checkPtrmap(&sCheck, aRoot[i], PTRMAP_ROOTPAGE, 0);
9433     }
9434 #endif
9435     checkTreePage(&sCheck, aRoot[i], &notUsed, LARGEST_INT64);
9436   }
9437   pBt->db->flags = savedDbFlags;
9438 
9439   /* Make sure every page in the file is referenced
9440   */
9441   for(i=1; i<=sCheck.nPage && sCheck.mxErr; i++){
9442 #ifdef SQLITE_OMIT_AUTOVACUUM
9443     if( getPageReferenced(&sCheck, i)==0 ){
9444       checkAppendMsg(&sCheck, "Page %d is never used", i);
9445     }
9446 #else
9447     /* If the database supports auto-vacuum, make sure no tables contain
9448     ** references to pointer-map pages.
9449     */
9450     if( getPageReferenced(&sCheck, i)==0 &&
9451        (PTRMAP_PAGENO(pBt, i)!=i || !pBt->autoVacuum) ){
9452       checkAppendMsg(&sCheck, "Page %d is never used", i);
9453     }
9454     if( getPageReferenced(&sCheck, i)!=0 &&
9455        (PTRMAP_PAGENO(pBt, i)==i && pBt->autoVacuum) ){
9456       checkAppendMsg(&sCheck, "Pointer map page %d is referenced", i);
9457     }
9458 #endif
9459   }
9460 
9461   /* Clean  up and report errors.
9462   */
9463 integrity_ck_cleanup:
9464   sqlite3PageFree(sCheck.heap);
9465   sqlite3_free(sCheck.aPgRef);
9466   if( sCheck.mallocFailed ){
9467     sqlite3StrAccumReset(&sCheck.errMsg);
9468     sCheck.nErr++;
9469   }
9470   *pnErr = sCheck.nErr;
9471   if( sCheck.nErr==0 ) sqlite3StrAccumReset(&sCheck.errMsg);
9472   /* Make sure this analysis did not leave any unref() pages. */
9473   assert( nRef==sqlite3PagerRefcount(pBt->pPager) );
9474   sqlite3BtreeLeave(p);
9475   return sqlite3StrAccumFinish(&sCheck.errMsg);
9476 }
9477 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
9478 
9479 /*
9480 ** Return the full pathname of the underlying database file.  Return
9481 ** an empty string if the database is in-memory or a TEMP database.
9482 **
9483 ** The pager filename is invariant as long as the pager is
9484 ** open so it is safe to access without the BtShared mutex.
9485 */
9486 const char *sqlite3BtreeGetFilename(Btree *p){
9487   assert( p->pBt->pPager!=0 );
9488   return sqlite3PagerFilename(p->pBt->pPager, 1);
9489 }
9490 
9491 /*
9492 ** Return the pathname of the journal file for this database. The return
9493 ** value of this routine is the same regardless of whether the journal file
9494 ** has been created or not.
9495 **
9496 ** The pager journal filename is invariant as long as the pager is
9497 ** open so it is safe to access without the BtShared mutex.
9498 */
9499 const char *sqlite3BtreeGetJournalname(Btree *p){
9500   assert( p->pBt->pPager!=0 );
9501   return sqlite3PagerJournalname(p->pBt->pPager);
9502 }
9503 
9504 /*
9505 ** Return non-zero if a transaction is active.
9506 */
9507 int sqlite3BtreeIsInTrans(Btree *p){
9508   assert( p==0 || sqlite3_mutex_held(p->db->mutex) );
9509   return (p && (p->inTrans==TRANS_WRITE));
9510 }
9511 
9512 #ifndef SQLITE_OMIT_WAL
9513 /*
9514 ** Run a checkpoint on the Btree passed as the first argument.
9515 **
9516 ** Return SQLITE_LOCKED if this or any other connection has an open
9517 ** transaction on the shared-cache the argument Btree is connected to.
9518 **
9519 ** Parameter eMode is one of SQLITE_CHECKPOINT_PASSIVE, FULL or RESTART.
9520 */
9521 int sqlite3BtreeCheckpoint(Btree *p, int eMode, int *pnLog, int *pnCkpt){
9522   int rc = SQLITE_OK;
9523   if( p ){
9524     BtShared *pBt = p->pBt;
9525     sqlite3BtreeEnter(p);
9526     if( pBt->inTransaction!=TRANS_NONE ){
9527       rc = SQLITE_LOCKED;
9528     }else{
9529       rc = sqlite3PagerCheckpoint(pBt->pPager, p->db, eMode, pnLog, pnCkpt);
9530     }
9531     sqlite3BtreeLeave(p);
9532   }
9533   return rc;
9534 }
9535 #endif
9536 
9537 /*
9538 ** Return non-zero if a read (or write) transaction is active.
9539 */
9540 int sqlite3BtreeIsInReadTrans(Btree *p){
9541   assert( p );
9542   assert( sqlite3_mutex_held(p->db->mutex) );
9543   return p->inTrans!=TRANS_NONE;
9544 }
9545 
9546 int sqlite3BtreeIsInBackup(Btree *p){
9547   assert( p );
9548   assert( sqlite3_mutex_held(p->db->mutex) );
9549   return p->nBackup!=0;
9550 }
9551 
9552 /*
9553 ** This function returns a pointer to a blob of memory associated with
9554 ** a single shared-btree. The memory is used by client code for its own
9555 ** purposes (for example, to store a high-level schema associated with
9556 ** the shared-btree). The btree layer manages reference counting issues.
9557 **
9558 ** The first time this is called on a shared-btree, nBytes bytes of memory
9559 ** are allocated, zeroed, and returned to the caller. For each subsequent
9560 ** call the nBytes parameter is ignored and a pointer to the same blob
9561 ** of memory returned.
9562 **
9563 ** If the nBytes parameter is 0 and the blob of memory has not yet been
9564 ** allocated, a null pointer is returned. If the blob has already been
9565 ** allocated, it is returned as normal.
9566 **
9567 ** Just before the shared-btree is closed, the function passed as the
9568 ** xFree argument when the memory allocation was made is invoked on the
9569 ** blob of allocated memory. The xFree function should not call sqlite3_free()
9570 ** on the memory, the btree layer does that.
9571 */
9572 void *sqlite3BtreeSchema(Btree *p, int nBytes, void(*xFree)(void *)){
9573   BtShared *pBt = p->pBt;
9574   sqlite3BtreeEnter(p);
9575   if( !pBt->pSchema && nBytes ){
9576     pBt->pSchema = sqlite3DbMallocZero(0, nBytes);
9577     pBt->xFreeSchema = xFree;
9578   }
9579   sqlite3BtreeLeave(p);
9580   return pBt->pSchema;
9581 }
9582 
9583 /*
9584 ** Return SQLITE_LOCKED_SHAREDCACHE if another user of the same shared
9585 ** btree as the argument handle holds an exclusive lock on the
9586 ** sqlite_master table. Otherwise SQLITE_OK.
9587 */
9588 int sqlite3BtreeSchemaLocked(Btree *p){
9589   int rc;
9590   assert( sqlite3_mutex_held(p->db->mutex) );
9591   sqlite3BtreeEnter(p);
9592   rc = querySharedCacheTableLock(p, MASTER_ROOT, READ_LOCK);
9593   assert( rc==SQLITE_OK || rc==SQLITE_LOCKED_SHAREDCACHE );
9594   sqlite3BtreeLeave(p);
9595   return rc;
9596 }
9597 
9598 
9599 #ifndef SQLITE_OMIT_SHARED_CACHE
9600 /*
9601 ** Obtain a lock on the table whose root page is iTab.  The
9602 ** lock is a write lock if isWritelock is true or a read lock
9603 ** if it is false.
9604 */
9605 int sqlite3BtreeLockTable(Btree *p, int iTab, u8 isWriteLock){
9606   int rc = SQLITE_OK;
9607   assert( p->inTrans!=TRANS_NONE );
9608   if( p->sharable ){
9609     u8 lockType = READ_LOCK + isWriteLock;
9610     assert( READ_LOCK+1==WRITE_LOCK );
9611     assert( isWriteLock==0 || isWriteLock==1 );
9612 
9613     sqlite3BtreeEnter(p);
9614     rc = querySharedCacheTableLock(p, iTab, lockType);
9615     if( rc==SQLITE_OK ){
9616       rc = setSharedCacheTableLock(p, iTab, lockType);
9617     }
9618     sqlite3BtreeLeave(p);
9619   }
9620   return rc;
9621 }
9622 #endif
9623 
9624 #ifndef SQLITE_OMIT_INCRBLOB
9625 /*
9626 ** Argument pCsr must be a cursor opened for writing on an
9627 ** INTKEY table currently pointing at a valid table entry.
9628 ** This function modifies the data stored as part of that entry.
9629 **
9630 ** Only the data content may only be modified, it is not possible to
9631 ** change the length of the data stored. If this function is called with
9632 ** parameters that attempt to write past the end of the existing data,
9633 ** no modifications are made and SQLITE_CORRUPT is returned.
9634 */
9635 int sqlite3BtreePutData(BtCursor *pCsr, u32 offset, u32 amt, void *z){
9636   int rc;
9637   assert( cursorOwnsBtShared(pCsr) );
9638   assert( sqlite3_mutex_held(pCsr->pBtree->db->mutex) );
9639   assert( pCsr->curFlags & BTCF_Incrblob );
9640 
9641   rc = restoreCursorPosition(pCsr);
9642   if( rc!=SQLITE_OK ){
9643     return rc;
9644   }
9645   assert( pCsr->eState!=CURSOR_REQUIRESEEK );
9646   if( pCsr->eState!=CURSOR_VALID ){
9647     return SQLITE_ABORT;
9648   }
9649 
9650   /* Save the positions of all other cursors open on this table. This is
9651   ** required in case any of them are holding references to an xFetch
9652   ** version of the b-tree page modified by the accessPayload call below.
9653   **
9654   ** Note that pCsr must be open on a INTKEY table and saveCursorPosition()
9655   ** and hence saveAllCursors() cannot fail on a BTREE_INTKEY table, hence
9656   ** saveAllCursors can only return SQLITE_OK.
9657   */
9658   VVA_ONLY(rc =) saveAllCursors(pCsr->pBt, pCsr->pgnoRoot, pCsr);
9659   assert( rc==SQLITE_OK );
9660 
9661   /* Check some assumptions:
9662   **   (a) the cursor is open for writing,
9663   **   (b) there is a read/write transaction open,
9664   **   (c) the connection holds a write-lock on the table (if required),
9665   **   (d) there are no conflicting read-locks, and
9666   **   (e) the cursor points at a valid row of an intKey table.
9667   */
9668   if( (pCsr->curFlags & BTCF_WriteFlag)==0 ){
9669     return SQLITE_READONLY;
9670   }
9671   assert( (pCsr->pBt->btsFlags & BTS_READ_ONLY)==0
9672               && pCsr->pBt->inTransaction==TRANS_WRITE );
9673   assert( hasSharedCacheTableLock(pCsr->pBtree, pCsr->pgnoRoot, 0, 2) );
9674   assert( !hasReadConflicts(pCsr->pBtree, pCsr->pgnoRoot) );
9675   assert( pCsr->apPage[pCsr->iPage]->intKey );
9676 
9677   return accessPayload(pCsr, offset, amt, (unsigned char *)z, 1);
9678 }
9679 
9680 /*
9681 ** Mark this cursor as an incremental blob cursor.
9682 */
9683 void sqlite3BtreeIncrblobCursor(BtCursor *pCur){
9684   pCur->curFlags |= BTCF_Incrblob;
9685   pCur->pBtree->hasIncrblobCur = 1;
9686 }
9687 #endif
9688 
9689 /*
9690 ** Set both the "read version" (single byte at byte offset 18) and
9691 ** "write version" (single byte at byte offset 19) fields in the database
9692 ** header to iVersion.
9693 */
9694 int sqlite3BtreeSetVersion(Btree *pBtree, int iVersion){
9695   BtShared *pBt = pBtree->pBt;
9696   int rc;                         /* Return code */
9697 
9698   assert( iVersion==1 || iVersion==2 );
9699 
9700   /* If setting the version fields to 1, do not automatically open the
9701   ** WAL connection, even if the version fields are currently set to 2.
9702   */
9703   pBt->btsFlags &= ~BTS_NO_WAL;
9704   if( iVersion==1 ) pBt->btsFlags |= BTS_NO_WAL;
9705 
9706   rc = sqlite3BtreeBeginTrans(pBtree, 0);
9707   if( rc==SQLITE_OK ){
9708     u8 *aData = pBt->pPage1->aData;
9709     if( aData[18]!=(u8)iVersion || aData[19]!=(u8)iVersion ){
9710       rc = sqlite3BtreeBeginTrans(pBtree, 2);
9711       if( rc==SQLITE_OK ){
9712         rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
9713         if( rc==SQLITE_OK ){
9714           aData[18] = (u8)iVersion;
9715           aData[19] = (u8)iVersion;
9716         }
9717       }
9718     }
9719   }
9720 
9721   pBt->btsFlags &= ~BTS_NO_WAL;
9722   return rc;
9723 }
9724 
9725 /*
9726 ** Return true if the cursor has a hint specified.  This routine is
9727 ** only used from within assert() statements
9728 */
9729 int sqlite3BtreeCursorHasHint(BtCursor *pCsr, unsigned int mask){
9730   return (pCsr->hints & mask)!=0;
9731 }
9732 
9733 /*
9734 ** Return true if the given Btree is read-only.
9735 */
9736 int sqlite3BtreeIsReadonly(Btree *p){
9737   return (p->pBt->btsFlags & BTS_READ_ONLY)!=0;
9738 }
9739 
9740 /*
9741 ** Return the size of the header added to each page by this module.
9742 */
9743 int sqlite3HeaderSizeBtree(void){ return ROUND8(sizeof(MemPage)); }
9744 
9745 #if !defined(SQLITE_OMIT_SHARED_CACHE)
9746 /*
9747 ** Return true if the Btree passed as the only argument is sharable.
9748 */
9749 int sqlite3BtreeSharable(Btree *p){
9750   return p->sharable;
9751 }
9752 
9753 /*
9754 ** Return the number of connections to the BtShared object accessed by
9755 ** the Btree handle passed as the only argument. For private caches
9756 ** this is always 1. For shared caches it may be 1 or greater.
9757 */
9758 int sqlite3BtreeConnectionCount(Btree *p){
9759   testcase( p->sharable );
9760   return p->pBt->nRef;
9761 }
9762 #endif
9763