xref: /sqlite-3.40.0/src/btree.c (revision 02267cc2)
1 /*
2 ** 2004 April 6
3 **
4 ** The author disclaims copyright to this source code.  In place of
5 ** a legal notice, here is a blessing:
6 **
7 **    May you do good and not evil.
8 **    May you find forgiveness for yourself and forgive others.
9 **    May you share freely, never taking more than you give.
10 **
11 *************************************************************************
12 ** This file implements an external (disk-based) database using BTrees.
13 ** See the header comment on "btreeInt.h" for additional information.
14 ** Including a description of file format and an overview of operation.
15 */
16 #include "btreeInt.h"
17 
18 /*
19 ** The header string that appears at the beginning of every
20 ** SQLite database.
21 */
22 static const char zMagicHeader[] = SQLITE_FILE_HEADER;
23 
24 /*
25 ** Set this global variable to 1 to enable tracing using the TRACE
26 ** macro.
27 */
28 #if 0
29 int sqlite3BtreeTrace=1;  /* True to enable tracing */
30 # define TRACE(X)  if(sqlite3BtreeTrace){printf X;fflush(stdout);}
31 #else
32 # define TRACE(X)
33 #endif
34 
35 /*
36 ** Extract a 2-byte big-endian integer from an array of unsigned bytes.
37 ** But if the value is zero, make it 65536.
38 **
39 ** This routine is used to extract the "offset to cell content area" value
40 ** from the header of a btree page.  If the page size is 65536 and the page
41 ** is empty, the offset should be 65536, but the 2-byte value stores zero.
42 ** This routine makes the necessary adjustment to 65536.
43 */
44 #define get2byteNotZero(X)  (((((int)get2byte(X))-1)&0xffff)+1)
45 
46 /*
47 ** Values passed as the 5th argument to allocateBtreePage()
48 */
49 #define BTALLOC_ANY   0           /* Allocate any page */
50 #define BTALLOC_EXACT 1           /* Allocate exact page if possible */
51 #define BTALLOC_LE    2           /* Allocate any page <= the parameter */
52 
53 /*
54 ** Macro IfNotOmitAV(x) returns (x) if SQLITE_OMIT_AUTOVACUUM is not
55 ** defined, or 0 if it is. For example:
56 **
57 **   bIncrVacuum = IfNotOmitAV(pBtShared->incrVacuum);
58 */
59 #ifndef SQLITE_OMIT_AUTOVACUUM
60 #define IfNotOmitAV(expr) (expr)
61 #else
62 #define IfNotOmitAV(expr) 0
63 #endif
64 
65 #ifndef SQLITE_OMIT_SHARED_CACHE
66 /*
67 ** A list of BtShared objects that are eligible for participation
68 ** in shared cache.  This variable has file scope during normal builds,
69 ** but the test harness needs to access it so we make it global for
70 ** test builds.
71 **
72 ** Access to this variable is protected by SQLITE_MUTEX_STATIC_MASTER.
73 */
74 #ifdef SQLITE_TEST
75 BtShared *SQLITE_WSD sqlite3SharedCacheList = 0;
76 #else
77 static BtShared *SQLITE_WSD sqlite3SharedCacheList = 0;
78 #endif
79 #endif /* SQLITE_OMIT_SHARED_CACHE */
80 
81 #ifndef SQLITE_OMIT_SHARED_CACHE
82 /*
83 ** Enable or disable the shared pager and schema features.
84 **
85 ** This routine has no effect on existing database connections.
86 ** The shared cache setting effects only future calls to
87 ** sqlite3_open(), sqlite3_open16(), or sqlite3_open_v2().
88 */
89 int sqlite3_enable_shared_cache(int enable){
90   sqlite3GlobalConfig.sharedCacheEnabled = enable;
91   return SQLITE_OK;
92 }
93 #endif
94 
95 
96 
97 #ifdef SQLITE_OMIT_SHARED_CACHE
98   /*
99   ** The functions querySharedCacheTableLock(), setSharedCacheTableLock(),
100   ** and clearAllSharedCacheTableLocks()
101   ** manipulate entries in the BtShared.pLock linked list used to store
102   ** shared-cache table level locks. If the library is compiled with the
103   ** shared-cache feature disabled, then there is only ever one user
104   ** of each BtShared structure and so this locking is not necessary.
105   ** So define the lock related functions as no-ops.
106   */
107   #define querySharedCacheTableLock(a,b,c) SQLITE_OK
108   #define setSharedCacheTableLock(a,b,c) SQLITE_OK
109   #define clearAllSharedCacheTableLocks(a)
110   #define downgradeAllSharedCacheTableLocks(a)
111   #define hasSharedCacheTableLock(a,b,c,d) 1
112   #define hasReadConflicts(a, b) 0
113 #endif
114 
115 #ifndef SQLITE_OMIT_SHARED_CACHE
116 
117 #ifdef SQLITE_DEBUG
118 /*
119 **** This function is only used as part of an assert() statement. ***
120 **
121 ** Check to see if pBtree holds the required locks to read or write to the
122 ** table with root page iRoot.   Return 1 if it does and 0 if not.
123 **
124 ** For example, when writing to a table with root-page iRoot via
125 ** Btree connection pBtree:
126 **
127 **    assert( hasSharedCacheTableLock(pBtree, iRoot, 0, WRITE_LOCK) );
128 **
129 ** When writing to an index that resides in a sharable database, the
130 ** caller should have first obtained a lock specifying the root page of
131 ** the corresponding table. This makes things a bit more complicated,
132 ** as this module treats each table as a separate structure. To determine
133 ** the table corresponding to the index being written, this
134 ** function has to search through the database schema.
135 **
136 ** Instead of a lock on the table/index rooted at page iRoot, the caller may
137 ** hold a write-lock on the schema table (root page 1). This is also
138 ** acceptable.
139 */
140 static int hasSharedCacheTableLock(
141   Btree *pBtree,         /* Handle that must hold lock */
142   Pgno iRoot,            /* Root page of b-tree */
143   int isIndex,           /* True if iRoot is the root of an index b-tree */
144   int eLockType          /* Required lock type (READ_LOCK or WRITE_LOCK) */
145 ){
146   Schema *pSchema = (Schema *)pBtree->pBt->pSchema;
147   Pgno iTab = 0;
148   BtLock *pLock;
149 
150   /* If this database is not shareable, or if the client is reading
151   ** and has the read-uncommitted flag set, then no lock is required.
152   ** Return true immediately.
153   */
154   if( (pBtree->sharable==0)
155    || (eLockType==READ_LOCK && (pBtree->db->flags & SQLITE_ReadUncommitted))
156   ){
157     return 1;
158   }
159 
160   /* If the client is reading  or writing an index and the schema is
161   ** not loaded, then it is too difficult to actually check to see if
162   ** the correct locks are held.  So do not bother - just return true.
163   ** This case does not come up very often anyhow.
164   */
165   if( isIndex && (!pSchema || (pSchema->schemaFlags&DB_SchemaLoaded)==0) ){
166     return 1;
167   }
168 
169   /* Figure out the root-page that the lock should be held on. For table
170   ** b-trees, this is just the root page of the b-tree being read or
171   ** written. For index b-trees, it is the root page of the associated
172   ** table.  */
173   if( isIndex ){
174     HashElem *p;
175     for(p=sqliteHashFirst(&pSchema->idxHash); p; p=sqliteHashNext(p)){
176       Index *pIdx = (Index *)sqliteHashData(p);
177       if( pIdx->tnum==(int)iRoot ){
178         if( iTab ){
179           /* Two or more indexes share the same root page.  There must
180           ** be imposter tables.  So just return true.  The assert is not
181           ** useful in that case. */
182           return 1;
183         }
184         iTab = pIdx->pTable->tnum;
185       }
186     }
187   }else{
188     iTab = iRoot;
189   }
190 
191   /* Search for the required lock. Either a write-lock on root-page iTab, a
192   ** write-lock on the schema table, or (if the client is reading) a
193   ** read-lock on iTab will suffice. Return 1 if any of these are found.  */
194   for(pLock=pBtree->pBt->pLock; pLock; pLock=pLock->pNext){
195     if( pLock->pBtree==pBtree
196      && (pLock->iTable==iTab || (pLock->eLock==WRITE_LOCK && pLock->iTable==1))
197      && pLock->eLock>=eLockType
198     ){
199       return 1;
200     }
201   }
202 
203   /* Failed to find the required lock. */
204   return 0;
205 }
206 #endif /* SQLITE_DEBUG */
207 
208 #ifdef SQLITE_DEBUG
209 /*
210 **** This function may be used as part of assert() statements only. ****
211 **
212 ** Return true if it would be illegal for pBtree to write into the
213 ** table or index rooted at iRoot because other shared connections are
214 ** simultaneously reading that same table or index.
215 **
216 ** It is illegal for pBtree to write if some other Btree object that
217 ** shares the same BtShared object is currently reading or writing
218 ** the iRoot table.  Except, if the other Btree object has the
219 ** read-uncommitted flag set, then it is OK for the other object to
220 ** have a read cursor.
221 **
222 ** For example, before writing to any part of the table or index
223 ** rooted at page iRoot, one should call:
224 **
225 **    assert( !hasReadConflicts(pBtree, iRoot) );
226 */
227 static int hasReadConflicts(Btree *pBtree, Pgno iRoot){
228   BtCursor *p;
229   for(p=pBtree->pBt->pCursor; p; p=p->pNext){
230     if( p->pgnoRoot==iRoot
231      && p->pBtree!=pBtree
232      && 0==(p->pBtree->db->flags & SQLITE_ReadUncommitted)
233     ){
234       return 1;
235     }
236   }
237   return 0;
238 }
239 #endif    /* #ifdef SQLITE_DEBUG */
240 
241 /*
242 ** Query to see if Btree handle p may obtain a lock of type eLock
243 ** (READ_LOCK or WRITE_LOCK) on the table with root-page iTab. Return
244 ** SQLITE_OK if the lock may be obtained (by calling
245 ** setSharedCacheTableLock()), or SQLITE_LOCKED if not.
246 */
247 static int querySharedCacheTableLock(Btree *p, Pgno iTab, u8 eLock){
248   BtShared *pBt = p->pBt;
249   BtLock *pIter;
250 
251   assert( sqlite3BtreeHoldsMutex(p) );
252   assert( eLock==READ_LOCK || eLock==WRITE_LOCK );
253   assert( p->db!=0 );
254   assert( !(p->db->flags&SQLITE_ReadUncommitted)||eLock==WRITE_LOCK||iTab==1 );
255 
256   /* If requesting a write-lock, then the Btree must have an open write
257   ** transaction on this file. And, obviously, for this to be so there
258   ** must be an open write transaction on the file itself.
259   */
260   assert( eLock==READ_LOCK || (p==pBt->pWriter && p->inTrans==TRANS_WRITE) );
261   assert( eLock==READ_LOCK || pBt->inTransaction==TRANS_WRITE );
262 
263   /* This routine is a no-op if the shared-cache is not enabled */
264   if( !p->sharable ){
265     return SQLITE_OK;
266   }
267 
268   /* If some other connection is holding an exclusive lock, the
269   ** requested lock may not be obtained.
270   */
271   if( pBt->pWriter!=p && (pBt->btsFlags & BTS_EXCLUSIVE)!=0 ){
272     sqlite3ConnectionBlocked(p->db, pBt->pWriter->db);
273     return SQLITE_LOCKED_SHAREDCACHE;
274   }
275 
276   for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){
277     /* The condition (pIter->eLock!=eLock) in the following if(...)
278     ** statement is a simplification of:
279     **
280     **   (eLock==WRITE_LOCK || pIter->eLock==WRITE_LOCK)
281     **
282     ** since we know that if eLock==WRITE_LOCK, then no other connection
283     ** may hold a WRITE_LOCK on any table in this file (since there can
284     ** only be a single writer).
285     */
286     assert( pIter->eLock==READ_LOCK || pIter->eLock==WRITE_LOCK );
287     assert( eLock==READ_LOCK || pIter->pBtree==p || pIter->eLock==READ_LOCK);
288     if( pIter->pBtree!=p && pIter->iTable==iTab && pIter->eLock!=eLock ){
289       sqlite3ConnectionBlocked(p->db, pIter->pBtree->db);
290       if( eLock==WRITE_LOCK ){
291         assert( p==pBt->pWriter );
292         pBt->btsFlags |= BTS_PENDING;
293       }
294       return SQLITE_LOCKED_SHAREDCACHE;
295     }
296   }
297   return SQLITE_OK;
298 }
299 #endif /* !SQLITE_OMIT_SHARED_CACHE */
300 
301 #ifndef SQLITE_OMIT_SHARED_CACHE
302 /*
303 ** Add a lock on the table with root-page iTable to the shared-btree used
304 ** by Btree handle p. Parameter eLock must be either READ_LOCK or
305 ** WRITE_LOCK.
306 **
307 ** This function assumes the following:
308 **
309 **   (a) The specified Btree object p is connected to a sharable
310 **       database (one with the BtShared.sharable flag set), and
311 **
312 **   (b) No other Btree objects hold a lock that conflicts
313 **       with the requested lock (i.e. querySharedCacheTableLock() has
314 **       already been called and returned SQLITE_OK).
315 **
316 ** SQLITE_OK is returned if the lock is added successfully. SQLITE_NOMEM
317 ** is returned if a malloc attempt fails.
318 */
319 static int setSharedCacheTableLock(Btree *p, Pgno iTable, u8 eLock){
320   BtShared *pBt = p->pBt;
321   BtLock *pLock = 0;
322   BtLock *pIter;
323 
324   assert( sqlite3BtreeHoldsMutex(p) );
325   assert( eLock==READ_LOCK || eLock==WRITE_LOCK );
326   assert( p->db!=0 );
327 
328   /* A connection with the read-uncommitted flag set will never try to
329   ** obtain a read-lock using this function. The only read-lock obtained
330   ** by a connection in read-uncommitted mode is on the sqlite_master
331   ** table, and that lock is obtained in BtreeBeginTrans().  */
332   assert( 0==(p->db->flags&SQLITE_ReadUncommitted) || eLock==WRITE_LOCK );
333 
334   /* This function should only be called on a sharable b-tree after it
335   ** has been determined that no other b-tree holds a conflicting lock.  */
336   assert( p->sharable );
337   assert( SQLITE_OK==querySharedCacheTableLock(p, iTable, eLock) );
338 
339   /* First search the list for an existing lock on this table. */
340   for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){
341     if( pIter->iTable==iTable && pIter->pBtree==p ){
342       pLock = pIter;
343       break;
344     }
345   }
346 
347   /* If the above search did not find a BtLock struct associating Btree p
348   ** with table iTable, allocate one and link it into the list.
349   */
350   if( !pLock ){
351     pLock = (BtLock *)sqlite3MallocZero(sizeof(BtLock));
352     if( !pLock ){
353       return SQLITE_NOMEM_BKPT;
354     }
355     pLock->iTable = iTable;
356     pLock->pBtree = p;
357     pLock->pNext = pBt->pLock;
358     pBt->pLock = pLock;
359   }
360 
361   /* Set the BtLock.eLock variable to the maximum of the current lock
362   ** and the requested lock. This means if a write-lock was already held
363   ** and a read-lock requested, we don't incorrectly downgrade the lock.
364   */
365   assert( WRITE_LOCK>READ_LOCK );
366   if( eLock>pLock->eLock ){
367     pLock->eLock = eLock;
368   }
369 
370   return SQLITE_OK;
371 }
372 #endif /* !SQLITE_OMIT_SHARED_CACHE */
373 
374 #ifndef SQLITE_OMIT_SHARED_CACHE
375 /*
376 ** Release all the table locks (locks obtained via calls to
377 ** the setSharedCacheTableLock() procedure) held by Btree object p.
378 **
379 ** This function assumes that Btree p has an open read or write
380 ** transaction. If it does not, then the BTS_PENDING flag
381 ** may be incorrectly cleared.
382 */
383 static void clearAllSharedCacheTableLocks(Btree *p){
384   BtShared *pBt = p->pBt;
385   BtLock **ppIter = &pBt->pLock;
386 
387   assert( sqlite3BtreeHoldsMutex(p) );
388   assert( p->sharable || 0==*ppIter );
389   assert( p->inTrans>0 );
390 
391   while( *ppIter ){
392     BtLock *pLock = *ppIter;
393     assert( (pBt->btsFlags & BTS_EXCLUSIVE)==0 || pBt->pWriter==pLock->pBtree );
394     assert( pLock->pBtree->inTrans>=pLock->eLock );
395     if( pLock->pBtree==p ){
396       *ppIter = pLock->pNext;
397       assert( pLock->iTable!=1 || pLock==&p->lock );
398       if( pLock->iTable!=1 ){
399         sqlite3_free(pLock);
400       }
401     }else{
402       ppIter = &pLock->pNext;
403     }
404   }
405 
406   assert( (pBt->btsFlags & BTS_PENDING)==0 || pBt->pWriter );
407   if( pBt->pWriter==p ){
408     pBt->pWriter = 0;
409     pBt->btsFlags &= ~(BTS_EXCLUSIVE|BTS_PENDING);
410   }else if( pBt->nTransaction==2 ){
411     /* This function is called when Btree p is concluding its
412     ** transaction. If there currently exists a writer, and p is not
413     ** that writer, then the number of locks held by connections other
414     ** than the writer must be about to drop to zero. In this case
415     ** set the BTS_PENDING flag to 0.
416     **
417     ** If there is not currently a writer, then BTS_PENDING must
418     ** be zero already. So this next line is harmless in that case.
419     */
420     pBt->btsFlags &= ~BTS_PENDING;
421   }
422 }
423 
424 /*
425 ** This function changes all write-locks held by Btree p into read-locks.
426 */
427 static void downgradeAllSharedCacheTableLocks(Btree *p){
428   BtShared *pBt = p->pBt;
429   if( pBt->pWriter==p ){
430     BtLock *pLock;
431     pBt->pWriter = 0;
432     pBt->btsFlags &= ~(BTS_EXCLUSIVE|BTS_PENDING);
433     for(pLock=pBt->pLock; pLock; pLock=pLock->pNext){
434       assert( pLock->eLock==READ_LOCK || pLock->pBtree==p );
435       pLock->eLock = READ_LOCK;
436     }
437   }
438 }
439 
440 #endif /* SQLITE_OMIT_SHARED_CACHE */
441 
442 static void releasePage(MemPage *pPage);  /* Forward reference */
443 
444 /*
445 ***** This routine is used inside of assert() only ****
446 **
447 ** Verify that the cursor holds the mutex on its BtShared
448 */
449 #ifdef SQLITE_DEBUG
450 static int cursorHoldsMutex(BtCursor *p){
451   return sqlite3_mutex_held(p->pBt->mutex);
452 }
453 static int cursorOwnsBtShared(BtCursor *p){
454   assert( cursorHoldsMutex(p) );
455   return (p->pBtree->db==p->pBt->db);
456 }
457 #endif
458 
459 /*
460 ** Invalidate the overflow cache of the cursor passed as the first argument.
461 ** on the shared btree structure pBt.
462 */
463 #define invalidateOverflowCache(pCur) (pCur->curFlags &= ~BTCF_ValidOvfl)
464 
465 /*
466 ** Invalidate the overflow page-list cache for all cursors opened
467 ** on the shared btree structure pBt.
468 */
469 static void invalidateAllOverflowCache(BtShared *pBt){
470   BtCursor *p;
471   assert( sqlite3_mutex_held(pBt->mutex) );
472   for(p=pBt->pCursor; p; p=p->pNext){
473     invalidateOverflowCache(p);
474   }
475 }
476 
477 #ifndef SQLITE_OMIT_INCRBLOB
478 /*
479 ** This function is called before modifying the contents of a table
480 ** to invalidate any incrblob cursors that are open on the
481 ** row or one of the rows being modified.
482 **
483 ** If argument isClearTable is true, then the entire contents of the
484 ** table is about to be deleted. In this case invalidate all incrblob
485 ** cursors open on any row within the table with root-page pgnoRoot.
486 **
487 ** Otherwise, if argument isClearTable is false, then the row with
488 ** rowid iRow is being replaced or deleted. In this case invalidate
489 ** only those incrblob cursors open on that specific row.
490 */
491 static void invalidateIncrblobCursors(
492   Btree *pBtree,          /* The database file to check */
493   i64 iRow,               /* The rowid that might be changing */
494   int isClearTable        /* True if all rows are being deleted */
495 ){
496   BtCursor *p;
497   if( pBtree->hasIncrblobCur==0 ) return;
498   assert( sqlite3BtreeHoldsMutex(pBtree) );
499   pBtree->hasIncrblobCur = 0;
500   for(p=pBtree->pBt->pCursor; p; p=p->pNext){
501     if( (p->curFlags & BTCF_Incrblob)!=0 ){
502       pBtree->hasIncrblobCur = 1;
503       if( isClearTable || p->info.nKey==iRow ){
504         p->eState = CURSOR_INVALID;
505       }
506     }
507   }
508 }
509 
510 #else
511   /* Stub function when INCRBLOB is omitted */
512   #define invalidateIncrblobCursors(x,y,z)
513 #endif /* SQLITE_OMIT_INCRBLOB */
514 
515 /*
516 ** Set bit pgno of the BtShared.pHasContent bitvec. This is called
517 ** when a page that previously contained data becomes a free-list leaf
518 ** page.
519 **
520 ** The BtShared.pHasContent bitvec exists to work around an obscure
521 ** bug caused by the interaction of two useful IO optimizations surrounding
522 ** free-list leaf pages:
523 **
524 **   1) When all data is deleted from a page and the page becomes
525 **      a free-list leaf page, the page is not written to the database
526 **      (as free-list leaf pages contain no meaningful data). Sometimes
527 **      such a page is not even journalled (as it will not be modified,
528 **      why bother journalling it?).
529 **
530 **   2) When a free-list leaf page is reused, its content is not read
531 **      from the database or written to the journal file (why should it
532 **      be, if it is not at all meaningful?).
533 **
534 ** By themselves, these optimizations work fine and provide a handy
535 ** performance boost to bulk delete or insert operations. However, if
536 ** a page is moved to the free-list and then reused within the same
537 ** transaction, a problem comes up. If the page is not journalled when
538 ** it is moved to the free-list and it is also not journalled when it
539 ** is extracted from the free-list and reused, then the original data
540 ** may be lost. In the event of a rollback, it may not be possible
541 ** to restore the database to its original configuration.
542 **
543 ** The solution is the BtShared.pHasContent bitvec. Whenever a page is
544 ** moved to become a free-list leaf page, the corresponding bit is
545 ** set in the bitvec. Whenever a leaf page is extracted from the free-list,
546 ** optimization 2 above is omitted if the corresponding bit is already
547 ** set in BtShared.pHasContent. The contents of the bitvec are cleared
548 ** at the end of every transaction.
549 */
550 static int btreeSetHasContent(BtShared *pBt, Pgno pgno){
551   int rc = SQLITE_OK;
552   if( !pBt->pHasContent ){
553     assert( pgno<=pBt->nPage );
554     pBt->pHasContent = sqlite3BitvecCreate(pBt->nPage);
555     if( !pBt->pHasContent ){
556       rc = SQLITE_NOMEM_BKPT;
557     }
558   }
559   if( rc==SQLITE_OK && pgno<=sqlite3BitvecSize(pBt->pHasContent) ){
560     rc = sqlite3BitvecSet(pBt->pHasContent, pgno);
561   }
562   return rc;
563 }
564 
565 /*
566 ** Query the BtShared.pHasContent vector.
567 **
568 ** This function is called when a free-list leaf page is removed from the
569 ** free-list for reuse. It returns false if it is safe to retrieve the
570 ** page from the pager layer with the 'no-content' flag set. True otherwise.
571 */
572 static int btreeGetHasContent(BtShared *pBt, Pgno pgno){
573   Bitvec *p = pBt->pHasContent;
574   return (p && (pgno>sqlite3BitvecSize(p) || sqlite3BitvecTest(p, pgno)));
575 }
576 
577 /*
578 ** Clear (destroy) the BtShared.pHasContent bitvec. This should be
579 ** invoked at the conclusion of each write-transaction.
580 */
581 static void btreeClearHasContent(BtShared *pBt){
582   sqlite3BitvecDestroy(pBt->pHasContent);
583   pBt->pHasContent = 0;
584 }
585 
586 /*
587 ** Release all of the apPage[] pages for a cursor.
588 */
589 static void btreeReleaseAllCursorPages(BtCursor *pCur){
590   int i;
591   for(i=0; i<=pCur->iPage; i++){
592     releasePage(pCur->apPage[i]);
593     pCur->apPage[i] = 0;
594   }
595   pCur->iPage = -1;
596 }
597 
598 /*
599 ** The cursor passed as the only argument must point to a valid entry
600 ** when this function is called (i.e. have eState==CURSOR_VALID). This
601 ** function saves the current cursor key in variables pCur->nKey and
602 ** pCur->pKey. SQLITE_OK is returned if successful or an SQLite error
603 ** code otherwise.
604 **
605 ** If the cursor is open on an intkey table, then the integer key
606 ** (the rowid) is stored in pCur->nKey and pCur->pKey is left set to
607 ** NULL. If the cursor is open on a non-intkey table, then pCur->pKey is
608 ** set to point to a malloced buffer pCur->nKey bytes in size containing
609 ** the key.
610 */
611 static int saveCursorKey(BtCursor *pCur){
612   int rc;
613   assert( CURSOR_VALID==pCur->eState );
614   assert( 0==pCur->pKey );
615   assert( cursorHoldsMutex(pCur) );
616 
617   rc = sqlite3BtreeKeySize(pCur, &pCur->nKey);
618   assert( rc==SQLITE_OK );  /* KeySize() cannot fail */
619 
620   /* If this is an intKey table, then the above call to BtreeKeySize()
621   ** stores the integer key in pCur->nKey. In this case this value is
622   ** all that is required. Otherwise, if pCur is not open on an intKey
623   ** table, then malloc space for and store the pCur->nKey bytes of key
624   ** data.  */
625   if( 0==pCur->curIntKey ){
626     void *pKey = sqlite3Malloc( pCur->nKey );
627     if( pKey ){
628       rc = sqlite3BtreeKey(pCur, 0, (int)pCur->nKey, pKey);
629       if( rc==SQLITE_OK ){
630         pCur->pKey = pKey;
631       }else{
632         sqlite3_free(pKey);
633       }
634     }else{
635       rc = SQLITE_NOMEM_BKPT;
636     }
637   }
638   assert( !pCur->curIntKey || !pCur->pKey );
639   return rc;
640 }
641 
642 /*
643 ** Save the current cursor position in the variables BtCursor.nKey
644 ** and BtCursor.pKey. The cursor's state is set to CURSOR_REQUIRESEEK.
645 **
646 ** The caller must ensure that the cursor is valid (has eState==CURSOR_VALID)
647 ** prior to calling this routine.
648 */
649 static int saveCursorPosition(BtCursor *pCur){
650   int rc;
651 
652   assert( CURSOR_VALID==pCur->eState || CURSOR_SKIPNEXT==pCur->eState );
653   assert( 0==pCur->pKey );
654   assert( cursorHoldsMutex(pCur) );
655 
656   if( pCur->eState==CURSOR_SKIPNEXT ){
657     pCur->eState = CURSOR_VALID;
658   }else{
659     pCur->skipNext = 0;
660   }
661 
662   rc = saveCursorKey(pCur);
663   if( rc==SQLITE_OK ){
664     btreeReleaseAllCursorPages(pCur);
665     pCur->eState = CURSOR_REQUIRESEEK;
666   }
667 
668   pCur->curFlags &= ~(BTCF_ValidNKey|BTCF_ValidOvfl|BTCF_AtLast);
669   return rc;
670 }
671 
672 /* Forward reference */
673 static int SQLITE_NOINLINE saveCursorsOnList(BtCursor*,Pgno,BtCursor*);
674 
675 /*
676 ** Save the positions of all cursors (except pExcept) that are open on
677 ** the table with root-page iRoot.  "Saving the cursor position" means that
678 ** the location in the btree is remembered in such a way that it can be
679 ** moved back to the same spot after the btree has been modified.  This
680 ** routine is called just before cursor pExcept is used to modify the
681 ** table, for example in BtreeDelete() or BtreeInsert().
682 **
683 ** If there are two or more cursors on the same btree, then all such
684 ** cursors should have their BTCF_Multiple flag set.  The btreeCursor()
685 ** routine enforces that rule.  This routine only needs to be called in
686 ** the uncommon case when pExpect has the BTCF_Multiple flag set.
687 **
688 ** If pExpect!=NULL and if no other cursors are found on the same root-page,
689 ** then the BTCF_Multiple flag on pExpect is cleared, to avoid another
690 ** pointless call to this routine.
691 **
692 ** Implementation note:  This routine merely checks to see if any cursors
693 ** need to be saved.  It calls out to saveCursorsOnList() in the (unusual)
694 ** event that cursors are in need to being saved.
695 */
696 static int saveAllCursors(BtShared *pBt, Pgno iRoot, BtCursor *pExcept){
697   BtCursor *p;
698   assert( sqlite3_mutex_held(pBt->mutex) );
699   assert( pExcept==0 || pExcept->pBt==pBt );
700   for(p=pBt->pCursor; p; p=p->pNext){
701     if( p!=pExcept && (0==iRoot || p->pgnoRoot==iRoot) ) break;
702   }
703   if( p ) return saveCursorsOnList(p, iRoot, pExcept);
704   if( pExcept ) pExcept->curFlags &= ~BTCF_Multiple;
705   return SQLITE_OK;
706 }
707 
708 /* This helper routine to saveAllCursors does the actual work of saving
709 ** the cursors if and when a cursor is found that actually requires saving.
710 ** The common case is that no cursors need to be saved, so this routine is
711 ** broken out from its caller to avoid unnecessary stack pointer movement.
712 */
713 static int SQLITE_NOINLINE saveCursorsOnList(
714   BtCursor *p,         /* The first cursor that needs saving */
715   Pgno iRoot,          /* Only save cursor with this iRoot. Save all if zero */
716   BtCursor *pExcept    /* Do not save this cursor */
717 ){
718   do{
719     if( p!=pExcept && (0==iRoot || p->pgnoRoot==iRoot) ){
720       if( p->eState==CURSOR_VALID || p->eState==CURSOR_SKIPNEXT ){
721         int rc = saveCursorPosition(p);
722         if( SQLITE_OK!=rc ){
723           return rc;
724         }
725       }else{
726         testcase( p->iPage>0 );
727         btreeReleaseAllCursorPages(p);
728       }
729     }
730     p = p->pNext;
731   }while( p );
732   return SQLITE_OK;
733 }
734 
735 /*
736 ** Clear the current cursor position.
737 */
738 void sqlite3BtreeClearCursor(BtCursor *pCur){
739   assert( cursorHoldsMutex(pCur) );
740   sqlite3_free(pCur->pKey);
741   pCur->pKey = 0;
742   pCur->eState = CURSOR_INVALID;
743 }
744 
745 /*
746 ** In this version of BtreeMoveto, pKey is a packed index record
747 ** such as is generated by the OP_MakeRecord opcode.  Unpack the
748 ** record and then call BtreeMovetoUnpacked() to do the work.
749 */
750 static int btreeMoveto(
751   BtCursor *pCur,     /* Cursor open on the btree to be searched */
752   const void *pKey,   /* Packed key if the btree is an index */
753   i64 nKey,           /* Integer key for tables.  Size of pKey for indices */
754   int bias,           /* Bias search to the high end */
755   int *pRes           /* Write search results here */
756 ){
757   int rc;                    /* Status code */
758   UnpackedRecord *pIdxKey;   /* Unpacked index key */
759   char aSpace[200];          /* Temp space for pIdxKey - to avoid a malloc */
760   char *pFree = 0;
761 
762   if( pKey ){
763     assert( nKey==(i64)(int)nKey );
764     pIdxKey = sqlite3VdbeAllocUnpackedRecord(
765         pCur->pKeyInfo, aSpace, sizeof(aSpace), &pFree
766     );
767     if( pIdxKey==0 ) return SQLITE_NOMEM_BKPT;
768     sqlite3VdbeRecordUnpack(pCur->pKeyInfo, (int)nKey, pKey, pIdxKey);
769     if( pIdxKey->nField==0 ){
770       sqlite3DbFree(pCur->pKeyInfo->db, pFree);
771       return SQLITE_CORRUPT_BKPT;
772     }
773   }else{
774     pIdxKey = 0;
775   }
776   rc = sqlite3BtreeMovetoUnpacked(pCur, pIdxKey, nKey, bias, pRes);
777   if( pFree ){
778     sqlite3DbFree(pCur->pKeyInfo->db, pFree);
779   }
780   return rc;
781 }
782 
783 /*
784 ** Restore the cursor to the position it was in (or as close to as possible)
785 ** when saveCursorPosition() was called. Note that this call deletes the
786 ** saved position info stored by saveCursorPosition(), so there can be
787 ** at most one effective restoreCursorPosition() call after each
788 ** saveCursorPosition().
789 */
790 static int btreeRestoreCursorPosition(BtCursor *pCur){
791   int rc;
792   int skipNext;
793   assert( cursorOwnsBtShared(pCur) );
794   assert( pCur->eState>=CURSOR_REQUIRESEEK );
795   if( pCur->eState==CURSOR_FAULT ){
796     return pCur->skipNext;
797   }
798   pCur->eState = CURSOR_INVALID;
799   rc = btreeMoveto(pCur, pCur->pKey, pCur->nKey, 0, &skipNext);
800   if( rc==SQLITE_OK ){
801     sqlite3_free(pCur->pKey);
802     pCur->pKey = 0;
803     assert( pCur->eState==CURSOR_VALID || pCur->eState==CURSOR_INVALID );
804     pCur->skipNext |= skipNext;
805     if( pCur->skipNext && pCur->eState==CURSOR_VALID ){
806       pCur->eState = CURSOR_SKIPNEXT;
807     }
808   }
809   return rc;
810 }
811 
812 #define restoreCursorPosition(p) \
813   (p->eState>=CURSOR_REQUIRESEEK ? \
814          btreeRestoreCursorPosition(p) : \
815          SQLITE_OK)
816 
817 /*
818 ** Determine whether or not a cursor has moved from the position where
819 ** it was last placed, or has been invalidated for any other reason.
820 ** Cursors can move when the row they are pointing at is deleted out
821 ** from under them, for example.  Cursor might also move if a btree
822 ** is rebalanced.
823 **
824 ** Calling this routine with a NULL cursor pointer returns false.
825 **
826 ** Use the separate sqlite3BtreeCursorRestore() routine to restore a cursor
827 ** back to where it ought to be if this routine returns true.
828 */
829 int sqlite3BtreeCursorHasMoved(BtCursor *pCur){
830   return pCur->eState!=CURSOR_VALID;
831 }
832 
833 /*
834 ** This routine restores a cursor back to its original position after it
835 ** has been moved by some outside activity (such as a btree rebalance or
836 ** a row having been deleted out from under the cursor).
837 **
838 ** On success, the *pDifferentRow parameter is false if the cursor is left
839 ** pointing at exactly the same row.  *pDifferntRow is the row the cursor
840 ** was pointing to has been deleted, forcing the cursor to point to some
841 ** nearby row.
842 **
843 ** This routine should only be called for a cursor that just returned
844 ** TRUE from sqlite3BtreeCursorHasMoved().
845 */
846 int sqlite3BtreeCursorRestore(BtCursor *pCur, int *pDifferentRow){
847   int rc;
848 
849   assert( pCur!=0 );
850   assert( pCur->eState!=CURSOR_VALID );
851   rc = restoreCursorPosition(pCur);
852   if( rc ){
853     *pDifferentRow = 1;
854     return rc;
855   }
856   if( pCur->eState!=CURSOR_VALID ){
857     *pDifferentRow = 1;
858   }else{
859     assert( pCur->skipNext==0 );
860     *pDifferentRow = 0;
861   }
862   return SQLITE_OK;
863 }
864 
865 #ifdef SQLITE_ENABLE_CURSOR_HINTS
866 /*
867 ** Provide hints to the cursor.  The particular hint given (and the type
868 ** and number of the varargs parameters) is determined by the eHintType
869 ** parameter.  See the definitions of the BTREE_HINT_* macros for details.
870 */
871 void sqlite3BtreeCursorHint(BtCursor *pCur, int eHintType, ...){
872   /* Used only by system that substitute their own storage engine */
873 }
874 #endif
875 
876 /*
877 ** Provide flag hints to the cursor.
878 */
879 void sqlite3BtreeCursorHintFlags(BtCursor *pCur, unsigned x){
880   assert( x==BTREE_SEEK_EQ || x==BTREE_BULKLOAD || x==0 );
881   pCur->hints = x;
882 }
883 
884 
885 #ifndef SQLITE_OMIT_AUTOVACUUM
886 /*
887 ** Given a page number of a regular database page, return the page
888 ** number for the pointer-map page that contains the entry for the
889 ** input page number.
890 **
891 ** Return 0 (not a valid page) for pgno==1 since there is
892 ** no pointer map associated with page 1.  The integrity_check logic
893 ** requires that ptrmapPageno(*,1)!=1.
894 */
895 static Pgno ptrmapPageno(BtShared *pBt, Pgno pgno){
896   int nPagesPerMapPage;
897   Pgno iPtrMap, ret;
898   assert( sqlite3_mutex_held(pBt->mutex) );
899   if( pgno<2 ) return 0;
900   nPagesPerMapPage = (pBt->usableSize/5)+1;
901   iPtrMap = (pgno-2)/nPagesPerMapPage;
902   ret = (iPtrMap*nPagesPerMapPage) + 2;
903   if( ret==PENDING_BYTE_PAGE(pBt) ){
904     ret++;
905   }
906   return ret;
907 }
908 
909 /*
910 ** Write an entry into the pointer map.
911 **
912 ** This routine updates the pointer map entry for page number 'key'
913 ** so that it maps to type 'eType' and parent page number 'pgno'.
914 **
915 ** If *pRC is initially non-zero (non-SQLITE_OK) then this routine is
916 ** a no-op.  If an error occurs, the appropriate error code is written
917 ** into *pRC.
918 */
919 static void ptrmapPut(BtShared *pBt, Pgno key, u8 eType, Pgno parent, int *pRC){
920   DbPage *pDbPage;  /* The pointer map page */
921   u8 *pPtrmap;      /* The pointer map data */
922   Pgno iPtrmap;     /* The pointer map page number */
923   int offset;       /* Offset in pointer map page */
924   int rc;           /* Return code from subfunctions */
925 
926   if( *pRC ) return;
927 
928   assert( sqlite3_mutex_held(pBt->mutex) );
929   /* The master-journal page number must never be used as a pointer map page */
930   assert( 0==PTRMAP_ISPAGE(pBt, PENDING_BYTE_PAGE(pBt)) );
931 
932   assert( pBt->autoVacuum );
933   if( key==0 ){
934     *pRC = SQLITE_CORRUPT_BKPT;
935     return;
936   }
937   iPtrmap = PTRMAP_PAGENO(pBt, key);
938   rc = sqlite3PagerGet(pBt->pPager, iPtrmap, &pDbPage, 0);
939   if( rc!=SQLITE_OK ){
940     *pRC = rc;
941     return;
942   }
943   offset = PTRMAP_PTROFFSET(iPtrmap, key);
944   if( offset<0 ){
945     *pRC = SQLITE_CORRUPT_BKPT;
946     goto ptrmap_exit;
947   }
948   assert( offset <= (int)pBt->usableSize-5 );
949   pPtrmap = (u8 *)sqlite3PagerGetData(pDbPage);
950 
951   if( eType!=pPtrmap[offset] || get4byte(&pPtrmap[offset+1])!=parent ){
952     TRACE(("PTRMAP_UPDATE: %d->(%d,%d)\n", key, eType, parent));
953     *pRC= rc = sqlite3PagerWrite(pDbPage);
954     if( rc==SQLITE_OK ){
955       pPtrmap[offset] = eType;
956       put4byte(&pPtrmap[offset+1], parent);
957     }
958   }
959 
960 ptrmap_exit:
961   sqlite3PagerUnref(pDbPage);
962 }
963 
964 /*
965 ** Read an entry from the pointer map.
966 **
967 ** This routine retrieves the pointer map entry for page 'key', writing
968 ** the type and parent page number to *pEType and *pPgno respectively.
969 ** An error code is returned if something goes wrong, otherwise SQLITE_OK.
970 */
971 static int ptrmapGet(BtShared *pBt, Pgno key, u8 *pEType, Pgno *pPgno){
972   DbPage *pDbPage;   /* The pointer map page */
973   int iPtrmap;       /* Pointer map page index */
974   u8 *pPtrmap;       /* Pointer map page data */
975   int offset;        /* Offset of entry in pointer map */
976   int rc;
977 
978   assert( sqlite3_mutex_held(pBt->mutex) );
979 
980   iPtrmap = PTRMAP_PAGENO(pBt, key);
981   rc = sqlite3PagerGet(pBt->pPager, iPtrmap, &pDbPage, 0);
982   if( rc!=0 ){
983     return rc;
984   }
985   pPtrmap = (u8 *)sqlite3PagerGetData(pDbPage);
986 
987   offset = PTRMAP_PTROFFSET(iPtrmap, key);
988   if( offset<0 ){
989     sqlite3PagerUnref(pDbPage);
990     return SQLITE_CORRUPT_BKPT;
991   }
992   assert( offset <= (int)pBt->usableSize-5 );
993   assert( pEType!=0 );
994   *pEType = pPtrmap[offset];
995   if( pPgno ) *pPgno = get4byte(&pPtrmap[offset+1]);
996 
997   sqlite3PagerUnref(pDbPage);
998   if( *pEType<1 || *pEType>5 ) return SQLITE_CORRUPT_BKPT;
999   return SQLITE_OK;
1000 }
1001 
1002 #else /* if defined SQLITE_OMIT_AUTOVACUUM */
1003   #define ptrmapPut(w,x,y,z,rc)
1004   #define ptrmapGet(w,x,y,z) SQLITE_OK
1005   #define ptrmapPutOvflPtr(x, y, rc)
1006 #endif
1007 
1008 /*
1009 ** Given a btree page and a cell index (0 means the first cell on
1010 ** the page, 1 means the second cell, and so forth) return a pointer
1011 ** to the cell content.
1012 **
1013 ** findCellPastPtr() does the same except it skips past the initial
1014 ** 4-byte child pointer found on interior pages, if there is one.
1015 **
1016 ** This routine works only for pages that do not contain overflow cells.
1017 */
1018 #define findCell(P,I) \
1019   ((P)->aData + ((P)->maskPage & get2byteAligned(&(P)->aCellIdx[2*(I)])))
1020 #define findCellPastPtr(P,I) \
1021   ((P)->aDataOfst + ((P)->maskPage & get2byteAligned(&(P)->aCellIdx[2*(I)])))
1022 
1023 
1024 /*
1025 ** This is common tail processing for btreeParseCellPtr() and
1026 ** btreeParseCellPtrIndex() for the case when the cell does not fit entirely
1027 ** on a single B-tree page.  Make necessary adjustments to the CellInfo
1028 ** structure.
1029 */
1030 static SQLITE_NOINLINE void btreeParseCellAdjustSizeForOverflow(
1031   MemPage *pPage,         /* Page containing the cell */
1032   u8 *pCell,              /* Pointer to the cell text. */
1033   CellInfo *pInfo         /* Fill in this structure */
1034 ){
1035   /* If the payload will not fit completely on the local page, we have
1036   ** to decide how much to store locally and how much to spill onto
1037   ** overflow pages.  The strategy is to minimize the amount of unused
1038   ** space on overflow pages while keeping the amount of local storage
1039   ** in between minLocal and maxLocal.
1040   **
1041   ** Warning:  changing the way overflow payload is distributed in any
1042   ** way will result in an incompatible file format.
1043   */
1044   int minLocal;  /* Minimum amount of payload held locally */
1045   int maxLocal;  /* Maximum amount of payload held locally */
1046   int surplus;   /* Overflow payload available for local storage */
1047 
1048   minLocal = pPage->minLocal;
1049   maxLocal = pPage->maxLocal;
1050   surplus = minLocal + (pInfo->nPayload - minLocal)%(pPage->pBt->usableSize-4);
1051   testcase( surplus==maxLocal );
1052   testcase( surplus==maxLocal+1 );
1053   if( surplus <= maxLocal ){
1054     pInfo->nLocal = (u16)surplus;
1055   }else{
1056     pInfo->nLocal = (u16)minLocal;
1057   }
1058   pInfo->nSize = (u16)(&pInfo->pPayload[pInfo->nLocal] - pCell) + 4;
1059 }
1060 
1061 /*
1062 ** The following routines are implementations of the MemPage.xParseCell()
1063 ** method.
1064 **
1065 ** Parse a cell content block and fill in the CellInfo structure.
1066 **
1067 ** btreeParseCellPtr()        =>   table btree leaf nodes
1068 ** btreeParseCellNoPayload()  =>   table btree internal nodes
1069 ** btreeParseCellPtrIndex()   =>   index btree nodes
1070 **
1071 ** There is also a wrapper function btreeParseCell() that works for
1072 ** all MemPage types and that references the cell by index rather than
1073 ** by pointer.
1074 */
1075 static void btreeParseCellPtrNoPayload(
1076   MemPage *pPage,         /* Page containing the cell */
1077   u8 *pCell,              /* Pointer to the cell text. */
1078   CellInfo *pInfo         /* Fill in this structure */
1079 ){
1080   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1081   assert( pPage->leaf==0 );
1082   assert( pPage->childPtrSize==4 );
1083 #ifndef SQLITE_DEBUG
1084   UNUSED_PARAMETER(pPage);
1085 #endif
1086   pInfo->nSize = 4 + getVarint(&pCell[4], (u64*)&pInfo->nKey);
1087   pInfo->nPayload = 0;
1088   pInfo->nLocal = 0;
1089   pInfo->pPayload = 0;
1090   return;
1091 }
1092 static void btreeParseCellPtr(
1093   MemPage *pPage,         /* Page containing the cell */
1094   u8 *pCell,              /* Pointer to the cell text. */
1095   CellInfo *pInfo         /* Fill in this structure */
1096 ){
1097   u8 *pIter;              /* For scanning through pCell */
1098   u32 nPayload;           /* Number of bytes of cell payload */
1099   u64 iKey;               /* Extracted Key value */
1100 
1101   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1102   assert( pPage->leaf==0 || pPage->leaf==1 );
1103   assert( pPage->intKeyLeaf );
1104   assert( pPage->childPtrSize==0 );
1105   pIter = pCell;
1106 
1107   /* The next block of code is equivalent to:
1108   **
1109   **     pIter += getVarint32(pIter, nPayload);
1110   **
1111   ** The code is inlined to avoid a function call.
1112   */
1113   nPayload = *pIter;
1114   if( nPayload>=0x80 ){
1115     u8 *pEnd = &pIter[8];
1116     nPayload &= 0x7f;
1117     do{
1118       nPayload = (nPayload<<7) | (*++pIter & 0x7f);
1119     }while( (*pIter)>=0x80 && pIter<pEnd );
1120   }
1121   pIter++;
1122 
1123   /* The next block of code is equivalent to:
1124   **
1125   **     pIter += getVarint(pIter, (u64*)&pInfo->nKey);
1126   **
1127   ** The code is inlined to avoid a function call.
1128   */
1129   iKey = *pIter;
1130   if( iKey>=0x80 ){
1131     u8 *pEnd = &pIter[7];
1132     iKey &= 0x7f;
1133     while(1){
1134       iKey = (iKey<<7) | (*++pIter & 0x7f);
1135       if( (*pIter)<0x80 ) break;
1136       if( pIter>=pEnd ){
1137         iKey = (iKey<<8) | *++pIter;
1138         break;
1139       }
1140     }
1141   }
1142   pIter++;
1143 
1144   pInfo->nKey = *(i64*)&iKey;
1145   pInfo->nPayload = nPayload;
1146   pInfo->pPayload = pIter;
1147   testcase( nPayload==pPage->maxLocal );
1148   testcase( nPayload==pPage->maxLocal+1 );
1149   if( nPayload<=pPage->maxLocal ){
1150     /* This is the (easy) common case where the entire payload fits
1151     ** on the local page.  No overflow is required.
1152     */
1153     pInfo->nSize = nPayload + (u16)(pIter - pCell);
1154     if( pInfo->nSize<4 ) pInfo->nSize = 4;
1155     pInfo->nLocal = (u16)nPayload;
1156   }else{
1157     btreeParseCellAdjustSizeForOverflow(pPage, pCell, pInfo);
1158   }
1159 }
1160 static void btreeParseCellPtrIndex(
1161   MemPage *pPage,         /* Page containing the cell */
1162   u8 *pCell,              /* Pointer to the cell text. */
1163   CellInfo *pInfo         /* Fill in this structure */
1164 ){
1165   u8 *pIter;              /* For scanning through pCell */
1166   u32 nPayload;           /* Number of bytes of cell payload */
1167 
1168   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1169   assert( pPage->leaf==0 || pPage->leaf==1 );
1170   assert( pPage->intKeyLeaf==0 );
1171   pIter = pCell + pPage->childPtrSize;
1172   nPayload = *pIter;
1173   if( nPayload>=0x80 ){
1174     u8 *pEnd = &pIter[8];
1175     nPayload &= 0x7f;
1176     do{
1177       nPayload = (nPayload<<7) | (*++pIter & 0x7f);
1178     }while( *(pIter)>=0x80 && pIter<pEnd );
1179   }
1180   pIter++;
1181   pInfo->nKey = nPayload;
1182   pInfo->nPayload = nPayload;
1183   pInfo->pPayload = pIter;
1184   testcase( nPayload==pPage->maxLocal );
1185   testcase( nPayload==pPage->maxLocal+1 );
1186   if( nPayload<=pPage->maxLocal ){
1187     /* This is the (easy) common case where the entire payload fits
1188     ** on the local page.  No overflow is required.
1189     */
1190     pInfo->nSize = nPayload + (u16)(pIter - pCell);
1191     if( pInfo->nSize<4 ) pInfo->nSize = 4;
1192     pInfo->nLocal = (u16)nPayload;
1193   }else{
1194     btreeParseCellAdjustSizeForOverflow(pPage, pCell, pInfo);
1195   }
1196 }
1197 static void btreeParseCell(
1198   MemPage *pPage,         /* Page containing the cell */
1199   int iCell,              /* The cell index.  First cell is 0 */
1200   CellInfo *pInfo         /* Fill in this structure */
1201 ){
1202   pPage->xParseCell(pPage, findCell(pPage, iCell), pInfo);
1203 }
1204 
1205 /*
1206 ** The following routines are implementations of the MemPage.xCellSize
1207 ** method.
1208 **
1209 ** Compute the total number of bytes that a Cell needs in the cell
1210 ** data area of the btree-page.  The return number includes the cell
1211 ** data header and the local payload, but not any overflow page or
1212 ** the space used by the cell pointer.
1213 **
1214 ** cellSizePtrNoPayload()    =>   table internal nodes
1215 ** cellSizePtr()             =>   all index nodes & table leaf nodes
1216 */
1217 static u16 cellSizePtr(MemPage *pPage, u8 *pCell){
1218   u8 *pIter = pCell + pPage->childPtrSize; /* For looping over bytes of pCell */
1219   u8 *pEnd;                                /* End mark for a varint */
1220   u32 nSize;                               /* Size value to return */
1221 
1222 #ifdef SQLITE_DEBUG
1223   /* The value returned by this function should always be the same as
1224   ** the (CellInfo.nSize) value found by doing a full parse of the
1225   ** cell. If SQLITE_DEBUG is defined, an assert() at the bottom of
1226   ** this function verifies that this invariant is not violated. */
1227   CellInfo debuginfo;
1228   pPage->xParseCell(pPage, pCell, &debuginfo);
1229 #endif
1230 
1231   nSize = *pIter;
1232   if( nSize>=0x80 ){
1233     pEnd = &pIter[8];
1234     nSize &= 0x7f;
1235     do{
1236       nSize = (nSize<<7) | (*++pIter & 0x7f);
1237     }while( *(pIter)>=0x80 && pIter<pEnd );
1238   }
1239   pIter++;
1240   if( pPage->intKey ){
1241     /* pIter now points at the 64-bit integer key value, a variable length
1242     ** integer. The following block moves pIter to point at the first byte
1243     ** past the end of the key value. */
1244     pEnd = &pIter[9];
1245     while( (*pIter++)&0x80 && pIter<pEnd );
1246   }
1247   testcase( nSize==pPage->maxLocal );
1248   testcase( nSize==pPage->maxLocal+1 );
1249   if( nSize<=pPage->maxLocal ){
1250     nSize += (u32)(pIter - pCell);
1251     if( nSize<4 ) nSize = 4;
1252   }else{
1253     int minLocal = pPage->minLocal;
1254     nSize = minLocal + (nSize - minLocal) % (pPage->pBt->usableSize - 4);
1255     testcase( nSize==pPage->maxLocal );
1256     testcase( nSize==pPage->maxLocal+1 );
1257     if( nSize>pPage->maxLocal ){
1258       nSize = minLocal;
1259     }
1260     nSize += 4 + (u16)(pIter - pCell);
1261   }
1262   assert( nSize==debuginfo.nSize || CORRUPT_DB );
1263   return (u16)nSize;
1264 }
1265 static u16 cellSizePtrNoPayload(MemPage *pPage, u8 *pCell){
1266   u8 *pIter = pCell + 4; /* For looping over bytes of pCell */
1267   u8 *pEnd;              /* End mark for a varint */
1268 
1269 #ifdef SQLITE_DEBUG
1270   /* The value returned by this function should always be the same as
1271   ** the (CellInfo.nSize) value found by doing a full parse of the
1272   ** cell. If SQLITE_DEBUG is defined, an assert() at the bottom of
1273   ** this function verifies that this invariant is not violated. */
1274   CellInfo debuginfo;
1275   pPage->xParseCell(pPage, pCell, &debuginfo);
1276 #else
1277   UNUSED_PARAMETER(pPage);
1278 #endif
1279 
1280   assert( pPage->childPtrSize==4 );
1281   pEnd = pIter + 9;
1282   while( (*pIter++)&0x80 && pIter<pEnd );
1283   assert( debuginfo.nSize==(u16)(pIter - pCell) || CORRUPT_DB );
1284   return (u16)(pIter - pCell);
1285 }
1286 
1287 
1288 #ifdef SQLITE_DEBUG
1289 /* This variation on cellSizePtr() is used inside of assert() statements
1290 ** only. */
1291 static u16 cellSize(MemPage *pPage, int iCell){
1292   return pPage->xCellSize(pPage, findCell(pPage, iCell));
1293 }
1294 #endif
1295 
1296 #ifndef SQLITE_OMIT_AUTOVACUUM
1297 /*
1298 ** If the cell pCell, part of page pPage contains a pointer
1299 ** to an overflow page, insert an entry into the pointer-map
1300 ** for the overflow page.
1301 */
1302 static void ptrmapPutOvflPtr(MemPage *pPage, u8 *pCell, int *pRC){
1303   CellInfo info;
1304   if( *pRC ) return;
1305   assert( pCell!=0 );
1306   pPage->xParseCell(pPage, pCell, &info);
1307   if( info.nLocal<info.nPayload ){
1308     Pgno ovfl = get4byte(&pCell[info.nSize-4]);
1309     ptrmapPut(pPage->pBt, ovfl, PTRMAP_OVERFLOW1, pPage->pgno, pRC);
1310   }
1311 }
1312 #endif
1313 
1314 
1315 /*
1316 ** Defragment the page given.  All Cells are moved to the
1317 ** end of the page and all free space is collected into one
1318 ** big FreeBlk that occurs in between the header and cell
1319 ** pointer array and the cell content area.
1320 **
1321 ** EVIDENCE-OF: R-44582-60138 SQLite may from time to time reorganize a
1322 ** b-tree page so that there are no freeblocks or fragment bytes, all
1323 ** unused bytes are contained in the unallocated space region, and all
1324 ** cells are packed tightly at the end of the page.
1325 */
1326 static int defragmentPage(MemPage *pPage){
1327   int i;                     /* Loop counter */
1328   int pc;                    /* Address of the i-th cell */
1329   int hdr;                   /* Offset to the page header */
1330   int size;                  /* Size of a cell */
1331   int usableSize;            /* Number of usable bytes on a page */
1332   int cellOffset;            /* Offset to the cell pointer array */
1333   int cbrk;                  /* Offset to the cell content area */
1334   int nCell;                 /* Number of cells on the page */
1335   unsigned char *data;       /* The page data */
1336   unsigned char *temp;       /* Temp area for cell content */
1337   unsigned char *src;        /* Source of content */
1338   int iCellFirst;            /* First allowable cell index */
1339   int iCellLast;             /* Last possible cell index */
1340 
1341 
1342   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
1343   assert( pPage->pBt!=0 );
1344   assert( pPage->pBt->usableSize <= SQLITE_MAX_PAGE_SIZE );
1345   assert( pPage->nOverflow==0 );
1346   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1347   temp = 0;
1348   src = data = pPage->aData;
1349   hdr = pPage->hdrOffset;
1350   cellOffset = pPage->cellOffset;
1351   nCell = pPage->nCell;
1352   assert( nCell==get2byte(&data[hdr+3]) );
1353   usableSize = pPage->pBt->usableSize;
1354   cbrk = usableSize;
1355   iCellFirst = cellOffset + 2*nCell;
1356   iCellLast = usableSize - 4;
1357   for(i=0; i<nCell; i++){
1358     u8 *pAddr;     /* The i-th cell pointer */
1359     pAddr = &data[cellOffset + i*2];
1360     pc = get2byte(pAddr);
1361     testcase( pc==iCellFirst );
1362     testcase( pc==iCellLast );
1363     /* These conditions have already been verified in btreeInitPage()
1364     ** if PRAGMA cell_size_check=ON.
1365     */
1366     if( pc<iCellFirst || pc>iCellLast ){
1367       return SQLITE_CORRUPT_BKPT;
1368     }
1369     assert( pc>=iCellFirst && pc<=iCellLast );
1370     size = pPage->xCellSize(pPage, &src[pc]);
1371     cbrk -= size;
1372     if( cbrk<iCellFirst || pc+size>usableSize ){
1373       return SQLITE_CORRUPT_BKPT;
1374     }
1375     assert( cbrk+size<=usableSize && cbrk>=iCellFirst );
1376     testcase( cbrk+size==usableSize );
1377     testcase( pc+size==usableSize );
1378     put2byte(pAddr, cbrk);
1379     if( temp==0 ){
1380       int x;
1381       if( cbrk==pc ) continue;
1382       temp = sqlite3PagerTempSpace(pPage->pBt->pPager);
1383       x = get2byte(&data[hdr+5]);
1384       memcpy(&temp[x], &data[x], (cbrk+size) - x);
1385       src = temp;
1386     }
1387     memcpy(&data[cbrk], &src[pc], size);
1388   }
1389   assert( cbrk>=iCellFirst );
1390   put2byte(&data[hdr+5], cbrk);
1391   data[hdr+1] = 0;
1392   data[hdr+2] = 0;
1393   data[hdr+7] = 0;
1394   memset(&data[iCellFirst], 0, cbrk-iCellFirst);
1395   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
1396   if( cbrk-iCellFirst!=pPage->nFree ){
1397     return SQLITE_CORRUPT_BKPT;
1398   }
1399   return SQLITE_OK;
1400 }
1401 
1402 /*
1403 ** Search the free-list on page pPg for space to store a cell nByte bytes in
1404 ** size. If one can be found, return a pointer to the space and remove it
1405 ** from the free-list.
1406 **
1407 ** If no suitable space can be found on the free-list, return NULL.
1408 **
1409 ** This function may detect corruption within pPg.  If corruption is
1410 ** detected then *pRc is set to SQLITE_CORRUPT and NULL is returned.
1411 **
1412 ** Slots on the free list that are between 1 and 3 bytes larger than nByte
1413 ** will be ignored if adding the extra space to the fragmentation count
1414 ** causes the fragmentation count to exceed 60.
1415 */
1416 static u8 *pageFindSlot(MemPage *pPg, int nByte, int *pRc){
1417   const int hdr = pPg->hdrOffset;
1418   u8 * const aData = pPg->aData;
1419   int iAddr = hdr + 1;
1420   int pc = get2byte(&aData[iAddr]);
1421   int x;
1422   int usableSize = pPg->pBt->usableSize;
1423 
1424   assert( pc>0 );
1425   do{
1426     int size;            /* Size of the free slot */
1427     /* EVIDENCE-OF: R-06866-39125 Freeblocks are always connected in order of
1428     ** increasing offset. */
1429     if( pc>usableSize-4 || pc<iAddr+4 ){
1430       *pRc = SQLITE_CORRUPT_BKPT;
1431       return 0;
1432     }
1433     /* EVIDENCE-OF: R-22710-53328 The third and fourth bytes of each
1434     ** freeblock form a big-endian integer which is the size of the freeblock
1435     ** in bytes, including the 4-byte header. */
1436     size = get2byte(&aData[pc+2]);
1437     if( (x = size - nByte)>=0 ){
1438       testcase( x==4 );
1439       testcase( x==3 );
1440       if( pc < pPg->cellOffset+2*pPg->nCell || size+pc > usableSize ){
1441         *pRc = SQLITE_CORRUPT_BKPT;
1442         return 0;
1443       }else if( x<4 ){
1444         /* EVIDENCE-OF: R-11498-58022 In a well-formed b-tree page, the total
1445         ** number of bytes in fragments may not exceed 60. */
1446         if( aData[hdr+7]>57 ) return 0;
1447 
1448         /* Remove the slot from the free-list. Update the number of
1449         ** fragmented bytes within the page. */
1450         memcpy(&aData[iAddr], &aData[pc], 2);
1451         aData[hdr+7] += (u8)x;
1452       }else{
1453         /* The slot remains on the free-list. Reduce its size to account
1454          ** for the portion used by the new allocation. */
1455         put2byte(&aData[pc+2], x);
1456       }
1457       return &aData[pc + x];
1458     }
1459     iAddr = pc;
1460     pc = get2byte(&aData[pc]);
1461   }while( pc );
1462 
1463   return 0;
1464 }
1465 
1466 /*
1467 ** Allocate nByte bytes of space from within the B-Tree page passed
1468 ** as the first argument. Write into *pIdx the index into pPage->aData[]
1469 ** of the first byte of allocated space. Return either SQLITE_OK or
1470 ** an error code (usually SQLITE_CORRUPT).
1471 **
1472 ** The caller guarantees that there is sufficient space to make the
1473 ** allocation.  This routine might need to defragment in order to bring
1474 ** all the space together, however.  This routine will avoid using
1475 ** the first two bytes past the cell pointer area since presumably this
1476 ** allocation is being made in order to insert a new cell, so we will
1477 ** also end up needing a new cell pointer.
1478 */
1479 static int allocateSpace(MemPage *pPage, int nByte, int *pIdx){
1480   const int hdr = pPage->hdrOffset;    /* Local cache of pPage->hdrOffset */
1481   u8 * const data = pPage->aData;      /* Local cache of pPage->aData */
1482   int top;                             /* First byte of cell content area */
1483   int rc = SQLITE_OK;                  /* Integer return code */
1484   int gap;        /* First byte of gap between cell pointers and cell content */
1485 
1486   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
1487   assert( pPage->pBt );
1488   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1489   assert( nByte>=0 );  /* Minimum cell size is 4 */
1490   assert( pPage->nFree>=nByte );
1491   assert( pPage->nOverflow==0 );
1492   assert( nByte < (int)(pPage->pBt->usableSize-8) );
1493 
1494   assert( pPage->cellOffset == hdr + 12 - 4*pPage->leaf );
1495   gap = pPage->cellOffset + 2*pPage->nCell;
1496   assert( gap<=65536 );
1497   /* EVIDENCE-OF: R-29356-02391 If the database uses a 65536-byte page size
1498   ** and the reserved space is zero (the usual value for reserved space)
1499   ** then the cell content offset of an empty page wants to be 65536.
1500   ** However, that integer is too large to be stored in a 2-byte unsigned
1501   ** integer, so a value of 0 is used in its place. */
1502   top = get2byte(&data[hdr+5]);
1503   assert( top<=(int)pPage->pBt->usableSize ); /* Prevent by getAndInitPage() */
1504   if( gap>top ){
1505     if( top==0 && pPage->pBt->usableSize==65536 ){
1506       top = 65536;
1507     }else{
1508       return SQLITE_CORRUPT_BKPT;
1509     }
1510   }
1511 
1512   /* If there is enough space between gap and top for one more cell pointer
1513   ** array entry offset, and if the freelist is not empty, then search the
1514   ** freelist looking for a free slot big enough to satisfy the request.
1515   */
1516   testcase( gap+2==top );
1517   testcase( gap+1==top );
1518   testcase( gap==top );
1519   if( (data[hdr+2] || data[hdr+1]) && gap+2<=top ){
1520     u8 *pSpace = pageFindSlot(pPage, nByte, &rc);
1521     if( pSpace ){
1522       assert( pSpace>=data && (pSpace - data)<65536 );
1523       *pIdx = (int)(pSpace - data);
1524       return SQLITE_OK;
1525     }else if( rc ){
1526       return rc;
1527     }
1528   }
1529 
1530   /* The request could not be fulfilled using a freelist slot.  Check
1531   ** to see if defragmentation is necessary.
1532   */
1533   testcase( gap+2+nByte==top );
1534   if( gap+2+nByte>top ){
1535     assert( pPage->nCell>0 || CORRUPT_DB );
1536     rc = defragmentPage(pPage);
1537     if( rc ) return rc;
1538     top = get2byteNotZero(&data[hdr+5]);
1539     assert( gap+nByte<=top );
1540   }
1541 
1542 
1543   /* Allocate memory from the gap in between the cell pointer array
1544   ** and the cell content area.  The btreeInitPage() call has already
1545   ** validated the freelist.  Given that the freelist is valid, there
1546   ** is no way that the allocation can extend off the end of the page.
1547   ** The assert() below verifies the previous sentence.
1548   */
1549   top -= nByte;
1550   put2byte(&data[hdr+5], top);
1551   assert( top+nByte <= (int)pPage->pBt->usableSize );
1552   *pIdx = top;
1553   return SQLITE_OK;
1554 }
1555 
1556 /*
1557 ** Return a section of the pPage->aData to the freelist.
1558 ** The first byte of the new free block is pPage->aData[iStart]
1559 ** and the size of the block is iSize bytes.
1560 **
1561 ** Adjacent freeblocks are coalesced.
1562 **
1563 ** Note that even though the freeblock list was checked by btreeInitPage(),
1564 ** that routine will not detect overlap between cells or freeblocks.  Nor
1565 ** does it detect cells or freeblocks that encrouch into the reserved bytes
1566 ** at the end of the page.  So do additional corruption checks inside this
1567 ** routine and return SQLITE_CORRUPT if any problems are found.
1568 */
1569 static int freeSpace(MemPage *pPage, u16 iStart, u16 iSize){
1570   u16 iPtr;                             /* Address of ptr to next freeblock */
1571   u16 iFreeBlk;                         /* Address of the next freeblock */
1572   u8 hdr;                               /* Page header size.  0 or 100 */
1573   u8 nFrag = 0;                         /* Reduction in fragmentation */
1574   u16 iOrigSize = iSize;                /* Original value of iSize */
1575   u32 iLast = pPage->pBt->usableSize-4; /* Largest possible freeblock offset */
1576   u32 iEnd = iStart + iSize;            /* First byte past the iStart buffer */
1577   unsigned char *data = pPage->aData;   /* Page content */
1578 
1579   assert( pPage->pBt!=0 );
1580   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
1581   assert( CORRUPT_DB || iStart>=pPage->hdrOffset+6+pPage->childPtrSize );
1582   assert( CORRUPT_DB || iEnd <= pPage->pBt->usableSize );
1583   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1584   assert( iSize>=4 );   /* Minimum cell size is 4 */
1585   assert( iStart<=iLast );
1586 
1587   /* Overwrite deleted information with zeros when the secure_delete
1588   ** option is enabled */
1589   if( pPage->pBt->btsFlags & BTS_SECURE_DELETE ){
1590     memset(&data[iStart], 0, iSize);
1591   }
1592 
1593   /* The list of freeblocks must be in ascending order.  Find the
1594   ** spot on the list where iStart should be inserted.
1595   */
1596   hdr = pPage->hdrOffset;
1597   iPtr = hdr + 1;
1598   if( data[iPtr+1]==0 && data[iPtr]==0 ){
1599     iFreeBlk = 0;  /* Shortcut for the case when the freelist is empty */
1600   }else{
1601     while( (iFreeBlk = get2byte(&data[iPtr]))>0 && iFreeBlk<iStart ){
1602       if( iFreeBlk<iPtr+4 ) return SQLITE_CORRUPT_BKPT;
1603       iPtr = iFreeBlk;
1604     }
1605     if( iFreeBlk>iLast ) return SQLITE_CORRUPT_BKPT;
1606     assert( iFreeBlk>iPtr || iFreeBlk==0 );
1607 
1608     /* At this point:
1609     **    iFreeBlk:   First freeblock after iStart, or zero if none
1610     **    iPtr:       The address of a pointer to iFreeBlk
1611     **
1612     ** Check to see if iFreeBlk should be coalesced onto the end of iStart.
1613     */
1614     if( iFreeBlk && iEnd+3>=iFreeBlk ){
1615       nFrag = iFreeBlk - iEnd;
1616       if( iEnd>iFreeBlk ) return SQLITE_CORRUPT_BKPT;
1617       iEnd = iFreeBlk + get2byte(&data[iFreeBlk+2]);
1618       if( iEnd > pPage->pBt->usableSize ) return SQLITE_CORRUPT_BKPT;
1619       iSize = iEnd - iStart;
1620       iFreeBlk = get2byte(&data[iFreeBlk]);
1621     }
1622 
1623     /* If iPtr is another freeblock (that is, if iPtr is not the freelist
1624     ** pointer in the page header) then check to see if iStart should be
1625     ** coalesced onto the end of iPtr.
1626     */
1627     if( iPtr>hdr+1 ){
1628       int iPtrEnd = iPtr + get2byte(&data[iPtr+2]);
1629       if( iPtrEnd+3>=iStart ){
1630         if( iPtrEnd>iStart ) return SQLITE_CORRUPT_BKPT;
1631         nFrag += iStart - iPtrEnd;
1632         iSize = iEnd - iPtr;
1633         iStart = iPtr;
1634       }
1635     }
1636     if( nFrag>data[hdr+7] ) return SQLITE_CORRUPT_BKPT;
1637     data[hdr+7] -= nFrag;
1638   }
1639   if( iStart==get2byte(&data[hdr+5]) ){
1640     /* The new freeblock is at the beginning of the cell content area,
1641     ** so just extend the cell content area rather than create another
1642     ** freelist entry */
1643     if( iPtr!=hdr+1 ) return SQLITE_CORRUPT_BKPT;
1644     put2byte(&data[hdr+1], iFreeBlk);
1645     put2byte(&data[hdr+5], iEnd);
1646   }else{
1647     /* Insert the new freeblock into the freelist */
1648     put2byte(&data[iPtr], iStart);
1649     put2byte(&data[iStart], iFreeBlk);
1650     put2byte(&data[iStart+2], iSize);
1651   }
1652   pPage->nFree += iOrigSize;
1653   return SQLITE_OK;
1654 }
1655 
1656 /*
1657 ** Decode the flags byte (the first byte of the header) for a page
1658 ** and initialize fields of the MemPage structure accordingly.
1659 **
1660 ** Only the following combinations are supported.  Anything different
1661 ** indicates a corrupt database files:
1662 **
1663 **         PTF_ZERODATA
1664 **         PTF_ZERODATA | PTF_LEAF
1665 **         PTF_LEAFDATA | PTF_INTKEY
1666 **         PTF_LEAFDATA | PTF_INTKEY | PTF_LEAF
1667 */
1668 static int decodeFlags(MemPage *pPage, int flagByte){
1669   BtShared *pBt;     /* A copy of pPage->pBt */
1670 
1671   assert( pPage->hdrOffset==(pPage->pgno==1 ? 100 : 0) );
1672   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1673   pPage->leaf = (u8)(flagByte>>3);  assert( PTF_LEAF == 1<<3 );
1674   flagByte &= ~PTF_LEAF;
1675   pPage->childPtrSize = 4-4*pPage->leaf;
1676   pPage->xCellSize = cellSizePtr;
1677   pBt = pPage->pBt;
1678   if( flagByte==(PTF_LEAFDATA | PTF_INTKEY) ){
1679     /* EVIDENCE-OF: R-03640-13415 A value of 5 means the page is an interior
1680     ** table b-tree page. */
1681     assert( (PTF_LEAFDATA|PTF_INTKEY)==5 );
1682     /* EVIDENCE-OF: R-20501-61796 A value of 13 means the page is a leaf
1683     ** table b-tree page. */
1684     assert( (PTF_LEAFDATA|PTF_INTKEY|PTF_LEAF)==13 );
1685     pPage->intKey = 1;
1686     if( pPage->leaf ){
1687       pPage->intKeyLeaf = 1;
1688       pPage->xParseCell = btreeParseCellPtr;
1689     }else{
1690       pPage->intKeyLeaf = 0;
1691       pPage->xCellSize = cellSizePtrNoPayload;
1692       pPage->xParseCell = btreeParseCellPtrNoPayload;
1693     }
1694     pPage->maxLocal = pBt->maxLeaf;
1695     pPage->minLocal = pBt->minLeaf;
1696   }else if( flagByte==PTF_ZERODATA ){
1697     /* EVIDENCE-OF: R-27225-53936 A value of 2 means the page is an interior
1698     ** index b-tree page. */
1699     assert( (PTF_ZERODATA)==2 );
1700     /* EVIDENCE-OF: R-16571-11615 A value of 10 means the page is a leaf
1701     ** index b-tree page. */
1702     assert( (PTF_ZERODATA|PTF_LEAF)==10 );
1703     pPage->intKey = 0;
1704     pPage->intKeyLeaf = 0;
1705     pPage->xParseCell = btreeParseCellPtrIndex;
1706     pPage->maxLocal = pBt->maxLocal;
1707     pPage->minLocal = pBt->minLocal;
1708   }else{
1709     /* EVIDENCE-OF: R-47608-56469 Any other value for the b-tree page type is
1710     ** an error. */
1711     return SQLITE_CORRUPT_BKPT;
1712   }
1713   pPage->max1bytePayload = pBt->max1bytePayload;
1714   return SQLITE_OK;
1715 }
1716 
1717 /*
1718 ** Initialize the auxiliary information for a disk block.
1719 **
1720 ** Return SQLITE_OK on success.  If we see that the page does
1721 ** not contain a well-formed database page, then return
1722 ** SQLITE_CORRUPT.  Note that a return of SQLITE_OK does not
1723 ** guarantee that the page is well-formed.  It only shows that
1724 ** we failed to detect any corruption.
1725 */
1726 static int btreeInitPage(MemPage *pPage){
1727 
1728   assert( pPage->pBt!=0 );
1729   assert( pPage->pBt->db!=0 );
1730   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1731   assert( pPage->pgno==sqlite3PagerPagenumber(pPage->pDbPage) );
1732   assert( pPage == sqlite3PagerGetExtra(pPage->pDbPage) );
1733   assert( pPage->aData == sqlite3PagerGetData(pPage->pDbPage) );
1734 
1735   if( !pPage->isInit ){
1736     u16 pc;            /* Address of a freeblock within pPage->aData[] */
1737     u8 hdr;            /* Offset to beginning of page header */
1738     u8 *data;          /* Equal to pPage->aData */
1739     BtShared *pBt;        /* The main btree structure */
1740     int usableSize;    /* Amount of usable space on each page */
1741     u16 cellOffset;    /* Offset from start of page to first cell pointer */
1742     int nFree;         /* Number of unused bytes on the page */
1743     int top;           /* First byte of the cell content area */
1744     int iCellFirst;    /* First allowable cell or freeblock offset */
1745     int iCellLast;     /* Last possible cell or freeblock offset */
1746 
1747     pBt = pPage->pBt;
1748 
1749     hdr = pPage->hdrOffset;
1750     data = pPage->aData;
1751     /* EVIDENCE-OF: R-28594-02890 The one-byte flag at offset 0 indicating
1752     ** the b-tree page type. */
1753     if( decodeFlags(pPage, data[hdr]) ) return SQLITE_CORRUPT_BKPT;
1754     assert( pBt->pageSize>=512 && pBt->pageSize<=65536 );
1755     pPage->maskPage = (u16)(pBt->pageSize - 1);
1756     pPage->nOverflow = 0;
1757     usableSize = pBt->usableSize;
1758     pPage->cellOffset = cellOffset = hdr + 8 + pPage->childPtrSize;
1759     pPage->aDataEnd = &data[usableSize];
1760     pPage->aCellIdx = &data[cellOffset];
1761     pPage->aDataOfst = &data[pPage->childPtrSize];
1762     /* EVIDENCE-OF: R-58015-48175 The two-byte integer at offset 5 designates
1763     ** the start of the cell content area. A zero value for this integer is
1764     ** interpreted as 65536. */
1765     top = get2byteNotZero(&data[hdr+5]);
1766     /* EVIDENCE-OF: R-37002-32774 The two-byte integer at offset 3 gives the
1767     ** number of cells on the page. */
1768     pPage->nCell = get2byte(&data[hdr+3]);
1769     if( pPage->nCell>MX_CELL(pBt) ){
1770       /* To many cells for a single page.  The page must be corrupt */
1771       return SQLITE_CORRUPT_BKPT;
1772     }
1773     testcase( pPage->nCell==MX_CELL(pBt) );
1774     /* EVIDENCE-OF: R-24089-57979 If a page contains no cells (which is only
1775     ** possible for a root page of a table that contains no rows) then the
1776     ** offset to the cell content area will equal the page size minus the
1777     ** bytes of reserved space. */
1778     assert( pPage->nCell>0 || top==usableSize || CORRUPT_DB );
1779 
1780     /* A malformed database page might cause us to read past the end
1781     ** of page when parsing a cell.
1782     **
1783     ** The following block of code checks early to see if a cell extends
1784     ** past the end of a page boundary and causes SQLITE_CORRUPT to be
1785     ** returned if it does.
1786     */
1787     iCellFirst = cellOffset + 2*pPage->nCell;
1788     iCellLast = usableSize - 4;
1789     if( pBt->db->flags & SQLITE_CellSizeCk ){
1790       int i;            /* Index into the cell pointer array */
1791       int sz;           /* Size of a cell */
1792 
1793       if( !pPage->leaf ) iCellLast--;
1794       for(i=0; i<pPage->nCell; i++){
1795         pc = get2byteAligned(&data[cellOffset+i*2]);
1796         testcase( pc==iCellFirst );
1797         testcase( pc==iCellLast );
1798         if( pc<iCellFirst || pc>iCellLast ){
1799           return SQLITE_CORRUPT_BKPT;
1800         }
1801         sz = pPage->xCellSize(pPage, &data[pc]);
1802         testcase( pc+sz==usableSize );
1803         if( pc+sz>usableSize ){
1804           return SQLITE_CORRUPT_BKPT;
1805         }
1806       }
1807       if( !pPage->leaf ) iCellLast++;
1808     }
1809 
1810     /* Compute the total free space on the page
1811     ** EVIDENCE-OF: R-23588-34450 The two-byte integer at offset 1 gives the
1812     ** start of the first freeblock on the page, or is zero if there are no
1813     ** freeblocks. */
1814     pc = get2byte(&data[hdr+1]);
1815     nFree = data[hdr+7] + top;  /* Init nFree to non-freeblock free space */
1816     while( pc>0 ){
1817       u16 next, size;
1818       if( pc<iCellFirst || pc>iCellLast ){
1819         /* EVIDENCE-OF: R-55530-52930 In a well-formed b-tree page, there will
1820         ** always be at least one cell before the first freeblock.
1821         **
1822         ** Or, the freeblock is off the end of the page
1823         */
1824         return SQLITE_CORRUPT_BKPT;
1825       }
1826       next = get2byte(&data[pc]);
1827       size = get2byte(&data[pc+2]);
1828       if( (next>0 && next<=pc+size+3) || pc+size>usableSize ){
1829         /* Free blocks must be in ascending order. And the last byte of
1830         ** the free-block must lie on the database page.  */
1831         return SQLITE_CORRUPT_BKPT;
1832       }
1833       nFree = nFree + size;
1834       pc = next;
1835     }
1836 
1837     /* At this point, nFree contains the sum of the offset to the start
1838     ** of the cell-content area plus the number of free bytes within
1839     ** the cell-content area. If this is greater than the usable-size
1840     ** of the page, then the page must be corrupted. This check also
1841     ** serves to verify that the offset to the start of the cell-content
1842     ** area, according to the page header, lies within the page.
1843     */
1844     if( nFree>usableSize ){
1845       return SQLITE_CORRUPT_BKPT;
1846     }
1847     pPage->nFree = (u16)(nFree - iCellFirst);
1848     pPage->isInit = 1;
1849   }
1850   return SQLITE_OK;
1851 }
1852 
1853 /*
1854 ** Set up a raw page so that it looks like a database page holding
1855 ** no entries.
1856 */
1857 static void zeroPage(MemPage *pPage, int flags){
1858   unsigned char *data = pPage->aData;
1859   BtShared *pBt = pPage->pBt;
1860   u8 hdr = pPage->hdrOffset;
1861   u16 first;
1862 
1863   assert( sqlite3PagerPagenumber(pPage->pDbPage)==pPage->pgno );
1864   assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage );
1865   assert( sqlite3PagerGetData(pPage->pDbPage) == data );
1866   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
1867   assert( sqlite3_mutex_held(pBt->mutex) );
1868   if( pBt->btsFlags & BTS_SECURE_DELETE ){
1869     memset(&data[hdr], 0, pBt->usableSize - hdr);
1870   }
1871   data[hdr] = (char)flags;
1872   first = hdr + ((flags&PTF_LEAF)==0 ? 12 : 8);
1873   memset(&data[hdr+1], 0, 4);
1874   data[hdr+7] = 0;
1875   put2byte(&data[hdr+5], pBt->usableSize);
1876   pPage->nFree = (u16)(pBt->usableSize - first);
1877   decodeFlags(pPage, flags);
1878   pPage->cellOffset = first;
1879   pPage->aDataEnd = &data[pBt->usableSize];
1880   pPage->aCellIdx = &data[first];
1881   pPage->aDataOfst = &data[pPage->childPtrSize];
1882   pPage->nOverflow = 0;
1883   assert( pBt->pageSize>=512 && pBt->pageSize<=65536 );
1884   pPage->maskPage = (u16)(pBt->pageSize - 1);
1885   pPage->nCell = 0;
1886   pPage->isInit = 1;
1887 }
1888 
1889 
1890 /*
1891 ** Convert a DbPage obtained from the pager into a MemPage used by
1892 ** the btree layer.
1893 */
1894 static MemPage *btreePageFromDbPage(DbPage *pDbPage, Pgno pgno, BtShared *pBt){
1895   MemPage *pPage = (MemPage*)sqlite3PagerGetExtra(pDbPage);
1896   if( pgno!=pPage->pgno ){
1897     pPage->aData = sqlite3PagerGetData(pDbPage);
1898     pPage->pDbPage = pDbPage;
1899     pPage->pBt = pBt;
1900     pPage->pgno = pgno;
1901     pPage->hdrOffset = pgno==1 ? 100 : 0;
1902   }
1903   assert( pPage->aData==sqlite3PagerGetData(pDbPage) );
1904   return pPage;
1905 }
1906 
1907 /*
1908 ** Get a page from the pager.  Initialize the MemPage.pBt and
1909 ** MemPage.aData elements if needed.  See also: btreeGetUnusedPage().
1910 **
1911 ** If the PAGER_GET_NOCONTENT flag is set, it means that we do not care
1912 ** about the content of the page at this time.  So do not go to the disk
1913 ** to fetch the content.  Just fill in the content with zeros for now.
1914 ** If in the future we call sqlite3PagerWrite() on this page, that
1915 ** means we have started to be concerned about content and the disk
1916 ** read should occur at that point.
1917 */
1918 static int btreeGetPage(
1919   BtShared *pBt,       /* The btree */
1920   Pgno pgno,           /* Number of the page to fetch */
1921   MemPage **ppPage,    /* Return the page in this parameter */
1922   int flags            /* PAGER_GET_NOCONTENT or PAGER_GET_READONLY */
1923 ){
1924   int rc;
1925   DbPage *pDbPage;
1926 
1927   assert( flags==0 || flags==PAGER_GET_NOCONTENT || flags==PAGER_GET_READONLY );
1928   assert( sqlite3_mutex_held(pBt->mutex) );
1929   rc = sqlite3PagerGet(pBt->pPager, pgno, (DbPage**)&pDbPage, flags);
1930   if( rc ) return rc;
1931   *ppPage = btreePageFromDbPage(pDbPage, pgno, pBt);
1932   return SQLITE_OK;
1933 }
1934 
1935 /*
1936 ** Retrieve a page from the pager cache. If the requested page is not
1937 ** already in the pager cache return NULL. Initialize the MemPage.pBt and
1938 ** MemPage.aData elements if needed.
1939 */
1940 static MemPage *btreePageLookup(BtShared *pBt, Pgno pgno){
1941   DbPage *pDbPage;
1942   assert( sqlite3_mutex_held(pBt->mutex) );
1943   pDbPage = sqlite3PagerLookup(pBt->pPager, pgno);
1944   if( pDbPage ){
1945     return btreePageFromDbPage(pDbPage, pgno, pBt);
1946   }
1947   return 0;
1948 }
1949 
1950 /*
1951 ** Return the size of the database file in pages. If there is any kind of
1952 ** error, return ((unsigned int)-1).
1953 */
1954 static Pgno btreePagecount(BtShared *pBt){
1955   return pBt->nPage;
1956 }
1957 u32 sqlite3BtreeLastPage(Btree *p){
1958   assert( sqlite3BtreeHoldsMutex(p) );
1959   assert( ((p->pBt->nPage)&0x8000000)==0 );
1960   return btreePagecount(p->pBt);
1961 }
1962 
1963 /*
1964 ** Get a page from the pager and initialize it.
1965 **
1966 ** If pCur!=0 then the page is being fetched as part of a moveToChild()
1967 ** call.  Do additional sanity checking on the page in this case.
1968 ** And if the fetch fails, this routine must decrement pCur->iPage.
1969 **
1970 ** The page is fetched as read-write unless pCur is not NULL and is
1971 ** a read-only cursor.
1972 **
1973 ** If an error occurs, then *ppPage is undefined. It
1974 ** may remain unchanged, or it may be set to an invalid value.
1975 */
1976 static int getAndInitPage(
1977   BtShared *pBt,                  /* The database file */
1978   Pgno pgno,                      /* Number of the page to get */
1979   MemPage **ppPage,               /* Write the page pointer here */
1980   BtCursor *pCur,                 /* Cursor to receive the page, or NULL */
1981   int bReadOnly                   /* True for a read-only page */
1982 ){
1983   int rc;
1984   DbPage *pDbPage;
1985   assert( sqlite3_mutex_held(pBt->mutex) );
1986   assert( pCur==0 || ppPage==&pCur->apPage[pCur->iPage] );
1987   assert( pCur==0 || bReadOnly==pCur->curPagerFlags );
1988   assert( pCur==0 || pCur->iPage>0 );
1989 
1990   if( pgno>btreePagecount(pBt) ){
1991     rc = SQLITE_CORRUPT_BKPT;
1992     goto getAndInitPage_error;
1993   }
1994   rc = sqlite3PagerGet(pBt->pPager, pgno, (DbPage**)&pDbPage, bReadOnly);
1995   if( rc ){
1996     goto getAndInitPage_error;
1997   }
1998   *ppPage = (MemPage*)sqlite3PagerGetExtra(pDbPage);
1999   if( (*ppPage)->isInit==0 ){
2000     btreePageFromDbPage(pDbPage, pgno, pBt);
2001     rc = btreeInitPage(*ppPage);
2002     if( rc!=SQLITE_OK ){
2003       releasePage(*ppPage);
2004       goto getAndInitPage_error;
2005     }
2006   }
2007   assert( (*ppPage)->pgno==pgno );
2008   assert( (*ppPage)->aData==sqlite3PagerGetData(pDbPage) );
2009 
2010   /* If obtaining a child page for a cursor, we must verify that the page is
2011   ** compatible with the root page. */
2012   if( pCur && ((*ppPage)->nCell<1 || (*ppPage)->intKey!=pCur->curIntKey) ){
2013     rc = SQLITE_CORRUPT_BKPT;
2014     releasePage(*ppPage);
2015     goto getAndInitPage_error;
2016   }
2017   return SQLITE_OK;
2018 
2019 getAndInitPage_error:
2020   if( pCur ) pCur->iPage--;
2021   testcase( pgno==0 );
2022   assert( pgno!=0 || rc==SQLITE_CORRUPT );
2023   return rc;
2024 }
2025 
2026 /*
2027 ** Release a MemPage.  This should be called once for each prior
2028 ** call to btreeGetPage.
2029 */
2030 static void releasePageNotNull(MemPage *pPage){
2031   assert( pPage->aData );
2032   assert( pPage->pBt );
2033   assert( pPage->pDbPage!=0 );
2034   assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage );
2035   assert( sqlite3PagerGetData(pPage->pDbPage)==pPage->aData );
2036   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
2037   sqlite3PagerUnrefNotNull(pPage->pDbPage);
2038 }
2039 static void releasePage(MemPage *pPage){
2040   if( pPage ) releasePageNotNull(pPage);
2041 }
2042 
2043 /*
2044 ** Get an unused page.
2045 **
2046 ** This works just like btreeGetPage() with the addition:
2047 **
2048 **   *  If the page is already in use for some other purpose, immediately
2049 **      release it and return an SQLITE_CURRUPT error.
2050 **   *  Make sure the isInit flag is clear
2051 */
2052 static int btreeGetUnusedPage(
2053   BtShared *pBt,       /* The btree */
2054   Pgno pgno,           /* Number of the page to fetch */
2055   MemPage **ppPage,    /* Return the page in this parameter */
2056   int flags            /* PAGER_GET_NOCONTENT or PAGER_GET_READONLY */
2057 ){
2058   int rc = btreeGetPage(pBt, pgno, ppPage, flags);
2059   if( rc==SQLITE_OK ){
2060     if( sqlite3PagerPageRefcount((*ppPage)->pDbPage)>1 ){
2061       releasePage(*ppPage);
2062       *ppPage = 0;
2063       return SQLITE_CORRUPT_BKPT;
2064     }
2065     (*ppPage)->isInit = 0;
2066   }else{
2067     *ppPage = 0;
2068   }
2069   return rc;
2070 }
2071 
2072 
2073 /*
2074 ** During a rollback, when the pager reloads information into the cache
2075 ** so that the cache is restored to its original state at the start of
2076 ** the transaction, for each page restored this routine is called.
2077 **
2078 ** This routine needs to reset the extra data section at the end of the
2079 ** page to agree with the restored data.
2080 */
2081 static void pageReinit(DbPage *pData){
2082   MemPage *pPage;
2083   pPage = (MemPage *)sqlite3PagerGetExtra(pData);
2084   assert( sqlite3PagerPageRefcount(pData)>0 );
2085   if( pPage->isInit ){
2086     assert( sqlite3_mutex_held(pPage->pBt->mutex) );
2087     pPage->isInit = 0;
2088     if( sqlite3PagerPageRefcount(pData)>1 ){
2089       /* pPage might not be a btree page;  it might be an overflow page
2090       ** or ptrmap page or a free page.  In those cases, the following
2091       ** call to btreeInitPage() will likely return SQLITE_CORRUPT.
2092       ** But no harm is done by this.  And it is very important that
2093       ** btreeInitPage() be called on every btree page so we make
2094       ** the call for every page that comes in for re-initing. */
2095       btreeInitPage(pPage);
2096     }
2097   }
2098 }
2099 
2100 /*
2101 ** Invoke the busy handler for a btree.
2102 */
2103 static int btreeInvokeBusyHandler(void *pArg){
2104   BtShared *pBt = (BtShared*)pArg;
2105   assert( pBt->db );
2106   assert( sqlite3_mutex_held(pBt->db->mutex) );
2107   return sqlite3InvokeBusyHandler(&pBt->db->busyHandler);
2108 }
2109 
2110 /*
2111 ** Open a database file.
2112 **
2113 ** zFilename is the name of the database file.  If zFilename is NULL
2114 ** then an ephemeral database is created.  The ephemeral database might
2115 ** be exclusively in memory, or it might use a disk-based memory cache.
2116 ** Either way, the ephemeral database will be automatically deleted
2117 ** when sqlite3BtreeClose() is called.
2118 **
2119 ** If zFilename is ":memory:" then an in-memory database is created
2120 ** that is automatically destroyed when it is closed.
2121 **
2122 ** The "flags" parameter is a bitmask that might contain bits like
2123 ** BTREE_OMIT_JOURNAL and/or BTREE_MEMORY.
2124 **
2125 ** If the database is already opened in the same database connection
2126 ** and we are in shared cache mode, then the open will fail with an
2127 ** SQLITE_CONSTRAINT error.  We cannot allow two or more BtShared
2128 ** objects in the same database connection since doing so will lead
2129 ** to problems with locking.
2130 */
2131 int sqlite3BtreeOpen(
2132   sqlite3_vfs *pVfs,      /* VFS to use for this b-tree */
2133   const char *zFilename,  /* Name of the file containing the BTree database */
2134   sqlite3 *db,            /* Associated database handle */
2135   Btree **ppBtree,        /* Pointer to new Btree object written here */
2136   int flags,              /* Options */
2137   int vfsFlags            /* Flags passed through to sqlite3_vfs.xOpen() */
2138 ){
2139   BtShared *pBt = 0;             /* Shared part of btree structure */
2140   Btree *p;                      /* Handle to return */
2141   sqlite3_mutex *mutexOpen = 0;  /* Prevents a race condition. Ticket #3537 */
2142   int rc = SQLITE_OK;            /* Result code from this function */
2143   u8 nReserve;                   /* Byte of unused space on each page */
2144   unsigned char zDbHeader[100];  /* Database header content */
2145 
2146   /* True if opening an ephemeral, temporary database */
2147   const int isTempDb = zFilename==0 || zFilename[0]==0;
2148 
2149   /* Set the variable isMemdb to true for an in-memory database, or
2150   ** false for a file-based database.
2151   */
2152 #ifdef SQLITE_OMIT_MEMORYDB
2153   const int isMemdb = 0;
2154 #else
2155   const int isMemdb = (zFilename && strcmp(zFilename, ":memory:")==0)
2156                        || (isTempDb && sqlite3TempInMemory(db))
2157                        || (vfsFlags & SQLITE_OPEN_MEMORY)!=0;
2158 #endif
2159 
2160   assert( db!=0 );
2161   assert( pVfs!=0 );
2162   assert( sqlite3_mutex_held(db->mutex) );
2163   assert( (flags&0xff)==flags );   /* flags fit in 8 bits */
2164 
2165   /* Only a BTREE_SINGLE database can be BTREE_UNORDERED */
2166   assert( (flags & BTREE_UNORDERED)==0 || (flags & BTREE_SINGLE)!=0 );
2167 
2168   /* A BTREE_SINGLE database is always a temporary and/or ephemeral */
2169   assert( (flags & BTREE_SINGLE)==0 || isTempDb );
2170 
2171   if( isMemdb ){
2172     flags |= BTREE_MEMORY;
2173   }
2174   if( (vfsFlags & SQLITE_OPEN_MAIN_DB)!=0 && (isMemdb || isTempDb) ){
2175     vfsFlags = (vfsFlags & ~SQLITE_OPEN_MAIN_DB) | SQLITE_OPEN_TEMP_DB;
2176   }
2177   p = sqlite3MallocZero(sizeof(Btree));
2178   if( !p ){
2179     return SQLITE_NOMEM_BKPT;
2180   }
2181   p->inTrans = TRANS_NONE;
2182   p->db = db;
2183 #ifndef SQLITE_OMIT_SHARED_CACHE
2184   p->lock.pBtree = p;
2185   p->lock.iTable = 1;
2186 #endif
2187 
2188 #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)
2189   /*
2190   ** If this Btree is a candidate for shared cache, try to find an
2191   ** existing BtShared object that we can share with
2192   */
2193   if( isTempDb==0 && (isMemdb==0 || (vfsFlags&SQLITE_OPEN_URI)!=0) ){
2194     if( vfsFlags & SQLITE_OPEN_SHAREDCACHE ){
2195       int nFilename = sqlite3Strlen30(zFilename)+1;
2196       int nFullPathname = pVfs->mxPathname+1;
2197       char *zFullPathname = sqlite3Malloc(MAX(nFullPathname,nFilename));
2198       MUTEX_LOGIC( sqlite3_mutex *mutexShared; )
2199 
2200       p->sharable = 1;
2201       if( !zFullPathname ){
2202         sqlite3_free(p);
2203         return SQLITE_NOMEM_BKPT;
2204       }
2205       if( isMemdb ){
2206         memcpy(zFullPathname, zFilename, nFilename);
2207       }else{
2208         rc = sqlite3OsFullPathname(pVfs, zFilename,
2209                                    nFullPathname, zFullPathname);
2210         if( rc ){
2211           sqlite3_free(zFullPathname);
2212           sqlite3_free(p);
2213           return rc;
2214         }
2215       }
2216 #if SQLITE_THREADSAFE
2217       mutexOpen = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_OPEN);
2218       sqlite3_mutex_enter(mutexOpen);
2219       mutexShared = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER);
2220       sqlite3_mutex_enter(mutexShared);
2221 #endif
2222       for(pBt=GLOBAL(BtShared*,sqlite3SharedCacheList); pBt; pBt=pBt->pNext){
2223         assert( pBt->nRef>0 );
2224         if( 0==strcmp(zFullPathname, sqlite3PagerFilename(pBt->pPager, 0))
2225                  && sqlite3PagerVfs(pBt->pPager)==pVfs ){
2226           int iDb;
2227           for(iDb=db->nDb-1; iDb>=0; iDb--){
2228             Btree *pExisting = db->aDb[iDb].pBt;
2229             if( pExisting && pExisting->pBt==pBt ){
2230               sqlite3_mutex_leave(mutexShared);
2231               sqlite3_mutex_leave(mutexOpen);
2232               sqlite3_free(zFullPathname);
2233               sqlite3_free(p);
2234               return SQLITE_CONSTRAINT;
2235             }
2236           }
2237           p->pBt = pBt;
2238           pBt->nRef++;
2239           break;
2240         }
2241       }
2242       sqlite3_mutex_leave(mutexShared);
2243       sqlite3_free(zFullPathname);
2244     }
2245 #ifdef SQLITE_DEBUG
2246     else{
2247       /* In debug mode, we mark all persistent databases as sharable
2248       ** even when they are not.  This exercises the locking code and
2249       ** gives more opportunity for asserts(sqlite3_mutex_held())
2250       ** statements to find locking problems.
2251       */
2252       p->sharable = 1;
2253     }
2254 #endif
2255   }
2256 #endif
2257   if( pBt==0 ){
2258     /*
2259     ** The following asserts make sure that structures used by the btree are
2260     ** the right size.  This is to guard against size changes that result
2261     ** when compiling on a different architecture.
2262     */
2263     assert( sizeof(i64)==8 );
2264     assert( sizeof(u64)==8 );
2265     assert( sizeof(u32)==4 );
2266     assert( sizeof(u16)==2 );
2267     assert( sizeof(Pgno)==4 );
2268 
2269     pBt = sqlite3MallocZero( sizeof(*pBt) );
2270     if( pBt==0 ){
2271       rc = SQLITE_NOMEM_BKPT;
2272       goto btree_open_out;
2273     }
2274     rc = sqlite3PagerOpen(pVfs, &pBt->pPager, zFilename,
2275                           EXTRA_SIZE, flags, vfsFlags, pageReinit);
2276     if( rc==SQLITE_OK ){
2277       sqlite3PagerSetMmapLimit(pBt->pPager, db->szMmap);
2278       rc = sqlite3PagerReadFileheader(pBt->pPager,sizeof(zDbHeader),zDbHeader);
2279     }
2280     if( rc!=SQLITE_OK ){
2281       goto btree_open_out;
2282     }
2283     pBt->openFlags = (u8)flags;
2284     pBt->db = db;
2285     sqlite3PagerSetBusyhandler(pBt->pPager, btreeInvokeBusyHandler, pBt);
2286     p->pBt = pBt;
2287 
2288     pBt->pCursor = 0;
2289     pBt->pPage1 = 0;
2290     if( sqlite3PagerIsreadonly(pBt->pPager) ) pBt->btsFlags |= BTS_READ_ONLY;
2291 #ifdef SQLITE_SECURE_DELETE
2292     pBt->btsFlags |= BTS_SECURE_DELETE;
2293 #endif
2294     /* EVIDENCE-OF: R-51873-39618 The page size for a database file is
2295     ** determined by the 2-byte integer located at an offset of 16 bytes from
2296     ** the beginning of the database file. */
2297     pBt->pageSize = (zDbHeader[16]<<8) | (zDbHeader[17]<<16);
2298     if( pBt->pageSize<512 || pBt->pageSize>SQLITE_MAX_PAGE_SIZE
2299          || ((pBt->pageSize-1)&pBt->pageSize)!=0 ){
2300       pBt->pageSize = 0;
2301 #ifndef SQLITE_OMIT_AUTOVACUUM
2302       /* If the magic name ":memory:" will create an in-memory database, then
2303       ** leave the autoVacuum mode at 0 (do not auto-vacuum), even if
2304       ** SQLITE_DEFAULT_AUTOVACUUM is true. On the other hand, if
2305       ** SQLITE_OMIT_MEMORYDB has been defined, then ":memory:" is just a
2306       ** regular file-name. In this case the auto-vacuum applies as per normal.
2307       */
2308       if( zFilename && !isMemdb ){
2309         pBt->autoVacuum = (SQLITE_DEFAULT_AUTOVACUUM ? 1 : 0);
2310         pBt->incrVacuum = (SQLITE_DEFAULT_AUTOVACUUM==2 ? 1 : 0);
2311       }
2312 #endif
2313       nReserve = 0;
2314     }else{
2315       /* EVIDENCE-OF: R-37497-42412 The size of the reserved region is
2316       ** determined by the one-byte unsigned integer found at an offset of 20
2317       ** into the database file header. */
2318       nReserve = zDbHeader[20];
2319       pBt->btsFlags |= BTS_PAGESIZE_FIXED;
2320 #ifndef SQLITE_OMIT_AUTOVACUUM
2321       pBt->autoVacuum = (get4byte(&zDbHeader[36 + 4*4])?1:0);
2322       pBt->incrVacuum = (get4byte(&zDbHeader[36 + 7*4])?1:0);
2323 #endif
2324     }
2325     rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize, nReserve);
2326     if( rc ) goto btree_open_out;
2327     pBt->usableSize = pBt->pageSize - nReserve;
2328     assert( (pBt->pageSize & 7)==0 );  /* 8-byte alignment of pageSize */
2329 
2330 #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)
2331     /* Add the new BtShared object to the linked list sharable BtShareds.
2332     */
2333     if( p->sharable ){
2334       MUTEX_LOGIC( sqlite3_mutex *mutexShared; )
2335       pBt->nRef = 1;
2336       MUTEX_LOGIC( mutexShared = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER);)
2337       if( SQLITE_THREADSAFE && sqlite3GlobalConfig.bCoreMutex ){
2338         pBt->mutex = sqlite3MutexAlloc(SQLITE_MUTEX_FAST);
2339         if( pBt->mutex==0 ){
2340           rc = SQLITE_NOMEM_BKPT;
2341           goto btree_open_out;
2342         }
2343       }
2344       sqlite3_mutex_enter(mutexShared);
2345       pBt->pNext = GLOBAL(BtShared*,sqlite3SharedCacheList);
2346       GLOBAL(BtShared*,sqlite3SharedCacheList) = pBt;
2347       sqlite3_mutex_leave(mutexShared);
2348     }
2349 #endif
2350   }
2351 
2352 #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)
2353   /* If the new Btree uses a sharable pBtShared, then link the new
2354   ** Btree into the list of all sharable Btrees for the same connection.
2355   ** The list is kept in ascending order by pBt address.
2356   */
2357   if( p->sharable ){
2358     int i;
2359     Btree *pSib;
2360     for(i=0; i<db->nDb; i++){
2361       if( (pSib = db->aDb[i].pBt)!=0 && pSib->sharable ){
2362         while( pSib->pPrev ){ pSib = pSib->pPrev; }
2363         if( (uptr)p->pBt<(uptr)pSib->pBt ){
2364           p->pNext = pSib;
2365           p->pPrev = 0;
2366           pSib->pPrev = p;
2367         }else{
2368           while( pSib->pNext && (uptr)pSib->pNext->pBt<(uptr)p->pBt ){
2369             pSib = pSib->pNext;
2370           }
2371           p->pNext = pSib->pNext;
2372           p->pPrev = pSib;
2373           if( p->pNext ){
2374             p->pNext->pPrev = p;
2375           }
2376           pSib->pNext = p;
2377         }
2378         break;
2379       }
2380     }
2381   }
2382 #endif
2383   *ppBtree = p;
2384 
2385 btree_open_out:
2386   if( rc!=SQLITE_OK ){
2387     if( pBt && pBt->pPager ){
2388       sqlite3PagerClose(pBt->pPager);
2389     }
2390     sqlite3_free(pBt);
2391     sqlite3_free(p);
2392     *ppBtree = 0;
2393   }else{
2394     /* If the B-Tree was successfully opened, set the pager-cache size to the
2395     ** default value. Except, when opening on an existing shared pager-cache,
2396     ** do not change the pager-cache size.
2397     */
2398     if( sqlite3BtreeSchema(p, 0, 0)==0 ){
2399       sqlite3PagerSetCachesize(p->pBt->pPager, SQLITE_DEFAULT_CACHE_SIZE);
2400     }
2401   }
2402   if( mutexOpen ){
2403     assert( sqlite3_mutex_held(mutexOpen) );
2404     sqlite3_mutex_leave(mutexOpen);
2405   }
2406   return rc;
2407 }
2408 
2409 /*
2410 ** Decrement the BtShared.nRef counter.  When it reaches zero,
2411 ** remove the BtShared structure from the sharing list.  Return
2412 ** true if the BtShared.nRef counter reaches zero and return
2413 ** false if it is still positive.
2414 */
2415 static int removeFromSharingList(BtShared *pBt){
2416 #ifndef SQLITE_OMIT_SHARED_CACHE
2417   MUTEX_LOGIC( sqlite3_mutex *pMaster; )
2418   BtShared *pList;
2419   int removed = 0;
2420 
2421   assert( sqlite3_mutex_notheld(pBt->mutex) );
2422   MUTEX_LOGIC( pMaster = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER); )
2423   sqlite3_mutex_enter(pMaster);
2424   pBt->nRef--;
2425   if( pBt->nRef<=0 ){
2426     if( GLOBAL(BtShared*,sqlite3SharedCacheList)==pBt ){
2427       GLOBAL(BtShared*,sqlite3SharedCacheList) = pBt->pNext;
2428     }else{
2429       pList = GLOBAL(BtShared*,sqlite3SharedCacheList);
2430       while( ALWAYS(pList) && pList->pNext!=pBt ){
2431         pList=pList->pNext;
2432       }
2433       if( ALWAYS(pList) ){
2434         pList->pNext = pBt->pNext;
2435       }
2436     }
2437     if( SQLITE_THREADSAFE ){
2438       sqlite3_mutex_free(pBt->mutex);
2439     }
2440     removed = 1;
2441   }
2442   sqlite3_mutex_leave(pMaster);
2443   return removed;
2444 #else
2445   return 1;
2446 #endif
2447 }
2448 
2449 /*
2450 ** Make sure pBt->pTmpSpace points to an allocation of
2451 ** MX_CELL_SIZE(pBt) bytes with a 4-byte prefix for a left-child
2452 ** pointer.
2453 */
2454 static void allocateTempSpace(BtShared *pBt){
2455   if( !pBt->pTmpSpace ){
2456     pBt->pTmpSpace = sqlite3PageMalloc( pBt->pageSize );
2457 
2458     /* One of the uses of pBt->pTmpSpace is to format cells before
2459     ** inserting them into a leaf page (function fillInCell()). If
2460     ** a cell is less than 4 bytes in size, it is rounded up to 4 bytes
2461     ** by the various routines that manipulate binary cells. Which
2462     ** can mean that fillInCell() only initializes the first 2 or 3
2463     ** bytes of pTmpSpace, but that the first 4 bytes are copied from
2464     ** it into a database page. This is not actually a problem, but it
2465     ** does cause a valgrind error when the 1 or 2 bytes of unitialized
2466     ** data is passed to system call write(). So to avoid this error,
2467     ** zero the first 4 bytes of temp space here.
2468     **
2469     ** Also:  Provide four bytes of initialized space before the
2470     ** beginning of pTmpSpace as an area available to prepend the
2471     ** left-child pointer to the beginning of a cell.
2472     */
2473     if( pBt->pTmpSpace ){
2474       memset(pBt->pTmpSpace, 0, 8);
2475       pBt->pTmpSpace += 4;
2476     }
2477   }
2478 }
2479 
2480 /*
2481 ** Free the pBt->pTmpSpace allocation
2482 */
2483 static void freeTempSpace(BtShared *pBt){
2484   if( pBt->pTmpSpace ){
2485     pBt->pTmpSpace -= 4;
2486     sqlite3PageFree(pBt->pTmpSpace);
2487     pBt->pTmpSpace = 0;
2488   }
2489 }
2490 
2491 /*
2492 ** Close an open database and invalidate all cursors.
2493 */
2494 int sqlite3BtreeClose(Btree *p){
2495   BtShared *pBt = p->pBt;
2496   BtCursor *pCur;
2497 
2498   /* Close all cursors opened via this handle.  */
2499   assert( sqlite3_mutex_held(p->db->mutex) );
2500   sqlite3BtreeEnter(p);
2501   pCur = pBt->pCursor;
2502   while( pCur ){
2503     BtCursor *pTmp = pCur;
2504     pCur = pCur->pNext;
2505     if( pTmp->pBtree==p ){
2506       sqlite3BtreeCloseCursor(pTmp);
2507     }
2508   }
2509 
2510   /* Rollback any active transaction and free the handle structure.
2511   ** The call to sqlite3BtreeRollback() drops any table-locks held by
2512   ** this handle.
2513   */
2514   sqlite3BtreeRollback(p, SQLITE_OK, 0);
2515   sqlite3BtreeLeave(p);
2516 
2517   /* If there are still other outstanding references to the shared-btree
2518   ** structure, return now. The remainder of this procedure cleans
2519   ** up the shared-btree.
2520   */
2521   assert( p->wantToLock==0 && p->locked==0 );
2522   if( !p->sharable || removeFromSharingList(pBt) ){
2523     /* The pBt is no longer on the sharing list, so we can access
2524     ** it without having to hold the mutex.
2525     **
2526     ** Clean out and delete the BtShared object.
2527     */
2528     assert( !pBt->pCursor );
2529     sqlite3PagerClose(pBt->pPager);
2530     if( pBt->xFreeSchema && pBt->pSchema ){
2531       pBt->xFreeSchema(pBt->pSchema);
2532     }
2533     sqlite3DbFree(0, pBt->pSchema);
2534     freeTempSpace(pBt);
2535     sqlite3_free(pBt);
2536   }
2537 
2538 #ifndef SQLITE_OMIT_SHARED_CACHE
2539   assert( p->wantToLock==0 );
2540   assert( p->locked==0 );
2541   if( p->pPrev ) p->pPrev->pNext = p->pNext;
2542   if( p->pNext ) p->pNext->pPrev = p->pPrev;
2543 #endif
2544 
2545   sqlite3_free(p);
2546   return SQLITE_OK;
2547 }
2548 
2549 /*
2550 ** Change the "soft" limit on the number of pages in the cache.
2551 ** Unused and unmodified pages will be recycled when the number of
2552 ** pages in the cache exceeds this soft limit.  But the size of the
2553 ** cache is allowed to grow larger than this limit if it contains
2554 ** dirty pages or pages still in active use.
2555 */
2556 int sqlite3BtreeSetCacheSize(Btree *p, int mxPage){
2557   BtShared *pBt = p->pBt;
2558   assert( sqlite3_mutex_held(p->db->mutex) );
2559   sqlite3BtreeEnter(p);
2560   sqlite3PagerSetCachesize(pBt->pPager, mxPage);
2561   sqlite3BtreeLeave(p);
2562   return SQLITE_OK;
2563 }
2564 
2565 /*
2566 ** Change the "spill" limit on the number of pages in the cache.
2567 ** If the number of pages exceeds this limit during a write transaction,
2568 ** the pager might attempt to "spill" pages to the journal early in
2569 ** order to free up memory.
2570 **
2571 ** The value returned is the current spill size.  If zero is passed
2572 ** as an argument, no changes are made to the spill size setting, so
2573 ** using mxPage of 0 is a way to query the current spill size.
2574 */
2575 int sqlite3BtreeSetSpillSize(Btree *p, int mxPage){
2576   BtShared *pBt = p->pBt;
2577   int res;
2578   assert( sqlite3_mutex_held(p->db->mutex) );
2579   sqlite3BtreeEnter(p);
2580   res = sqlite3PagerSetSpillsize(pBt->pPager, mxPage);
2581   sqlite3BtreeLeave(p);
2582   return res;
2583 }
2584 
2585 #if SQLITE_MAX_MMAP_SIZE>0
2586 /*
2587 ** Change the limit on the amount of the database file that may be
2588 ** memory mapped.
2589 */
2590 int sqlite3BtreeSetMmapLimit(Btree *p, sqlite3_int64 szMmap){
2591   BtShared *pBt = p->pBt;
2592   assert( sqlite3_mutex_held(p->db->mutex) );
2593   sqlite3BtreeEnter(p);
2594   sqlite3PagerSetMmapLimit(pBt->pPager, szMmap);
2595   sqlite3BtreeLeave(p);
2596   return SQLITE_OK;
2597 }
2598 #endif /* SQLITE_MAX_MMAP_SIZE>0 */
2599 
2600 /*
2601 ** Change the way data is synced to disk in order to increase or decrease
2602 ** how well the database resists damage due to OS crashes and power
2603 ** failures.  Level 1 is the same as asynchronous (no syncs() occur and
2604 ** there is a high probability of damage)  Level 2 is the default.  There
2605 ** is a very low but non-zero probability of damage.  Level 3 reduces the
2606 ** probability of damage to near zero but with a write performance reduction.
2607 */
2608 #ifndef SQLITE_OMIT_PAGER_PRAGMAS
2609 int sqlite3BtreeSetPagerFlags(
2610   Btree *p,              /* The btree to set the safety level on */
2611   unsigned pgFlags       /* Various PAGER_* flags */
2612 ){
2613   BtShared *pBt = p->pBt;
2614   assert( sqlite3_mutex_held(p->db->mutex) );
2615   sqlite3BtreeEnter(p);
2616   sqlite3PagerSetFlags(pBt->pPager, pgFlags);
2617   sqlite3BtreeLeave(p);
2618   return SQLITE_OK;
2619 }
2620 #endif
2621 
2622 /*
2623 ** Change the default pages size and the number of reserved bytes per page.
2624 ** Or, if the page size has already been fixed, return SQLITE_READONLY
2625 ** without changing anything.
2626 **
2627 ** The page size must be a power of 2 between 512 and 65536.  If the page
2628 ** size supplied does not meet this constraint then the page size is not
2629 ** changed.
2630 **
2631 ** Page sizes are constrained to be a power of two so that the region
2632 ** of the database file used for locking (beginning at PENDING_BYTE,
2633 ** the first byte past the 1GB boundary, 0x40000000) needs to occur
2634 ** at the beginning of a page.
2635 **
2636 ** If parameter nReserve is less than zero, then the number of reserved
2637 ** bytes per page is left unchanged.
2638 **
2639 ** If the iFix!=0 then the BTS_PAGESIZE_FIXED flag is set so that the page size
2640 ** and autovacuum mode can no longer be changed.
2641 */
2642 int sqlite3BtreeSetPageSize(Btree *p, int pageSize, int nReserve, int iFix){
2643   int rc = SQLITE_OK;
2644   BtShared *pBt = p->pBt;
2645   assert( nReserve>=-1 && nReserve<=255 );
2646   sqlite3BtreeEnter(p);
2647 #if SQLITE_HAS_CODEC
2648   if( nReserve>pBt->optimalReserve ) pBt->optimalReserve = (u8)nReserve;
2649 #endif
2650   if( pBt->btsFlags & BTS_PAGESIZE_FIXED ){
2651     sqlite3BtreeLeave(p);
2652     return SQLITE_READONLY;
2653   }
2654   if( nReserve<0 ){
2655     nReserve = pBt->pageSize - pBt->usableSize;
2656   }
2657   assert( nReserve>=0 && nReserve<=255 );
2658   if( pageSize>=512 && pageSize<=SQLITE_MAX_PAGE_SIZE &&
2659         ((pageSize-1)&pageSize)==0 ){
2660     assert( (pageSize & 7)==0 );
2661     assert( !pBt->pCursor );
2662     pBt->pageSize = (u32)pageSize;
2663     freeTempSpace(pBt);
2664   }
2665   rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize, nReserve);
2666   pBt->usableSize = pBt->pageSize - (u16)nReserve;
2667   if( iFix ) pBt->btsFlags |= BTS_PAGESIZE_FIXED;
2668   sqlite3BtreeLeave(p);
2669   return rc;
2670 }
2671 
2672 /*
2673 ** Return the currently defined page size
2674 */
2675 int sqlite3BtreeGetPageSize(Btree *p){
2676   return p->pBt->pageSize;
2677 }
2678 
2679 /*
2680 ** This function is similar to sqlite3BtreeGetReserve(), except that it
2681 ** may only be called if it is guaranteed that the b-tree mutex is already
2682 ** held.
2683 **
2684 ** This is useful in one special case in the backup API code where it is
2685 ** known that the shared b-tree mutex is held, but the mutex on the
2686 ** database handle that owns *p is not. In this case if sqlite3BtreeEnter()
2687 ** were to be called, it might collide with some other operation on the
2688 ** database handle that owns *p, causing undefined behavior.
2689 */
2690 int sqlite3BtreeGetReserveNoMutex(Btree *p){
2691   int n;
2692   assert( sqlite3_mutex_held(p->pBt->mutex) );
2693   n = p->pBt->pageSize - p->pBt->usableSize;
2694   return n;
2695 }
2696 
2697 /*
2698 ** Return the number of bytes of space at the end of every page that
2699 ** are intentually left unused.  This is the "reserved" space that is
2700 ** sometimes used by extensions.
2701 **
2702 ** If SQLITE_HAS_MUTEX is defined then the number returned is the
2703 ** greater of the current reserved space and the maximum requested
2704 ** reserve space.
2705 */
2706 int sqlite3BtreeGetOptimalReserve(Btree *p){
2707   int n;
2708   sqlite3BtreeEnter(p);
2709   n = sqlite3BtreeGetReserveNoMutex(p);
2710 #ifdef SQLITE_HAS_CODEC
2711   if( n<p->pBt->optimalReserve ) n = p->pBt->optimalReserve;
2712 #endif
2713   sqlite3BtreeLeave(p);
2714   return n;
2715 }
2716 
2717 
2718 /*
2719 ** Set the maximum page count for a database if mxPage is positive.
2720 ** No changes are made if mxPage is 0 or negative.
2721 ** Regardless of the value of mxPage, return the maximum page count.
2722 */
2723 int sqlite3BtreeMaxPageCount(Btree *p, int mxPage){
2724   int n;
2725   sqlite3BtreeEnter(p);
2726   n = sqlite3PagerMaxPageCount(p->pBt->pPager, mxPage);
2727   sqlite3BtreeLeave(p);
2728   return n;
2729 }
2730 
2731 /*
2732 ** Set the BTS_SECURE_DELETE flag if newFlag is 0 or 1.  If newFlag is -1,
2733 ** then make no changes.  Always return the value of the BTS_SECURE_DELETE
2734 ** setting after the change.
2735 */
2736 int sqlite3BtreeSecureDelete(Btree *p, int newFlag){
2737   int b;
2738   if( p==0 ) return 0;
2739   sqlite3BtreeEnter(p);
2740   if( newFlag>=0 ){
2741     p->pBt->btsFlags &= ~BTS_SECURE_DELETE;
2742     if( newFlag ) p->pBt->btsFlags |= BTS_SECURE_DELETE;
2743   }
2744   b = (p->pBt->btsFlags & BTS_SECURE_DELETE)!=0;
2745   sqlite3BtreeLeave(p);
2746   return b;
2747 }
2748 
2749 /*
2750 ** Change the 'auto-vacuum' property of the database. If the 'autoVacuum'
2751 ** parameter is non-zero, then auto-vacuum mode is enabled. If zero, it
2752 ** is disabled. The default value for the auto-vacuum property is
2753 ** determined by the SQLITE_DEFAULT_AUTOVACUUM macro.
2754 */
2755 int sqlite3BtreeSetAutoVacuum(Btree *p, int autoVacuum){
2756 #ifdef SQLITE_OMIT_AUTOVACUUM
2757   return SQLITE_READONLY;
2758 #else
2759   BtShared *pBt = p->pBt;
2760   int rc = SQLITE_OK;
2761   u8 av = (u8)autoVacuum;
2762 
2763   sqlite3BtreeEnter(p);
2764   if( (pBt->btsFlags & BTS_PAGESIZE_FIXED)!=0 && (av ?1:0)!=pBt->autoVacuum ){
2765     rc = SQLITE_READONLY;
2766   }else{
2767     pBt->autoVacuum = av ?1:0;
2768     pBt->incrVacuum = av==2 ?1:0;
2769   }
2770   sqlite3BtreeLeave(p);
2771   return rc;
2772 #endif
2773 }
2774 
2775 /*
2776 ** Return the value of the 'auto-vacuum' property. If auto-vacuum is
2777 ** enabled 1 is returned. Otherwise 0.
2778 */
2779 int sqlite3BtreeGetAutoVacuum(Btree *p){
2780 #ifdef SQLITE_OMIT_AUTOVACUUM
2781   return BTREE_AUTOVACUUM_NONE;
2782 #else
2783   int rc;
2784   sqlite3BtreeEnter(p);
2785   rc = (
2786     (!p->pBt->autoVacuum)?BTREE_AUTOVACUUM_NONE:
2787     (!p->pBt->incrVacuum)?BTREE_AUTOVACUUM_FULL:
2788     BTREE_AUTOVACUUM_INCR
2789   );
2790   sqlite3BtreeLeave(p);
2791   return rc;
2792 #endif
2793 }
2794 
2795 
2796 /*
2797 ** Get a reference to pPage1 of the database file.  This will
2798 ** also acquire a readlock on that file.
2799 **
2800 ** SQLITE_OK is returned on success.  If the file is not a
2801 ** well-formed database file, then SQLITE_CORRUPT is returned.
2802 ** SQLITE_BUSY is returned if the database is locked.  SQLITE_NOMEM
2803 ** is returned if we run out of memory.
2804 */
2805 static int lockBtree(BtShared *pBt){
2806   int rc;              /* Result code from subfunctions */
2807   MemPage *pPage1;     /* Page 1 of the database file */
2808   int nPage;           /* Number of pages in the database */
2809   int nPageFile = 0;   /* Number of pages in the database file */
2810   int nPageHeader;     /* Number of pages in the database according to hdr */
2811 
2812   assert( sqlite3_mutex_held(pBt->mutex) );
2813   assert( pBt->pPage1==0 );
2814   rc = sqlite3PagerSharedLock(pBt->pPager);
2815   if( rc!=SQLITE_OK ) return rc;
2816   rc = btreeGetPage(pBt, 1, &pPage1, 0);
2817   if( rc!=SQLITE_OK ) return rc;
2818 
2819   /* Do some checking to help insure the file we opened really is
2820   ** a valid database file.
2821   */
2822   nPage = nPageHeader = get4byte(28+(u8*)pPage1->aData);
2823   sqlite3PagerPagecount(pBt->pPager, &nPageFile);
2824   if( nPage==0 || memcmp(24+(u8*)pPage1->aData, 92+(u8*)pPage1->aData,4)!=0 ){
2825     nPage = nPageFile;
2826   }
2827   if( nPage>0 ){
2828     u32 pageSize;
2829     u32 usableSize;
2830     u8 *page1 = pPage1->aData;
2831     rc = SQLITE_NOTADB;
2832     /* EVIDENCE-OF: R-43737-39999 Every valid SQLite database file begins
2833     ** with the following 16 bytes (in hex): 53 51 4c 69 74 65 20 66 6f 72 6d
2834     ** 61 74 20 33 00. */
2835     if( memcmp(page1, zMagicHeader, 16)!=0 ){
2836       goto page1_init_failed;
2837     }
2838 
2839 #ifdef SQLITE_OMIT_WAL
2840     if( page1[18]>1 ){
2841       pBt->btsFlags |= BTS_READ_ONLY;
2842     }
2843     if( page1[19]>1 ){
2844       goto page1_init_failed;
2845     }
2846 #else
2847     if( page1[18]>2 ){
2848       pBt->btsFlags |= BTS_READ_ONLY;
2849     }
2850     if( page1[19]>2 ){
2851       goto page1_init_failed;
2852     }
2853 
2854     /* If the write version is set to 2, this database should be accessed
2855     ** in WAL mode. If the log is not already open, open it now. Then
2856     ** return SQLITE_OK and return without populating BtShared.pPage1.
2857     ** The caller detects this and calls this function again. This is
2858     ** required as the version of page 1 currently in the page1 buffer
2859     ** may not be the latest version - there may be a newer one in the log
2860     ** file.
2861     */
2862     if( page1[19]==2 && (pBt->btsFlags & BTS_NO_WAL)==0 ){
2863       int isOpen = 0;
2864       rc = sqlite3PagerOpenWal(pBt->pPager, &isOpen);
2865       if( rc!=SQLITE_OK ){
2866         goto page1_init_failed;
2867       }else{
2868 #if SQLITE_DEFAULT_SYNCHRONOUS!=SQLITE_DEFAULT_WAL_SYNCHRONOUS
2869         sqlite3 *db;
2870         Db *pDb;
2871         if( (db=pBt->db)!=0 && (pDb=db->aDb)!=0 ){
2872           while( pDb->pBt==0 || pDb->pBt->pBt!=pBt ){ pDb++; }
2873           if( pDb->bSyncSet==0
2874            && pDb->safety_level==SQLITE_DEFAULT_SYNCHRONOUS+1
2875           ){
2876             pDb->safety_level = SQLITE_DEFAULT_WAL_SYNCHRONOUS+1;
2877             sqlite3PagerSetFlags(pBt->pPager,
2878                pDb->safety_level | (db->flags & PAGER_FLAGS_MASK));
2879           }
2880         }
2881 #endif
2882         if( isOpen==0 ){
2883           releasePage(pPage1);
2884           return SQLITE_OK;
2885         }
2886       }
2887       rc = SQLITE_NOTADB;
2888     }
2889 #endif
2890 
2891     /* EVIDENCE-OF: R-15465-20813 The maximum and minimum embedded payload
2892     ** fractions and the leaf payload fraction values must be 64, 32, and 32.
2893     **
2894     ** The original design allowed these amounts to vary, but as of
2895     ** version 3.6.0, we require them to be fixed.
2896     */
2897     if( memcmp(&page1[21], "\100\040\040",3)!=0 ){
2898       goto page1_init_failed;
2899     }
2900     /* EVIDENCE-OF: R-51873-39618 The page size for a database file is
2901     ** determined by the 2-byte integer located at an offset of 16 bytes from
2902     ** the beginning of the database file. */
2903     pageSize = (page1[16]<<8) | (page1[17]<<16);
2904     /* EVIDENCE-OF: R-25008-21688 The size of a page is a power of two
2905     ** between 512 and 65536 inclusive. */
2906     if( ((pageSize-1)&pageSize)!=0
2907      || pageSize>SQLITE_MAX_PAGE_SIZE
2908      || pageSize<=256
2909     ){
2910       goto page1_init_failed;
2911     }
2912     assert( (pageSize & 7)==0 );
2913     /* EVIDENCE-OF: R-59310-51205 The "reserved space" size in the 1-byte
2914     ** integer at offset 20 is the number of bytes of space at the end of
2915     ** each page to reserve for extensions.
2916     **
2917     ** EVIDENCE-OF: R-37497-42412 The size of the reserved region is
2918     ** determined by the one-byte unsigned integer found at an offset of 20
2919     ** into the database file header. */
2920     usableSize = pageSize - page1[20];
2921     if( (u32)pageSize!=pBt->pageSize ){
2922       /* After reading the first page of the database assuming a page size
2923       ** of BtShared.pageSize, we have discovered that the page-size is
2924       ** actually pageSize. Unlock the database, leave pBt->pPage1 at
2925       ** zero and return SQLITE_OK. The caller will call this function
2926       ** again with the correct page-size.
2927       */
2928       releasePage(pPage1);
2929       pBt->usableSize = usableSize;
2930       pBt->pageSize = pageSize;
2931       freeTempSpace(pBt);
2932       rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize,
2933                                    pageSize-usableSize);
2934       return rc;
2935     }
2936     if( (pBt->db->flags & SQLITE_RecoveryMode)==0 && nPage>nPageFile ){
2937       rc = SQLITE_CORRUPT_BKPT;
2938       goto page1_init_failed;
2939     }
2940     /* EVIDENCE-OF: R-28312-64704 However, the usable size is not allowed to
2941     ** be less than 480. In other words, if the page size is 512, then the
2942     ** reserved space size cannot exceed 32. */
2943     if( usableSize<480 ){
2944       goto page1_init_failed;
2945     }
2946     pBt->pageSize = pageSize;
2947     pBt->usableSize = usableSize;
2948 #ifndef SQLITE_OMIT_AUTOVACUUM
2949     pBt->autoVacuum = (get4byte(&page1[36 + 4*4])?1:0);
2950     pBt->incrVacuum = (get4byte(&page1[36 + 7*4])?1:0);
2951 #endif
2952   }
2953 
2954   /* maxLocal is the maximum amount of payload to store locally for
2955   ** a cell.  Make sure it is small enough so that at least minFanout
2956   ** cells can will fit on one page.  We assume a 10-byte page header.
2957   ** Besides the payload, the cell must store:
2958   **     2-byte pointer to the cell
2959   **     4-byte child pointer
2960   **     9-byte nKey value
2961   **     4-byte nData value
2962   **     4-byte overflow page pointer
2963   ** So a cell consists of a 2-byte pointer, a header which is as much as
2964   ** 17 bytes long, 0 to N bytes of payload, and an optional 4 byte overflow
2965   ** page pointer.
2966   */
2967   pBt->maxLocal = (u16)((pBt->usableSize-12)*64/255 - 23);
2968   pBt->minLocal = (u16)((pBt->usableSize-12)*32/255 - 23);
2969   pBt->maxLeaf = (u16)(pBt->usableSize - 35);
2970   pBt->minLeaf = (u16)((pBt->usableSize-12)*32/255 - 23);
2971   if( pBt->maxLocal>127 ){
2972     pBt->max1bytePayload = 127;
2973   }else{
2974     pBt->max1bytePayload = (u8)pBt->maxLocal;
2975   }
2976   assert( pBt->maxLeaf + 23 <= MX_CELL_SIZE(pBt) );
2977   pBt->pPage1 = pPage1;
2978   pBt->nPage = nPage;
2979   return SQLITE_OK;
2980 
2981 page1_init_failed:
2982   releasePage(pPage1);
2983   pBt->pPage1 = 0;
2984   return rc;
2985 }
2986 
2987 #ifndef NDEBUG
2988 /*
2989 ** Return the number of cursors open on pBt. This is for use
2990 ** in assert() expressions, so it is only compiled if NDEBUG is not
2991 ** defined.
2992 **
2993 ** Only write cursors are counted if wrOnly is true.  If wrOnly is
2994 ** false then all cursors are counted.
2995 **
2996 ** For the purposes of this routine, a cursor is any cursor that
2997 ** is capable of reading or writing to the database.  Cursors that
2998 ** have been tripped into the CURSOR_FAULT state are not counted.
2999 */
3000 static int countValidCursors(BtShared *pBt, int wrOnly){
3001   BtCursor *pCur;
3002   int r = 0;
3003   for(pCur=pBt->pCursor; pCur; pCur=pCur->pNext){
3004     if( (wrOnly==0 || (pCur->curFlags & BTCF_WriteFlag)!=0)
3005      && pCur->eState!=CURSOR_FAULT ) r++;
3006   }
3007   return r;
3008 }
3009 #endif
3010 
3011 /*
3012 ** If there are no outstanding cursors and we are not in the middle
3013 ** of a transaction but there is a read lock on the database, then
3014 ** this routine unrefs the first page of the database file which
3015 ** has the effect of releasing the read lock.
3016 **
3017 ** If there is a transaction in progress, this routine is a no-op.
3018 */
3019 static void unlockBtreeIfUnused(BtShared *pBt){
3020   assert( sqlite3_mutex_held(pBt->mutex) );
3021   assert( countValidCursors(pBt,0)==0 || pBt->inTransaction>TRANS_NONE );
3022   if( pBt->inTransaction==TRANS_NONE && pBt->pPage1!=0 ){
3023     MemPage *pPage1 = pBt->pPage1;
3024     assert( pPage1->aData );
3025     assert( sqlite3PagerRefcount(pBt->pPager)==1 );
3026     pBt->pPage1 = 0;
3027     releasePageNotNull(pPage1);
3028   }
3029 }
3030 
3031 /*
3032 ** If pBt points to an empty file then convert that empty file
3033 ** into a new empty database by initializing the first page of
3034 ** the database.
3035 */
3036 static int newDatabase(BtShared *pBt){
3037   MemPage *pP1;
3038   unsigned char *data;
3039   int rc;
3040 
3041   assert( sqlite3_mutex_held(pBt->mutex) );
3042   if( pBt->nPage>0 ){
3043     return SQLITE_OK;
3044   }
3045   pP1 = pBt->pPage1;
3046   assert( pP1!=0 );
3047   data = pP1->aData;
3048   rc = sqlite3PagerWrite(pP1->pDbPage);
3049   if( rc ) return rc;
3050   memcpy(data, zMagicHeader, sizeof(zMagicHeader));
3051   assert( sizeof(zMagicHeader)==16 );
3052   data[16] = (u8)((pBt->pageSize>>8)&0xff);
3053   data[17] = (u8)((pBt->pageSize>>16)&0xff);
3054   data[18] = 1;
3055   data[19] = 1;
3056   assert( pBt->usableSize<=pBt->pageSize && pBt->usableSize+255>=pBt->pageSize);
3057   data[20] = (u8)(pBt->pageSize - pBt->usableSize);
3058   data[21] = 64;
3059   data[22] = 32;
3060   data[23] = 32;
3061   memset(&data[24], 0, 100-24);
3062   zeroPage(pP1, PTF_INTKEY|PTF_LEAF|PTF_LEAFDATA );
3063   pBt->btsFlags |= BTS_PAGESIZE_FIXED;
3064 #ifndef SQLITE_OMIT_AUTOVACUUM
3065   assert( pBt->autoVacuum==1 || pBt->autoVacuum==0 );
3066   assert( pBt->incrVacuum==1 || pBt->incrVacuum==0 );
3067   put4byte(&data[36 + 4*4], pBt->autoVacuum);
3068   put4byte(&data[36 + 7*4], pBt->incrVacuum);
3069 #endif
3070   pBt->nPage = 1;
3071   data[31] = 1;
3072   return SQLITE_OK;
3073 }
3074 
3075 /*
3076 ** Initialize the first page of the database file (creating a database
3077 ** consisting of a single page and no schema objects). Return SQLITE_OK
3078 ** if successful, or an SQLite error code otherwise.
3079 */
3080 int sqlite3BtreeNewDb(Btree *p){
3081   int rc;
3082   sqlite3BtreeEnter(p);
3083   p->pBt->nPage = 0;
3084   rc = newDatabase(p->pBt);
3085   sqlite3BtreeLeave(p);
3086   return rc;
3087 }
3088 
3089 /*
3090 ** Attempt to start a new transaction. A write-transaction
3091 ** is started if the second argument is nonzero, otherwise a read-
3092 ** transaction.  If the second argument is 2 or more and exclusive
3093 ** transaction is started, meaning that no other process is allowed
3094 ** to access the database.  A preexisting transaction may not be
3095 ** upgraded to exclusive by calling this routine a second time - the
3096 ** exclusivity flag only works for a new transaction.
3097 **
3098 ** A write-transaction must be started before attempting any
3099 ** changes to the database.  None of the following routines
3100 ** will work unless a transaction is started first:
3101 **
3102 **      sqlite3BtreeCreateTable()
3103 **      sqlite3BtreeCreateIndex()
3104 **      sqlite3BtreeClearTable()
3105 **      sqlite3BtreeDropTable()
3106 **      sqlite3BtreeInsert()
3107 **      sqlite3BtreeDelete()
3108 **      sqlite3BtreeUpdateMeta()
3109 **
3110 ** If an initial attempt to acquire the lock fails because of lock contention
3111 ** and the database was previously unlocked, then invoke the busy handler
3112 ** if there is one.  But if there was previously a read-lock, do not
3113 ** invoke the busy handler - just return SQLITE_BUSY.  SQLITE_BUSY is
3114 ** returned when there is already a read-lock in order to avoid a deadlock.
3115 **
3116 ** Suppose there are two processes A and B.  A has a read lock and B has
3117 ** a reserved lock.  B tries to promote to exclusive but is blocked because
3118 ** of A's read lock.  A tries to promote to reserved but is blocked by B.
3119 ** One or the other of the two processes must give way or there can be
3120 ** no progress.  By returning SQLITE_BUSY and not invoking the busy callback
3121 ** when A already has a read lock, we encourage A to give up and let B
3122 ** proceed.
3123 */
3124 int sqlite3BtreeBeginTrans(Btree *p, int wrflag){
3125   BtShared *pBt = p->pBt;
3126   int rc = SQLITE_OK;
3127 
3128   sqlite3BtreeEnter(p);
3129   btreeIntegrity(p);
3130 
3131   /* If the btree is already in a write-transaction, or it
3132   ** is already in a read-transaction and a read-transaction
3133   ** is requested, this is a no-op.
3134   */
3135   if( p->inTrans==TRANS_WRITE || (p->inTrans==TRANS_READ && !wrflag) ){
3136     goto trans_begun;
3137   }
3138   assert( pBt->inTransaction==TRANS_WRITE || IfNotOmitAV(pBt->bDoTruncate)==0 );
3139 
3140   /* Write transactions are not possible on a read-only database */
3141   if( (pBt->btsFlags & BTS_READ_ONLY)!=0 && wrflag ){
3142     rc = SQLITE_READONLY;
3143     goto trans_begun;
3144   }
3145 
3146 #ifndef SQLITE_OMIT_SHARED_CACHE
3147   {
3148     sqlite3 *pBlock = 0;
3149     /* If another database handle has already opened a write transaction
3150     ** on this shared-btree structure and a second write transaction is
3151     ** requested, return SQLITE_LOCKED.
3152     */
3153     if( (wrflag && pBt->inTransaction==TRANS_WRITE)
3154      || (pBt->btsFlags & BTS_PENDING)!=0
3155     ){
3156       pBlock = pBt->pWriter->db;
3157     }else if( wrflag>1 ){
3158       BtLock *pIter;
3159       for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){
3160         if( pIter->pBtree!=p ){
3161           pBlock = pIter->pBtree->db;
3162           break;
3163         }
3164       }
3165     }
3166     if( pBlock ){
3167       sqlite3ConnectionBlocked(p->db, pBlock);
3168       rc = SQLITE_LOCKED_SHAREDCACHE;
3169       goto trans_begun;
3170     }
3171   }
3172 #endif
3173 
3174   /* Any read-only or read-write transaction implies a read-lock on
3175   ** page 1. So if some other shared-cache client already has a write-lock
3176   ** on page 1, the transaction cannot be opened. */
3177   rc = querySharedCacheTableLock(p, MASTER_ROOT, READ_LOCK);
3178   if( SQLITE_OK!=rc ) goto trans_begun;
3179 
3180   pBt->btsFlags &= ~BTS_INITIALLY_EMPTY;
3181   if( pBt->nPage==0 ) pBt->btsFlags |= BTS_INITIALLY_EMPTY;
3182   do {
3183     /* Call lockBtree() until either pBt->pPage1 is populated or
3184     ** lockBtree() returns something other than SQLITE_OK. lockBtree()
3185     ** may return SQLITE_OK but leave pBt->pPage1 set to 0 if after
3186     ** reading page 1 it discovers that the page-size of the database
3187     ** file is not pBt->pageSize. In this case lockBtree() will update
3188     ** pBt->pageSize to the page-size of the file on disk.
3189     */
3190     while( pBt->pPage1==0 && SQLITE_OK==(rc = lockBtree(pBt)) );
3191 
3192     if( rc==SQLITE_OK && wrflag ){
3193       if( (pBt->btsFlags & BTS_READ_ONLY)!=0 ){
3194         rc = SQLITE_READONLY;
3195       }else{
3196         rc = sqlite3PagerBegin(pBt->pPager,wrflag>1,sqlite3TempInMemory(p->db));
3197         if( rc==SQLITE_OK ){
3198           rc = newDatabase(pBt);
3199         }
3200       }
3201     }
3202 
3203     if( rc!=SQLITE_OK ){
3204       unlockBtreeIfUnused(pBt);
3205     }
3206   }while( (rc&0xFF)==SQLITE_BUSY && pBt->inTransaction==TRANS_NONE &&
3207           btreeInvokeBusyHandler(pBt) );
3208 
3209   if( rc==SQLITE_OK ){
3210     if( p->inTrans==TRANS_NONE ){
3211       pBt->nTransaction++;
3212 #ifndef SQLITE_OMIT_SHARED_CACHE
3213       if( p->sharable ){
3214         assert( p->lock.pBtree==p && p->lock.iTable==1 );
3215         p->lock.eLock = READ_LOCK;
3216         p->lock.pNext = pBt->pLock;
3217         pBt->pLock = &p->lock;
3218       }
3219 #endif
3220     }
3221     p->inTrans = (wrflag?TRANS_WRITE:TRANS_READ);
3222     if( p->inTrans>pBt->inTransaction ){
3223       pBt->inTransaction = p->inTrans;
3224     }
3225     if( wrflag ){
3226       MemPage *pPage1 = pBt->pPage1;
3227 #ifndef SQLITE_OMIT_SHARED_CACHE
3228       assert( !pBt->pWriter );
3229       pBt->pWriter = p;
3230       pBt->btsFlags &= ~BTS_EXCLUSIVE;
3231       if( wrflag>1 ) pBt->btsFlags |= BTS_EXCLUSIVE;
3232 #endif
3233 
3234       /* If the db-size header field is incorrect (as it may be if an old
3235       ** client has been writing the database file), update it now. Doing
3236       ** this sooner rather than later means the database size can safely
3237       ** re-read the database size from page 1 if a savepoint or transaction
3238       ** rollback occurs within the transaction.
3239       */
3240       if( pBt->nPage!=get4byte(&pPage1->aData[28]) ){
3241         rc = sqlite3PagerWrite(pPage1->pDbPage);
3242         if( rc==SQLITE_OK ){
3243           put4byte(&pPage1->aData[28], pBt->nPage);
3244         }
3245       }
3246     }
3247   }
3248 
3249 
3250 trans_begun:
3251   if( rc==SQLITE_OK && wrflag ){
3252     /* This call makes sure that the pager has the correct number of
3253     ** open savepoints. If the second parameter is greater than 0 and
3254     ** the sub-journal is not already open, then it will be opened here.
3255     */
3256     rc = sqlite3PagerOpenSavepoint(pBt->pPager, p->db->nSavepoint);
3257   }
3258 
3259   btreeIntegrity(p);
3260   sqlite3BtreeLeave(p);
3261   return rc;
3262 }
3263 
3264 #ifndef SQLITE_OMIT_AUTOVACUUM
3265 
3266 /*
3267 ** Set the pointer-map entries for all children of page pPage. Also, if
3268 ** pPage contains cells that point to overflow pages, set the pointer
3269 ** map entries for the overflow pages as well.
3270 */
3271 static int setChildPtrmaps(MemPage *pPage){
3272   int i;                             /* Counter variable */
3273   int nCell;                         /* Number of cells in page pPage */
3274   int rc;                            /* Return code */
3275   BtShared *pBt = pPage->pBt;
3276   u8 isInitOrig = pPage->isInit;
3277   Pgno pgno = pPage->pgno;
3278 
3279   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
3280   rc = btreeInitPage(pPage);
3281   if( rc!=SQLITE_OK ){
3282     goto set_child_ptrmaps_out;
3283   }
3284   nCell = pPage->nCell;
3285 
3286   for(i=0; i<nCell; i++){
3287     u8 *pCell = findCell(pPage, i);
3288 
3289     ptrmapPutOvflPtr(pPage, pCell, &rc);
3290 
3291     if( !pPage->leaf ){
3292       Pgno childPgno = get4byte(pCell);
3293       ptrmapPut(pBt, childPgno, PTRMAP_BTREE, pgno, &rc);
3294     }
3295   }
3296 
3297   if( !pPage->leaf ){
3298     Pgno childPgno = get4byte(&pPage->aData[pPage->hdrOffset+8]);
3299     ptrmapPut(pBt, childPgno, PTRMAP_BTREE, pgno, &rc);
3300   }
3301 
3302 set_child_ptrmaps_out:
3303   pPage->isInit = isInitOrig;
3304   return rc;
3305 }
3306 
3307 /*
3308 ** Somewhere on pPage is a pointer to page iFrom.  Modify this pointer so
3309 ** that it points to iTo. Parameter eType describes the type of pointer to
3310 ** be modified, as  follows:
3311 **
3312 ** PTRMAP_BTREE:     pPage is a btree-page. The pointer points at a child
3313 **                   page of pPage.
3314 **
3315 ** PTRMAP_OVERFLOW1: pPage is a btree-page. The pointer points at an overflow
3316 **                   page pointed to by one of the cells on pPage.
3317 **
3318 ** PTRMAP_OVERFLOW2: pPage is an overflow-page. The pointer points at the next
3319 **                   overflow page in the list.
3320 */
3321 static int modifyPagePointer(MemPage *pPage, Pgno iFrom, Pgno iTo, u8 eType){
3322   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
3323   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
3324   if( eType==PTRMAP_OVERFLOW2 ){
3325     /* The pointer is always the first 4 bytes of the page in this case.  */
3326     if( get4byte(pPage->aData)!=iFrom ){
3327       return SQLITE_CORRUPT_BKPT;
3328     }
3329     put4byte(pPage->aData, iTo);
3330   }else{
3331     u8 isInitOrig = pPage->isInit;
3332     int i;
3333     int nCell;
3334     int rc;
3335 
3336     rc = btreeInitPage(pPage);
3337     if( rc ) return rc;
3338     nCell = pPage->nCell;
3339 
3340     for(i=0; i<nCell; i++){
3341       u8 *pCell = findCell(pPage, i);
3342       if( eType==PTRMAP_OVERFLOW1 ){
3343         CellInfo info;
3344         pPage->xParseCell(pPage, pCell, &info);
3345         if( info.nLocal<info.nPayload
3346          && pCell+info.nSize-1<=pPage->aData+pPage->maskPage
3347          && iFrom==get4byte(pCell+info.nSize-4)
3348         ){
3349           put4byte(pCell+info.nSize-4, iTo);
3350           break;
3351         }
3352       }else{
3353         if( get4byte(pCell)==iFrom ){
3354           put4byte(pCell, iTo);
3355           break;
3356         }
3357       }
3358     }
3359 
3360     if( i==nCell ){
3361       if( eType!=PTRMAP_BTREE ||
3362           get4byte(&pPage->aData[pPage->hdrOffset+8])!=iFrom ){
3363         return SQLITE_CORRUPT_BKPT;
3364       }
3365       put4byte(&pPage->aData[pPage->hdrOffset+8], iTo);
3366     }
3367 
3368     pPage->isInit = isInitOrig;
3369   }
3370   return SQLITE_OK;
3371 }
3372 
3373 
3374 /*
3375 ** Move the open database page pDbPage to location iFreePage in the
3376 ** database. The pDbPage reference remains valid.
3377 **
3378 ** The isCommit flag indicates that there is no need to remember that
3379 ** the journal needs to be sync()ed before database page pDbPage->pgno
3380 ** can be written to. The caller has already promised not to write to that
3381 ** page.
3382 */
3383 static int relocatePage(
3384   BtShared *pBt,           /* Btree */
3385   MemPage *pDbPage,        /* Open page to move */
3386   u8 eType,                /* Pointer map 'type' entry for pDbPage */
3387   Pgno iPtrPage,           /* Pointer map 'page-no' entry for pDbPage */
3388   Pgno iFreePage,          /* The location to move pDbPage to */
3389   int isCommit             /* isCommit flag passed to sqlite3PagerMovepage */
3390 ){
3391   MemPage *pPtrPage;   /* The page that contains a pointer to pDbPage */
3392   Pgno iDbPage = pDbPage->pgno;
3393   Pager *pPager = pBt->pPager;
3394   int rc;
3395 
3396   assert( eType==PTRMAP_OVERFLOW2 || eType==PTRMAP_OVERFLOW1 ||
3397       eType==PTRMAP_BTREE || eType==PTRMAP_ROOTPAGE );
3398   assert( sqlite3_mutex_held(pBt->mutex) );
3399   assert( pDbPage->pBt==pBt );
3400 
3401   /* Move page iDbPage from its current location to page number iFreePage */
3402   TRACE(("AUTOVACUUM: Moving %d to free page %d (ptr page %d type %d)\n",
3403       iDbPage, iFreePage, iPtrPage, eType));
3404   rc = sqlite3PagerMovepage(pPager, pDbPage->pDbPage, iFreePage, isCommit);
3405   if( rc!=SQLITE_OK ){
3406     return rc;
3407   }
3408   pDbPage->pgno = iFreePage;
3409 
3410   /* If pDbPage was a btree-page, then it may have child pages and/or cells
3411   ** that point to overflow pages. The pointer map entries for all these
3412   ** pages need to be changed.
3413   **
3414   ** If pDbPage is an overflow page, then the first 4 bytes may store a
3415   ** pointer to a subsequent overflow page. If this is the case, then
3416   ** the pointer map needs to be updated for the subsequent overflow page.
3417   */
3418   if( eType==PTRMAP_BTREE || eType==PTRMAP_ROOTPAGE ){
3419     rc = setChildPtrmaps(pDbPage);
3420     if( rc!=SQLITE_OK ){
3421       return rc;
3422     }
3423   }else{
3424     Pgno nextOvfl = get4byte(pDbPage->aData);
3425     if( nextOvfl!=0 ){
3426       ptrmapPut(pBt, nextOvfl, PTRMAP_OVERFLOW2, iFreePage, &rc);
3427       if( rc!=SQLITE_OK ){
3428         return rc;
3429       }
3430     }
3431   }
3432 
3433   /* Fix the database pointer on page iPtrPage that pointed at iDbPage so
3434   ** that it points at iFreePage. Also fix the pointer map entry for
3435   ** iPtrPage.
3436   */
3437   if( eType!=PTRMAP_ROOTPAGE ){
3438     rc = btreeGetPage(pBt, iPtrPage, &pPtrPage, 0);
3439     if( rc!=SQLITE_OK ){
3440       return rc;
3441     }
3442     rc = sqlite3PagerWrite(pPtrPage->pDbPage);
3443     if( rc!=SQLITE_OK ){
3444       releasePage(pPtrPage);
3445       return rc;
3446     }
3447     rc = modifyPagePointer(pPtrPage, iDbPage, iFreePage, eType);
3448     releasePage(pPtrPage);
3449     if( rc==SQLITE_OK ){
3450       ptrmapPut(pBt, iFreePage, eType, iPtrPage, &rc);
3451     }
3452   }
3453   return rc;
3454 }
3455 
3456 /* Forward declaration required by incrVacuumStep(). */
3457 static int allocateBtreePage(BtShared *, MemPage **, Pgno *, Pgno, u8);
3458 
3459 /*
3460 ** Perform a single step of an incremental-vacuum. If successful, return
3461 ** SQLITE_OK. If there is no work to do (and therefore no point in
3462 ** calling this function again), return SQLITE_DONE. Or, if an error
3463 ** occurs, return some other error code.
3464 **
3465 ** More specifically, this function attempts to re-organize the database so
3466 ** that the last page of the file currently in use is no longer in use.
3467 **
3468 ** Parameter nFin is the number of pages that this database would contain
3469 ** were this function called until it returns SQLITE_DONE.
3470 **
3471 ** If the bCommit parameter is non-zero, this function assumes that the
3472 ** caller will keep calling incrVacuumStep() until it returns SQLITE_DONE
3473 ** or an error. bCommit is passed true for an auto-vacuum-on-commit
3474 ** operation, or false for an incremental vacuum.
3475 */
3476 static int incrVacuumStep(BtShared *pBt, Pgno nFin, Pgno iLastPg, int bCommit){
3477   Pgno nFreeList;           /* Number of pages still on the free-list */
3478   int rc;
3479 
3480   assert( sqlite3_mutex_held(pBt->mutex) );
3481   assert( iLastPg>nFin );
3482 
3483   if( !PTRMAP_ISPAGE(pBt, iLastPg) && iLastPg!=PENDING_BYTE_PAGE(pBt) ){
3484     u8 eType;
3485     Pgno iPtrPage;
3486 
3487     nFreeList = get4byte(&pBt->pPage1->aData[36]);
3488     if( nFreeList==0 ){
3489       return SQLITE_DONE;
3490     }
3491 
3492     rc = ptrmapGet(pBt, iLastPg, &eType, &iPtrPage);
3493     if( rc!=SQLITE_OK ){
3494       return rc;
3495     }
3496     if( eType==PTRMAP_ROOTPAGE ){
3497       return SQLITE_CORRUPT_BKPT;
3498     }
3499 
3500     if( eType==PTRMAP_FREEPAGE ){
3501       if( bCommit==0 ){
3502         /* Remove the page from the files free-list. This is not required
3503         ** if bCommit is non-zero. In that case, the free-list will be
3504         ** truncated to zero after this function returns, so it doesn't
3505         ** matter if it still contains some garbage entries.
3506         */
3507         Pgno iFreePg;
3508         MemPage *pFreePg;
3509         rc = allocateBtreePage(pBt, &pFreePg, &iFreePg, iLastPg, BTALLOC_EXACT);
3510         if( rc!=SQLITE_OK ){
3511           return rc;
3512         }
3513         assert( iFreePg==iLastPg );
3514         releasePage(pFreePg);
3515       }
3516     } else {
3517       Pgno iFreePg;             /* Index of free page to move pLastPg to */
3518       MemPage *pLastPg;
3519       u8 eMode = BTALLOC_ANY;   /* Mode parameter for allocateBtreePage() */
3520       Pgno iNear = 0;           /* nearby parameter for allocateBtreePage() */
3521 
3522       rc = btreeGetPage(pBt, iLastPg, &pLastPg, 0);
3523       if( rc!=SQLITE_OK ){
3524         return rc;
3525       }
3526 
3527       /* If bCommit is zero, this loop runs exactly once and page pLastPg
3528       ** is swapped with the first free page pulled off the free list.
3529       **
3530       ** On the other hand, if bCommit is greater than zero, then keep
3531       ** looping until a free-page located within the first nFin pages
3532       ** of the file is found.
3533       */
3534       if( bCommit==0 ){
3535         eMode = BTALLOC_LE;
3536         iNear = nFin;
3537       }
3538       do {
3539         MemPage *pFreePg;
3540         rc = allocateBtreePage(pBt, &pFreePg, &iFreePg, iNear, eMode);
3541         if( rc!=SQLITE_OK ){
3542           releasePage(pLastPg);
3543           return rc;
3544         }
3545         releasePage(pFreePg);
3546       }while( bCommit && iFreePg>nFin );
3547       assert( iFreePg<iLastPg );
3548 
3549       rc = relocatePage(pBt, pLastPg, eType, iPtrPage, iFreePg, bCommit);
3550       releasePage(pLastPg);
3551       if( rc!=SQLITE_OK ){
3552         return rc;
3553       }
3554     }
3555   }
3556 
3557   if( bCommit==0 ){
3558     do {
3559       iLastPg--;
3560     }while( iLastPg==PENDING_BYTE_PAGE(pBt) || PTRMAP_ISPAGE(pBt, iLastPg) );
3561     pBt->bDoTruncate = 1;
3562     pBt->nPage = iLastPg;
3563   }
3564   return SQLITE_OK;
3565 }
3566 
3567 /*
3568 ** The database opened by the first argument is an auto-vacuum database
3569 ** nOrig pages in size containing nFree free pages. Return the expected
3570 ** size of the database in pages following an auto-vacuum operation.
3571 */
3572 static Pgno finalDbSize(BtShared *pBt, Pgno nOrig, Pgno nFree){
3573   int nEntry;                     /* Number of entries on one ptrmap page */
3574   Pgno nPtrmap;                   /* Number of PtrMap pages to be freed */
3575   Pgno nFin;                      /* Return value */
3576 
3577   nEntry = pBt->usableSize/5;
3578   nPtrmap = (nFree-nOrig+PTRMAP_PAGENO(pBt, nOrig)+nEntry)/nEntry;
3579   nFin = nOrig - nFree - nPtrmap;
3580   if( nOrig>PENDING_BYTE_PAGE(pBt) && nFin<PENDING_BYTE_PAGE(pBt) ){
3581     nFin--;
3582   }
3583   while( PTRMAP_ISPAGE(pBt, nFin) || nFin==PENDING_BYTE_PAGE(pBt) ){
3584     nFin--;
3585   }
3586 
3587   return nFin;
3588 }
3589 
3590 /*
3591 ** A write-transaction must be opened before calling this function.
3592 ** It performs a single unit of work towards an incremental vacuum.
3593 **
3594 ** If the incremental vacuum is finished after this function has run,
3595 ** SQLITE_DONE is returned. If it is not finished, but no error occurred,
3596 ** SQLITE_OK is returned. Otherwise an SQLite error code.
3597 */
3598 int sqlite3BtreeIncrVacuum(Btree *p){
3599   int rc;
3600   BtShared *pBt = p->pBt;
3601 
3602   sqlite3BtreeEnter(p);
3603   assert( pBt->inTransaction==TRANS_WRITE && p->inTrans==TRANS_WRITE );
3604   if( !pBt->autoVacuum ){
3605     rc = SQLITE_DONE;
3606   }else{
3607     Pgno nOrig = btreePagecount(pBt);
3608     Pgno nFree = get4byte(&pBt->pPage1->aData[36]);
3609     Pgno nFin = finalDbSize(pBt, nOrig, nFree);
3610 
3611     if( nOrig<nFin ){
3612       rc = SQLITE_CORRUPT_BKPT;
3613     }else if( nFree>0 ){
3614       rc = saveAllCursors(pBt, 0, 0);
3615       if( rc==SQLITE_OK ){
3616         invalidateAllOverflowCache(pBt);
3617         rc = incrVacuumStep(pBt, nFin, nOrig, 0);
3618       }
3619       if( rc==SQLITE_OK ){
3620         rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
3621         put4byte(&pBt->pPage1->aData[28], pBt->nPage);
3622       }
3623     }else{
3624       rc = SQLITE_DONE;
3625     }
3626   }
3627   sqlite3BtreeLeave(p);
3628   return rc;
3629 }
3630 
3631 /*
3632 ** This routine is called prior to sqlite3PagerCommit when a transaction
3633 ** is committed for an auto-vacuum database.
3634 **
3635 ** If SQLITE_OK is returned, then *pnTrunc is set to the number of pages
3636 ** the database file should be truncated to during the commit process.
3637 ** i.e. the database has been reorganized so that only the first *pnTrunc
3638 ** pages are in use.
3639 */
3640 static int autoVacuumCommit(BtShared *pBt){
3641   int rc = SQLITE_OK;
3642   Pager *pPager = pBt->pPager;
3643   VVA_ONLY( int nRef = sqlite3PagerRefcount(pPager); )
3644 
3645   assert( sqlite3_mutex_held(pBt->mutex) );
3646   invalidateAllOverflowCache(pBt);
3647   assert(pBt->autoVacuum);
3648   if( !pBt->incrVacuum ){
3649     Pgno nFin;         /* Number of pages in database after autovacuuming */
3650     Pgno nFree;        /* Number of pages on the freelist initially */
3651     Pgno iFree;        /* The next page to be freed */
3652     Pgno nOrig;        /* Database size before freeing */
3653 
3654     nOrig = btreePagecount(pBt);
3655     if( PTRMAP_ISPAGE(pBt, nOrig) || nOrig==PENDING_BYTE_PAGE(pBt) ){
3656       /* It is not possible to create a database for which the final page
3657       ** is either a pointer-map page or the pending-byte page. If one
3658       ** is encountered, this indicates corruption.
3659       */
3660       return SQLITE_CORRUPT_BKPT;
3661     }
3662 
3663     nFree = get4byte(&pBt->pPage1->aData[36]);
3664     nFin = finalDbSize(pBt, nOrig, nFree);
3665     if( nFin>nOrig ) return SQLITE_CORRUPT_BKPT;
3666     if( nFin<nOrig ){
3667       rc = saveAllCursors(pBt, 0, 0);
3668     }
3669     for(iFree=nOrig; iFree>nFin && rc==SQLITE_OK; iFree--){
3670       rc = incrVacuumStep(pBt, nFin, iFree, 1);
3671     }
3672     if( (rc==SQLITE_DONE || rc==SQLITE_OK) && nFree>0 ){
3673       rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
3674       put4byte(&pBt->pPage1->aData[32], 0);
3675       put4byte(&pBt->pPage1->aData[36], 0);
3676       put4byte(&pBt->pPage1->aData[28], nFin);
3677       pBt->bDoTruncate = 1;
3678       pBt->nPage = nFin;
3679     }
3680     if( rc!=SQLITE_OK ){
3681       sqlite3PagerRollback(pPager);
3682     }
3683   }
3684 
3685   assert( nRef>=sqlite3PagerRefcount(pPager) );
3686   return rc;
3687 }
3688 
3689 #else /* ifndef SQLITE_OMIT_AUTOVACUUM */
3690 # define setChildPtrmaps(x) SQLITE_OK
3691 #endif
3692 
3693 /*
3694 ** This routine does the first phase of a two-phase commit.  This routine
3695 ** causes a rollback journal to be created (if it does not already exist)
3696 ** and populated with enough information so that if a power loss occurs
3697 ** the database can be restored to its original state by playing back
3698 ** the journal.  Then the contents of the journal are flushed out to
3699 ** the disk.  After the journal is safely on oxide, the changes to the
3700 ** database are written into the database file and flushed to oxide.
3701 ** At the end of this call, the rollback journal still exists on the
3702 ** disk and we are still holding all locks, so the transaction has not
3703 ** committed.  See sqlite3BtreeCommitPhaseTwo() for the second phase of the
3704 ** commit process.
3705 **
3706 ** This call is a no-op if no write-transaction is currently active on pBt.
3707 **
3708 ** Otherwise, sync the database file for the btree pBt. zMaster points to
3709 ** the name of a master journal file that should be written into the
3710 ** individual journal file, or is NULL, indicating no master journal file
3711 ** (single database transaction).
3712 **
3713 ** When this is called, the master journal should already have been
3714 ** created, populated with this journal pointer and synced to disk.
3715 **
3716 ** Once this is routine has returned, the only thing required to commit
3717 ** the write-transaction for this database file is to delete the journal.
3718 */
3719 int sqlite3BtreeCommitPhaseOne(Btree *p, const char *zMaster){
3720   int rc = SQLITE_OK;
3721   if( p->inTrans==TRANS_WRITE ){
3722     BtShared *pBt = p->pBt;
3723     sqlite3BtreeEnter(p);
3724 #ifndef SQLITE_OMIT_AUTOVACUUM
3725     if( pBt->autoVacuum ){
3726       rc = autoVacuumCommit(pBt);
3727       if( rc!=SQLITE_OK ){
3728         sqlite3BtreeLeave(p);
3729         return rc;
3730       }
3731     }
3732     if( pBt->bDoTruncate ){
3733       sqlite3PagerTruncateImage(pBt->pPager, pBt->nPage);
3734     }
3735 #endif
3736     rc = sqlite3PagerCommitPhaseOne(pBt->pPager, zMaster, 0);
3737     sqlite3BtreeLeave(p);
3738   }
3739   return rc;
3740 }
3741 
3742 /*
3743 ** This function is called from both BtreeCommitPhaseTwo() and BtreeRollback()
3744 ** at the conclusion of a transaction.
3745 */
3746 static void btreeEndTransaction(Btree *p){
3747   BtShared *pBt = p->pBt;
3748   sqlite3 *db = p->db;
3749   assert( sqlite3BtreeHoldsMutex(p) );
3750 
3751 #ifndef SQLITE_OMIT_AUTOVACUUM
3752   pBt->bDoTruncate = 0;
3753 #endif
3754   if( p->inTrans>TRANS_NONE && db->nVdbeRead>1 ){
3755     /* If there are other active statements that belong to this database
3756     ** handle, downgrade to a read-only transaction. The other statements
3757     ** may still be reading from the database.  */
3758     downgradeAllSharedCacheTableLocks(p);
3759     p->inTrans = TRANS_READ;
3760   }else{
3761     /* If the handle had any kind of transaction open, decrement the
3762     ** transaction count of the shared btree. If the transaction count
3763     ** reaches 0, set the shared state to TRANS_NONE. The unlockBtreeIfUnused()
3764     ** call below will unlock the pager.  */
3765     if( p->inTrans!=TRANS_NONE ){
3766       clearAllSharedCacheTableLocks(p);
3767       pBt->nTransaction--;
3768       if( 0==pBt->nTransaction ){
3769         pBt->inTransaction = TRANS_NONE;
3770       }
3771     }
3772 
3773     /* Set the current transaction state to TRANS_NONE and unlock the
3774     ** pager if this call closed the only read or write transaction.  */
3775     p->inTrans = TRANS_NONE;
3776     unlockBtreeIfUnused(pBt);
3777   }
3778 
3779   btreeIntegrity(p);
3780 }
3781 
3782 /*
3783 ** Commit the transaction currently in progress.
3784 **
3785 ** This routine implements the second phase of a 2-phase commit.  The
3786 ** sqlite3BtreeCommitPhaseOne() routine does the first phase and should
3787 ** be invoked prior to calling this routine.  The sqlite3BtreeCommitPhaseOne()
3788 ** routine did all the work of writing information out to disk and flushing the
3789 ** contents so that they are written onto the disk platter.  All this
3790 ** routine has to do is delete or truncate or zero the header in the
3791 ** the rollback journal (which causes the transaction to commit) and
3792 ** drop locks.
3793 **
3794 ** Normally, if an error occurs while the pager layer is attempting to
3795 ** finalize the underlying journal file, this function returns an error and
3796 ** the upper layer will attempt a rollback. However, if the second argument
3797 ** is non-zero then this b-tree transaction is part of a multi-file
3798 ** transaction. In this case, the transaction has already been committed
3799 ** (by deleting a master journal file) and the caller will ignore this
3800 ** functions return code. So, even if an error occurs in the pager layer,
3801 ** reset the b-tree objects internal state to indicate that the write
3802 ** transaction has been closed. This is quite safe, as the pager will have
3803 ** transitioned to the error state.
3804 **
3805 ** This will release the write lock on the database file.  If there
3806 ** are no active cursors, it also releases the read lock.
3807 */
3808 int sqlite3BtreeCommitPhaseTwo(Btree *p, int bCleanup){
3809 
3810   if( p->inTrans==TRANS_NONE ) return SQLITE_OK;
3811   sqlite3BtreeEnter(p);
3812   btreeIntegrity(p);
3813 
3814   /* If the handle has a write-transaction open, commit the shared-btrees
3815   ** transaction and set the shared state to TRANS_READ.
3816   */
3817   if( p->inTrans==TRANS_WRITE ){
3818     int rc;
3819     BtShared *pBt = p->pBt;
3820     assert( pBt->inTransaction==TRANS_WRITE );
3821     assert( pBt->nTransaction>0 );
3822     rc = sqlite3PagerCommitPhaseTwo(pBt->pPager);
3823     if( rc!=SQLITE_OK && bCleanup==0 ){
3824       sqlite3BtreeLeave(p);
3825       return rc;
3826     }
3827     p->iDataVersion--;  /* Compensate for pPager->iDataVersion++; */
3828     pBt->inTransaction = TRANS_READ;
3829     btreeClearHasContent(pBt);
3830   }
3831 
3832   btreeEndTransaction(p);
3833   sqlite3BtreeLeave(p);
3834   return SQLITE_OK;
3835 }
3836 
3837 /*
3838 ** Do both phases of a commit.
3839 */
3840 int sqlite3BtreeCommit(Btree *p){
3841   int rc;
3842   sqlite3BtreeEnter(p);
3843   rc = sqlite3BtreeCommitPhaseOne(p, 0);
3844   if( rc==SQLITE_OK ){
3845     rc = sqlite3BtreeCommitPhaseTwo(p, 0);
3846   }
3847   sqlite3BtreeLeave(p);
3848   return rc;
3849 }
3850 
3851 /*
3852 ** This routine sets the state to CURSOR_FAULT and the error
3853 ** code to errCode for every cursor on any BtShared that pBtree
3854 ** references.  Or if the writeOnly flag is set to 1, then only
3855 ** trip write cursors and leave read cursors unchanged.
3856 **
3857 ** Every cursor is a candidate to be tripped, including cursors
3858 ** that belong to other database connections that happen to be
3859 ** sharing the cache with pBtree.
3860 **
3861 ** This routine gets called when a rollback occurs. If the writeOnly
3862 ** flag is true, then only write-cursors need be tripped - read-only
3863 ** cursors save their current positions so that they may continue
3864 ** following the rollback. Or, if writeOnly is false, all cursors are
3865 ** tripped. In general, writeOnly is false if the transaction being
3866 ** rolled back modified the database schema. In this case b-tree root
3867 ** pages may be moved or deleted from the database altogether, making
3868 ** it unsafe for read cursors to continue.
3869 **
3870 ** If the writeOnly flag is true and an error is encountered while
3871 ** saving the current position of a read-only cursor, all cursors,
3872 ** including all read-cursors are tripped.
3873 **
3874 ** SQLITE_OK is returned if successful, or if an error occurs while
3875 ** saving a cursor position, an SQLite error code.
3876 */
3877 int sqlite3BtreeTripAllCursors(Btree *pBtree, int errCode, int writeOnly){
3878   BtCursor *p;
3879   int rc = SQLITE_OK;
3880 
3881   assert( (writeOnly==0 || writeOnly==1) && BTCF_WriteFlag==1 );
3882   if( pBtree ){
3883     sqlite3BtreeEnter(pBtree);
3884     for(p=pBtree->pBt->pCursor; p; p=p->pNext){
3885       int i;
3886       if( writeOnly && (p->curFlags & BTCF_WriteFlag)==0 ){
3887         if( p->eState==CURSOR_VALID || p->eState==CURSOR_SKIPNEXT ){
3888           rc = saveCursorPosition(p);
3889           if( rc!=SQLITE_OK ){
3890             (void)sqlite3BtreeTripAllCursors(pBtree, rc, 0);
3891             break;
3892           }
3893         }
3894       }else{
3895         sqlite3BtreeClearCursor(p);
3896         p->eState = CURSOR_FAULT;
3897         p->skipNext = errCode;
3898       }
3899       for(i=0; i<=p->iPage; i++){
3900         releasePage(p->apPage[i]);
3901         p->apPage[i] = 0;
3902       }
3903     }
3904     sqlite3BtreeLeave(pBtree);
3905   }
3906   return rc;
3907 }
3908 
3909 /*
3910 ** Rollback the transaction in progress.
3911 **
3912 ** If tripCode is not SQLITE_OK then cursors will be invalidated (tripped).
3913 ** Only write cursors are tripped if writeOnly is true but all cursors are
3914 ** tripped if writeOnly is false.  Any attempt to use
3915 ** a tripped cursor will result in an error.
3916 **
3917 ** This will release the write lock on the database file.  If there
3918 ** are no active cursors, it also releases the read lock.
3919 */
3920 int sqlite3BtreeRollback(Btree *p, int tripCode, int writeOnly){
3921   int rc;
3922   BtShared *pBt = p->pBt;
3923   MemPage *pPage1;
3924 
3925   assert( writeOnly==1 || writeOnly==0 );
3926   assert( tripCode==SQLITE_ABORT_ROLLBACK || tripCode==SQLITE_OK );
3927   sqlite3BtreeEnter(p);
3928   if( tripCode==SQLITE_OK ){
3929     rc = tripCode = saveAllCursors(pBt, 0, 0);
3930     if( rc ) writeOnly = 0;
3931   }else{
3932     rc = SQLITE_OK;
3933   }
3934   if( tripCode ){
3935     int rc2 = sqlite3BtreeTripAllCursors(p, tripCode, writeOnly);
3936     assert( rc==SQLITE_OK || (writeOnly==0 && rc2==SQLITE_OK) );
3937     if( rc2!=SQLITE_OK ) rc = rc2;
3938   }
3939   btreeIntegrity(p);
3940 
3941   if( p->inTrans==TRANS_WRITE ){
3942     int rc2;
3943 
3944     assert( TRANS_WRITE==pBt->inTransaction );
3945     rc2 = sqlite3PagerRollback(pBt->pPager);
3946     if( rc2!=SQLITE_OK ){
3947       rc = rc2;
3948     }
3949 
3950     /* The rollback may have destroyed the pPage1->aData value.  So
3951     ** call btreeGetPage() on page 1 again to make
3952     ** sure pPage1->aData is set correctly. */
3953     if( btreeGetPage(pBt, 1, &pPage1, 0)==SQLITE_OK ){
3954       int nPage = get4byte(28+(u8*)pPage1->aData);
3955       testcase( nPage==0 );
3956       if( nPage==0 ) sqlite3PagerPagecount(pBt->pPager, &nPage);
3957       testcase( pBt->nPage!=nPage );
3958       pBt->nPage = nPage;
3959       releasePage(pPage1);
3960     }
3961     assert( countValidCursors(pBt, 1)==0 );
3962     pBt->inTransaction = TRANS_READ;
3963     btreeClearHasContent(pBt);
3964   }
3965 
3966   btreeEndTransaction(p);
3967   sqlite3BtreeLeave(p);
3968   return rc;
3969 }
3970 
3971 /*
3972 ** Start a statement subtransaction. The subtransaction can be rolled
3973 ** back independently of the main transaction. You must start a transaction
3974 ** before starting a subtransaction. The subtransaction is ended automatically
3975 ** if the main transaction commits or rolls back.
3976 **
3977 ** Statement subtransactions are used around individual SQL statements
3978 ** that are contained within a BEGIN...COMMIT block.  If a constraint
3979 ** error occurs within the statement, the effect of that one statement
3980 ** can be rolled back without having to rollback the entire transaction.
3981 **
3982 ** A statement sub-transaction is implemented as an anonymous savepoint. The
3983 ** value passed as the second parameter is the total number of savepoints,
3984 ** including the new anonymous savepoint, open on the B-Tree. i.e. if there
3985 ** are no active savepoints and no other statement-transactions open,
3986 ** iStatement is 1. This anonymous savepoint can be released or rolled back
3987 ** using the sqlite3BtreeSavepoint() function.
3988 */
3989 int sqlite3BtreeBeginStmt(Btree *p, int iStatement){
3990   int rc;
3991   BtShared *pBt = p->pBt;
3992   sqlite3BtreeEnter(p);
3993   assert( p->inTrans==TRANS_WRITE );
3994   assert( (pBt->btsFlags & BTS_READ_ONLY)==0 );
3995   assert( iStatement>0 );
3996   assert( iStatement>p->db->nSavepoint );
3997   assert( pBt->inTransaction==TRANS_WRITE );
3998   /* At the pager level, a statement transaction is a savepoint with
3999   ** an index greater than all savepoints created explicitly using
4000   ** SQL statements. It is illegal to open, release or rollback any
4001   ** such savepoints while the statement transaction savepoint is active.
4002   */
4003   rc = sqlite3PagerOpenSavepoint(pBt->pPager, iStatement);
4004   sqlite3BtreeLeave(p);
4005   return rc;
4006 }
4007 
4008 /*
4009 ** The second argument to this function, op, is always SAVEPOINT_ROLLBACK
4010 ** or SAVEPOINT_RELEASE. This function either releases or rolls back the
4011 ** savepoint identified by parameter iSavepoint, depending on the value
4012 ** of op.
4013 **
4014 ** Normally, iSavepoint is greater than or equal to zero. However, if op is
4015 ** SAVEPOINT_ROLLBACK, then iSavepoint may also be -1. In this case the
4016 ** contents of the entire transaction are rolled back. This is different
4017 ** from a normal transaction rollback, as no locks are released and the
4018 ** transaction remains open.
4019 */
4020 int sqlite3BtreeSavepoint(Btree *p, int op, int iSavepoint){
4021   int rc = SQLITE_OK;
4022   if( p && p->inTrans==TRANS_WRITE ){
4023     BtShared *pBt = p->pBt;
4024     assert( op==SAVEPOINT_RELEASE || op==SAVEPOINT_ROLLBACK );
4025     assert( iSavepoint>=0 || (iSavepoint==-1 && op==SAVEPOINT_ROLLBACK) );
4026     sqlite3BtreeEnter(p);
4027     rc = sqlite3PagerSavepoint(pBt->pPager, op, iSavepoint);
4028     if( rc==SQLITE_OK ){
4029       if( iSavepoint<0 && (pBt->btsFlags & BTS_INITIALLY_EMPTY)!=0 ){
4030         pBt->nPage = 0;
4031       }
4032       rc = newDatabase(pBt);
4033       pBt->nPage = get4byte(28 + pBt->pPage1->aData);
4034 
4035       /* The database size was written into the offset 28 of the header
4036       ** when the transaction started, so we know that the value at offset
4037       ** 28 is nonzero. */
4038       assert( pBt->nPage>0 );
4039     }
4040     sqlite3BtreeLeave(p);
4041   }
4042   return rc;
4043 }
4044 
4045 /*
4046 ** Create a new cursor for the BTree whose root is on the page
4047 ** iTable. If a read-only cursor is requested, it is assumed that
4048 ** the caller already has at least a read-only transaction open
4049 ** on the database already. If a write-cursor is requested, then
4050 ** the caller is assumed to have an open write transaction.
4051 **
4052 ** If the BTREE_WRCSR bit of wrFlag is clear, then the cursor can only
4053 ** be used for reading.  If the BTREE_WRCSR bit is set, then the cursor
4054 ** can be used for reading or for writing if other conditions for writing
4055 ** are also met.  These are the conditions that must be met in order
4056 ** for writing to be allowed:
4057 **
4058 ** 1:  The cursor must have been opened with wrFlag containing BTREE_WRCSR
4059 **
4060 ** 2:  Other database connections that share the same pager cache
4061 **     but which are not in the READ_UNCOMMITTED state may not have
4062 **     cursors open with wrFlag==0 on the same table.  Otherwise
4063 **     the changes made by this write cursor would be visible to
4064 **     the read cursors in the other database connection.
4065 **
4066 ** 3:  The database must be writable (not on read-only media)
4067 **
4068 ** 4:  There must be an active transaction.
4069 **
4070 ** The BTREE_FORDELETE bit of wrFlag may optionally be set if BTREE_WRCSR
4071 ** is set.  If FORDELETE is set, that is a hint to the implementation that
4072 ** this cursor will only be used to seek to and delete entries of an index
4073 ** as part of a larger DELETE statement.  The FORDELETE hint is not used by
4074 ** this implementation.  But in a hypothetical alternative storage engine
4075 ** in which index entries are automatically deleted when corresponding table
4076 ** rows are deleted, the FORDELETE flag is a hint that all SEEK and DELETE
4077 ** operations on this cursor can be no-ops and all READ operations can
4078 ** return a null row (2-bytes: 0x01 0x00).
4079 **
4080 ** No checking is done to make sure that page iTable really is the
4081 ** root page of a b-tree.  If it is not, then the cursor acquired
4082 ** will not work correctly.
4083 **
4084 ** It is assumed that the sqlite3BtreeCursorZero() has been called
4085 ** on pCur to initialize the memory space prior to invoking this routine.
4086 */
4087 static int btreeCursor(
4088   Btree *p,                              /* The btree */
4089   int iTable,                            /* Root page of table to open */
4090   int wrFlag,                            /* 1 to write. 0 read-only */
4091   struct KeyInfo *pKeyInfo,              /* First arg to comparison function */
4092   BtCursor *pCur                         /* Space for new cursor */
4093 ){
4094   BtShared *pBt = p->pBt;                /* Shared b-tree handle */
4095   BtCursor *pX;                          /* Looping over other all cursors */
4096 
4097   assert( sqlite3BtreeHoldsMutex(p) );
4098   assert( wrFlag==0
4099        || wrFlag==BTREE_WRCSR
4100        || wrFlag==(BTREE_WRCSR|BTREE_FORDELETE)
4101   );
4102 
4103   /* The following assert statements verify that if this is a sharable
4104   ** b-tree database, the connection is holding the required table locks,
4105   ** and that no other connection has any open cursor that conflicts with
4106   ** this lock.  */
4107   assert( hasSharedCacheTableLock(p, iTable, pKeyInfo!=0, (wrFlag?2:1)) );
4108   assert( wrFlag==0 || !hasReadConflicts(p, iTable) );
4109 
4110   /* Assert that the caller has opened the required transaction. */
4111   assert( p->inTrans>TRANS_NONE );
4112   assert( wrFlag==0 || p->inTrans==TRANS_WRITE );
4113   assert( pBt->pPage1 && pBt->pPage1->aData );
4114   assert( wrFlag==0 || (pBt->btsFlags & BTS_READ_ONLY)==0 );
4115 
4116   if( wrFlag ){
4117     allocateTempSpace(pBt);
4118     if( pBt->pTmpSpace==0 ) return SQLITE_NOMEM_BKPT;
4119   }
4120   if( iTable==1 && btreePagecount(pBt)==0 ){
4121     assert( wrFlag==0 );
4122     iTable = 0;
4123   }
4124 
4125   /* Now that no other errors can occur, finish filling in the BtCursor
4126   ** variables and link the cursor into the BtShared list.  */
4127   pCur->pgnoRoot = (Pgno)iTable;
4128   pCur->iPage = -1;
4129   pCur->pKeyInfo = pKeyInfo;
4130   pCur->pBtree = p;
4131   pCur->pBt = pBt;
4132   pCur->curFlags = wrFlag ? BTCF_WriteFlag : 0;
4133   pCur->curPagerFlags = wrFlag ? 0 : PAGER_GET_READONLY;
4134   /* If there are two or more cursors on the same btree, then all such
4135   ** cursors *must* have the BTCF_Multiple flag set. */
4136   for(pX=pBt->pCursor; pX; pX=pX->pNext){
4137     if( pX->pgnoRoot==(Pgno)iTable ){
4138       pX->curFlags |= BTCF_Multiple;
4139       pCur->curFlags |= BTCF_Multiple;
4140     }
4141   }
4142   pCur->pNext = pBt->pCursor;
4143   pBt->pCursor = pCur;
4144   pCur->eState = CURSOR_INVALID;
4145   return SQLITE_OK;
4146 }
4147 int sqlite3BtreeCursor(
4148   Btree *p,                                   /* The btree */
4149   int iTable,                                 /* Root page of table to open */
4150   int wrFlag,                                 /* 1 to write. 0 read-only */
4151   struct KeyInfo *pKeyInfo,                   /* First arg to xCompare() */
4152   BtCursor *pCur                              /* Write new cursor here */
4153 ){
4154   int rc;
4155   if( iTable<1 ){
4156     rc = SQLITE_CORRUPT_BKPT;
4157   }else{
4158     sqlite3BtreeEnter(p);
4159     rc = btreeCursor(p, iTable, wrFlag, pKeyInfo, pCur);
4160     sqlite3BtreeLeave(p);
4161   }
4162   return rc;
4163 }
4164 
4165 /*
4166 ** Return the size of a BtCursor object in bytes.
4167 **
4168 ** This interfaces is needed so that users of cursors can preallocate
4169 ** sufficient storage to hold a cursor.  The BtCursor object is opaque
4170 ** to users so they cannot do the sizeof() themselves - they must call
4171 ** this routine.
4172 */
4173 int sqlite3BtreeCursorSize(void){
4174   return ROUND8(sizeof(BtCursor));
4175 }
4176 
4177 /*
4178 ** Initialize memory that will be converted into a BtCursor object.
4179 **
4180 ** The simple approach here would be to memset() the entire object
4181 ** to zero.  But it turns out that the apPage[] and aiIdx[] arrays
4182 ** do not need to be zeroed and they are large, so we can save a lot
4183 ** of run-time by skipping the initialization of those elements.
4184 */
4185 void sqlite3BtreeCursorZero(BtCursor *p){
4186   memset(p, 0, offsetof(BtCursor, iPage));
4187 }
4188 
4189 /*
4190 ** Close a cursor.  The read lock on the database file is released
4191 ** when the last cursor is closed.
4192 */
4193 int sqlite3BtreeCloseCursor(BtCursor *pCur){
4194   Btree *pBtree = pCur->pBtree;
4195   if( pBtree ){
4196     int i;
4197     BtShared *pBt = pCur->pBt;
4198     sqlite3BtreeEnter(pBtree);
4199     sqlite3BtreeClearCursor(pCur);
4200     assert( pBt->pCursor!=0 );
4201     if( pBt->pCursor==pCur ){
4202       pBt->pCursor = pCur->pNext;
4203     }else{
4204       BtCursor *pPrev = pBt->pCursor;
4205       do{
4206         if( pPrev->pNext==pCur ){
4207           pPrev->pNext = pCur->pNext;
4208           break;
4209         }
4210         pPrev = pPrev->pNext;
4211       }while( ALWAYS(pPrev) );
4212     }
4213     for(i=0; i<=pCur->iPage; i++){
4214       releasePage(pCur->apPage[i]);
4215     }
4216     unlockBtreeIfUnused(pBt);
4217     sqlite3_free(pCur->aOverflow);
4218     /* sqlite3_free(pCur); */
4219     sqlite3BtreeLeave(pBtree);
4220   }
4221   return SQLITE_OK;
4222 }
4223 
4224 /*
4225 ** Make sure the BtCursor* given in the argument has a valid
4226 ** BtCursor.info structure.  If it is not already valid, call
4227 ** btreeParseCell() to fill it in.
4228 **
4229 ** BtCursor.info is a cache of the information in the current cell.
4230 ** Using this cache reduces the number of calls to btreeParseCell().
4231 */
4232 #ifndef NDEBUG
4233   static void assertCellInfo(BtCursor *pCur){
4234     CellInfo info;
4235     int iPage = pCur->iPage;
4236     memset(&info, 0, sizeof(info));
4237     btreeParseCell(pCur->apPage[iPage], pCur->aiIdx[iPage], &info);
4238     assert( CORRUPT_DB || memcmp(&info, &pCur->info, sizeof(info))==0 );
4239   }
4240 #else
4241   #define assertCellInfo(x)
4242 #endif
4243 static SQLITE_NOINLINE void getCellInfo(BtCursor *pCur){
4244   if( pCur->info.nSize==0 ){
4245     int iPage = pCur->iPage;
4246     pCur->curFlags |= BTCF_ValidNKey;
4247     btreeParseCell(pCur->apPage[iPage],pCur->aiIdx[iPage],&pCur->info);
4248   }else{
4249     assertCellInfo(pCur);
4250   }
4251 }
4252 
4253 #ifndef NDEBUG  /* The next routine used only within assert() statements */
4254 /*
4255 ** Return true if the given BtCursor is valid.  A valid cursor is one
4256 ** that is currently pointing to a row in a (non-empty) table.
4257 ** This is a verification routine is used only within assert() statements.
4258 */
4259 int sqlite3BtreeCursorIsValid(BtCursor *pCur){
4260   return pCur && pCur->eState==CURSOR_VALID;
4261 }
4262 #endif /* NDEBUG */
4263 
4264 /*
4265 ** Set *pSize to the size of the buffer needed to hold the value of
4266 ** the key for the current entry.  If the cursor is not pointing
4267 ** to a valid entry, *pSize is set to 0.
4268 **
4269 ** For a table with the INTKEY flag set, this routine returns the key
4270 ** itself, not the number of bytes in the key.
4271 **
4272 ** The caller must position the cursor prior to invoking this routine.
4273 **
4274 ** This routine cannot fail.  It always returns SQLITE_OK.
4275 */
4276 int sqlite3BtreeKeySize(BtCursor *pCur, i64 *pSize){
4277   assert( cursorHoldsMutex(pCur) );
4278   assert( pCur->eState==CURSOR_VALID );
4279   getCellInfo(pCur);
4280   *pSize = pCur->info.nKey;
4281   return SQLITE_OK;
4282 }
4283 
4284 /*
4285 ** Set *pSize to the number of bytes of data in the entry the
4286 ** cursor currently points to.
4287 **
4288 ** The caller must guarantee that the cursor is pointing to a non-NULL
4289 ** valid entry.  In other words, the calling procedure must guarantee
4290 ** that the cursor has Cursor.eState==CURSOR_VALID.
4291 **
4292 ** Failure is not possible.  This function always returns SQLITE_OK.
4293 ** It might just as well be a procedure (returning void) but we continue
4294 ** to return an integer result code for historical reasons.
4295 */
4296 int sqlite3BtreeDataSize(BtCursor *pCur, u32 *pSize){
4297   assert( cursorOwnsBtShared(pCur) );
4298   assert( pCur->eState==CURSOR_VALID );
4299   assert( pCur->iPage>=0 );
4300   assert( pCur->iPage<BTCURSOR_MAX_DEPTH );
4301   assert( pCur->apPage[pCur->iPage]->intKeyLeaf==1 );
4302   getCellInfo(pCur);
4303   *pSize = pCur->info.nPayload;
4304   return SQLITE_OK;
4305 }
4306 
4307 /*
4308 ** Given the page number of an overflow page in the database (parameter
4309 ** ovfl), this function finds the page number of the next page in the
4310 ** linked list of overflow pages. If possible, it uses the auto-vacuum
4311 ** pointer-map data instead of reading the content of page ovfl to do so.
4312 **
4313 ** If an error occurs an SQLite error code is returned. Otherwise:
4314 **
4315 ** The page number of the next overflow page in the linked list is
4316 ** written to *pPgnoNext. If page ovfl is the last page in its linked
4317 ** list, *pPgnoNext is set to zero.
4318 **
4319 ** If ppPage is not NULL, and a reference to the MemPage object corresponding
4320 ** to page number pOvfl was obtained, then *ppPage is set to point to that
4321 ** reference. It is the responsibility of the caller to call releasePage()
4322 ** on *ppPage to free the reference. In no reference was obtained (because
4323 ** the pointer-map was used to obtain the value for *pPgnoNext), then
4324 ** *ppPage is set to zero.
4325 */
4326 static int getOverflowPage(
4327   BtShared *pBt,               /* The database file */
4328   Pgno ovfl,                   /* Current overflow page number */
4329   MemPage **ppPage,            /* OUT: MemPage handle (may be NULL) */
4330   Pgno *pPgnoNext              /* OUT: Next overflow page number */
4331 ){
4332   Pgno next = 0;
4333   MemPage *pPage = 0;
4334   int rc = SQLITE_OK;
4335 
4336   assert( sqlite3_mutex_held(pBt->mutex) );
4337   assert(pPgnoNext);
4338 
4339 #ifndef SQLITE_OMIT_AUTOVACUUM
4340   /* Try to find the next page in the overflow list using the
4341   ** autovacuum pointer-map pages. Guess that the next page in
4342   ** the overflow list is page number (ovfl+1). If that guess turns
4343   ** out to be wrong, fall back to loading the data of page
4344   ** number ovfl to determine the next page number.
4345   */
4346   if( pBt->autoVacuum ){
4347     Pgno pgno;
4348     Pgno iGuess = ovfl+1;
4349     u8 eType;
4350 
4351     while( PTRMAP_ISPAGE(pBt, iGuess) || iGuess==PENDING_BYTE_PAGE(pBt) ){
4352       iGuess++;
4353     }
4354 
4355     if( iGuess<=btreePagecount(pBt) ){
4356       rc = ptrmapGet(pBt, iGuess, &eType, &pgno);
4357       if( rc==SQLITE_OK && eType==PTRMAP_OVERFLOW2 && pgno==ovfl ){
4358         next = iGuess;
4359         rc = SQLITE_DONE;
4360       }
4361     }
4362   }
4363 #endif
4364 
4365   assert( next==0 || rc==SQLITE_DONE );
4366   if( rc==SQLITE_OK ){
4367     rc = btreeGetPage(pBt, ovfl, &pPage, (ppPage==0) ? PAGER_GET_READONLY : 0);
4368     assert( rc==SQLITE_OK || pPage==0 );
4369     if( rc==SQLITE_OK ){
4370       next = get4byte(pPage->aData);
4371     }
4372   }
4373 
4374   *pPgnoNext = next;
4375   if( ppPage ){
4376     *ppPage = pPage;
4377   }else{
4378     releasePage(pPage);
4379   }
4380   return (rc==SQLITE_DONE ? SQLITE_OK : rc);
4381 }
4382 
4383 /*
4384 ** Copy data from a buffer to a page, or from a page to a buffer.
4385 **
4386 ** pPayload is a pointer to data stored on database page pDbPage.
4387 ** If argument eOp is false, then nByte bytes of data are copied
4388 ** from pPayload to the buffer pointed at by pBuf. If eOp is true,
4389 ** then sqlite3PagerWrite() is called on pDbPage and nByte bytes
4390 ** of data are copied from the buffer pBuf to pPayload.
4391 **
4392 ** SQLITE_OK is returned on success, otherwise an error code.
4393 */
4394 static int copyPayload(
4395   void *pPayload,           /* Pointer to page data */
4396   void *pBuf,               /* Pointer to buffer */
4397   int nByte,                /* Number of bytes to copy */
4398   int eOp,                  /* 0 -> copy from page, 1 -> copy to page */
4399   DbPage *pDbPage           /* Page containing pPayload */
4400 ){
4401   if( eOp ){
4402     /* Copy data from buffer to page (a write operation) */
4403     int rc = sqlite3PagerWrite(pDbPage);
4404     if( rc!=SQLITE_OK ){
4405       return rc;
4406     }
4407     memcpy(pPayload, pBuf, nByte);
4408   }else{
4409     /* Copy data from page to buffer (a read operation) */
4410     memcpy(pBuf, pPayload, nByte);
4411   }
4412   return SQLITE_OK;
4413 }
4414 
4415 /*
4416 ** This function is used to read or overwrite payload information
4417 ** for the entry that the pCur cursor is pointing to. The eOp
4418 ** argument is interpreted as follows:
4419 **
4420 **   0: The operation is a read. Populate the overflow cache.
4421 **   1: The operation is a write. Populate the overflow cache.
4422 **   2: The operation is a read. Do not populate the overflow cache.
4423 **
4424 ** A total of "amt" bytes are read or written beginning at "offset".
4425 ** Data is read to or from the buffer pBuf.
4426 **
4427 ** The content being read or written might appear on the main page
4428 ** or be scattered out on multiple overflow pages.
4429 **
4430 ** If the current cursor entry uses one or more overflow pages and the
4431 ** eOp argument is not 2, this function may allocate space for and lazily
4432 ** populates the overflow page-list cache array (BtCursor.aOverflow).
4433 ** Subsequent calls use this cache to make seeking to the supplied offset
4434 ** more efficient.
4435 **
4436 ** Once an overflow page-list cache has been allocated, it may be
4437 ** invalidated if some other cursor writes to the same table, or if
4438 ** the cursor is moved to a different row. Additionally, in auto-vacuum
4439 ** mode, the following events may invalidate an overflow page-list cache.
4440 **
4441 **   * An incremental vacuum,
4442 **   * A commit in auto_vacuum="full" mode,
4443 **   * Creating a table (may require moving an overflow page).
4444 */
4445 static int accessPayload(
4446   BtCursor *pCur,      /* Cursor pointing to entry to read from */
4447   u32 offset,          /* Begin reading this far into payload */
4448   u32 amt,             /* Read this many bytes */
4449   unsigned char *pBuf, /* Write the bytes into this buffer */
4450   int eOp              /* zero to read. non-zero to write. */
4451 ){
4452   unsigned char *aPayload;
4453   int rc = SQLITE_OK;
4454   int iIdx = 0;
4455   MemPage *pPage = pCur->apPage[pCur->iPage]; /* Btree page of current entry */
4456   BtShared *pBt = pCur->pBt;                  /* Btree this cursor belongs to */
4457 #ifdef SQLITE_DIRECT_OVERFLOW_READ
4458   unsigned char * const pBufStart = pBuf;
4459   int bEnd;                                 /* True if reading to end of data */
4460 #endif
4461 
4462   assert( pPage );
4463   assert( pCur->eState==CURSOR_VALID );
4464   assert( pCur->aiIdx[pCur->iPage]<pPage->nCell );
4465   assert( cursorHoldsMutex(pCur) );
4466   assert( eOp!=2 || offset==0 );    /* Always start from beginning for eOp==2 */
4467 
4468   getCellInfo(pCur);
4469   aPayload = pCur->info.pPayload;
4470 #ifdef SQLITE_DIRECT_OVERFLOW_READ
4471   bEnd = offset+amt==pCur->info.nPayload;
4472 #endif
4473   assert( offset+amt <= pCur->info.nPayload );
4474 
4475   assert( aPayload > pPage->aData );
4476   if( (uptr)(aPayload - pPage->aData) > (pBt->usableSize - pCur->info.nLocal) ){
4477     /* Trying to read or write past the end of the data is an error.  The
4478     ** conditional above is really:
4479     **    &aPayload[pCur->info.nLocal] > &pPage->aData[pBt->usableSize]
4480     ** but is recast into its current form to avoid integer overflow problems
4481     */
4482     return SQLITE_CORRUPT_BKPT;
4483   }
4484 
4485   /* Check if data must be read/written to/from the btree page itself. */
4486   if( offset<pCur->info.nLocal ){
4487     int a = amt;
4488     if( a+offset>pCur->info.nLocal ){
4489       a = pCur->info.nLocal - offset;
4490     }
4491     rc = copyPayload(&aPayload[offset], pBuf, a, (eOp & 0x01), pPage->pDbPage);
4492     offset = 0;
4493     pBuf += a;
4494     amt -= a;
4495   }else{
4496     offset -= pCur->info.nLocal;
4497   }
4498 
4499 
4500   if( rc==SQLITE_OK && amt>0 ){
4501     const u32 ovflSize = pBt->usableSize - 4;  /* Bytes content per ovfl page */
4502     Pgno nextPage;
4503 
4504     nextPage = get4byte(&aPayload[pCur->info.nLocal]);
4505 
4506     /* If the BtCursor.aOverflow[] has not been allocated, allocate it now.
4507     ** Except, do not allocate aOverflow[] for eOp==2.
4508     **
4509     ** The aOverflow[] array is sized at one entry for each overflow page
4510     ** in the overflow chain. The page number of the first overflow page is
4511     ** stored in aOverflow[0], etc. A value of 0 in the aOverflow[] array
4512     ** means "not yet known" (the cache is lazily populated).
4513     */
4514     if( eOp!=2 && (pCur->curFlags & BTCF_ValidOvfl)==0 ){
4515       int nOvfl = (pCur->info.nPayload-pCur->info.nLocal+ovflSize-1)/ovflSize;
4516       if( nOvfl>pCur->nOvflAlloc ){
4517         Pgno *aNew = (Pgno*)sqlite3Realloc(
4518             pCur->aOverflow, nOvfl*2*sizeof(Pgno)
4519         );
4520         if( aNew==0 ){
4521           rc = SQLITE_NOMEM_BKPT;
4522         }else{
4523           pCur->nOvflAlloc = nOvfl*2;
4524           pCur->aOverflow = aNew;
4525         }
4526       }
4527       if( rc==SQLITE_OK ){
4528         memset(pCur->aOverflow, 0, nOvfl*sizeof(Pgno));
4529         pCur->curFlags |= BTCF_ValidOvfl;
4530       }
4531     }
4532 
4533     /* If the overflow page-list cache has been allocated and the
4534     ** entry for the first required overflow page is valid, skip
4535     ** directly to it.
4536     */
4537     if( (pCur->curFlags & BTCF_ValidOvfl)!=0
4538      && pCur->aOverflow[offset/ovflSize]
4539     ){
4540       iIdx = (offset/ovflSize);
4541       nextPage = pCur->aOverflow[iIdx];
4542       offset = (offset%ovflSize);
4543     }
4544 
4545     for( ; rc==SQLITE_OK && amt>0 && nextPage; iIdx++){
4546 
4547       /* If required, populate the overflow page-list cache. */
4548       if( (pCur->curFlags & BTCF_ValidOvfl)!=0 ){
4549         assert( pCur->aOverflow[iIdx]==0
4550                 || pCur->aOverflow[iIdx]==nextPage
4551                 || CORRUPT_DB );
4552         pCur->aOverflow[iIdx] = nextPage;
4553       }
4554 
4555       if( offset>=ovflSize ){
4556         /* The only reason to read this page is to obtain the page
4557         ** number for the next page in the overflow chain. The page
4558         ** data is not required. So first try to lookup the overflow
4559         ** page-list cache, if any, then fall back to the getOverflowPage()
4560         ** function.
4561         **
4562         ** Note that the aOverflow[] array must be allocated because eOp!=2
4563         ** here.  If eOp==2, then offset==0 and this branch is never taken.
4564         */
4565         assert( eOp!=2 );
4566         assert( pCur->curFlags & BTCF_ValidOvfl );
4567         assert( pCur->pBtree->db==pBt->db );
4568         if( pCur->aOverflow[iIdx+1] ){
4569           nextPage = pCur->aOverflow[iIdx+1];
4570         }else{
4571           rc = getOverflowPage(pBt, nextPage, 0, &nextPage);
4572         }
4573         offset -= ovflSize;
4574       }else{
4575         /* Need to read this page properly. It contains some of the
4576         ** range of data that is being read (eOp==0) or written (eOp!=0).
4577         */
4578 #ifdef SQLITE_DIRECT_OVERFLOW_READ
4579         sqlite3_file *fd;
4580 #endif
4581         int a = amt;
4582         if( a + offset > ovflSize ){
4583           a = ovflSize - offset;
4584         }
4585 
4586 #ifdef SQLITE_DIRECT_OVERFLOW_READ
4587         /* If all the following are true:
4588         **
4589         **   1) this is a read operation, and
4590         **   2) data is required from the start of this overflow page, and
4591         **   3) the database is file-backed, and
4592         **   4) there is no open write-transaction, and
4593         **   5) the database is not a WAL database,
4594         **   6) all data from the page is being read.
4595         **   7) at least 4 bytes have already been read into the output buffer
4596         **
4597         ** then data can be read directly from the database file into the
4598         ** output buffer, bypassing the page-cache altogether. This speeds
4599         ** up loading large records that span many overflow pages.
4600         */
4601         if( (eOp&0x01)==0                                      /* (1) */
4602          && offset==0                                          /* (2) */
4603          && (bEnd || a==ovflSize)                              /* (6) */
4604          && pBt->inTransaction==TRANS_READ                     /* (4) */
4605          && (fd = sqlite3PagerFile(pBt->pPager))->pMethods     /* (3) */
4606          && pBt->pPage1->aData[19]==0x01                       /* (5) */
4607          && &pBuf[-4]>=pBufStart                               /* (7) */
4608         ){
4609           u8 aSave[4];
4610           u8 *aWrite = &pBuf[-4];
4611           assert( aWrite>=pBufStart );                         /* hence (7) */
4612           memcpy(aSave, aWrite, 4);
4613           rc = sqlite3OsRead(fd, aWrite, a+4, (i64)pBt->pageSize*(nextPage-1));
4614           nextPage = get4byte(aWrite);
4615           memcpy(aWrite, aSave, 4);
4616         }else
4617 #endif
4618 
4619         {
4620           DbPage *pDbPage;
4621           rc = sqlite3PagerGet(pBt->pPager, nextPage, &pDbPage,
4622               ((eOp&0x01)==0 ? PAGER_GET_READONLY : 0)
4623           );
4624           if( rc==SQLITE_OK ){
4625             aPayload = sqlite3PagerGetData(pDbPage);
4626             nextPage = get4byte(aPayload);
4627             rc = copyPayload(&aPayload[offset+4], pBuf, a, (eOp&0x01), pDbPage);
4628             sqlite3PagerUnref(pDbPage);
4629             offset = 0;
4630           }
4631         }
4632         amt -= a;
4633         pBuf += a;
4634       }
4635     }
4636   }
4637 
4638   if( rc==SQLITE_OK && amt>0 ){
4639     return SQLITE_CORRUPT_BKPT;
4640   }
4641   return rc;
4642 }
4643 
4644 /*
4645 ** Read part of the key associated with cursor pCur.  Exactly
4646 ** "amt" bytes will be transferred into pBuf[].  The transfer
4647 ** begins at "offset".
4648 **
4649 ** The caller must ensure that pCur is pointing to a valid row
4650 ** in the table.
4651 **
4652 ** Return SQLITE_OK on success or an error code if anything goes
4653 ** wrong.  An error is returned if "offset+amt" is larger than
4654 ** the available payload.
4655 */
4656 int sqlite3BtreeKey(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){
4657   assert( cursorHoldsMutex(pCur) );
4658   assert( pCur->eState==CURSOR_VALID );
4659   assert( pCur->iPage>=0 && pCur->apPage[pCur->iPage] );
4660   assert( pCur->aiIdx[pCur->iPage]<pCur->apPage[pCur->iPage]->nCell );
4661   return accessPayload(pCur, offset, amt, (unsigned char*)pBuf, 0);
4662 }
4663 
4664 /*
4665 ** Read part of the data associated with cursor pCur.  Exactly
4666 ** "amt" bytes will be transfered into pBuf[].  The transfer
4667 ** begins at "offset".
4668 **
4669 ** Return SQLITE_OK on success or an error code if anything goes
4670 ** wrong.  An error is returned if "offset+amt" is larger than
4671 ** the available payload.
4672 */
4673 int sqlite3BtreeData(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){
4674   int rc;
4675 
4676 #ifndef SQLITE_OMIT_INCRBLOB
4677   if ( pCur->eState==CURSOR_INVALID ){
4678     return SQLITE_ABORT;
4679   }
4680 #endif
4681 
4682   assert( cursorOwnsBtShared(pCur) );
4683   rc = restoreCursorPosition(pCur);
4684   if( rc==SQLITE_OK ){
4685     assert( pCur->eState==CURSOR_VALID );
4686     assert( pCur->iPage>=0 && pCur->apPage[pCur->iPage] );
4687     assert( pCur->aiIdx[pCur->iPage]<pCur->apPage[pCur->iPage]->nCell );
4688     rc = accessPayload(pCur, offset, amt, pBuf, 0);
4689   }
4690   return rc;
4691 }
4692 
4693 /*
4694 ** Return a pointer to payload information from the entry that the
4695 ** pCur cursor is pointing to.  The pointer is to the beginning of
4696 ** the key if index btrees (pPage->intKey==0) and is the data for
4697 ** table btrees (pPage->intKey==1). The number of bytes of available
4698 ** key/data is written into *pAmt.  If *pAmt==0, then the value
4699 ** returned will not be a valid pointer.
4700 **
4701 ** This routine is an optimization.  It is common for the entire key
4702 ** and data to fit on the local page and for there to be no overflow
4703 ** pages.  When that is so, this routine can be used to access the
4704 ** key and data without making a copy.  If the key and/or data spills
4705 ** onto overflow pages, then accessPayload() must be used to reassemble
4706 ** the key/data and copy it into a preallocated buffer.
4707 **
4708 ** The pointer returned by this routine looks directly into the cached
4709 ** page of the database.  The data might change or move the next time
4710 ** any btree routine is called.
4711 */
4712 static const void *fetchPayload(
4713   BtCursor *pCur,      /* Cursor pointing to entry to read from */
4714   u32 *pAmt            /* Write the number of available bytes here */
4715 ){
4716   u32 amt;
4717   assert( pCur!=0 && pCur->iPage>=0 && pCur->apPage[pCur->iPage]);
4718   assert( pCur->eState==CURSOR_VALID );
4719   assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
4720   assert( cursorOwnsBtShared(pCur) );
4721   assert( pCur->aiIdx[pCur->iPage]<pCur->apPage[pCur->iPage]->nCell );
4722   assert( pCur->info.nSize>0 );
4723   assert( pCur->info.pPayload>pCur->apPage[pCur->iPage]->aData || CORRUPT_DB );
4724   assert( pCur->info.pPayload<pCur->apPage[pCur->iPage]->aDataEnd ||CORRUPT_DB);
4725   amt = (int)(pCur->apPage[pCur->iPage]->aDataEnd - pCur->info.pPayload);
4726   if( pCur->info.nLocal<amt ) amt = pCur->info.nLocal;
4727   *pAmt = amt;
4728   return (void*)pCur->info.pPayload;
4729 }
4730 
4731 
4732 /*
4733 ** For the entry that cursor pCur is point to, return as
4734 ** many bytes of the key or data as are available on the local
4735 ** b-tree page.  Write the number of available bytes into *pAmt.
4736 **
4737 ** The pointer returned is ephemeral.  The key/data may move
4738 ** or be destroyed on the next call to any Btree routine,
4739 ** including calls from other threads against the same cache.
4740 ** Hence, a mutex on the BtShared should be held prior to calling
4741 ** this routine.
4742 **
4743 ** These routines is used to get quick access to key and data
4744 ** in the common case where no overflow pages are used.
4745 */
4746 const void *sqlite3BtreeKeyFetch(BtCursor *pCur, u32 *pAmt){
4747   return fetchPayload(pCur, pAmt);
4748 }
4749 const void *sqlite3BtreeDataFetch(BtCursor *pCur, u32 *pAmt){
4750   return fetchPayload(pCur, pAmt);
4751 }
4752 
4753 
4754 /*
4755 ** Move the cursor down to a new child page.  The newPgno argument is the
4756 ** page number of the child page to move to.
4757 **
4758 ** This function returns SQLITE_CORRUPT if the page-header flags field of
4759 ** the new child page does not match the flags field of the parent (i.e.
4760 ** if an intkey page appears to be the parent of a non-intkey page, or
4761 ** vice-versa).
4762 */
4763 static int moveToChild(BtCursor *pCur, u32 newPgno){
4764   BtShared *pBt = pCur->pBt;
4765 
4766   assert( cursorOwnsBtShared(pCur) );
4767   assert( pCur->eState==CURSOR_VALID );
4768   assert( pCur->iPage<BTCURSOR_MAX_DEPTH );
4769   assert( pCur->iPage>=0 );
4770   if( pCur->iPage>=(BTCURSOR_MAX_DEPTH-1) ){
4771     return SQLITE_CORRUPT_BKPT;
4772   }
4773   pCur->info.nSize = 0;
4774   pCur->curFlags &= ~(BTCF_ValidNKey|BTCF_ValidOvfl);
4775   pCur->iPage++;
4776   pCur->aiIdx[pCur->iPage] = 0;
4777   return getAndInitPage(pBt, newPgno, &pCur->apPage[pCur->iPage],
4778                         pCur, pCur->curPagerFlags);
4779 }
4780 
4781 #if SQLITE_DEBUG
4782 /*
4783 ** Page pParent is an internal (non-leaf) tree page. This function
4784 ** asserts that page number iChild is the left-child if the iIdx'th
4785 ** cell in page pParent. Or, if iIdx is equal to the total number of
4786 ** cells in pParent, that page number iChild is the right-child of
4787 ** the page.
4788 */
4789 static void assertParentIndex(MemPage *pParent, int iIdx, Pgno iChild){
4790   if( CORRUPT_DB ) return;  /* The conditions tested below might not be true
4791                             ** in a corrupt database */
4792   assert( iIdx<=pParent->nCell );
4793   if( iIdx==pParent->nCell ){
4794     assert( get4byte(&pParent->aData[pParent->hdrOffset+8])==iChild );
4795   }else{
4796     assert( get4byte(findCell(pParent, iIdx))==iChild );
4797   }
4798 }
4799 #else
4800 #  define assertParentIndex(x,y,z)
4801 #endif
4802 
4803 /*
4804 ** Move the cursor up to the parent page.
4805 **
4806 ** pCur->idx is set to the cell index that contains the pointer
4807 ** to the page we are coming from.  If we are coming from the
4808 ** right-most child page then pCur->idx is set to one more than
4809 ** the largest cell index.
4810 */
4811 static void moveToParent(BtCursor *pCur){
4812   assert( cursorOwnsBtShared(pCur) );
4813   assert( pCur->eState==CURSOR_VALID );
4814   assert( pCur->iPage>0 );
4815   assert( pCur->apPage[pCur->iPage] );
4816   assertParentIndex(
4817     pCur->apPage[pCur->iPage-1],
4818     pCur->aiIdx[pCur->iPage-1],
4819     pCur->apPage[pCur->iPage]->pgno
4820   );
4821   testcase( pCur->aiIdx[pCur->iPage-1] > pCur->apPage[pCur->iPage-1]->nCell );
4822   pCur->info.nSize = 0;
4823   pCur->curFlags &= ~(BTCF_ValidNKey|BTCF_ValidOvfl);
4824   releasePageNotNull(pCur->apPage[pCur->iPage--]);
4825 }
4826 
4827 /*
4828 ** Move the cursor to point to the root page of its b-tree structure.
4829 **
4830 ** If the table has a virtual root page, then the cursor is moved to point
4831 ** to the virtual root page instead of the actual root page. A table has a
4832 ** virtual root page when the actual root page contains no cells and a
4833 ** single child page. This can only happen with the table rooted at page 1.
4834 **
4835 ** If the b-tree structure is empty, the cursor state is set to
4836 ** CURSOR_INVALID. Otherwise, the cursor is set to point to the first
4837 ** cell located on the root (or virtual root) page and the cursor state
4838 ** is set to CURSOR_VALID.
4839 **
4840 ** If this function returns successfully, it may be assumed that the
4841 ** page-header flags indicate that the [virtual] root-page is the expected
4842 ** kind of b-tree page (i.e. if when opening the cursor the caller did not
4843 ** specify a KeyInfo structure the flags byte is set to 0x05 or 0x0D,
4844 ** indicating a table b-tree, or if the caller did specify a KeyInfo
4845 ** structure the flags byte is set to 0x02 or 0x0A, indicating an index
4846 ** b-tree).
4847 */
4848 static int moveToRoot(BtCursor *pCur){
4849   MemPage *pRoot;
4850   int rc = SQLITE_OK;
4851 
4852   assert( cursorOwnsBtShared(pCur) );
4853   assert( CURSOR_INVALID < CURSOR_REQUIRESEEK );
4854   assert( CURSOR_VALID   < CURSOR_REQUIRESEEK );
4855   assert( CURSOR_FAULT   > CURSOR_REQUIRESEEK );
4856   if( pCur->eState>=CURSOR_REQUIRESEEK ){
4857     if( pCur->eState==CURSOR_FAULT ){
4858       assert( pCur->skipNext!=SQLITE_OK );
4859       return pCur->skipNext;
4860     }
4861     sqlite3BtreeClearCursor(pCur);
4862   }
4863 
4864   if( pCur->iPage>=0 ){
4865     while( pCur->iPage ){
4866       assert( pCur->apPage[pCur->iPage]!=0 );
4867       releasePageNotNull(pCur->apPage[pCur->iPage--]);
4868     }
4869   }else if( pCur->pgnoRoot==0 ){
4870     pCur->eState = CURSOR_INVALID;
4871     return SQLITE_OK;
4872   }else{
4873     assert( pCur->iPage==(-1) );
4874     rc = getAndInitPage(pCur->pBtree->pBt, pCur->pgnoRoot, &pCur->apPage[0],
4875                         0, pCur->curPagerFlags);
4876     if( rc!=SQLITE_OK ){
4877       pCur->eState = CURSOR_INVALID;
4878       return rc;
4879     }
4880     pCur->iPage = 0;
4881     pCur->curIntKey = pCur->apPage[0]->intKey;
4882   }
4883   pRoot = pCur->apPage[0];
4884   assert( pRoot->pgno==pCur->pgnoRoot );
4885 
4886   /* If pCur->pKeyInfo is not NULL, then the caller that opened this cursor
4887   ** expected to open it on an index b-tree. Otherwise, if pKeyInfo is
4888   ** NULL, the caller expects a table b-tree. If this is not the case,
4889   ** return an SQLITE_CORRUPT error.
4890   **
4891   ** Earlier versions of SQLite assumed that this test could not fail
4892   ** if the root page was already loaded when this function was called (i.e.
4893   ** if pCur->iPage>=0). But this is not so if the database is corrupted
4894   ** in such a way that page pRoot is linked into a second b-tree table
4895   ** (or the freelist).  */
4896   assert( pRoot->intKey==1 || pRoot->intKey==0 );
4897   if( pRoot->isInit==0 || (pCur->pKeyInfo==0)!=pRoot->intKey ){
4898     return SQLITE_CORRUPT_BKPT;
4899   }
4900 
4901   pCur->aiIdx[0] = 0;
4902   pCur->info.nSize = 0;
4903   pCur->curFlags &= ~(BTCF_AtLast|BTCF_ValidNKey|BTCF_ValidOvfl);
4904 
4905   if( pRoot->nCell>0 ){
4906     pCur->eState = CURSOR_VALID;
4907   }else if( !pRoot->leaf ){
4908     Pgno subpage;
4909     if( pRoot->pgno!=1 ) return SQLITE_CORRUPT_BKPT;
4910     subpage = get4byte(&pRoot->aData[pRoot->hdrOffset+8]);
4911     pCur->eState = CURSOR_VALID;
4912     rc = moveToChild(pCur, subpage);
4913   }else{
4914     pCur->eState = CURSOR_INVALID;
4915   }
4916   return rc;
4917 }
4918 
4919 /*
4920 ** Move the cursor down to the left-most leaf entry beneath the
4921 ** entry to which it is currently pointing.
4922 **
4923 ** The left-most leaf is the one with the smallest key - the first
4924 ** in ascending order.
4925 */
4926 static int moveToLeftmost(BtCursor *pCur){
4927   Pgno pgno;
4928   int rc = SQLITE_OK;
4929   MemPage *pPage;
4930 
4931   assert( cursorOwnsBtShared(pCur) );
4932   assert( pCur->eState==CURSOR_VALID );
4933   while( rc==SQLITE_OK && !(pPage = pCur->apPage[pCur->iPage])->leaf ){
4934     assert( pCur->aiIdx[pCur->iPage]<pPage->nCell );
4935     pgno = get4byte(findCell(pPage, pCur->aiIdx[pCur->iPage]));
4936     rc = moveToChild(pCur, pgno);
4937   }
4938   return rc;
4939 }
4940 
4941 /*
4942 ** Move the cursor down to the right-most leaf entry beneath the
4943 ** page to which it is currently pointing.  Notice the difference
4944 ** between moveToLeftmost() and moveToRightmost().  moveToLeftmost()
4945 ** finds the left-most entry beneath the *entry* whereas moveToRightmost()
4946 ** finds the right-most entry beneath the *page*.
4947 **
4948 ** The right-most entry is the one with the largest key - the last
4949 ** key in ascending order.
4950 */
4951 static int moveToRightmost(BtCursor *pCur){
4952   Pgno pgno;
4953   int rc = SQLITE_OK;
4954   MemPage *pPage = 0;
4955 
4956   assert( cursorOwnsBtShared(pCur) );
4957   assert( pCur->eState==CURSOR_VALID );
4958   while( !(pPage = pCur->apPage[pCur->iPage])->leaf ){
4959     pgno = get4byte(&pPage->aData[pPage->hdrOffset+8]);
4960     pCur->aiIdx[pCur->iPage] = pPage->nCell;
4961     rc = moveToChild(pCur, pgno);
4962     if( rc ) return rc;
4963   }
4964   pCur->aiIdx[pCur->iPage] = pPage->nCell-1;
4965   assert( pCur->info.nSize==0 );
4966   assert( (pCur->curFlags & BTCF_ValidNKey)==0 );
4967   return SQLITE_OK;
4968 }
4969 
4970 /* Move the cursor to the first entry in the table.  Return SQLITE_OK
4971 ** on success.  Set *pRes to 0 if the cursor actually points to something
4972 ** or set *pRes to 1 if the table is empty.
4973 */
4974 int sqlite3BtreeFirst(BtCursor *pCur, int *pRes){
4975   int rc;
4976 
4977   assert( cursorOwnsBtShared(pCur) );
4978   assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
4979   rc = moveToRoot(pCur);
4980   if( rc==SQLITE_OK ){
4981     if( pCur->eState==CURSOR_INVALID ){
4982       assert( pCur->pgnoRoot==0 || pCur->apPage[pCur->iPage]->nCell==0 );
4983       *pRes = 1;
4984     }else{
4985       assert( pCur->apPage[pCur->iPage]->nCell>0 );
4986       *pRes = 0;
4987       rc = moveToLeftmost(pCur);
4988     }
4989   }
4990   return rc;
4991 }
4992 
4993 /* Move the cursor to the last entry in the table.  Return SQLITE_OK
4994 ** on success.  Set *pRes to 0 if the cursor actually points to something
4995 ** or set *pRes to 1 if the table is empty.
4996 */
4997 int sqlite3BtreeLast(BtCursor *pCur, int *pRes){
4998   int rc;
4999 
5000   assert( cursorOwnsBtShared(pCur) );
5001   assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
5002 
5003   /* If the cursor already points to the last entry, this is a no-op. */
5004   if( CURSOR_VALID==pCur->eState && (pCur->curFlags & BTCF_AtLast)!=0 ){
5005 #ifdef SQLITE_DEBUG
5006     /* This block serves to assert() that the cursor really does point
5007     ** to the last entry in the b-tree. */
5008     int ii;
5009     for(ii=0; ii<pCur->iPage; ii++){
5010       assert( pCur->aiIdx[ii]==pCur->apPage[ii]->nCell );
5011     }
5012     assert( pCur->aiIdx[pCur->iPage]==pCur->apPage[pCur->iPage]->nCell-1 );
5013     assert( pCur->apPage[pCur->iPage]->leaf );
5014 #endif
5015     return SQLITE_OK;
5016   }
5017 
5018   rc = moveToRoot(pCur);
5019   if( rc==SQLITE_OK ){
5020     if( CURSOR_INVALID==pCur->eState ){
5021       assert( pCur->pgnoRoot==0 || pCur->apPage[pCur->iPage]->nCell==0 );
5022       *pRes = 1;
5023     }else{
5024       assert( pCur->eState==CURSOR_VALID );
5025       *pRes = 0;
5026       rc = moveToRightmost(pCur);
5027       if( rc==SQLITE_OK ){
5028         pCur->curFlags |= BTCF_AtLast;
5029       }else{
5030         pCur->curFlags &= ~BTCF_AtLast;
5031       }
5032 
5033     }
5034   }
5035   return rc;
5036 }
5037 
5038 /* Move the cursor so that it points to an entry near the key
5039 ** specified by pIdxKey or intKey.   Return a success code.
5040 **
5041 ** For INTKEY tables, the intKey parameter is used.  pIdxKey
5042 ** must be NULL.  For index tables, pIdxKey is used and intKey
5043 ** is ignored.
5044 **
5045 ** If an exact match is not found, then the cursor is always
5046 ** left pointing at a leaf page which would hold the entry if it
5047 ** were present.  The cursor might point to an entry that comes
5048 ** before or after the key.
5049 **
5050 ** An integer is written into *pRes which is the result of
5051 ** comparing the key with the entry to which the cursor is
5052 ** pointing.  The meaning of the integer written into
5053 ** *pRes is as follows:
5054 **
5055 **     *pRes<0      The cursor is left pointing at an entry that
5056 **                  is smaller than intKey/pIdxKey or if the table is empty
5057 **                  and the cursor is therefore left point to nothing.
5058 **
5059 **     *pRes==0     The cursor is left pointing at an entry that
5060 **                  exactly matches intKey/pIdxKey.
5061 **
5062 **     *pRes>0      The cursor is left pointing at an entry that
5063 **                  is larger than intKey/pIdxKey.
5064 **
5065 ** For index tables, the pIdxKey->eqSeen field is set to 1 if there
5066 ** exists an entry in the table that exactly matches pIdxKey.
5067 */
5068 int sqlite3BtreeMovetoUnpacked(
5069   BtCursor *pCur,          /* The cursor to be moved */
5070   UnpackedRecord *pIdxKey, /* Unpacked index key */
5071   i64 intKey,              /* The table key */
5072   int biasRight,           /* If true, bias the search to the high end */
5073   int *pRes                /* Write search results here */
5074 ){
5075   int rc;
5076   RecordCompare xRecordCompare;
5077 
5078   assert( cursorOwnsBtShared(pCur) );
5079   assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
5080   assert( pRes );
5081   assert( (pIdxKey==0)==(pCur->pKeyInfo==0) );
5082 
5083   /* If the cursor is already positioned at the point we are trying
5084   ** to move to, then just return without doing any work */
5085   if( pCur->eState==CURSOR_VALID && (pCur->curFlags & BTCF_ValidNKey)!=0
5086    && pCur->curIntKey
5087   ){
5088     if( pCur->info.nKey==intKey ){
5089       *pRes = 0;
5090       return SQLITE_OK;
5091     }
5092     if( (pCur->curFlags & BTCF_AtLast)!=0 && pCur->info.nKey<intKey ){
5093       *pRes = -1;
5094       return SQLITE_OK;
5095     }
5096   }
5097 
5098   if( pIdxKey ){
5099     xRecordCompare = sqlite3VdbeFindCompare(pIdxKey);
5100     pIdxKey->errCode = 0;
5101     assert( pIdxKey->default_rc==1
5102          || pIdxKey->default_rc==0
5103          || pIdxKey->default_rc==-1
5104     );
5105   }else{
5106     xRecordCompare = 0; /* All keys are integers */
5107   }
5108 
5109   rc = moveToRoot(pCur);
5110   if( rc ){
5111     return rc;
5112   }
5113   assert( pCur->pgnoRoot==0 || pCur->apPage[pCur->iPage] );
5114   assert( pCur->pgnoRoot==0 || pCur->apPage[pCur->iPage]->isInit );
5115   assert( pCur->eState==CURSOR_INVALID || pCur->apPage[pCur->iPage]->nCell>0 );
5116   if( pCur->eState==CURSOR_INVALID ){
5117     *pRes = -1;
5118     assert( pCur->pgnoRoot==0 || pCur->apPage[pCur->iPage]->nCell==0 );
5119     return SQLITE_OK;
5120   }
5121   assert( pCur->apPage[0]->intKey==pCur->curIntKey );
5122   assert( pCur->curIntKey || pIdxKey );
5123   for(;;){
5124     int lwr, upr, idx, c;
5125     Pgno chldPg;
5126     MemPage *pPage = pCur->apPage[pCur->iPage];
5127     u8 *pCell;                          /* Pointer to current cell in pPage */
5128 
5129     /* pPage->nCell must be greater than zero. If this is the root-page
5130     ** the cursor would have been INVALID above and this for(;;) loop
5131     ** not run. If this is not the root-page, then the moveToChild() routine
5132     ** would have already detected db corruption. Similarly, pPage must
5133     ** be the right kind (index or table) of b-tree page. Otherwise
5134     ** a moveToChild() or moveToRoot() call would have detected corruption.  */
5135     assert( pPage->nCell>0 );
5136     assert( pPage->intKey==(pIdxKey==0) );
5137     lwr = 0;
5138     upr = pPage->nCell-1;
5139     assert( biasRight==0 || biasRight==1 );
5140     idx = upr>>(1-biasRight); /* idx = biasRight ? upr : (lwr+upr)/2; */
5141     pCur->aiIdx[pCur->iPage] = (u16)idx;
5142     if( xRecordCompare==0 ){
5143       for(;;){
5144         i64 nCellKey;
5145         pCell = findCellPastPtr(pPage, idx);
5146         if( pPage->intKeyLeaf ){
5147           while( 0x80 <= *(pCell++) ){
5148             if( pCell>=pPage->aDataEnd ) return SQLITE_CORRUPT_BKPT;
5149           }
5150         }
5151         getVarint(pCell, (u64*)&nCellKey);
5152         if( nCellKey<intKey ){
5153           lwr = idx+1;
5154           if( lwr>upr ){ c = -1; break; }
5155         }else if( nCellKey>intKey ){
5156           upr = idx-1;
5157           if( lwr>upr ){ c = +1; break; }
5158         }else{
5159           assert( nCellKey==intKey );
5160           pCur->curFlags |= BTCF_ValidNKey;
5161           pCur->info.nKey = nCellKey;
5162           pCur->aiIdx[pCur->iPage] = (u16)idx;
5163           if( !pPage->leaf ){
5164             lwr = idx;
5165             goto moveto_next_layer;
5166           }else{
5167             *pRes = 0;
5168             rc = SQLITE_OK;
5169             goto moveto_finish;
5170           }
5171         }
5172         assert( lwr+upr>=0 );
5173         idx = (lwr+upr)>>1;  /* idx = (lwr+upr)/2; */
5174       }
5175     }else{
5176       for(;;){
5177         int nCell;  /* Size of the pCell cell in bytes */
5178         pCell = findCellPastPtr(pPage, idx);
5179 
5180         /* The maximum supported page-size is 65536 bytes. This means that
5181         ** the maximum number of record bytes stored on an index B-Tree
5182         ** page is less than 16384 bytes and may be stored as a 2-byte
5183         ** varint. This information is used to attempt to avoid parsing
5184         ** the entire cell by checking for the cases where the record is
5185         ** stored entirely within the b-tree page by inspecting the first
5186         ** 2 bytes of the cell.
5187         */
5188         nCell = pCell[0];
5189         if( nCell<=pPage->max1bytePayload ){
5190           /* This branch runs if the record-size field of the cell is a
5191           ** single byte varint and the record fits entirely on the main
5192           ** b-tree page.  */
5193           testcase( pCell+nCell+1==pPage->aDataEnd );
5194           c = xRecordCompare(nCell, (void*)&pCell[1], pIdxKey);
5195         }else if( !(pCell[1] & 0x80)
5196           && (nCell = ((nCell&0x7f)<<7) + pCell[1])<=pPage->maxLocal
5197         ){
5198           /* The record-size field is a 2 byte varint and the record
5199           ** fits entirely on the main b-tree page.  */
5200           testcase( pCell+nCell+2==pPage->aDataEnd );
5201           c = xRecordCompare(nCell, (void*)&pCell[2], pIdxKey);
5202         }else{
5203           /* The record flows over onto one or more overflow pages. In
5204           ** this case the whole cell needs to be parsed, a buffer allocated
5205           ** and accessPayload() used to retrieve the record into the
5206           ** buffer before VdbeRecordCompare() can be called.
5207           **
5208           ** If the record is corrupt, the xRecordCompare routine may read
5209           ** up to two varints past the end of the buffer. An extra 18
5210           ** bytes of padding is allocated at the end of the buffer in
5211           ** case this happens.  */
5212           void *pCellKey;
5213           u8 * const pCellBody = pCell - pPage->childPtrSize;
5214           pPage->xParseCell(pPage, pCellBody, &pCur->info);
5215           nCell = (int)pCur->info.nKey;
5216           testcase( nCell<0 );   /* True if key size is 2^32 or more */
5217           testcase( nCell==0 );  /* Invalid key size:  0x80 0x80 0x00 */
5218           testcase( nCell==1 );  /* Invalid key size:  0x80 0x80 0x01 */
5219           testcase( nCell==2 );  /* Minimum legal index key size */
5220           if( nCell<2 ){
5221             rc = SQLITE_CORRUPT_BKPT;
5222             goto moveto_finish;
5223           }
5224           pCellKey = sqlite3Malloc( nCell+18 );
5225           if( pCellKey==0 ){
5226             rc = SQLITE_NOMEM_BKPT;
5227             goto moveto_finish;
5228           }
5229           pCur->aiIdx[pCur->iPage] = (u16)idx;
5230           rc = accessPayload(pCur, 0, nCell, (unsigned char*)pCellKey, 2);
5231           if( rc ){
5232             sqlite3_free(pCellKey);
5233             goto moveto_finish;
5234           }
5235           c = xRecordCompare(nCell, pCellKey, pIdxKey);
5236           sqlite3_free(pCellKey);
5237         }
5238         assert(
5239             (pIdxKey->errCode!=SQLITE_CORRUPT || c==0)
5240          && (pIdxKey->errCode!=SQLITE_NOMEM || pCur->pBtree->db->mallocFailed)
5241         );
5242         if( c<0 ){
5243           lwr = idx+1;
5244         }else if( c>0 ){
5245           upr = idx-1;
5246         }else{
5247           assert( c==0 );
5248           *pRes = 0;
5249           rc = SQLITE_OK;
5250           pCur->aiIdx[pCur->iPage] = (u16)idx;
5251           if( pIdxKey->errCode ) rc = SQLITE_CORRUPT;
5252           goto moveto_finish;
5253         }
5254         if( lwr>upr ) break;
5255         assert( lwr+upr>=0 );
5256         idx = (lwr+upr)>>1;  /* idx = (lwr+upr)/2 */
5257       }
5258     }
5259     assert( lwr==upr+1 || (pPage->intKey && !pPage->leaf) );
5260     assert( pPage->isInit );
5261     if( pPage->leaf ){
5262       assert( pCur->aiIdx[pCur->iPage]<pCur->apPage[pCur->iPage]->nCell );
5263       pCur->aiIdx[pCur->iPage] = (u16)idx;
5264       *pRes = c;
5265       rc = SQLITE_OK;
5266       goto moveto_finish;
5267     }
5268 moveto_next_layer:
5269     if( lwr>=pPage->nCell ){
5270       chldPg = get4byte(&pPage->aData[pPage->hdrOffset+8]);
5271     }else{
5272       chldPg = get4byte(findCell(pPage, lwr));
5273     }
5274     pCur->aiIdx[pCur->iPage] = (u16)lwr;
5275     rc = moveToChild(pCur, chldPg);
5276     if( rc ) break;
5277   }
5278 moveto_finish:
5279   pCur->info.nSize = 0;
5280   pCur->curFlags &= ~(BTCF_ValidNKey|BTCF_ValidOvfl);
5281   return rc;
5282 }
5283 
5284 
5285 /*
5286 ** Return TRUE if the cursor is not pointing at an entry of the table.
5287 **
5288 ** TRUE will be returned after a call to sqlite3BtreeNext() moves
5289 ** past the last entry in the table or sqlite3BtreePrev() moves past
5290 ** the first entry.  TRUE is also returned if the table is empty.
5291 */
5292 int sqlite3BtreeEof(BtCursor *pCur){
5293   /* TODO: What if the cursor is in CURSOR_REQUIRESEEK but all table entries
5294   ** have been deleted? This API will need to change to return an error code
5295   ** as well as the boolean result value.
5296   */
5297   return (CURSOR_VALID!=pCur->eState);
5298 }
5299 
5300 /*
5301 ** Advance the cursor to the next entry in the database.  If
5302 ** successful then set *pRes=0.  If the cursor
5303 ** was already pointing to the last entry in the database before
5304 ** this routine was called, then set *pRes=1.
5305 **
5306 ** The main entry point is sqlite3BtreeNext().  That routine is optimized
5307 ** for the common case of merely incrementing the cell counter BtCursor.aiIdx
5308 ** to the next cell on the current page.  The (slower) btreeNext() helper
5309 ** routine is called when it is necessary to move to a different page or
5310 ** to restore the cursor.
5311 **
5312 ** The calling function will set *pRes to 0 or 1.  The initial *pRes value
5313 ** will be 1 if the cursor being stepped corresponds to an SQL index and
5314 ** if this routine could have been skipped if that SQL index had been
5315 ** a unique index.  Otherwise the caller will have set *pRes to zero.
5316 ** Zero is the common case. The btree implementation is free to use the
5317 ** initial *pRes value as a hint to improve performance, but the current
5318 ** SQLite btree implementation does not. (Note that the comdb2 btree
5319 ** implementation does use this hint, however.)
5320 */
5321 static SQLITE_NOINLINE int btreeNext(BtCursor *pCur, int *pRes){
5322   int rc;
5323   int idx;
5324   MemPage *pPage;
5325 
5326   assert( cursorOwnsBtShared(pCur) );
5327   assert( pCur->skipNext==0 || pCur->eState!=CURSOR_VALID );
5328   assert( *pRes==0 );
5329   if( pCur->eState!=CURSOR_VALID ){
5330     assert( (pCur->curFlags & BTCF_ValidOvfl)==0 );
5331     rc = restoreCursorPosition(pCur);
5332     if( rc!=SQLITE_OK ){
5333       return rc;
5334     }
5335     if( CURSOR_INVALID==pCur->eState ){
5336       *pRes = 1;
5337       return SQLITE_OK;
5338     }
5339     if( pCur->skipNext ){
5340       assert( pCur->eState==CURSOR_VALID || pCur->eState==CURSOR_SKIPNEXT );
5341       pCur->eState = CURSOR_VALID;
5342       if( pCur->skipNext>0 ){
5343         pCur->skipNext = 0;
5344         return SQLITE_OK;
5345       }
5346       pCur->skipNext = 0;
5347     }
5348   }
5349 
5350   pPage = pCur->apPage[pCur->iPage];
5351   idx = ++pCur->aiIdx[pCur->iPage];
5352   assert( pPage->isInit );
5353 
5354   /* If the database file is corrupt, it is possible for the value of idx
5355   ** to be invalid here. This can only occur if a second cursor modifies
5356   ** the page while cursor pCur is holding a reference to it. Which can
5357   ** only happen if the database is corrupt in such a way as to link the
5358   ** page into more than one b-tree structure. */
5359   testcase( idx>pPage->nCell );
5360 
5361   if( idx>=pPage->nCell ){
5362     if( !pPage->leaf ){
5363       rc = moveToChild(pCur, get4byte(&pPage->aData[pPage->hdrOffset+8]));
5364       if( rc ) return rc;
5365       return moveToLeftmost(pCur);
5366     }
5367     do{
5368       if( pCur->iPage==0 ){
5369         *pRes = 1;
5370         pCur->eState = CURSOR_INVALID;
5371         return SQLITE_OK;
5372       }
5373       moveToParent(pCur);
5374       pPage = pCur->apPage[pCur->iPage];
5375     }while( pCur->aiIdx[pCur->iPage]>=pPage->nCell );
5376     if( pPage->intKey ){
5377       return sqlite3BtreeNext(pCur, pRes);
5378     }else{
5379       return SQLITE_OK;
5380     }
5381   }
5382   if( pPage->leaf ){
5383     return SQLITE_OK;
5384   }else{
5385     return moveToLeftmost(pCur);
5386   }
5387 }
5388 int sqlite3BtreeNext(BtCursor *pCur, int *pRes){
5389   MemPage *pPage;
5390   assert( cursorOwnsBtShared(pCur) );
5391   assert( pRes!=0 );
5392   assert( *pRes==0 || *pRes==1 );
5393   assert( pCur->skipNext==0 || pCur->eState!=CURSOR_VALID );
5394   pCur->info.nSize = 0;
5395   pCur->curFlags &= ~(BTCF_ValidNKey|BTCF_ValidOvfl);
5396   *pRes = 0;
5397   if( pCur->eState!=CURSOR_VALID ) return btreeNext(pCur, pRes);
5398   pPage = pCur->apPage[pCur->iPage];
5399   if( (++pCur->aiIdx[pCur->iPage])>=pPage->nCell ){
5400     pCur->aiIdx[pCur->iPage]--;
5401     return btreeNext(pCur, pRes);
5402   }
5403   if( pPage->leaf ){
5404     return SQLITE_OK;
5405   }else{
5406     return moveToLeftmost(pCur);
5407   }
5408 }
5409 
5410 /*
5411 ** Step the cursor to the back to the previous entry in the database.  If
5412 ** successful then set *pRes=0.  If the cursor
5413 ** was already pointing to the first entry in the database before
5414 ** this routine was called, then set *pRes=1.
5415 **
5416 ** The main entry point is sqlite3BtreePrevious().  That routine is optimized
5417 ** for the common case of merely decrementing the cell counter BtCursor.aiIdx
5418 ** to the previous cell on the current page.  The (slower) btreePrevious()
5419 ** helper routine is called when it is necessary to move to a different page
5420 ** or to restore the cursor.
5421 **
5422 ** The calling function will set *pRes to 0 or 1.  The initial *pRes value
5423 ** will be 1 if the cursor being stepped corresponds to an SQL index and
5424 ** if this routine could have been skipped if that SQL index had been
5425 ** a unique index.  Otherwise the caller will have set *pRes to zero.
5426 ** Zero is the common case. The btree implementation is free to use the
5427 ** initial *pRes value as a hint to improve performance, but the current
5428 ** SQLite btree implementation does not. (Note that the comdb2 btree
5429 ** implementation does use this hint, however.)
5430 */
5431 static SQLITE_NOINLINE int btreePrevious(BtCursor *pCur, int *pRes){
5432   int rc;
5433   MemPage *pPage;
5434 
5435   assert( cursorOwnsBtShared(pCur) );
5436   assert( pRes!=0 );
5437   assert( *pRes==0 );
5438   assert( pCur->skipNext==0 || pCur->eState!=CURSOR_VALID );
5439   assert( (pCur->curFlags & (BTCF_AtLast|BTCF_ValidOvfl|BTCF_ValidNKey))==0 );
5440   assert( pCur->info.nSize==0 );
5441   if( pCur->eState!=CURSOR_VALID ){
5442     rc = restoreCursorPosition(pCur);
5443     if( rc!=SQLITE_OK ){
5444       return rc;
5445     }
5446     if( CURSOR_INVALID==pCur->eState ){
5447       *pRes = 1;
5448       return SQLITE_OK;
5449     }
5450     if( pCur->skipNext ){
5451       assert( pCur->eState==CURSOR_VALID || pCur->eState==CURSOR_SKIPNEXT );
5452       pCur->eState = CURSOR_VALID;
5453       if( pCur->skipNext<0 ){
5454         pCur->skipNext = 0;
5455         return SQLITE_OK;
5456       }
5457       pCur->skipNext = 0;
5458     }
5459   }
5460 
5461   pPage = pCur->apPage[pCur->iPage];
5462   assert( pPage->isInit );
5463   if( !pPage->leaf ){
5464     int idx = pCur->aiIdx[pCur->iPage];
5465     rc = moveToChild(pCur, get4byte(findCell(pPage, idx)));
5466     if( rc ) return rc;
5467     rc = moveToRightmost(pCur);
5468   }else{
5469     while( pCur->aiIdx[pCur->iPage]==0 ){
5470       if( pCur->iPage==0 ){
5471         pCur->eState = CURSOR_INVALID;
5472         *pRes = 1;
5473         return SQLITE_OK;
5474       }
5475       moveToParent(pCur);
5476     }
5477     assert( pCur->info.nSize==0 );
5478     assert( (pCur->curFlags & (BTCF_ValidNKey|BTCF_ValidOvfl))==0 );
5479 
5480     pCur->aiIdx[pCur->iPage]--;
5481     pPage = pCur->apPage[pCur->iPage];
5482     if( pPage->intKey && !pPage->leaf ){
5483       rc = sqlite3BtreePrevious(pCur, pRes);
5484     }else{
5485       rc = SQLITE_OK;
5486     }
5487   }
5488   return rc;
5489 }
5490 int sqlite3BtreePrevious(BtCursor *pCur, int *pRes){
5491   assert( cursorOwnsBtShared(pCur) );
5492   assert( pRes!=0 );
5493   assert( *pRes==0 || *pRes==1 );
5494   assert( pCur->skipNext==0 || pCur->eState!=CURSOR_VALID );
5495   *pRes = 0;
5496   pCur->curFlags &= ~(BTCF_AtLast|BTCF_ValidOvfl|BTCF_ValidNKey);
5497   pCur->info.nSize = 0;
5498   if( pCur->eState!=CURSOR_VALID
5499    || pCur->aiIdx[pCur->iPage]==0
5500    || pCur->apPage[pCur->iPage]->leaf==0
5501   ){
5502     return btreePrevious(pCur, pRes);
5503   }
5504   pCur->aiIdx[pCur->iPage]--;
5505   return SQLITE_OK;
5506 }
5507 
5508 /*
5509 ** Allocate a new page from the database file.
5510 **
5511 ** The new page is marked as dirty.  (In other words, sqlite3PagerWrite()
5512 ** has already been called on the new page.)  The new page has also
5513 ** been referenced and the calling routine is responsible for calling
5514 ** sqlite3PagerUnref() on the new page when it is done.
5515 **
5516 ** SQLITE_OK is returned on success.  Any other return value indicates
5517 ** an error.  *ppPage is set to NULL in the event of an error.
5518 **
5519 ** If the "nearby" parameter is not 0, then an effort is made to
5520 ** locate a page close to the page number "nearby".  This can be used in an
5521 ** attempt to keep related pages close to each other in the database file,
5522 ** which in turn can make database access faster.
5523 **
5524 ** If the eMode parameter is BTALLOC_EXACT and the nearby page exists
5525 ** anywhere on the free-list, then it is guaranteed to be returned.  If
5526 ** eMode is BTALLOC_LT then the page returned will be less than or equal
5527 ** to nearby if any such page exists.  If eMode is BTALLOC_ANY then there
5528 ** are no restrictions on which page is returned.
5529 */
5530 static int allocateBtreePage(
5531   BtShared *pBt,         /* The btree */
5532   MemPage **ppPage,      /* Store pointer to the allocated page here */
5533   Pgno *pPgno,           /* Store the page number here */
5534   Pgno nearby,           /* Search for a page near this one */
5535   u8 eMode               /* BTALLOC_EXACT, BTALLOC_LT, or BTALLOC_ANY */
5536 ){
5537   MemPage *pPage1;
5538   int rc;
5539   u32 n;     /* Number of pages on the freelist */
5540   u32 k;     /* Number of leaves on the trunk of the freelist */
5541   MemPage *pTrunk = 0;
5542   MemPage *pPrevTrunk = 0;
5543   Pgno mxPage;     /* Total size of the database file */
5544 
5545   assert( sqlite3_mutex_held(pBt->mutex) );
5546   assert( eMode==BTALLOC_ANY || (nearby>0 && IfNotOmitAV(pBt->autoVacuum)) );
5547   pPage1 = pBt->pPage1;
5548   mxPage = btreePagecount(pBt);
5549   /* EVIDENCE-OF: R-05119-02637 The 4-byte big-endian integer at offset 36
5550   ** stores stores the total number of pages on the freelist. */
5551   n = get4byte(&pPage1->aData[36]);
5552   testcase( n==mxPage-1 );
5553   if( n>=mxPage ){
5554     return SQLITE_CORRUPT_BKPT;
5555   }
5556   if( n>0 ){
5557     /* There are pages on the freelist.  Reuse one of those pages. */
5558     Pgno iTrunk;
5559     u8 searchList = 0; /* If the free-list must be searched for 'nearby' */
5560     u32 nSearch = 0;   /* Count of the number of search attempts */
5561 
5562     /* If eMode==BTALLOC_EXACT and a query of the pointer-map
5563     ** shows that the page 'nearby' is somewhere on the free-list, then
5564     ** the entire-list will be searched for that page.
5565     */
5566 #ifndef SQLITE_OMIT_AUTOVACUUM
5567     if( eMode==BTALLOC_EXACT ){
5568       if( nearby<=mxPage ){
5569         u8 eType;
5570         assert( nearby>0 );
5571         assert( pBt->autoVacuum );
5572         rc = ptrmapGet(pBt, nearby, &eType, 0);
5573         if( rc ) return rc;
5574         if( eType==PTRMAP_FREEPAGE ){
5575           searchList = 1;
5576         }
5577       }
5578     }else if( eMode==BTALLOC_LE ){
5579       searchList = 1;
5580     }
5581 #endif
5582 
5583     /* Decrement the free-list count by 1. Set iTrunk to the index of the
5584     ** first free-list trunk page. iPrevTrunk is initially 1.
5585     */
5586     rc = sqlite3PagerWrite(pPage1->pDbPage);
5587     if( rc ) return rc;
5588     put4byte(&pPage1->aData[36], n-1);
5589 
5590     /* The code within this loop is run only once if the 'searchList' variable
5591     ** is not true. Otherwise, it runs once for each trunk-page on the
5592     ** free-list until the page 'nearby' is located (eMode==BTALLOC_EXACT)
5593     ** or until a page less than 'nearby' is located (eMode==BTALLOC_LT)
5594     */
5595     do {
5596       pPrevTrunk = pTrunk;
5597       if( pPrevTrunk ){
5598         /* EVIDENCE-OF: R-01506-11053 The first integer on a freelist trunk page
5599         ** is the page number of the next freelist trunk page in the list or
5600         ** zero if this is the last freelist trunk page. */
5601         iTrunk = get4byte(&pPrevTrunk->aData[0]);
5602       }else{
5603         /* EVIDENCE-OF: R-59841-13798 The 4-byte big-endian integer at offset 32
5604         ** stores the page number of the first page of the freelist, or zero if
5605         ** the freelist is empty. */
5606         iTrunk = get4byte(&pPage1->aData[32]);
5607       }
5608       testcase( iTrunk==mxPage );
5609       if( iTrunk>mxPage || nSearch++ > n ){
5610         rc = SQLITE_CORRUPT_BKPT;
5611       }else{
5612         rc = btreeGetUnusedPage(pBt, iTrunk, &pTrunk, 0);
5613       }
5614       if( rc ){
5615         pTrunk = 0;
5616         goto end_allocate_page;
5617       }
5618       assert( pTrunk!=0 );
5619       assert( pTrunk->aData!=0 );
5620       /* EVIDENCE-OF: R-13523-04394 The second integer on a freelist trunk page
5621       ** is the number of leaf page pointers to follow. */
5622       k = get4byte(&pTrunk->aData[4]);
5623       if( k==0 && !searchList ){
5624         /* The trunk has no leaves and the list is not being searched.
5625         ** So extract the trunk page itself and use it as the newly
5626         ** allocated page */
5627         assert( pPrevTrunk==0 );
5628         rc = sqlite3PagerWrite(pTrunk->pDbPage);
5629         if( rc ){
5630           goto end_allocate_page;
5631         }
5632         *pPgno = iTrunk;
5633         memcpy(&pPage1->aData[32], &pTrunk->aData[0], 4);
5634         *ppPage = pTrunk;
5635         pTrunk = 0;
5636         TRACE(("ALLOCATE: %d trunk - %d free pages left\n", *pPgno, n-1));
5637       }else if( k>(u32)(pBt->usableSize/4 - 2) ){
5638         /* Value of k is out of range.  Database corruption */
5639         rc = SQLITE_CORRUPT_BKPT;
5640         goto end_allocate_page;
5641 #ifndef SQLITE_OMIT_AUTOVACUUM
5642       }else if( searchList
5643             && (nearby==iTrunk || (iTrunk<nearby && eMode==BTALLOC_LE))
5644       ){
5645         /* The list is being searched and this trunk page is the page
5646         ** to allocate, regardless of whether it has leaves.
5647         */
5648         *pPgno = iTrunk;
5649         *ppPage = pTrunk;
5650         searchList = 0;
5651         rc = sqlite3PagerWrite(pTrunk->pDbPage);
5652         if( rc ){
5653           goto end_allocate_page;
5654         }
5655         if( k==0 ){
5656           if( !pPrevTrunk ){
5657             memcpy(&pPage1->aData[32], &pTrunk->aData[0], 4);
5658           }else{
5659             rc = sqlite3PagerWrite(pPrevTrunk->pDbPage);
5660             if( rc!=SQLITE_OK ){
5661               goto end_allocate_page;
5662             }
5663             memcpy(&pPrevTrunk->aData[0], &pTrunk->aData[0], 4);
5664           }
5665         }else{
5666           /* The trunk page is required by the caller but it contains
5667           ** pointers to free-list leaves. The first leaf becomes a trunk
5668           ** page in this case.
5669           */
5670           MemPage *pNewTrunk;
5671           Pgno iNewTrunk = get4byte(&pTrunk->aData[8]);
5672           if( iNewTrunk>mxPage ){
5673             rc = SQLITE_CORRUPT_BKPT;
5674             goto end_allocate_page;
5675           }
5676           testcase( iNewTrunk==mxPage );
5677           rc = btreeGetUnusedPage(pBt, iNewTrunk, &pNewTrunk, 0);
5678           if( rc!=SQLITE_OK ){
5679             goto end_allocate_page;
5680           }
5681           rc = sqlite3PagerWrite(pNewTrunk->pDbPage);
5682           if( rc!=SQLITE_OK ){
5683             releasePage(pNewTrunk);
5684             goto end_allocate_page;
5685           }
5686           memcpy(&pNewTrunk->aData[0], &pTrunk->aData[0], 4);
5687           put4byte(&pNewTrunk->aData[4], k-1);
5688           memcpy(&pNewTrunk->aData[8], &pTrunk->aData[12], (k-1)*4);
5689           releasePage(pNewTrunk);
5690           if( !pPrevTrunk ){
5691             assert( sqlite3PagerIswriteable(pPage1->pDbPage) );
5692             put4byte(&pPage1->aData[32], iNewTrunk);
5693           }else{
5694             rc = sqlite3PagerWrite(pPrevTrunk->pDbPage);
5695             if( rc ){
5696               goto end_allocate_page;
5697             }
5698             put4byte(&pPrevTrunk->aData[0], iNewTrunk);
5699           }
5700         }
5701         pTrunk = 0;
5702         TRACE(("ALLOCATE: %d trunk - %d free pages left\n", *pPgno, n-1));
5703 #endif
5704       }else if( k>0 ){
5705         /* Extract a leaf from the trunk */
5706         u32 closest;
5707         Pgno iPage;
5708         unsigned char *aData = pTrunk->aData;
5709         if( nearby>0 ){
5710           u32 i;
5711           closest = 0;
5712           if( eMode==BTALLOC_LE ){
5713             for(i=0; i<k; i++){
5714               iPage = get4byte(&aData[8+i*4]);
5715               if( iPage<=nearby ){
5716                 closest = i;
5717                 break;
5718               }
5719             }
5720           }else{
5721             int dist;
5722             dist = sqlite3AbsInt32(get4byte(&aData[8]) - nearby);
5723             for(i=1; i<k; i++){
5724               int d2 = sqlite3AbsInt32(get4byte(&aData[8+i*4]) - nearby);
5725               if( d2<dist ){
5726                 closest = i;
5727                 dist = d2;
5728               }
5729             }
5730           }
5731         }else{
5732           closest = 0;
5733         }
5734 
5735         iPage = get4byte(&aData[8+closest*4]);
5736         testcase( iPage==mxPage );
5737         if( iPage>mxPage ){
5738           rc = SQLITE_CORRUPT_BKPT;
5739           goto end_allocate_page;
5740         }
5741         testcase( iPage==mxPage );
5742         if( !searchList
5743          || (iPage==nearby || (iPage<nearby && eMode==BTALLOC_LE))
5744         ){
5745           int noContent;
5746           *pPgno = iPage;
5747           TRACE(("ALLOCATE: %d was leaf %d of %d on trunk %d"
5748                  ": %d more free pages\n",
5749                  *pPgno, closest+1, k, pTrunk->pgno, n-1));
5750           rc = sqlite3PagerWrite(pTrunk->pDbPage);
5751           if( rc ) goto end_allocate_page;
5752           if( closest<k-1 ){
5753             memcpy(&aData[8+closest*4], &aData[4+k*4], 4);
5754           }
5755           put4byte(&aData[4], k-1);
5756           noContent = !btreeGetHasContent(pBt, *pPgno)? PAGER_GET_NOCONTENT : 0;
5757           rc = btreeGetUnusedPage(pBt, *pPgno, ppPage, noContent);
5758           if( rc==SQLITE_OK ){
5759             rc = sqlite3PagerWrite((*ppPage)->pDbPage);
5760             if( rc!=SQLITE_OK ){
5761               releasePage(*ppPage);
5762               *ppPage = 0;
5763             }
5764           }
5765           searchList = 0;
5766         }
5767       }
5768       releasePage(pPrevTrunk);
5769       pPrevTrunk = 0;
5770     }while( searchList );
5771   }else{
5772     /* There are no pages on the freelist, so append a new page to the
5773     ** database image.
5774     **
5775     ** Normally, new pages allocated by this block can be requested from the
5776     ** pager layer with the 'no-content' flag set. This prevents the pager
5777     ** from trying to read the pages content from disk. However, if the
5778     ** current transaction has already run one or more incremental-vacuum
5779     ** steps, then the page we are about to allocate may contain content
5780     ** that is required in the event of a rollback. In this case, do
5781     ** not set the no-content flag. This causes the pager to load and journal
5782     ** the current page content before overwriting it.
5783     **
5784     ** Note that the pager will not actually attempt to load or journal
5785     ** content for any page that really does lie past the end of the database
5786     ** file on disk. So the effects of disabling the no-content optimization
5787     ** here are confined to those pages that lie between the end of the
5788     ** database image and the end of the database file.
5789     */
5790     int bNoContent = (0==IfNotOmitAV(pBt->bDoTruncate))? PAGER_GET_NOCONTENT:0;
5791 
5792     rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
5793     if( rc ) return rc;
5794     pBt->nPage++;
5795     if( pBt->nPage==PENDING_BYTE_PAGE(pBt) ) pBt->nPage++;
5796 
5797 #ifndef SQLITE_OMIT_AUTOVACUUM
5798     if( pBt->autoVacuum && PTRMAP_ISPAGE(pBt, pBt->nPage) ){
5799       /* If *pPgno refers to a pointer-map page, allocate two new pages
5800       ** at the end of the file instead of one. The first allocated page
5801       ** becomes a new pointer-map page, the second is used by the caller.
5802       */
5803       MemPage *pPg = 0;
5804       TRACE(("ALLOCATE: %d from end of file (pointer-map page)\n", pBt->nPage));
5805       assert( pBt->nPage!=PENDING_BYTE_PAGE(pBt) );
5806       rc = btreeGetUnusedPage(pBt, pBt->nPage, &pPg, bNoContent);
5807       if( rc==SQLITE_OK ){
5808         rc = sqlite3PagerWrite(pPg->pDbPage);
5809         releasePage(pPg);
5810       }
5811       if( rc ) return rc;
5812       pBt->nPage++;
5813       if( pBt->nPage==PENDING_BYTE_PAGE(pBt) ){ pBt->nPage++; }
5814     }
5815 #endif
5816     put4byte(28 + (u8*)pBt->pPage1->aData, pBt->nPage);
5817     *pPgno = pBt->nPage;
5818 
5819     assert( *pPgno!=PENDING_BYTE_PAGE(pBt) );
5820     rc = btreeGetUnusedPage(pBt, *pPgno, ppPage, bNoContent);
5821     if( rc ) return rc;
5822     rc = sqlite3PagerWrite((*ppPage)->pDbPage);
5823     if( rc!=SQLITE_OK ){
5824       releasePage(*ppPage);
5825       *ppPage = 0;
5826     }
5827     TRACE(("ALLOCATE: %d from end of file\n", *pPgno));
5828   }
5829 
5830   assert( *pPgno!=PENDING_BYTE_PAGE(pBt) );
5831 
5832 end_allocate_page:
5833   releasePage(pTrunk);
5834   releasePage(pPrevTrunk);
5835   assert( rc!=SQLITE_OK || sqlite3PagerPageRefcount((*ppPage)->pDbPage)<=1 );
5836   assert( rc!=SQLITE_OK || (*ppPage)->isInit==0 );
5837   return rc;
5838 }
5839 
5840 /*
5841 ** This function is used to add page iPage to the database file free-list.
5842 ** It is assumed that the page is not already a part of the free-list.
5843 **
5844 ** The value passed as the second argument to this function is optional.
5845 ** If the caller happens to have a pointer to the MemPage object
5846 ** corresponding to page iPage handy, it may pass it as the second value.
5847 ** Otherwise, it may pass NULL.
5848 **
5849 ** If a pointer to a MemPage object is passed as the second argument,
5850 ** its reference count is not altered by this function.
5851 */
5852 static int freePage2(BtShared *pBt, MemPage *pMemPage, Pgno iPage){
5853   MemPage *pTrunk = 0;                /* Free-list trunk page */
5854   Pgno iTrunk = 0;                    /* Page number of free-list trunk page */
5855   MemPage *pPage1 = pBt->pPage1;      /* Local reference to page 1 */
5856   MemPage *pPage;                     /* Page being freed. May be NULL. */
5857   int rc;                             /* Return Code */
5858   int nFree;                          /* Initial number of pages on free-list */
5859 
5860   assert( sqlite3_mutex_held(pBt->mutex) );
5861   assert( CORRUPT_DB || iPage>1 );
5862   assert( !pMemPage || pMemPage->pgno==iPage );
5863 
5864   if( iPage<2 ) return SQLITE_CORRUPT_BKPT;
5865   if( pMemPage ){
5866     pPage = pMemPage;
5867     sqlite3PagerRef(pPage->pDbPage);
5868   }else{
5869     pPage = btreePageLookup(pBt, iPage);
5870   }
5871 
5872   /* Increment the free page count on pPage1 */
5873   rc = sqlite3PagerWrite(pPage1->pDbPage);
5874   if( rc ) goto freepage_out;
5875   nFree = get4byte(&pPage1->aData[36]);
5876   put4byte(&pPage1->aData[36], nFree+1);
5877 
5878   if( pBt->btsFlags & BTS_SECURE_DELETE ){
5879     /* If the secure_delete option is enabled, then
5880     ** always fully overwrite deleted information with zeros.
5881     */
5882     if( (!pPage && ((rc = btreeGetPage(pBt, iPage, &pPage, 0))!=0) )
5883      ||            ((rc = sqlite3PagerWrite(pPage->pDbPage))!=0)
5884     ){
5885       goto freepage_out;
5886     }
5887     memset(pPage->aData, 0, pPage->pBt->pageSize);
5888   }
5889 
5890   /* If the database supports auto-vacuum, write an entry in the pointer-map
5891   ** to indicate that the page is free.
5892   */
5893   if( ISAUTOVACUUM ){
5894     ptrmapPut(pBt, iPage, PTRMAP_FREEPAGE, 0, &rc);
5895     if( rc ) goto freepage_out;
5896   }
5897 
5898   /* Now manipulate the actual database free-list structure. There are two
5899   ** possibilities. If the free-list is currently empty, or if the first
5900   ** trunk page in the free-list is full, then this page will become a
5901   ** new free-list trunk page. Otherwise, it will become a leaf of the
5902   ** first trunk page in the current free-list. This block tests if it
5903   ** is possible to add the page as a new free-list leaf.
5904   */
5905   if( nFree!=0 ){
5906     u32 nLeaf;                /* Initial number of leaf cells on trunk page */
5907 
5908     iTrunk = get4byte(&pPage1->aData[32]);
5909     rc = btreeGetPage(pBt, iTrunk, &pTrunk, 0);
5910     if( rc!=SQLITE_OK ){
5911       goto freepage_out;
5912     }
5913 
5914     nLeaf = get4byte(&pTrunk->aData[4]);
5915     assert( pBt->usableSize>32 );
5916     if( nLeaf > (u32)pBt->usableSize/4 - 2 ){
5917       rc = SQLITE_CORRUPT_BKPT;
5918       goto freepage_out;
5919     }
5920     if( nLeaf < (u32)pBt->usableSize/4 - 8 ){
5921       /* In this case there is room on the trunk page to insert the page
5922       ** being freed as a new leaf.
5923       **
5924       ** Note that the trunk page is not really full until it contains
5925       ** usableSize/4 - 2 entries, not usableSize/4 - 8 entries as we have
5926       ** coded.  But due to a coding error in versions of SQLite prior to
5927       ** 3.6.0, databases with freelist trunk pages holding more than
5928       ** usableSize/4 - 8 entries will be reported as corrupt.  In order
5929       ** to maintain backwards compatibility with older versions of SQLite,
5930       ** we will continue to restrict the number of entries to usableSize/4 - 8
5931       ** for now.  At some point in the future (once everyone has upgraded
5932       ** to 3.6.0 or later) we should consider fixing the conditional above
5933       ** to read "usableSize/4-2" instead of "usableSize/4-8".
5934       **
5935       ** EVIDENCE-OF: R-19920-11576 However, newer versions of SQLite still
5936       ** avoid using the last six entries in the freelist trunk page array in
5937       ** order that database files created by newer versions of SQLite can be
5938       ** read by older versions of SQLite.
5939       */
5940       rc = sqlite3PagerWrite(pTrunk->pDbPage);
5941       if( rc==SQLITE_OK ){
5942         put4byte(&pTrunk->aData[4], nLeaf+1);
5943         put4byte(&pTrunk->aData[8+nLeaf*4], iPage);
5944         if( pPage && (pBt->btsFlags & BTS_SECURE_DELETE)==0 ){
5945           sqlite3PagerDontWrite(pPage->pDbPage);
5946         }
5947         rc = btreeSetHasContent(pBt, iPage);
5948       }
5949       TRACE(("FREE-PAGE: %d leaf on trunk page %d\n",pPage->pgno,pTrunk->pgno));
5950       goto freepage_out;
5951     }
5952   }
5953 
5954   /* If control flows to this point, then it was not possible to add the
5955   ** the page being freed as a leaf page of the first trunk in the free-list.
5956   ** Possibly because the free-list is empty, or possibly because the
5957   ** first trunk in the free-list is full. Either way, the page being freed
5958   ** will become the new first trunk page in the free-list.
5959   */
5960   if( pPage==0 && SQLITE_OK!=(rc = btreeGetPage(pBt, iPage, &pPage, 0)) ){
5961     goto freepage_out;
5962   }
5963   rc = sqlite3PagerWrite(pPage->pDbPage);
5964   if( rc!=SQLITE_OK ){
5965     goto freepage_out;
5966   }
5967   put4byte(pPage->aData, iTrunk);
5968   put4byte(&pPage->aData[4], 0);
5969   put4byte(&pPage1->aData[32], iPage);
5970   TRACE(("FREE-PAGE: %d new trunk page replacing %d\n", pPage->pgno, iTrunk));
5971 
5972 freepage_out:
5973   if( pPage ){
5974     pPage->isInit = 0;
5975   }
5976   releasePage(pPage);
5977   releasePage(pTrunk);
5978   return rc;
5979 }
5980 static void freePage(MemPage *pPage, int *pRC){
5981   if( (*pRC)==SQLITE_OK ){
5982     *pRC = freePage2(pPage->pBt, pPage, pPage->pgno);
5983   }
5984 }
5985 
5986 /*
5987 ** Free any overflow pages associated with the given Cell.  Write the
5988 ** local Cell size (the number of bytes on the original page, omitting
5989 ** overflow) into *pnSize.
5990 */
5991 static int clearCell(
5992   MemPage *pPage,          /* The page that contains the Cell */
5993   unsigned char *pCell,    /* First byte of the Cell */
5994   u16 *pnSize              /* Write the size of the Cell here */
5995 ){
5996   BtShared *pBt = pPage->pBt;
5997   CellInfo info;
5998   Pgno ovflPgno;
5999   int rc;
6000   int nOvfl;
6001   u32 ovflPageSize;
6002 
6003   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
6004   pPage->xParseCell(pPage, pCell, &info);
6005   *pnSize = info.nSize;
6006   if( info.nLocal==info.nPayload ){
6007     return SQLITE_OK;  /* No overflow pages. Return without doing anything */
6008   }
6009   if( pCell+info.nSize-1 > pPage->aData+pPage->maskPage ){
6010     return SQLITE_CORRUPT_BKPT;  /* Cell extends past end of page */
6011   }
6012   ovflPgno = get4byte(pCell + info.nSize - 4);
6013   assert( pBt->usableSize > 4 );
6014   ovflPageSize = pBt->usableSize - 4;
6015   nOvfl = (info.nPayload - info.nLocal + ovflPageSize - 1)/ovflPageSize;
6016   assert( nOvfl>0 ||
6017     (CORRUPT_DB && (info.nPayload + ovflPageSize)<ovflPageSize)
6018   );
6019   while( nOvfl-- ){
6020     Pgno iNext = 0;
6021     MemPage *pOvfl = 0;
6022     if( ovflPgno<2 || ovflPgno>btreePagecount(pBt) ){
6023       /* 0 is not a legal page number and page 1 cannot be an
6024       ** overflow page. Therefore if ovflPgno<2 or past the end of the
6025       ** file the database must be corrupt. */
6026       return SQLITE_CORRUPT_BKPT;
6027     }
6028     if( nOvfl ){
6029       rc = getOverflowPage(pBt, ovflPgno, &pOvfl, &iNext);
6030       if( rc ) return rc;
6031     }
6032 
6033     if( ( pOvfl || ((pOvfl = btreePageLookup(pBt, ovflPgno))!=0) )
6034      && sqlite3PagerPageRefcount(pOvfl->pDbPage)!=1
6035     ){
6036       /* There is no reason any cursor should have an outstanding reference
6037       ** to an overflow page belonging to a cell that is being deleted/updated.
6038       ** So if there exists more than one reference to this page, then it
6039       ** must not really be an overflow page and the database must be corrupt.
6040       ** It is helpful to detect this before calling freePage2(), as
6041       ** freePage2() may zero the page contents if secure-delete mode is
6042       ** enabled. If this 'overflow' page happens to be a page that the
6043       ** caller is iterating through or using in some other way, this
6044       ** can be problematic.
6045       */
6046       rc = SQLITE_CORRUPT_BKPT;
6047     }else{
6048       rc = freePage2(pBt, pOvfl, ovflPgno);
6049     }
6050 
6051     if( pOvfl ){
6052       sqlite3PagerUnref(pOvfl->pDbPage);
6053     }
6054     if( rc ) return rc;
6055     ovflPgno = iNext;
6056   }
6057   return SQLITE_OK;
6058 }
6059 
6060 /*
6061 ** Create the byte sequence used to represent a cell on page pPage
6062 ** and write that byte sequence into pCell[].  Overflow pages are
6063 ** allocated and filled in as necessary.  The calling procedure
6064 ** is responsible for making sure sufficient space has been allocated
6065 ** for pCell[].
6066 **
6067 ** Note that pCell does not necessary need to point to the pPage->aData
6068 ** area.  pCell might point to some temporary storage.  The cell will
6069 ** be constructed in this temporary area then copied into pPage->aData
6070 ** later.
6071 */
6072 static int fillInCell(
6073   MemPage *pPage,                /* The page that contains the cell */
6074   unsigned char *pCell,          /* Complete text of the cell */
6075   const void *pKey, i64 nKey,    /* The key */
6076   const void *pData,int nData,   /* The data */
6077   int nZero,                     /* Extra zero bytes to append to pData */
6078   int *pnSize                    /* Write cell size here */
6079 ){
6080   int nPayload;
6081   const u8 *pSrc;
6082   int nSrc, n, rc;
6083   int spaceLeft;
6084   MemPage *pOvfl = 0;
6085   MemPage *pToRelease = 0;
6086   unsigned char *pPrior;
6087   unsigned char *pPayload;
6088   BtShared *pBt = pPage->pBt;
6089   Pgno pgnoOvfl = 0;
6090   int nHeader;
6091 
6092   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
6093 
6094   /* pPage is not necessarily writeable since pCell might be auxiliary
6095   ** buffer space that is separate from the pPage buffer area */
6096   assert( pCell<pPage->aData || pCell>=&pPage->aData[pBt->pageSize]
6097             || sqlite3PagerIswriteable(pPage->pDbPage) );
6098 
6099   /* Fill in the header. */
6100   nHeader = pPage->childPtrSize;
6101   nPayload = nData + nZero;
6102   if( pPage->intKeyLeaf ){
6103     nHeader += putVarint32(&pCell[nHeader], nPayload);
6104   }else{
6105     assert( nData==0 );
6106     assert( nZero==0 );
6107   }
6108   nHeader += putVarint(&pCell[nHeader], *(u64*)&nKey);
6109 
6110   /* Fill in the payload size */
6111   if( pPage->intKey ){
6112     pSrc = pData;
6113     nSrc = nData;
6114     nData = 0;
6115   }else{
6116     assert( nKey<=0x7fffffff && pKey!=0 );
6117     nPayload = (int)nKey;
6118     pSrc = pKey;
6119     nSrc = (int)nKey;
6120   }
6121   if( nPayload<=pPage->maxLocal ){
6122     n = nHeader + nPayload;
6123     testcase( n==3 );
6124     testcase( n==4 );
6125     if( n<4 ) n = 4;
6126     *pnSize = n;
6127     spaceLeft = nPayload;
6128     pPrior = pCell;
6129   }else{
6130     int mn = pPage->minLocal;
6131     n = mn + (nPayload - mn) % (pPage->pBt->usableSize - 4);
6132     testcase( n==pPage->maxLocal );
6133     testcase( n==pPage->maxLocal+1 );
6134     if( n > pPage->maxLocal ) n = mn;
6135     spaceLeft = n;
6136     *pnSize = n + nHeader + 4;
6137     pPrior = &pCell[nHeader+n];
6138   }
6139   pPayload = &pCell[nHeader];
6140 
6141   /* At this point variables should be set as follows:
6142   **
6143   **   nPayload           Total payload size in bytes
6144   **   pPayload           Begin writing payload here
6145   **   spaceLeft          Space available at pPayload.  If nPayload>spaceLeft,
6146   **                      that means content must spill into overflow pages.
6147   **   *pnSize            Size of the local cell (not counting overflow pages)
6148   **   pPrior             Where to write the pgno of the first overflow page
6149   **
6150   ** Use a call to btreeParseCellPtr() to verify that the values above
6151   ** were computed correctly.
6152   */
6153 #if SQLITE_DEBUG
6154   {
6155     CellInfo info;
6156     pPage->xParseCell(pPage, pCell, &info);
6157     assert( nHeader==(int)(info.pPayload - pCell) );
6158     assert( info.nKey==nKey );
6159     assert( *pnSize == info.nSize );
6160     assert( spaceLeft == info.nLocal );
6161   }
6162 #endif
6163 
6164   /* Write the payload into the local Cell and any extra into overflow pages */
6165   while( nPayload>0 ){
6166     if( spaceLeft==0 ){
6167 #ifndef SQLITE_OMIT_AUTOVACUUM
6168       Pgno pgnoPtrmap = pgnoOvfl; /* Overflow page pointer-map entry page */
6169       if( pBt->autoVacuum ){
6170         do{
6171           pgnoOvfl++;
6172         } while(
6173           PTRMAP_ISPAGE(pBt, pgnoOvfl) || pgnoOvfl==PENDING_BYTE_PAGE(pBt)
6174         );
6175       }
6176 #endif
6177       rc = allocateBtreePage(pBt, &pOvfl, &pgnoOvfl, pgnoOvfl, 0);
6178 #ifndef SQLITE_OMIT_AUTOVACUUM
6179       /* If the database supports auto-vacuum, and the second or subsequent
6180       ** overflow page is being allocated, add an entry to the pointer-map
6181       ** for that page now.
6182       **
6183       ** If this is the first overflow page, then write a partial entry
6184       ** to the pointer-map. If we write nothing to this pointer-map slot,
6185       ** then the optimistic overflow chain processing in clearCell()
6186       ** may misinterpret the uninitialized values and delete the
6187       ** wrong pages from the database.
6188       */
6189       if( pBt->autoVacuum && rc==SQLITE_OK ){
6190         u8 eType = (pgnoPtrmap?PTRMAP_OVERFLOW2:PTRMAP_OVERFLOW1);
6191         ptrmapPut(pBt, pgnoOvfl, eType, pgnoPtrmap, &rc);
6192         if( rc ){
6193           releasePage(pOvfl);
6194         }
6195       }
6196 #endif
6197       if( rc ){
6198         releasePage(pToRelease);
6199         return rc;
6200       }
6201 
6202       /* If pToRelease is not zero than pPrior points into the data area
6203       ** of pToRelease.  Make sure pToRelease is still writeable. */
6204       assert( pToRelease==0 || sqlite3PagerIswriteable(pToRelease->pDbPage) );
6205 
6206       /* If pPrior is part of the data area of pPage, then make sure pPage
6207       ** is still writeable */
6208       assert( pPrior<pPage->aData || pPrior>=&pPage->aData[pBt->pageSize]
6209             || sqlite3PagerIswriteable(pPage->pDbPage) );
6210 
6211       put4byte(pPrior, pgnoOvfl);
6212       releasePage(pToRelease);
6213       pToRelease = pOvfl;
6214       pPrior = pOvfl->aData;
6215       put4byte(pPrior, 0);
6216       pPayload = &pOvfl->aData[4];
6217       spaceLeft = pBt->usableSize - 4;
6218     }
6219     n = nPayload;
6220     if( n>spaceLeft ) n = spaceLeft;
6221 
6222     /* If pToRelease is not zero than pPayload points into the data area
6223     ** of pToRelease.  Make sure pToRelease is still writeable. */
6224     assert( pToRelease==0 || sqlite3PagerIswriteable(pToRelease->pDbPage) );
6225 
6226     /* If pPayload is part of the data area of pPage, then make sure pPage
6227     ** is still writeable */
6228     assert( pPayload<pPage->aData || pPayload>=&pPage->aData[pBt->pageSize]
6229             || sqlite3PagerIswriteable(pPage->pDbPage) );
6230 
6231     if( nSrc>0 ){
6232       if( n>nSrc ) n = nSrc;
6233       assert( pSrc );
6234       memcpy(pPayload, pSrc, n);
6235     }else{
6236       memset(pPayload, 0, n);
6237     }
6238     nPayload -= n;
6239     pPayload += n;
6240     pSrc += n;
6241     nSrc -= n;
6242     spaceLeft -= n;
6243     if( nSrc==0 ){
6244       nSrc = nData;
6245       pSrc = pData;
6246     }
6247   }
6248   releasePage(pToRelease);
6249   return SQLITE_OK;
6250 }
6251 
6252 /*
6253 ** Remove the i-th cell from pPage.  This routine effects pPage only.
6254 ** The cell content is not freed or deallocated.  It is assumed that
6255 ** the cell content has been copied someplace else.  This routine just
6256 ** removes the reference to the cell from pPage.
6257 **
6258 ** "sz" must be the number of bytes in the cell.
6259 */
6260 static void dropCell(MemPage *pPage, int idx, int sz, int *pRC){
6261   u32 pc;         /* Offset to cell content of cell being deleted */
6262   u8 *data;       /* pPage->aData */
6263   u8 *ptr;        /* Used to move bytes around within data[] */
6264   int rc;         /* The return code */
6265   int hdr;        /* Beginning of the header.  0 most pages.  100 page 1 */
6266 
6267   if( *pRC ) return;
6268 
6269   assert( idx>=0 && idx<pPage->nCell );
6270   assert( CORRUPT_DB || sz==cellSize(pPage, idx) );
6271   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
6272   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
6273   data = pPage->aData;
6274   ptr = &pPage->aCellIdx[2*idx];
6275   pc = get2byte(ptr);
6276   hdr = pPage->hdrOffset;
6277   testcase( pc==get2byte(&data[hdr+5]) );
6278   testcase( pc+sz==pPage->pBt->usableSize );
6279   if( pc < (u32)get2byte(&data[hdr+5]) || pc+sz > pPage->pBt->usableSize ){
6280     *pRC = SQLITE_CORRUPT_BKPT;
6281     return;
6282   }
6283   rc = freeSpace(pPage, pc, sz);
6284   if( rc ){
6285     *pRC = rc;
6286     return;
6287   }
6288   pPage->nCell--;
6289   if( pPage->nCell==0 ){
6290     memset(&data[hdr+1], 0, 4);
6291     data[hdr+7] = 0;
6292     put2byte(&data[hdr+5], pPage->pBt->usableSize);
6293     pPage->nFree = pPage->pBt->usableSize - pPage->hdrOffset
6294                        - pPage->childPtrSize - 8;
6295   }else{
6296     memmove(ptr, ptr+2, 2*(pPage->nCell - idx));
6297     put2byte(&data[hdr+3], pPage->nCell);
6298     pPage->nFree += 2;
6299   }
6300 }
6301 
6302 /*
6303 ** Insert a new cell on pPage at cell index "i".  pCell points to the
6304 ** content of the cell.
6305 **
6306 ** If the cell content will fit on the page, then put it there.  If it
6307 ** will not fit, then make a copy of the cell content into pTemp if
6308 ** pTemp is not null.  Regardless of pTemp, allocate a new entry
6309 ** in pPage->apOvfl[] and make it point to the cell content (either
6310 ** in pTemp or the original pCell) and also record its index.
6311 ** Allocating a new entry in pPage->aCell[] implies that
6312 ** pPage->nOverflow is incremented.
6313 */
6314 static void insertCell(
6315   MemPage *pPage,   /* Page into which we are copying */
6316   int i,            /* New cell becomes the i-th cell of the page */
6317   u8 *pCell,        /* Content of the new cell */
6318   int sz,           /* Bytes of content in pCell */
6319   u8 *pTemp,        /* Temp storage space for pCell, if needed */
6320   Pgno iChild,      /* If non-zero, replace first 4 bytes with this value */
6321   int *pRC          /* Read and write return code from here */
6322 ){
6323   int idx = 0;      /* Where to write new cell content in data[] */
6324   int j;            /* Loop counter */
6325   u8 *data;         /* The content of the whole page */
6326   u8 *pIns;         /* The point in pPage->aCellIdx[] where no cell inserted */
6327 
6328   if( *pRC ) return;
6329 
6330   assert( i>=0 && i<=pPage->nCell+pPage->nOverflow );
6331   assert( MX_CELL(pPage->pBt)<=10921 );
6332   assert( pPage->nCell<=MX_CELL(pPage->pBt) || CORRUPT_DB );
6333   assert( pPage->nOverflow<=ArraySize(pPage->apOvfl) );
6334   assert( ArraySize(pPage->apOvfl)==ArraySize(pPage->aiOvfl) );
6335   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
6336   /* The cell should normally be sized correctly.  However, when moving a
6337   ** malformed cell from a leaf page to an interior page, if the cell size
6338   ** wanted to be less than 4 but got rounded up to 4 on the leaf, then size
6339   ** might be less than 8 (leaf-size + pointer) on the interior node.  Hence
6340   ** the term after the || in the following assert(). */
6341   assert( sz==pPage->xCellSize(pPage, pCell) || (sz==8 && iChild>0) );
6342   if( pPage->nOverflow || sz+2>pPage->nFree ){
6343     if( pTemp ){
6344       memcpy(pTemp, pCell, sz);
6345       pCell = pTemp;
6346     }
6347     if( iChild ){
6348       put4byte(pCell, iChild);
6349     }
6350     j = pPage->nOverflow++;
6351     assert( j<(int)(sizeof(pPage->apOvfl)/sizeof(pPage->apOvfl[0])) );
6352     pPage->apOvfl[j] = pCell;
6353     pPage->aiOvfl[j] = (u16)i;
6354 
6355     /* When multiple overflows occur, they are always sequential and in
6356     ** sorted order.  This invariants arise because multiple overflows can
6357     ** only occur when inserting divider cells into the parent page during
6358     ** balancing, and the dividers are adjacent and sorted.
6359     */
6360     assert( j==0 || pPage->aiOvfl[j-1]<(u16)i ); /* Overflows in sorted order */
6361     assert( j==0 || i==pPage->aiOvfl[j-1]+1 );   /* Overflows are sequential */
6362   }else{
6363     int rc = sqlite3PagerWrite(pPage->pDbPage);
6364     if( rc!=SQLITE_OK ){
6365       *pRC = rc;
6366       return;
6367     }
6368     assert( sqlite3PagerIswriteable(pPage->pDbPage) );
6369     data = pPage->aData;
6370     assert( &data[pPage->cellOffset]==pPage->aCellIdx );
6371     rc = allocateSpace(pPage, sz, &idx);
6372     if( rc ){ *pRC = rc; return; }
6373     /* The allocateSpace() routine guarantees the following properties
6374     ** if it returns successfully */
6375     assert( idx >= 0 );
6376     assert( idx >= pPage->cellOffset+2*pPage->nCell+2 || CORRUPT_DB );
6377     assert( idx+sz <= (int)pPage->pBt->usableSize );
6378     pPage->nFree -= (u16)(2 + sz);
6379     memcpy(&data[idx], pCell, sz);
6380     if( iChild ){
6381       put4byte(&data[idx], iChild);
6382     }
6383     pIns = pPage->aCellIdx + i*2;
6384     memmove(pIns+2, pIns, 2*(pPage->nCell - i));
6385     put2byte(pIns, idx);
6386     pPage->nCell++;
6387     /* increment the cell count */
6388     if( (++data[pPage->hdrOffset+4])==0 ) data[pPage->hdrOffset+3]++;
6389     assert( get2byte(&data[pPage->hdrOffset+3])==pPage->nCell );
6390 #ifndef SQLITE_OMIT_AUTOVACUUM
6391     if( pPage->pBt->autoVacuum ){
6392       /* The cell may contain a pointer to an overflow page. If so, write
6393       ** the entry for the overflow page into the pointer map.
6394       */
6395       ptrmapPutOvflPtr(pPage, pCell, pRC);
6396     }
6397 #endif
6398   }
6399 }
6400 
6401 /*
6402 ** A CellArray object contains a cache of pointers and sizes for a
6403 ** consecutive sequence of cells that might be held multiple pages.
6404 */
6405 typedef struct CellArray CellArray;
6406 struct CellArray {
6407   int nCell;              /* Number of cells in apCell[] */
6408   MemPage *pRef;          /* Reference page */
6409   u8 **apCell;            /* All cells begin balanced */
6410   u16 *szCell;            /* Local size of all cells in apCell[] */
6411 };
6412 
6413 /*
6414 ** Make sure the cell sizes at idx, idx+1, ..., idx+N-1 have been
6415 ** computed.
6416 */
6417 static void populateCellCache(CellArray *p, int idx, int N){
6418   assert( idx>=0 && idx+N<=p->nCell );
6419   while( N>0 ){
6420     assert( p->apCell[idx]!=0 );
6421     if( p->szCell[idx]==0 ){
6422       p->szCell[idx] = p->pRef->xCellSize(p->pRef, p->apCell[idx]);
6423     }else{
6424       assert( CORRUPT_DB ||
6425               p->szCell[idx]==p->pRef->xCellSize(p->pRef, p->apCell[idx]) );
6426     }
6427     idx++;
6428     N--;
6429   }
6430 }
6431 
6432 /*
6433 ** Return the size of the Nth element of the cell array
6434 */
6435 static SQLITE_NOINLINE u16 computeCellSize(CellArray *p, int N){
6436   assert( N>=0 && N<p->nCell );
6437   assert( p->szCell[N]==0 );
6438   p->szCell[N] = p->pRef->xCellSize(p->pRef, p->apCell[N]);
6439   return p->szCell[N];
6440 }
6441 static u16 cachedCellSize(CellArray *p, int N){
6442   assert( N>=0 && N<p->nCell );
6443   if( p->szCell[N] ) return p->szCell[N];
6444   return computeCellSize(p, N);
6445 }
6446 
6447 /*
6448 ** Array apCell[] contains pointers to nCell b-tree page cells. The
6449 ** szCell[] array contains the size in bytes of each cell. This function
6450 ** replaces the current contents of page pPg with the contents of the cell
6451 ** array.
6452 **
6453 ** Some of the cells in apCell[] may currently be stored in pPg. This
6454 ** function works around problems caused by this by making a copy of any
6455 ** such cells before overwriting the page data.
6456 **
6457 ** The MemPage.nFree field is invalidated by this function. It is the
6458 ** responsibility of the caller to set it correctly.
6459 */
6460 static int rebuildPage(
6461   MemPage *pPg,                   /* Edit this page */
6462   int nCell,                      /* Final number of cells on page */
6463   u8 **apCell,                    /* Array of cells */
6464   u16 *szCell                     /* Array of cell sizes */
6465 ){
6466   const int hdr = pPg->hdrOffset;          /* Offset of header on pPg */
6467   u8 * const aData = pPg->aData;           /* Pointer to data for pPg */
6468   const int usableSize = pPg->pBt->usableSize;
6469   u8 * const pEnd = &aData[usableSize];
6470   int i;
6471   u8 *pCellptr = pPg->aCellIdx;
6472   u8 *pTmp = sqlite3PagerTempSpace(pPg->pBt->pPager);
6473   u8 *pData;
6474 
6475   i = get2byte(&aData[hdr+5]);
6476   memcpy(&pTmp[i], &aData[i], usableSize - i);
6477 
6478   pData = pEnd;
6479   for(i=0; i<nCell; i++){
6480     u8 *pCell = apCell[i];
6481     if( SQLITE_WITHIN(pCell,aData,pEnd) ){
6482       pCell = &pTmp[pCell - aData];
6483     }
6484     pData -= szCell[i];
6485     put2byte(pCellptr, (pData - aData));
6486     pCellptr += 2;
6487     if( pData < pCellptr ) return SQLITE_CORRUPT_BKPT;
6488     memcpy(pData, pCell, szCell[i]);
6489     assert( szCell[i]==pPg->xCellSize(pPg, pCell) || CORRUPT_DB );
6490     testcase( szCell[i]!=pPg->xCellSize(pPg,pCell) );
6491   }
6492 
6493   /* The pPg->nFree field is now set incorrectly. The caller will fix it. */
6494   pPg->nCell = nCell;
6495   pPg->nOverflow = 0;
6496 
6497   put2byte(&aData[hdr+1], 0);
6498   put2byte(&aData[hdr+3], pPg->nCell);
6499   put2byte(&aData[hdr+5], pData - aData);
6500   aData[hdr+7] = 0x00;
6501   return SQLITE_OK;
6502 }
6503 
6504 /*
6505 ** Array apCell[] contains nCell pointers to b-tree cells. Array szCell
6506 ** contains the size in bytes of each such cell. This function attempts to
6507 ** add the cells stored in the array to page pPg. If it cannot (because
6508 ** the page needs to be defragmented before the cells will fit), non-zero
6509 ** is returned. Otherwise, if the cells are added successfully, zero is
6510 ** returned.
6511 **
6512 ** Argument pCellptr points to the first entry in the cell-pointer array
6513 ** (part of page pPg) to populate. After cell apCell[0] is written to the
6514 ** page body, a 16-bit offset is written to pCellptr. And so on, for each
6515 ** cell in the array. It is the responsibility of the caller to ensure
6516 ** that it is safe to overwrite this part of the cell-pointer array.
6517 **
6518 ** When this function is called, *ppData points to the start of the
6519 ** content area on page pPg. If the size of the content area is extended,
6520 ** *ppData is updated to point to the new start of the content area
6521 ** before returning.
6522 **
6523 ** Finally, argument pBegin points to the byte immediately following the
6524 ** end of the space required by this page for the cell-pointer area (for
6525 ** all cells - not just those inserted by the current call). If the content
6526 ** area must be extended to before this point in order to accomodate all
6527 ** cells in apCell[], then the cells do not fit and non-zero is returned.
6528 */
6529 static int pageInsertArray(
6530   MemPage *pPg,                   /* Page to add cells to */
6531   u8 *pBegin,                     /* End of cell-pointer array */
6532   u8 **ppData,                    /* IN/OUT: Page content -area pointer */
6533   u8 *pCellptr,                   /* Pointer to cell-pointer area */
6534   int iFirst,                     /* Index of first cell to add */
6535   int nCell,                      /* Number of cells to add to pPg */
6536   CellArray *pCArray              /* Array of cells */
6537 ){
6538   int i;
6539   u8 *aData = pPg->aData;
6540   u8 *pData = *ppData;
6541   int iEnd = iFirst + nCell;
6542   assert( CORRUPT_DB || pPg->hdrOffset==0 );    /* Never called on page 1 */
6543   for(i=iFirst; i<iEnd; i++){
6544     int sz, rc;
6545     u8 *pSlot;
6546     sz = cachedCellSize(pCArray, i);
6547     if( (aData[1]==0 && aData[2]==0) || (pSlot = pageFindSlot(pPg,sz,&rc))==0 ){
6548       if( (pData - pBegin)<sz ) return 1;
6549       pData -= sz;
6550       pSlot = pData;
6551     }
6552     /* pSlot and pCArray->apCell[i] will never overlap on a well-formed
6553     ** database.  But they might for a corrupt database.  Hence use memmove()
6554     ** since memcpy() sends SIGABORT with overlapping buffers on OpenBSD */
6555     assert( (pSlot+sz)<=pCArray->apCell[i]
6556          || pSlot>=(pCArray->apCell[i]+sz)
6557          || CORRUPT_DB );
6558     memmove(pSlot, pCArray->apCell[i], sz);
6559     put2byte(pCellptr, (pSlot - aData));
6560     pCellptr += 2;
6561   }
6562   *ppData = pData;
6563   return 0;
6564 }
6565 
6566 /*
6567 ** Array apCell[] contains nCell pointers to b-tree cells. Array szCell
6568 ** contains the size in bytes of each such cell. This function adds the
6569 ** space associated with each cell in the array that is currently stored
6570 ** within the body of pPg to the pPg free-list. The cell-pointers and other
6571 ** fields of the page are not updated.
6572 **
6573 ** This function returns the total number of cells added to the free-list.
6574 */
6575 static int pageFreeArray(
6576   MemPage *pPg,                   /* Page to edit */
6577   int iFirst,                     /* First cell to delete */
6578   int nCell,                      /* Cells to delete */
6579   CellArray *pCArray              /* Array of cells */
6580 ){
6581   u8 * const aData = pPg->aData;
6582   u8 * const pEnd = &aData[pPg->pBt->usableSize];
6583   u8 * const pStart = &aData[pPg->hdrOffset + 8 + pPg->childPtrSize];
6584   int nRet = 0;
6585   int i;
6586   int iEnd = iFirst + nCell;
6587   u8 *pFree = 0;
6588   int szFree = 0;
6589 
6590   for(i=iFirst; i<iEnd; i++){
6591     u8 *pCell = pCArray->apCell[i];
6592     if( SQLITE_WITHIN(pCell, pStart, pEnd) ){
6593       int sz;
6594       /* No need to use cachedCellSize() here.  The sizes of all cells that
6595       ** are to be freed have already been computing while deciding which
6596       ** cells need freeing */
6597       sz = pCArray->szCell[i];  assert( sz>0 );
6598       if( pFree!=(pCell + sz) ){
6599         if( pFree ){
6600           assert( pFree>aData && (pFree - aData)<65536 );
6601           freeSpace(pPg, (u16)(pFree - aData), szFree);
6602         }
6603         pFree = pCell;
6604         szFree = sz;
6605         if( pFree+sz>pEnd ) return 0;
6606       }else{
6607         pFree = pCell;
6608         szFree += sz;
6609       }
6610       nRet++;
6611     }
6612   }
6613   if( pFree ){
6614     assert( pFree>aData && (pFree - aData)<65536 );
6615     freeSpace(pPg, (u16)(pFree - aData), szFree);
6616   }
6617   return nRet;
6618 }
6619 
6620 /*
6621 ** apCell[] and szCell[] contains pointers to and sizes of all cells in the
6622 ** pages being balanced.  The current page, pPg, has pPg->nCell cells starting
6623 ** with apCell[iOld].  After balancing, this page should hold nNew cells
6624 ** starting at apCell[iNew].
6625 **
6626 ** This routine makes the necessary adjustments to pPg so that it contains
6627 ** the correct cells after being balanced.
6628 **
6629 ** The pPg->nFree field is invalid when this function returns. It is the
6630 ** responsibility of the caller to set it correctly.
6631 */
6632 static int editPage(
6633   MemPage *pPg,                   /* Edit this page */
6634   int iOld,                       /* Index of first cell currently on page */
6635   int iNew,                       /* Index of new first cell on page */
6636   int nNew,                       /* Final number of cells on page */
6637   CellArray *pCArray              /* Array of cells and sizes */
6638 ){
6639   u8 * const aData = pPg->aData;
6640   const int hdr = pPg->hdrOffset;
6641   u8 *pBegin = &pPg->aCellIdx[nNew * 2];
6642   int nCell = pPg->nCell;       /* Cells stored on pPg */
6643   u8 *pData;
6644   u8 *pCellptr;
6645   int i;
6646   int iOldEnd = iOld + pPg->nCell + pPg->nOverflow;
6647   int iNewEnd = iNew + nNew;
6648 
6649 #ifdef SQLITE_DEBUG
6650   u8 *pTmp = sqlite3PagerTempSpace(pPg->pBt->pPager);
6651   memcpy(pTmp, aData, pPg->pBt->usableSize);
6652 #endif
6653 
6654   /* Remove cells from the start and end of the page */
6655   if( iOld<iNew ){
6656     int nShift = pageFreeArray(pPg, iOld, iNew-iOld, pCArray);
6657     memmove(pPg->aCellIdx, &pPg->aCellIdx[nShift*2], nCell*2);
6658     nCell -= nShift;
6659   }
6660   if( iNewEnd < iOldEnd ){
6661     nCell -= pageFreeArray(pPg, iNewEnd, iOldEnd - iNewEnd, pCArray);
6662   }
6663 
6664   pData = &aData[get2byteNotZero(&aData[hdr+5])];
6665   if( pData<pBegin ) goto editpage_fail;
6666 
6667   /* Add cells to the start of the page */
6668   if( iNew<iOld ){
6669     int nAdd = MIN(nNew,iOld-iNew);
6670     assert( (iOld-iNew)<nNew || nCell==0 || CORRUPT_DB );
6671     pCellptr = pPg->aCellIdx;
6672     memmove(&pCellptr[nAdd*2], pCellptr, nCell*2);
6673     if( pageInsertArray(
6674           pPg, pBegin, &pData, pCellptr,
6675           iNew, nAdd, pCArray
6676     ) ) goto editpage_fail;
6677     nCell += nAdd;
6678   }
6679 
6680   /* Add any overflow cells */
6681   for(i=0; i<pPg->nOverflow; i++){
6682     int iCell = (iOld + pPg->aiOvfl[i]) - iNew;
6683     if( iCell>=0 && iCell<nNew ){
6684       pCellptr = &pPg->aCellIdx[iCell * 2];
6685       memmove(&pCellptr[2], pCellptr, (nCell - iCell) * 2);
6686       nCell++;
6687       if( pageInsertArray(
6688             pPg, pBegin, &pData, pCellptr,
6689             iCell+iNew, 1, pCArray
6690       ) ) goto editpage_fail;
6691     }
6692   }
6693 
6694   /* Append cells to the end of the page */
6695   pCellptr = &pPg->aCellIdx[nCell*2];
6696   if( pageInsertArray(
6697         pPg, pBegin, &pData, pCellptr,
6698         iNew+nCell, nNew-nCell, pCArray
6699   ) ) goto editpage_fail;
6700 
6701   pPg->nCell = nNew;
6702   pPg->nOverflow = 0;
6703 
6704   put2byte(&aData[hdr+3], pPg->nCell);
6705   put2byte(&aData[hdr+5], pData - aData);
6706 
6707 #ifdef SQLITE_DEBUG
6708   for(i=0; i<nNew && !CORRUPT_DB; i++){
6709     u8 *pCell = pCArray->apCell[i+iNew];
6710     int iOff = get2byteAligned(&pPg->aCellIdx[i*2]);
6711     if( SQLITE_WITHIN(pCell, aData, &aData[pPg->pBt->usableSize]) ){
6712       pCell = &pTmp[pCell - aData];
6713     }
6714     assert( 0==memcmp(pCell, &aData[iOff],
6715             pCArray->pRef->xCellSize(pCArray->pRef, pCArray->apCell[i+iNew])) );
6716   }
6717 #endif
6718 
6719   return SQLITE_OK;
6720  editpage_fail:
6721   /* Unable to edit this page. Rebuild it from scratch instead. */
6722   populateCellCache(pCArray, iNew, nNew);
6723   return rebuildPage(pPg, nNew, &pCArray->apCell[iNew], &pCArray->szCell[iNew]);
6724 }
6725 
6726 /*
6727 ** The following parameters determine how many adjacent pages get involved
6728 ** in a balancing operation.  NN is the number of neighbors on either side
6729 ** of the page that participate in the balancing operation.  NB is the
6730 ** total number of pages that participate, including the target page and
6731 ** NN neighbors on either side.
6732 **
6733 ** The minimum value of NN is 1 (of course).  Increasing NN above 1
6734 ** (to 2 or 3) gives a modest improvement in SELECT and DELETE performance
6735 ** in exchange for a larger degradation in INSERT and UPDATE performance.
6736 ** The value of NN appears to give the best results overall.
6737 */
6738 #define NN 1             /* Number of neighbors on either side of pPage */
6739 #define NB (NN*2+1)      /* Total pages involved in the balance */
6740 
6741 
6742 #ifndef SQLITE_OMIT_QUICKBALANCE
6743 /*
6744 ** This version of balance() handles the common special case where
6745 ** a new entry is being inserted on the extreme right-end of the
6746 ** tree, in other words, when the new entry will become the largest
6747 ** entry in the tree.
6748 **
6749 ** Instead of trying to balance the 3 right-most leaf pages, just add
6750 ** a new page to the right-hand side and put the one new entry in
6751 ** that page.  This leaves the right side of the tree somewhat
6752 ** unbalanced.  But odds are that we will be inserting new entries
6753 ** at the end soon afterwards so the nearly empty page will quickly
6754 ** fill up.  On average.
6755 **
6756 ** pPage is the leaf page which is the right-most page in the tree.
6757 ** pParent is its parent.  pPage must have a single overflow entry
6758 ** which is also the right-most entry on the page.
6759 **
6760 ** The pSpace buffer is used to store a temporary copy of the divider
6761 ** cell that will be inserted into pParent. Such a cell consists of a 4
6762 ** byte page number followed by a variable length integer. In other
6763 ** words, at most 13 bytes. Hence the pSpace buffer must be at
6764 ** least 13 bytes in size.
6765 */
6766 static int balance_quick(MemPage *pParent, MemPage *pPage, u8 *pSpace){
6767   BtShared *const pBt = pPage->pBt;    /* B-Tree Database */
6768   MemPage *pNew;                       /* Newly allocated page */
6769   int rc;                              /* Return Code */
6770   Pgno pgnoNew;                        /* Page number of pNew */
6771 
6772   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
6773   assert( sqlite3PagerIswriteable(pParent->pDbPage) );
6774   assert( pPage->nOverflow==1 );
6775 
6776   /* This error condition is now caught prior to reaching this function */
6777   if( NEVER(pPage->nCell==0) ) return SQLITE_CORRUPT_BKPT;
6778 
6779   /* Allocate a new page. This page will become the right-sibling of
6780   ** pPage. Make the parent page writable, so that the new divider cell
6781   ** may be inserted. If both these operations are successful, proceed.
6782   */
6783   rc = allocateBtreePage(pBt, &pNew, &pgnoNew, 0, 0);
6784 
6785   if( rc==SQLITE_OK ){
6786 
6787     u8 *pOut = &pSpace[4];
6788     u8 *pCell = pPage->apOvfl[0];
6789     u16 szCell = pPage->xCellSize(pPage, pCell);
6790     u8 *pStop;
6791 
6792     assert( sqlite3PagerIswriteable(pNew->pDbPage) );
6793     assert( pPage->aData[0]==(PTF_INTKEY|PTF_LEAFDATA|PTF_LEAF) );
6794     zeroPage(pNew, PTF_INTKEY|PTF_LEAFDATA|PTF_LEAF);
6795     rc = rebuildPage(pNew, 1, &pCell, &szCell);
6796     if( NEVER(rc) ) return rc;
6797     pNew->nFree = pBt->usableSize - pNew->cellOffset - 2 - szCell;
6798 
6799     /* If this is an auto-vacuum database, update the pointer map
6800     ** with entries for the new page, and any pointer from the
6801     ** cell on the page to an overflow page. If either of these
6802     ** operations fails, the return code is set, but the contents
6803     ** of the parent page are still manipulated by thh code below.
6804     ** That is Ok, at this point the parent page is guaranteed to
6805     ** be marked as dirty. Returning an error code will cause a
6806     ** rollback, undoing any changes made to the parent page.
6807     */
6808     if( ISAUTOVACUUM ){
6809       ptrmapPut(pBt, pgnoNew, PTRMAP_BTREE, pParent->pgno, &rc);
6810       if( szCell>pNew->minLocal ){
6811         ptrmapPutOvflPtr(pNew, pCell, &rc);
6812       }
6813     }
6814 
6815     /* Create a divider cell to insert into pParent. The divider cell
6816     ** consists of a 4-byte page number (the page number of pPage) and
6817     ** a variable length key value (which must be the same value as the
6818     ** largest key on pPage).
6819     **
6820     ** To find the largest key value on pPage, first find the right-most
6821     ** cell on pPage. The first two fields of this cell are the
6822     ** record-length (a variable length integer at most 32-bits in size)
6823     ** and the key value (a variable length integer, may have any value).
6824     ** The first of the while(...) loops below skips over the record-length
6825     ** field. The second while(...) loop copies the key value from the
6826     ** cell on pPage into the pSpace buffer.
6827     */
6828     pCell = findCell(pPage, pPage->nCell-1);
6829     pStop = &pCell[9];
6830     while( (*(pCell++)&0x80) && pCell<pStop );
6831     pStop = &pCell[9];
6832     while( ((*(pOut++) = *(pCell++))&0x80) && pCell<pStop );
6833 
6834     /* Insert the new divider cell into pParent. */
6835     insertCell(pParent, pParent->nCell, pSpace, (int)(pOut-pSpace),
6836                0, pPage->pgno, &rc);
6837 
6838     /* Set the right-child pointer of pParent to point to the new page. */
6839     put4byte(&pParent->aData[pParent->hdrOffset+8], pgnoNew);
6840 
6841     /* Release the reference to the new page. */
6842     releasePage(pNew);
6843   }
6844 
6845   return rc;
6846 }
6847 #endif /* SQLITE_OMIT_QUICKBALANCE */
6848 
6849 #if 0
6850 /*
6851 ** This function does not contribute anything to the operation of SQLite.
6852 ** it is sometimes activated temporarily while debugging code responsible
6853 ** for setting pointer-map entries.
6854 */
6855 static int ptrmapCheckPages(MemPage **apPage, int nPage){
6856   int i, j;
6857   for(i=0; i<nPage; i++){
6858     Pgno n;
6859     u8 e;
6860     MemPage *pPage = apPage[i];
6861     BtShared *pBt = pPage->pBt;
6862     assert( pPage->isInit );
6863 
6864     for(j=0; j<pPage->nCell; j++){
6865       CellInfo info;
6866       u8 *z;
6867 
6868       z = findCell(pPage, j);
6869       pPage->xParseCell(pPage, z, &info);
6870       if( info.nLocal<info.nPayload ){
6871         Pgno ovfl = get4byte(&z[info.nSize-4]);
6872         ptrmapGet(pBt, ovfl, &e, &n);
6873         assert( n==pPage->pgno && e==PTRMAP_OVERFLOW1 );
6874       }
6875       if( !pPage->leaf ){
6876         Pgno child = get4byte(z);
6877         ptrmapGet(pBt, child, &e, &n);
6878         assert( n==pPage->pgno && e==PTRMAP_BTREE );
6879       }
6880     }
6881     if( !pPage->leaf ){
6882       Pgno child = get4byte(&pPage->aData[pPage->hdrOffset+8]);
6883       ptrmapGet(pBt, child, &e, &n);
6884       assert( n==pPage->pgno && e==PTRMAP_BTREE );
6885     }
6886   }
6887   return 1;
6888 }
6889 #endif
6890 
6891 /*
6892 ** This function is used to copy the contents of the b-tree node stored
6893 ** on page pFrom to page pTo. If page pFrom was not a leaf page, then
6894 ** the pointer-map entries for each child page are updated so that the
6895 ** parent page stored in the pointer map is page pTo. If pFrom contained
6896 ** any cells with overflow page pointers, then the corresponding pointer
6897 ** map entries are also updated so that the parent page is page pTo.
6898 **
6899 ** If pFrom is currently carrying any overflow cells (entries in the
6900 ** MemPage.apOvfl[] array), they are not copied to pTo.
6901 **
6902 ** Before returning, page pTo is reinitialized using btreeInitPage().
6903 **
6904 ** The performance of this function is not critical. It is only used by
6905 ** the balance_shallower() and balance_deeper() procedures, neither of
6906 ** which are called often under normal circumstances.
6907 */
6908 static void copyNodeContent(MemPage *pFrom, MemPage *pTo, int *pRC){
6909   if( (*pRC)==SQLITE_OK ){
6910     BtShared * const pBt = pFrom->pBt;
6911     u8 * const aFrom = pFrom->aData;
6912     u8 * const aTo = pTo->aData;
6913     int const iFromHdr = pFrom->hdrOffset;
6914     int const iToHdr = ((pTo->pgno==1) ? 100 : 0);
6915     int rc;
6916     int iData;
6917 
6918 
6919     assert( pFrom->isInit );
6920     assert( pFrom->nFree>=iToHdr );
6921     assert( get2byte(&aFrom[iFromHdr+5]) <= (int)pBt->usableSize );
6922 
6923     /* Copy the b-tree node content from page pFrom to page pTo. */
6924     iData = get2byte(&aFrom[iFromHdr+5]);
6925     memcpy(&aTo[iData], &aFrom[iData], pBt->usableSize-iData);
6926     memcpy(&aTo[iToHdr], &aFrom[iFromHdr], pFrom->cellOffset + 2*pFrom->nCell);
6927 
6928     /* Reinitialize page pTo so that the contents of the MemPage structure
6929     ** match the new data. The initialization of pTo can actually fail under
6930     ** fairly obscure circumstances, even though it is a copy of initialized
6931     ** page pFrom.
6932     */
6933     pTo->isInit = 0;
6934     rc = btreeInitPage(pTo);
6935     if( rc!=SQLITE_OK ){
6936       *pRC = rc;
6937       return;
6938     }
6939 
6940     /* If this is an auto-vacuum database, update the pointer-map entries
6941     ** for any b-tree or overflow pages that pTo now contains the pointers to.
6942     */
6943     if( ISAUTOVACUUM ){
6944       *pRC = setChildPtrmaps(pTo);
6945     }
6946   }
6947 }
6948 
6949 /*
6950 ** This routine redistributes cells on the iParentIdx'th child of pParent
6951 ** (hereafter "the page") and up to 2 siblings so that all pages have about the
6952 ** same amount of free space. Usually a single sibling on either side of the
6953 ** page are used in the balancing, though both siblings might come from one
6954 ** side if the page is the first or last child of its parent. If the page
6955 ** has fewer than 2 siblings (something which can only happen if the page
6956 ** is a root page or a child of a root page) then all available siblings
6957 ** participate in the balancing.
6958 **
6959 ** The number of siblings of the page might be increased or decreased by
6960 ** one or two in an effort to keep pages nearly full but not over full.
6961 **
6962 ** Note that when this routine is called, some of the cells on the page
6963 ** might not actually be stored in MemPage.aData[]. This can happen
6964 ** if the page is overfull. This routine ensures that all cells allocated
6965 ** to the page and its siblings fit into MemPage.aData[] before returning.
6966 **
6967 ** In the course of balancing the page and its siblings, cells may be
6968 ** inserted into or removed from the parent page (pParent). Doing so
6969 ** may cause the parent page to become overfull or underfull. If this
6970 ** happens, it is the responsibility of the caller to invoke the correct
6971 ** balancing routine to fix this problem (see the balance() routine).
6972 **
6973 ** If this routine fails for any reason, it might leave the database
6974 ** in a corrupted state. So if this routine fails, the database should
6975 ** be rolled back.
6976 **
6977 ** The third argument to this function, aOvflSpace, is a pointer to a
6978 ** buffer big enough to hold one page. If while inserting cells into the parent
6979 ** page (pParent) the parent page becomes overfull, this buffer is
6980 ** used to store the parent's overflow cells. Because this function inserts
6981 ** a maximum of four divider cells into the parent page, and the maximum
6982 ** size of a cell stored within an internal node is always less than 1/4
6983 ** of the page-size, the aOvflSpace[] buffer is guaranteed to be large
6984 ** enough for all overflow cells.
6985 **
6986 ** If aOvflSpace is set to a null pointer, this function returns
6987 ** SQLITE_NOMEM.
6988 */
6989 static int balance_nonroot(
6990   MemPage *pParent,               /* Parent page of siblings being balanced */
6991   int iParentIdx,                 /* Index of "the page" in pParent */
6992   u8 *aOvflSpace,                 /* page-size bytes of space for parent ovfl */
6993   int isRoot,                     /* True if pParent is a root-page */
6994   int bBulk                       /* True if this call is part of a bulk load */
6995 ){
6996   BtShared *pBt;               /* The whole database */
6997   int nMaxCells = 0;           /* Allocated size of apCell, szCell, aFrom. */
6998   int nNew = 0;                /* Number of pages in apNew[] */
6999   int nOld;                    /* Number of pages in apOld[] */
7000   int i, j, k;                 /* Loop counters */
7001   int nxDiv;                   /* Next divider slot in pParent->aCell[] */
7002   int rc = SQLITE_OK;          /* The return code */
7003   u16 leafCorrection;          /* 4 if pPage is a leaf.  0 if not */
7004   int leafData;                /* True if pPage is a leaf of a LEAFDATA tree */
7005   int usableSpace;             /* Bytes in pPage beyond the header */
7006   int pageFlags;               /* Value of pPage->aData[0] */
7007   int iSpace1 = 0;             /* First unused byte of aSpace1[] */
7008   int iOvflSpace = 0;          /* First unused byte of aOvflSpace[] */
7009   int szScratch;               /* Size of scratch memory requested */
7010   MemPage *apOld[NB];          /* pPage and up to two siblings */
7011   MemPage *apNew[NB+2];        /* pPage and up to NB siblings after balancing */
7012   u8 *pRight;                  /* Location in parent of right-sibling pointer */
7013   u8 *apDiv[NB-1];             /* Divider cells in pParent */
7014   int cntNew[NB+2];            /* Index in b.paCell[] of cell after i-th page */
7015   int cntOld[NB+2];            /* Old index in b.apCell[] */
7016   int szNew[NB+2];             /* Combined size of cells placed on i-th page */
7017   u8 *aSpace1;                 /* Space for copies of dividers cells */
7018   Pgno pgno;                   /* Temp var to store a page number in */
7019   u8 abDone[NB+2];             /* True after i'th new page is populated */
7020   Pgno aPgno[NB+2];            /* Page numbers of new pages before shuffling */
7021   Pgno aPgOrder[NB+2];         /* Copy of aPgno[] used for sorting pages */
7022   u16 aPgFlags[NB+2];          /* flags field of new pages before shuffling */
7023   CellArray b;                  /* Parsed information on cells being balanced */
7024 
7025   memset(abDone, 0, sizeof(abDone));
7026   b.nCell = 0;
7027   b.apCell = 0;
7028   pBt = pParent->pBt;
7029   assert( sqlite3_mutex_held(pBt->mutex) );
7030   assert( sqlite3PagerIswriteable(pParent->pDbPage) );
7031 
7032 #if 0
7033   TRACE(("BALANCE: begin page %d child of %d\n", pPage->pgno, pParent->pgno));
7034 #endif
7035 
7036   /* At this point pParent may have at most one overflow cell. And if
7037   ** this overflow cell is present, it must be the cell with
7038   ** index iParentIdx. This scenario comes about when this function
7039   ** is called (indirectly) from sqlite3BtreeDelete().
7040   */
7041   assert( pParent->nOverflow==0 || pParent->nOverflow==1 );
7042   assert( pParent->nOverflow==0 || pParent->aiOvfl[0]==iParentIdx );
7043 
7044   if( !aOvflSpace ){
7045     return SQLITE_NOMEM_BKPT;
7046   }
7047 
7048   /* Find the sibling pages to balance. Also locate the cells in pParent
7049   ** that divide the siblings. An attempt is made to find NN siblings on
7050   ** either side of pPage. More siblings are taken from one side, however,
7051   ** if there are fewer than NN siblings on the other side. If pParent
7052   ** has NB or fewer children then all children of pParent are taken.
7053   **
7054   ** This loop also drops the divider cells from the parent page. This
7055   ** way, the remainder of the function does not have to deal with any
7056   ** overflow cells in the parent page, since if any existed they will
7057   ** have already been removed.
7058   */
7059   i = pParent->nOverflow + pParent->nCell;
7060   if( i<2 ){
7061     nxDiv = 0;
7062   }else{
7063     assert( bBulk==0 || bBulk==1 );
7064     if( iParentIdx==0 ){
7065       nxDiv = 0;
7066     }else if( iParentIdx==i ){
7067       nxDiv = i-2+bBulk;
7068     }else{
7069       nxDiv = iParentIdx-1;
7070     }
7071     i = 2-bBulk;
7072   }
7073   nOld = i+1;
7074   if( (i+nxDiv-pParent->nOverflow)==pParent->nCell ){
7075     pRight = &pParent->aData[pParent->hdrOffset+8];
7076   }else{
7077     pRight = findCell(pParent, i+nxDiv-pParent->nOverflow);
7078   }
7079   pgno = get4byte(pRight);
7080   while( 1 ){
7081     rc = getAndInitPage(pBt, pgno, &apOld[i], 0, 0);
7082     if( rc ){
7083       memset(apOld, 0, (i+1)*sizeof(MemPage*));
7084       goto balance_cleanup;
7085     }
7086     nMaxCells += 1+apOld[i]->nCell+apOld[i]->nOverflow;
7087     if( (i--)==0 ) break;
7088 
7089     if( i+nxDiv==pParent->aiOvfl[0] && pParent->nOverflow ){
7090       apDiv[i] = pParent->apOvfl[0];
7091       pgno = get4byte(apDiv[i]);
7092       szNew[i] = pParent->xCellSize(pParent, apDiv[i]);
7093       pParent->nOverflow = 0;
7094     }else{
7095       apDiv[i] = findCell(pParent, i+nxDiv-pParent->nOverflow);
7096       pgno = get4byte(apDiv[i]);
7097       szNew[i] = pParent->xCellSize(pParent, apDiv[i]);
7098 
7099       /* Drop the cell from the parent page. apDiv[i] still points to
7100       ** the cell within the parent, even though it has been dropped.
7101       ** This is safe because dropping a cell only overwrites the first
7102       ** four bytes of it, and this function does not need the first
7103       ** four bytes of the divider cell. So the pointer is safe to use
7104       ** later on.
7105       **
7106       ** But not if we are in secure-delete mode. In secure-delete mode,
7107       ** the dropCell() routine will overwrite the entire cell with zeroes.
7108       ** In this case, temporarily copy the cell into the aOvflSpace[]
7109       ** buffer. It will be copied out again as soon as the aSpace[] buffer
7110       ** is allocated.  */
7111       if( pBt->btsFlags & BTS_SECURE_DELETE ){
7112         int iOff;
7113 
7114         iOff = SQLITE_PTR_TO_INT(apDiv[i]) - SQLITE_PTR_TO_INT(pParent->aData);
7115         if( (iOff+szNew[i])>(int)pBt->usableSize ){
7116           rc = SQLITE_CORRUPT_BKPT;
7117           memset(apOld, 0, (i+1)*sizeof(MemPage*));
7118           goto balance_cleanup;
7119         }else{
7120           memcpy(&aOvflSpace[iOff], apDiv[i], szNew[i]);
7121           apDiv[i] = &aOvflSpace[apDiv[i]-pParent->aData];
7122         }
7123       }
7124       dropCell(pParent, i+nxDiv-pParent->nOverflow, szNew[i], &rc);
7125     }
7126   }
7127 
7128   /* Make nMaxCells a multiple of 4 in order to preserve 8-byte
7129   ** alignment */
7130   nMaxCells = (nMaxCells + 3)&~3;
7131 
7132   /*
7133   ** Allocate space for memory structures
7134   */
7135   szScratch =
7136        nMaxCells*sizeof(u8*)                       /* b.apCell */
7137      + nMaxCells*sizeof(u16)                       /* b.szCell */
7138      + pBt->pageSize;                              /* aSpace1 */
7139 
7140   /* EVIDENCE-OF: R-28375-38319 SQLite will never request a scratch buffer
7141   ** that is more than 6 times the database page size. */
7142   assert( szScratch<=6*(int)pBt->pageSize );
7143   b.apCell = sqlite3ScratchMalloc( szScratch );
7144   if( b.apCell==0 ){
7145     rc = SQLITE_NOMEM_BKPT;
7146     goto balance_cleanup;
7147   }
7148   b.szCell = (u16*)&b.apCell[nMaxCells];
7149   aSpace1 = (u8*)&b.szCell[nMaxCells];
7150   assert( EIGHT_BYTE_ALIGNMENT(aSpace1) );
7151 
7152   /*
7153   ** Load pointers to all cells on sibling pages and the divider cells
7154   ** into the local b.apCell[] array.  Make copies of the divider cells
7155   ** into space obtained from aSpace1[]. The divider cells have already
7156   ** been removed from pParent.
7157   **
7158   ** If the siblings are on leaf pages, then the child pointers of the
7159   ** divider cells are stripped from the cells before they are copied
7160   ** into aSpace1[].  In this way, all cells in b.apCell[] are without
7161   ** child pointers.  If siblings are not leaves, then all cell in
7162   ** b.apCell[] include child pointers.  Either way, all cells in b.apCell[]
7163   ** are alike.
7164   **
7165   ** leafCorrection:  4 if pPage is a leaf.  0 if pPage is not a leaf.
7166   **       leafData:  1 if pPage holds key+data and pParent holds only keys.
7167   */
7168   b.pRef = apOld[0];
7169   leafCorrection = b.pRef->leaf*4;
7170   leafData = b.pRef->intKeyLeaf;
7171   for(i=0; i<nOld; i++){
7172     MemPage *pOld = apOld[i];
7173     int limit = pOld->nCell;
7174     u8 *aData = pOld->aData;
7175     u16 maskPage = pOld->maskPage;
7176     u8 *piCell = aData + pOld->cellOffset;
7177     u8 *piEnd;
7178 
7179     /* Verify that all sibling pages are of the same "type" (table-leaf,
7180     ** table-interior, index-leaf, or index-interior).
7181     */
7182     if( pOld->aData[0]!=apOld[0]->aData[0] ){
7183       rc = SQLITE_CORRUPT_BKPT;
7184       goto balance_cleanup;
7185     }
7186 
7187     /* Load b.apCell[] with pointers to all cells in pOld.  If pOld
7188     ** constains overflow cells, include them in the b.apCell[] array
7189     ** in the correct spot.
7190     **
7191     ** Note that when there are multiple overflow cells, it is always the
7192     ** case that they are sequential and adjacent.  This invariant arises
7193     ** because multiple overflows can only occurs when inserting divider
7194     ** cells into a parent on a prior balance, and divider cells are always
7195     ** adjacent and are inserted in order.  There is an assert() tagged
7196     ** with "NOTE 1" in the overflow cell insertion loop to prove this
7197     ** invariant.
7198     **
7199     ** This must be done in advance.  Once the balance starts, the cell
7200     ** offset section of the btree page will be overwritten and we will no
7201     ** long be able to find the cells if a pointer to each cell is not saved
7202     ** first.
7203     */
7204     memset(&b.szCell[b.nCell], 0, sizeof(b.szCell[0])*(limit+pOld->nOverflow));
7205     if( pOld->nOverflow>0 ){
7206       limit = pOld->aiOvfl[0];
7207       for(j=0; j<limit; j++){
7208         b.apCell[b.nCell] = aData + (maskPage & get2byteAligned(piCell));
7209         piCell += 2;
7210         b.nCell++;
7211       }
7212       for(k=0; k<pOld->nOverflow; k++){
7213         assert( k==0 || pOld->aiOvfl[k-1]+1==pOld->aiOvfl[k] );/* NOTE 1 */
7214         b.apCell[b.nCell] = pOld->apOvfl[k];
7215         b.nCell++;
7216       }
7217     }
7218     piEnd = aData + pOld->cellOffset + 2*pOld->nCell;
7219     while( piCell<piEnd ){
7220       assert( b.nCell<nMaxCells );
7221       b.apCell[b.nCell] = aData + (maskPage & get2byteAligned(piCell));
7222       piCell += 2;
7223       b.nCell++;
7224     }
7225 
7226     cntOld[i] = b.nCell;
7227     if( i<nOld-1 && !leafData){
7228       u16 sz = (u16)szNew[i];
7229       u8 *pTemp;
7230       assert( b.nCell<nMaxCells );
7231       b.szCell[b.nCell] = sz;
7232       pTemp = &aSpace1[iSpace1];
7233       iSpace1 += sz;
7234       assert( sz<=pBt->maxLocal+23 );
7235       assert( iSpace1 <= (int)pBt->pageSize );
7236       memcpy(pTemp, apDiv[i], sz);
7237       b.apCell[b.nCell] = pTemp+leafCorrection;
7238       assert( leafCorrection==0 || leafCorrection==4 );
7239       b.szCell[b.nCell] = b.szCell[b.nCell] - leafCorrection;
7240       if( !pOld->leaf ){
7241         assert( leafCorrection==0 );
7242         assert( pOld->hdrOffset==0 );
7243         /* The right pointer of the child page pOld becomes the left
7244         ** pointer of the divider cell */
7245         memcpy(b.apCell[b.nCell], &pOld->aData[8], 4);
7246       }else{
7247         assert( leafCorrection==4 );
7248         while( b.szCell[b.nCell]<4 ){
7249           /* Do not allow any cells smaller than 4 bytes. If a smaller cell
7250           ** does exist, pad it with 0x00 bytes. */
7251           assert( b.szCell[b.nCell]==3 || CORRUPT_DB );
7252           assert( b.apCell[b.nCell]==&aSpace1[iSpace1-3] || CORRUPT_DB );
7253           aSpace1[iSpace1++] = 0x00;
7254           b.szCell[b.nCell]++;
7255         }
7256       }
7257       b.nCell++;
7258     }
7259   }
7260 
7261   /*
7262   ** Figure out the number of pages needed to hold all b.nCell cells.
7263   ** Store this number in "k".  Also compute szNew[] which is the total
7264   ** size of all cells on the i-th page and cntNew[] which is the index
7265   ** in b.apCell[] of the cell that divides page i from page i+1.
7266   ** cntNew[k] should equal b.nCell.
7267   **
7268   ** Values computed by this block:
7269   **
7270   **           k: The total number of sibling pages
7271   **    szNew[i]: Spaced used on the i-th sibling page.
7272   **   cntNew[i]: Index in b.apCell[] and b.szCell[] for the first cell to
7273   **              the right of the i-th sibling page.
7274   ** usableSpace: Number of bytes of space available on each sibling.
7275   **
7276   */
7277   usableSpace = pBt->usableSize - 12 + leafCorrection;
7278   for(i=0; i<nOld; i++){
7279     MemPage *p = apOld[i];
7280     szNew[i] = usableSpace - p->nFree;
7281     if( szNew[i]<0 ){ rc = SQLITE_CORRUPT_BKPT; goto balance_cleanup; }
7282     for(j=0; j<p->nOverflow; j++){
7283       szNew[i] += 2 + p->xCellSize(p, p->apOvfl[j]);
7284     }
7285     cntNew[i] = cntOld[i];
7286   }
7287   k = nOld;
7288   for(i=0; i<k; i++){
7289     int sz;
7290     while( szNew[i]>usableSpace ){
7291       if( i+1>=k ){
7292         k = i+2;
7293         if( k>NB+2 ){ rc = SQLITE_CORRUPT_BKPT; goto balance_cleanup; }
7294         szNew[k-1] = 0;
7295         cntNew[k-1] = b.nCell;
7296       }
7297       sz = 2 + cachedCellSize(&b, cntNew[i]-1);
7298       szNew[i] -= sz;
7299       if( !leafData ){
7300         if( cntNew[i]<b.nCell ){
7301           sz = 2 + cachedCellSize(&b, cntNew[i]);
7302         }else{
7303           sz = 0;
7304         }
7305       }
7306       szNew[i+1] += sz;
7307       cntNew[i]--;
7308     }
7309     while( cntNew[i]<b.nCell ){
7310       sz = 2 + cachedCellSize(&b, cntNew[i]);
7311       if( szNew[i]+sz>usableSpace ) break;
7312       szNew[i] += sz;
7313       cntNew[i]++;
7314       if( !leafData ){
7315         if( cntNew[i]<b.nCell ){
7316           sz = 2 + cachedCellSize(&b, cntNew[i]);
7317         }else{
7318           sz = 0;
7319         }
7320       }
7321       szNew[i+1] -= sz;
7322     }
7323     if( cntNew[i]>=b.nCell ){
7324       k = i+1;
7325     }else if( cntNew[i] <= (i>0 ? cntNew[i-1] : 0) ){
7326       rc = SQLITE_CORRUPT_BKPT;
7327       goto balance_cleanup;
7328     }
7329   }
7330 
7331   /*
7332   ** The packing computed by the previous block is biased toward the siblings
7333   ** on the left side (siblings with smaller keys). The left siblings are
7334   ** always nearly full, while the right-most sibling might be nearly empty.
7335   ** The next block of code attempts to adjust the packing of siblings to
7336   ** get a better balance.
7337   **
7338   ** This adjustment is more than an optimization.  The packing above might
7339   ** be so out of balance as to be illegal.  For example, the right-most
7340   ** sibling might be completely empty.  This adjustment is not optional.
7341   */
7342   for(i=k-1; i>0; i--){
7343     int szRight = szNew[i];  /* Size of sibling on the right */
7344     int szLeft = szNew[i-1]; /* Size of sibling on the left */
7345     int r;              /* Index of right-most cell in left sibling */
7346     int d;              /* Index of first cell to the left of right sibling */
7347 
7348     r = cntNew[i-1] - 1;
7349     d = r + 1 - leafData;
7350     (void)cachedCellSize(&b, d);
7351     do{
7352       assert( d<nMaxCells );
7353       assert( r<nMaxCells );
7354       (void)cachedCellSize(&b, r);
7355       if( szRight!=0
7356        && (bBulk || szRight+b.szCell[d]+2 > szLeft-(b.szCell[r]+2)) ){
7357         break;
7358       }
7359       szRight += b.szCell[d] + 2;
7360       szLeft -= b.szCell[r] + 2;
7361       cntNew[i-1] = r;
7362       r--;
7363       d--;
7364     }while( r>=0 );
7365     szNew[i] = szRight;
7366     szNew[i-1] = szLeft;
7367     if( cntNew[i-1] <= (i>1 ? cntNew[i-2] : 0) ){
7368       rc = SQLITE_CORRUPT_BKPT;
7369       goto balance_cleanup;
7370     }
7371   }
7372 
7373   /* Sanity check:  For a non-corrupt database file one of the follwing
7374   ** must be true:
7375   **    (1) We found one or more cells (cntNew[0])>0), or
7376   **    (2) pPage is a virtual root page.  A virtual root page is when
7377   **        the real root page is page 1 and we are the only child of
7378   **        that page.
7379   */
7380   assert( cntNew[0]>0 || (pParent->pgno==1 && pParent->nCell==0) || CORRUPT_DB);
7381   TRACE(("BALANCE: old: %d(nc=%d) %d(nc=%d) %d(nc=%d)\n",
7382     apOld[0]->pgno, apOld[0]->nCell,
7383     nOld>=2 ? apOld[1]->pgno : 0, nOld>=2 ? apOld[1]->nCell : 0,
7384     nOld>=3 ? apOld[2]->pgno : 0, nOld>=3 ? apOld[2]->nCell : 0
7385   ));
7386 
7387   /*
7388   ** Allocate k new pages.  Reuse old pages where possible.
7389   */
7390   pageFlags = apOld[0]->aData[0];
7391   for(i=0; i<k; i++){
7392     MemPage *pNew;
7393     if( i<nOld ){
7394       pNew = apNew[i] = apOld[i];
7395       apOld[i] = 0;
7396       rc = sqlite3PagerWrite(pNew->pDbPage);
7397       nNew++;
7398       if( rc ) goto balance_cleanup;
7399     }else{
7400       assert( i>0 );
7401       rc = allocateBtreePage(pBt, &pNew, &pgno, (bBulk ? 1 : pgno), 0);
7402       if( rc ) goto balance_cleanup;
7403       zeroPage(pNew, pageFlags);
7404       apNew[i] = pNew;
7405       nNew++;
7406       cntOld[i] = b.nCell;
7407 
7408       /* Set the pointer-map entry for the new sibling page. */
7409       if( ISAUTOVACUUM ){
7410         ptrmapPut(pBt, pNew->pgno, PTRMAP_BTREE, pParent->pgno, &rc);
7411         if( rc!=SQLITE_OK ){
7412           goto balance_cleanup;
7413         }
7414       }
7415     }
7416   }
7417 
7418   /*
7419   ** Reassign page numbers so that the new pages are in ascending order.
7420   ** This helps to keep entries in the disk file in order so that a scan
7421   ** of the table is closer to a linear scan through the file. That in turn
7422   ** helps the operating system to deliver pages from the disk more rapidly.
7423   **
7424   ** An O(n^2) insertion sort algorithm is used, but since n is never more
7425   ** than (NB+2) (a small constant), that should not be a problem.
7426   **
7427   ** When NB==3, this one optimization makes the database about 25% faster
7428   ** for large insertions and deletions.
7429   */
7430   for(i=0; i<nNew; i++){
7431     aPgOrder[i] = aPgno[i] = apNew[i]->pgno;
7432     aPgFlags[i] = apNew[i]->pDbPage->flags;
7433     for(j=0; j<i; j++){
7434       if( aPgno[j]==aPgno[i] ){
7435         /* This branch is taken if the set of sibling pages somehow contains
7436         ** duplicate entries. This can happen if the database is corrupt.
7437         ** It would be simpler to detect this as part of the loop below, but
7438         ** we do the detection here in order to avoid populating the pager
7439         ** cache with two separate objects associated with the same
7440         ** page number.  */
7441         assert( CORRUPT_DB );
7442         rc = SQLITE_CORRUPT_BKPT;
7443         goto balance_cleanup;
7444       }
7445     }
7446   }
7447   for(i=0; i<nNew; i++){
7448     int iBest = 0;                /* aPgno[] index of page number to use */
7449     for(j=1; j<nNew; j++){
7450       if( aPgOrder[j]<aPgOrder[iBest] ) iBest = j;
7451     }
7452     pgno = aPgOrder[iBest];
7453     aPgOrder[iBest] = 0xffffffff;
7454     if( iBest!=i ){
7455       if( iBest>i ){
7456         sqlite3PagerRekey(apNew[iBest]->pDbPage, pBt->nPage+iBest+1, 0);
7457       }
7458       sqlite3PagerRekey(apNew[i]->pDbPage, pgno, aPgFlags[iBest]);
7459       apNew[i]->pgno = pgno;
7460     }
7461   }
7462 
7463   TRACE(("BALANCE: new: %d(%d nc=%d) %d(%d nc=%d) %d(%d nc=%d) "
7464          "%d(%d nc=%d) %d(%d nc=%d)\n",
7465     apNew[0]->pgno, szNew[0], cntNew[0],
7466     nNew>=2 ? apNew[1]->pgno : 0, nNew>=2 ? szNew[1] : 0,
7467     nNew>=2 ? cntNew[1] - cntNew[0] - !leafData : 0,
7468     nNew>=3 ? apNew[2]->pgno : 0, nNew>=3 ? szNew[2] : 0,
7469     nNew>=3 ? cntNew[2] - cntNew[1] - !leafData : 0,
7470     nNew>=4 ? apNew[3]->pgno : 0, nNew>=4 ? szNew[3] : 0,
7471     nNew>=4 ? cntNew[3] - cntNew[2] - !leafData : 0,
7472     nNew>=5 ? apNew[4]->pgno : 0, nNew>=5 ? szNew[4] : 0,
7473     nNew>=5 ? cntNew[4] - cntNew[3] - !leafData : 0
7474   ));
7475 
7476   assert( sqlite3PagerIswriteable(pParent->pDbPage) );
7477   put4byte(pRight, apNew[nNew-1]->pgno);
7478 
7479   /* If the sibling pages are not leaves, ensure that the right-child pointer
7480   ** of the right-most new sibling page is set to the value that was
7481   ** originally in the same field of the right-most old sibling page. */
7482   if( (pageFlags & PTF_LEAF)==0 && nOld!=nNew ){
7483     MemPage *pOld = (nNew>nOld ? apNew : apOld)[nOld-1];
7484     memcpy(&apNew[nNew-1]->aData[8], &pOld->aData[8], 4);
7485   }
7486 
7487   /* Make any required updates to pointer map entries associated with
7488   ** cells stored on sibling pages following the balance operation. Pointer
7489   ** map entries associated with divider cells are set by the insertCell()
7490   ** routine. The associated pointer map entries are:
7491   **
7492   **   a) if the cell contains a reference to an overflow chain, the
7493   **      entry associated with the first page in the overflow chain, and
7494   **
7495   **   b) if the sibling pages are not leaves, the child page associated
7496   **      with the cell.
7497   **
7498   ** If the sibling pages are not leaves, then the pointer map entry
7499   ** associated with the right-child of each sibling may also need to be
7500   ** updated. This happens below, after the sibling pages have been
7501   ** populated, not here.
7502   */
7503   if( ISAUTOVACUUM ){
7504     MemPage *pNew = apNew[0];
7505     u8 *aOld = pNew->aData;
7506     int cntOldNext = pNew->nCell + pNew->nOverflow;
7507     int usableSize = pBt->usableSize;
7508     int iNew = 0;
7509     int iOld = 0;
7510 
7511     for(i=0; i<b.nCell; i++){
7512       u8 *pCell = b.apCell[i];
7513       if( i==cntOldNext ){
7514         MemPage *pOld = (++iOld)<nNew ? apNew[iOld] : apOld[iOld];
7515         cntOldNext += pOld->nCell + pOld->nOverflow + !leafData;
7516         aOld = pOld->aData;
7517       }
7518       if( i==cntNew[iNew] ){
7519         pNew = apNew[++iNew];
7520         if( !leafData ) continue;
7521       }
7522 
7523       /* Cell pCell is destined for new sibling page pNew. Originally, it
7524       ** was either part of sibling page iOld (possibly an overflow cell),
7525       ** or else the divider cell to the left of sibling page iOld. So,
7526       ** if sibling page iOld had the same page number as pNew, and if
7527       ** pCell really was a part of sibling page iOld (not a divider or
7528       ** overflow cell), we can skip updating the pointer map entries.  */
7529       if( iOld>=nNew
7530        || pNew->pgno!=aPgno[iOld]
7531        || !SQLITE_WITHIN(pCell,aOld,&aOld[usableSize])
7532       ){
7533         if( !leafCorrection ){
7534           ptrmapPut(pBt, get4byte(pCell), PTRMAP_BTREE, pNew->pgno, &rc);
7535         }
7536         if( cachedCellSize(&b,i)>pNew->minLocal ){
7537           ptrmapPutOvflPtr(pNew, pCell, &rc);
7538         }
7539         if( rc ) goto balance_cleanup;
7540       }
7541     }
7542   }
7543 
7544   /* Insert new divider cells into pParent. */
7545   for(i=0; i<nNew-1; i++){
7546     u8 *pCell;
7547     u8 *pTemp;
7548     int sz;
7549     MemPage *pNew = apNew[i];
7550     j = cntNew[i];
7551 
7552     assert( j<nMaxCells );
7553     assert( b.apCell[j]!=0 );
7554     pCell = b.apCell[j];
7555     sz = b.szCell[j] + leafCorrection;
7556     pTemp = &aOvflSpace[iOvflSpace];
7557     if( !pNew->leaf ){
7558       memcpy(&pNew->aData[8], pCell, 4);
7559     }else if( leafData ){
7560       /* If the tree is a leaf-data tree, and the siblings are leaves,
7561       ** then there is no divider cell in b.apCell[]. Instead, the divider
7562       ** cell consists of the integer key for the right-most cell of
7563       ** the sibling-page assembled above only.
7564       */
7565       CellInfo info;
7566       j--;
7567       pNew->xParseCell(pNew, b.apCell[j], &info);
7568       pCell = pTemp;
7569       sz = 4 + putVarint(&pCell[4], info.nKey);
7570       pTemp = 0;
7571     }else{
7572       pCell -= 4;
7573       /* Obscure case for non-leaf-data trees: If the cell at pCell was
7574       ** previously stored on a leaf node, and its reported size was 4
7575       ** bytes, then it may actually be smaller than this
7576       ** (see btreeParseCellPtr(), 4 bytes is the minimum size of
7577       ** any cell). But it is important to pass the correct size to
7578       ** insertCell(), so reparse the cell now.
7579       **
7580       ** This can only happen for b-trees used to evaluate "IN (SELECT ...)"
7581       ** and WITHOUT ROWID tables with exactly one column which is the
7582       ** primary key.
7583       */
7584       if( b.szCell[j]==4 ){
7585         assert(leafCorrection==4);
7586         sz = pParent->xCellSize(pParent, pCell);
7587       }
7588     }
7589     iOvflSpace += sz;
7590     assert( sz<=pBt->maxLocal+23 );
7591     assert( iOvflSpace <= (int)pBt->pageSize );
7592     insertCell(pParent, nxDiv+i, pCell, sz, pTemp, pNew->pgno, &rc);
7593     if( rc!=SQLITE_OK ) goto balance_cleanup;
7594     assert( sqlite3PagerIswriteable(pParent->pDbPage) );
7595   }
7596 
7597   /* Now update the actual sibling pages. The order in which they are updated
7598   ** is important, as this code needs to avoid disrupting any page from which
7599   ** cells may still to be read. In practice, this means:
7600   **
7601   **  (1) If cells are moving left (from apNew[iPg] to apNew[iPg-1])
7602   **      then it is not safe to update page apNew[iPg] until after
7603   **      the left-hand sibling apNew[iPg-1] has been updated.
7604   **
7605   **  (2) If cells are moving right (from apNew[iPg] to apNew[iPg+1])
7606   **      then it is not safe to update page apNew[iPg] until after
7607   **      the right-hand sibling apNew[iPg+1] has been updated.
7608   **
7609   ** If neither of the above apply, the page is safe to update.
7610   **
7611   ** The iPg value in the following loop starts at nNew-1 goes down
7612   ** to 0, then back up to nNew-1 again, thus making two passes over
7613   ** the pages.  On the initial downward pass, only condition (1) above
7614   ** needs to be tested because (2) will always be true from the previous
7615   ** step.  On the upward pass, both conditions are always true, so the
7616   ** upwards pass simply processes pages that were missed on the downward
7617   ** pass.
7618   */
7619   for(i=1-nNew; i<nNew; i++){
7620     int iPg = i<0 ? -i : i;
7621     assert( iPg>=0 && iPg<nNew );
7622     if( abDone[iPg] ) continue;         /* Skip pages already processed */
7623     if( i>=0                            /* On the upwards pass, or... */
7624      || cntOld[iPg-1]>=cntNew[iPg-1]    /* Condition (1) is true */
7625     ){
7626       int iNew;
7627       int iOld;
7628       int nNewCell;
7629 
7630       /* Verify condition (1):  If cells are moving left, update iPg
7631       ** only after iPg-1 has already been updated. */
7632       assert( iPg==0 || cntOld[iPg-1]>=cntNew[iPg-1] || abDone[iPg-1] );
7633 
7634       /* Verify condition (2):  If cells are moving right, update iPg
7635       ** only after iPg+1 has already been updated. */
7636       assert( cntNew[iPg]>=cntOld[iPg] || abDone[iPg+1] );
7637 
7638       if( iPg==0 ){
7639         iNew = iOld = 0;
7640         nNewCell = cntNew[0];
7641       }else{
7642         iOld = iPg<nOld ? (cntOld[iPg-1] + !leafData) : b.nCell;
7643         iNew = cntNew[iPg-1] + !leafData;
7644         nNewCell = cntNew[iPg] - iNew;
7645       }
7646 
7647       rc = editPage(apNew[iPg], iOld, iNew, nNewCell, &b);
7648       if( rc ) goto balance_cleanup;
7649       abDone[iPg]++;
7650       apNew[iPg]->nFree = usableSpace-szNew[iPg];
7651       assert( apNew[iPg]->nOverflow==0 );
7652       assert( apNew[iPg]->nCell==nNewCell );
7653     }
7654   }
7655 
7656   /* All pages have been processed exactly once */
7657   assert( memcmp(abDone, "\01\01\01\01\01", nNew)==0 );
7658 
7659   assert( nOld>0 );
7660   assert( nNew>0 );
7661 
7662   if( isRoot && pParent->nCell==0 && pParent->hdrOffset<=apNew[0]->nFree ){
7663     /* The root page of the b-tree now contains no cells. The only sibling
7664     ** page is the right-child of the parent. Copy the contents of the
7665     ** child page into the parent, decreasing the overall height of the
7666     ** b-tree structure by one. This is described as the "balance-shallower"
7667     ** sub-algorithm in some documentation.
7668     **
7669     ** If this is an auto-vacuum database, the call to copyNodeContent()
7670     ** sets all pointer-map entries corresponding to database image pages
7671     ** for which the pointer is stored within the content being copied.
7672     **
7673     ** It is critical that the child page be defragmented before being
7674     ** copied into the parent, because if the parent is page 1 then it will
7675     ** by smaller than the child due to the database header, and so all the
7676     ** free space needs to be up front.
7677     */
7678     assert( nNew==1 || CORRUPT_DB );
7679     rc = defragmentPage(apNew[0]);
7680     testcase( rc!=SQLITE_OK );
7681     assert( apNew[0]->nFree ==
7682         (get2byte(&apNew[0]->aData[5])-apNew[0]->cellOffset-apNew[0]->nCell*2)
7683       || rc!=SQLITE_OK
7684     );
7685     copyNodeContent(apNew[0], pParent, &rc);
7686     freePage(apNew[0], &rc);
7687   }else if( ISAUTOVACUUM && !leafCorrection ){
7688     /* Fix the pointer map entries associated with the right-child of each
7689     ** sibling page. All other pointer map entries have already been taken
7690     ** care of.  */
7691     for(i=0; i<nNew; i++){
7692       u32 key = get4byte(&apNew[i]->aData[8]);
7693       ptrmapPut(pBt, key, PTRMAP_BTREE, apNew[i]->pgno, &rc);
7694     }
7695   }
7696 
7697   assert( pParent->isInit );
7698   TRACE(("BALANCE: finished: old=%d new=%d cells=%d\n",
7699           nOld, nNew, b.nCell));
7700 
7701   /* Free any old pages that were not reused as new pages.
7702   */
7703   for(i=nNew; i<nOld; i++){
7704     freePage(apOld[i], &rc);
7705   }
7706 
7707 #if 0
7708   if( ISAUTOVACUUM && rc==SQLITE_OK && apNew[0]->isInit ){
7709     /* The ptrmapCheckPages() contains assert() statements that verify that
7710     ** all pointer map pages are set correctly. This is helpful while
7711     ** debugging. This is usually disabled because a corrupt database may
7712     ** cause an assert() statement to fail.  */
7713     ptrmapCheckPages(apNew, nNew);
7714     ptrmapCheckPages(&pParent, 1);
7715   }
7716 #endif
7717 
7718   /*
7719   ** Cleanup before returning.
7720   */
7721 balance_cleanup:
7722   sqlite3ScratchFree(b.apCell);
7723   for(i=0; i<nOld; i++){
7724     releasePage(apOld[i]);
7725   }
7726   for(i=0; i<nNew; i++){
7727     releasePage(apNew[i]);
7728   }
7729 
7730   return rc;
7731 }
7732 
7733 
7734 /*
7735 ** This function is called when the root page of a b-tree structure is
7736 ** overfull (has one or more overflow pages).
7737 **
7738 ** A new child page is allocated and the contents of the current root
7739 ** page, including overflow cells, are copied into the child. The root
7740 ** page is then overwritten to make it an empty page with the right-child
7741 ** pointer pointing to the new page.
7742 **
7743 ** Before returning, all pointer-map entries corresponding to pages
7744 ** that the new child-page now contains pointers to are updated. The
7745 ** entry corresponding to the new right-child pointer of the root
7746 ** page is also updated.
7747 **
7748 ** If successful, *ppChild is set to contain a reference to the child
7749 ** page and SQLITE_OK is returned. In this case the caller is required
7750 ** to call releasePage() on *ppChild exactly once. If an error occurs,
7751 ** an error code is returned and *ppChild is set to 0.
7752 */
7753 static int balance_deeper(MemPage *pRoot, MemPage **ppChild){
7754   int rc;                        /* Return value from subprocedures */
7755   MemPage *pChild = 0;           /* Pointer to a new child page */
7756   Pgno pgnoChild = 0;            /* Page number of the new child page */
7757   BtShared *pBt = pRoot->pBt;    /* The BTree */
7758 
7759   assert( pRoot->nOverflow>0 );
7760   assert( sqlite3_mutex_held(pBt->mutex) );
7761 
7762   /* Make pRoot, the root page of the b-tree, writable. Allocate a new
7763   ** page that will become the new right-child of pPage. Copy the contents
7764   ** of the node stored on pRoot into the new child page.
7765   */
7766   rc = sqlite3PagerWrite(pRoot->pDbPage);
7767   if( rc==SQLITE_OK ){
7768     rc = allocateBtreePage(pBt,&pChild,&pgnoChild,pRoot->pgno,0);
7769     copyNodeContent(pRoot, pChild, &rc);
7770     if( ISAUTOVACUUM ){
7771       ptrmapPut(pBt, pgnoChild, PTRMAP_BTREE, pRoot->pgno, &rc);
7772     }
7773   }
7774   if( rc ){
7775     *ppChild = 0;
7776     releasePage(pChild);
7777     return rc;
7778   }
7779   assert( sqlite3PagerIswriteable(pChild->pDbPage) );
7780   assert( sqlite3PagerIswriteable(pRoot->pDbPage) );
7781   assert( pChild->nCell==pRoot->nCell );
7782 
7783   TRACE(("BALANCE: copy root %d into %d\n", pRoot->pgno, pChild->pgno));
7784 
7785   /* Copy the overflow cells from pRoot to pChild */
7786   memcpy(pChild->aiOvfl, pRoot->aiOvfl,
7787          pRoot->nOverflow*sizeof(pRoot->aiOvfl[0]));
7788   memcpy(pChild->apOvfl, pRoot->apOvfl,
7789          pRoot->nOverflow*sizeof(pRoot->apOvfl[0]));
7790   pChild->nOverflow = pRoot->nOverflow;
7791 
7792   /* Zero the contents of pRoot. Then install pChild as the right-child. */
7793   zeroPage(pRoot, pChild->aData[0] & ~PTF_LEAF);
7794   put4byte(&pRoot->aData[pRoot->hdrOffset+8], pgnoChild);
7795 
7796   *ppChild = pChild;
7797   return SQLITE_OK;
7798 }
7799 
7800 /*
7801 ** The page that pCur currently points to has just been modified in
7802 ** some way. This function figures out if this modification means the
7803 ** tree needs to be balanced, and if so calls the appropriate balancing
7804 ** routine. Balancing routines are:
7805 **
7806 **   balance_quick()
7807 **   balance_deeper()
7808 **   balance_nonroot()
7809 */
7810 static int balance(BtCursor *pCur){
7811   int rc = SQLITE_OK;
7812   const int nMin = pCur->pBt->usableSize * 2 / 3;
7813   u8 aBalanceQuickSpace[13];
7814   u8 *pFree = 0;
7815 
7816   VVA_ONLY( int balance_quick_called = 0 );
7817   VVA_ONLY( int balance_deeper_called = 0 );
7818 
7819   do {
7820     int iPage = pCur->iPage;
7821     MemPage *pPage = pCur->apPage[iPage];
7822 
7823     if( iPage==0 ){
7824       if( pPage->nOverflow ){
7825         /* The root page of the b-tree is overfull. In this case call the
7826         ** balance_deeper() function to create a new child for the root-page
7827         ** and copy the current contents of the root-page to it. The
7828         ** next iteration of the do-loop will balance the child page.
7829         */
7830         assert( balance_deeper_called==0 );
7831         VVA_ONLY( balance_deeper_called++ );
7832         rc = balance_deeper(pPage, &pCur->apPage[1]);
7833         if( rc==SQLITE_OK ){
7834           pCur->iPage = 1;
7835           pCur->aiIdx[0] = 0;
7836           pCur->aiIdx[1] = 0;
7837           assert( pCur->apPage[1]->nOverflow );
7838         }
7839       }else{
7840         break;
7841       }
7842     }else if( pPage->nOverflow==0 && pPage->nFree<=nMin ){
7843       break;
7844     }else{
7845       MemPage * const pParent = pCur->apPage[iPage-1];
7846       int const iIdx = pCur->aiIdx[iPage-1];
7847 
7848       rc = sqlite3PagerWrite(pParent->pDbPage);
7849       if( rc==SQLITE_OK ){
7850 #ifndef SQLITE_OMIT_QUICKBALANCE
7851         if( pPage->intKeyLeaf
7852          && pPage->nOverflow==1
7853          && pPage->aiOvfl[0]==pPage->nCell
7854          && pParent->pgno!=1
7855          && pParent->nCell==iIdx
7856         ){
7857           /* Call balance_quick() to create a new sibling of pPage on which
7858           ** to store the overflow cell. balance_quick() inserts a new cell
7859           ** into pParent, which may cause pParent overflow. If this
7860           ** happens, the next iteration of the do-loop will balance pParent
7861           ** use either balance_nonroot() or balance_deeper(). Until this
7862           ** happens, the overflow cell is stored in the aBalanceQuickSpace[]
7863           ** buffer.
7864           **
7865           ** The purpose of the following assert() is to check that only a
7866           ** single call to balance_quick() is made for each call to this
7867           ** function. If this were not verified, a subtle bug involving reuse
7868           ** of the aBalanceQuickSpace[] might sneak in.
7869           */
7870           assert( balance_quick_called==0 );
7871           VVA_ONLY( balance_quick_called++ );
7872           rc = balance_quick(pParent, pPage, aBalanceQuickSpace);
7873         }else
7874 #endif
7875         {
7876           /* In this case, call balance_nonroot() to redistribute cells
7877           ** between pPage and up to 2 of its sibling pages. This involves
7878           ** modifying the contents of pParent, which may cause pParent to
7879           ** become overfull or underfull. The next iteration of the do-loop
7880           ** will balance the parent page to correct this.
7881           **
7882           ** If the parent page becomes overfull, the overflow cell or cells
7883           ** are stored in the pSpace buffer allocated immediately below.
7884           ** A subsequent iteration of the do-loop will deal with this by
7885           ** calling balance_nonroot() (balance_deeper() may be called first,
7886           ** but it doesn't deal with overflow cells - just moves them to a
7887           ** different page). Once this subsequent call to balance_nonroot()
7888           ** has completed, it is safe to release the pSpace buffer used by
7889           ** the previous call, as the overflow cell data will have been
7890           ** copied either into the body of a database page or into the new
7891           ** pSpace buffer passed to the latter call to balance_nonroot().
7892           */
7893           u8 *pSpace = sqlite3PageMalloc(pCur->pBt->pageSize);
7894           rc = balance_nonroot(pParent, iIdx, pSpace, iPage==1,
7895                                pCur->hints&BTREE_BULKLOAD);
7896           if( pFree ){
7897             /* If pFree is not NULL, it points to the pSpace buffer used
7898             ** by a previous call to balance_nonroot(). Its contents are
7899             ** now stored either on real database pages or within the
7900             ** new pSpace buffer, so it may be safely freed here. */
7901             sqlite3PageFree(pFree);
7902           }
7903 
7904           /* The pSpace buffer will be freed after the next call to
7905           ** balance_nonroot(), or just before this function returns, whichever
7906           ** comes first. */
7907           pFree = pSpace;
7908         }
7909       }
7910 
7911       pPage->nOverflow = 0;
7912 
7913       /* The next iteration of the do-loop balances the parent page. */
7914       releasePage(pPage);
7915       pCur->iPage--;
7916       assert( pCur->iPage>=0 );
7917     }
7918   }while( rc==SQLITE_OK );
7919 
7920   if( pFree ){
7921     sqlite3PageFree(pFree);
7922   }
7923   return rc;
7924 }
7925 
7926 
7927 /*
7928 ** Insert a new record into the BTree.  The key is given by (pKey,nKey)
7929 ** and the data is given by (pData,nData).  The cursor is used only to
7930 ** define what table the record should be inserted into.  The cursor
7931 ** is left pointing at a random location.
7932 **
7933 ** For an INTKEY table, only the nKey value of the key is used.  pKey is
7934 ** ignored.  For a ZERODATA table, the pData and nData are both ignored.
7935 **
7936 ** If the seekResult parameter is non-zero, then a successful call to
7937 ** MovetoUnpacked() to seek cursor pCur to (pKey, nKey) has already
7938 ** been performed. seekResult is the search result returned (a negative
7939 ** number if pCur points at an entry that is smaller than (pKey, nKey), or
7940 ** a positive value if pCur points at an entry that is larger than
7941 ** (pKey, nKey)).
7942 **
7943 ** If the seekResult parameter is non-zero, then the caller guarantees that
7944 ** cursor pCur is pointing at the existing copy of a row that is to be
7945 ** overwritten.  If the seekResult parameter is 0, then cursor pCur may
7946 ** point to any entry or to no entry at all and so this function has to seek
7947 ** the cursor before the new key can be inserted.
7948 */
7949 int sqlite3BtreeInsert(
7950   BtCursor *pCur,                /* Insert data into the table of this cursor */
7951   const void *pKey, i64 nKey,    /* The key of the new record */
7952   const void *pData, int nData,  /* The data of the new record */
7953   int nZero,                     /* Number of extra 0 bytes to append to data */
7954   int appendBias,                /* True if this is likely an append */
7955   int seekResult                 /* Result of prior MovetoUnpacked() call */
7956 ){
7957   int rc;
7958   int loc = seekResult;          /* -1: before desired location  +1: after */
7959   int szNew = 0;
7960   int idx;
7961   MemPage *pPage;
7962   Btree *p = pCur->pBtree;
7963   BtShared *pBt = p->pBt;
7964   unsigned char *oldCell;
7965   unsigned char *newCell = 0;
7966 
7967   if( pCur->eState==CURSOR_FAULT ){
7968     assert( pCur->skipNext!=SQLITE_OK );
7969     return pCur->skipNext;
7970   }
7971 
7972   assert( cursorOwnsBtShared(pCur) );
7973   assert( (pCur->curFlags & BTCF_WriteFlag)!=0
7974               && pBt->inTransaction==TRANS_WRITE
7975               && (pBt->btsFlags & BTS_READ_ONLY)==0 );
7976   assert( hasSharedCacheTableLock(p, pCur->pgnoRoot, pCur->pKeyInfo!=0, 2) );
7977 
7978   /* Assert that the caller has been consistent. If this cursor was opened
7979   ** expecting an index b-tree, then the caller should be inserting blob
7980   ** keys with no associated data. If the cursor was opened expecting an
7981   ** intkey table, the caller should be inserting integer keys with a
7982   ** blob of associated data.  */
7983   assert( (pKey==0)==(pCur->pKeyInfo==0) );
7984 
7985   /* Save the positions of any other cursors open on this table.
7986   **
7987   ** In some cases, the call to btreeMoveto() below is a no-op. For
7988   ** example, when inserting data into a table with auto-generated integer
7989   ** keys, the VDBE layer invokes sqlite3BtreeLast() to figure out the
7990   ** integer key to use. It then calls this function to actually insert the
7991   ** data into the intkey B-Tree. In this case btreeMoveto() recognizes
7992   ** that the cursor is already where it needs to be and returns without
7993   ** doing any work. To avoid thwarting these optimizations, it is important
7994   ** not to clear the cursor here.
7995   */
7996   if( pCur->curFlags & BTCF_Multiple ){
7997     rc = saveAllCursors(pBt, pCur->pgnoRoot, pCur);
7998     if( rc ) return rc;
7999   }
8000 
8001   if( pCur->pKeyInfo==0 ){
8002     assert( pKey==0 );
8003     /* If this is an insert into a table b-tree, invalidate any incrblob
8004     ** cursors open on the row being replaced */
8005     invalidateIncrblobCursors(p, nKey, 0);
8006 
8007     /* If the cursor is currently on the last row and we are appending a
8008     ** new row onto the end, set the "loc" to avoid an unnecessary
8009     ** btreeMoveto() call */
8010     if( (pCur->curFlags&BTCF_ValidNKey)!=0 && nKey>0
8011       && pCur->info.nKey==nKey-1 ){
8012        loc = -1;
8013     }else if( loc==0 ){
8014       rc = sqlite3BtreeMovetoUnpacked(pCur, 0, nKey, appendBias, &loc);
8015       if( rc ) return rc;
8016     }
8017   }else if( loc==0 ){
8018     rc = btreeMoveto(pCur, pKey, nKey, appendBias, &loc);
8019     if( rc ) return rc;
8020   }
8021   assert( pCur->eState==CURSOR_VALID || (pCur->eState==CURSOR_INVALID && loc) );
8022 
8023   pPage = pCur->apPage[pCur->iPage];
8024   assert( pPage->intKey || nKey>=0 );
8025   assert( pPage->leaf || !pPage->intKey );
8026 
8027   TRACE(("INSERT: table=%d nkey=%lld ndata=%d page=%d %s\n",
8028           pCur->pgnoRoot, nKey, nData, pPage->pgno,
8029           loc==0 ? "overwrite" : "new entry"));
8030   assert( pPage->isInit );
8031   newCell = pBt->pTmpSpace;
8032   assert( newCell!=0 );
8033   rc = fillInCell(pPage, newCell, pKey, nKey, pData, nData, nZero, &szNew);
8034   if( rc ) goto end_insert;
8035   assert( szNew==pPage->xCellSize(pPage, newCell) );
8036   assert( szNew <= MX_CELL_SIZE(pBt) );
8037   idx = pCur->aiIdx[pCur->iPage];
8038   if( loc==0 ){
8039     u16 szOld;
8040     assert( idx<pPage->nCell );
8041     rc = sqlite3PagerWrite(pPage->pDbPage);
8042     if( rc ){
8043       goto end_insert;
8044     }
8045     oldCell = findCell(pPage, idx);
8046     if( !pPage->leaf ){
8047       memcpy(newCell, oldCell, 4);
8048     }
8049     rc = clearCell(pPage, oldCell, &szOld);
8050     dropCell(pPage, idx, szOld, &rc);
8051     if( rc ) goto end_insert;
8052   }else if( loc<0 && pPage->nCell>0 ){
8053     assert( pPage->leaf );
8054     idx = ++pCur->aiIdx[pCur->iPage];
8055   }else{
8056     assert( pPage->leaf );
8057   }
8058   insertCell(pPage, idx, newCell, szNew, 0, 0, &rc);
8059   assert( rc!=SQLITE_OK || pPage->nCell>0 || pPage->nOverflow>0 );
8060 
8061   /* If no error has occurred and pPage has an overflow cell, call balance()
8062   ** to redistribute the cells within the tree. Since balance() may move
8063   ** the cursor, zero the BtCursor.info.nSize and BTCF_ValidNKey
8064   ** variables.
8065   **
8066   ** Previous versions of SQLite called moveToRoot() to move the cursor
8067   ** back to the root page as balance() used to invalidate the contents
8068   ** of BtCursor.apPage[] and BtCursor.aiIdx[]. Instead of doing that,
8069   ** set the cursor state to "invalid". This makes common insert operations
8070   ** slightly faster.
8071   **
8072   ** There is a subtle but important optimization here too. When inserting
8073   ** multiple records into an intkey b-tree using a single cursor (as can
8074   ** happen while processing an "INSERT INTO ... SELECT" statement), it
8075   ** is advantageous to leave the cursor pointing to the last entry in
8076   ** the b-tree if possible. If the cursor is left pointing to the last
8077   ** entry in the table, and the next row inserted has an integer key
8078   ** larger than the largest existing key, it is possible to insert the
8079   ** row without seeking the cursor. This can be a big performance boost.
8080   */
8081   pCur->info.nSize = 0;
8082   if( rc==SQLITE_OK && pPage->nOverflow ){
8083     pCur->curFlags &= ~(BTCF_ValidNKey);
8084     rc = balance(pCur);
8085 
8086     /* Must make sure nOverflow is reset to zero even if the balance()
8087     ** fails. Internal data structure corruption will result otherwise.
8088     ** Also, set the cursor state to invalid. This stops saveCursorPosition()
8089     ** from trying to save the current position of the cursor.  */
8090     pCur->apPage[pCur->iPage]->nOverflow = 0;
8091     pCur->eState = CURSOR_INVALID;
8092   }
8093   assert( pCur->apPage[pCur->iPage]->nOverflow==0 );
8094 
8095 end_insert:
8096   return rc;
8097 }
8098 
8099 /*
8100 ** Delete the entry that the cursor is pointing to.
8101 **
8102 ** If the BTREE_SAVEPOSITION bit of the flags parameter is zero, then
8103 ** the cursor is left pointing at an arbitrary location after the delete.
8104 ** But if that bit is set, then the cursor is left in a state such that
8105 ** the next call to BtreeNext() or BtreePrev() moves it to the same row
8106 ** as it would have been on if the call to BtreeDelete() had been omitted.
8107 **
8108 ** The BTREE_AUXDELETE bit of flags indicates that is one of several deletes
8109 ** associated with a single table entry and its indexes.  Only one of those
8110 ** deletes is considered the "primary" delete.  The primary delete occurs
8111 ** on a cursor that is not a BTREE_FORDELETE cursor.  All but one delete
8112 ** operation on non-FORDELETE cursors is tagged with the AUXDELETE flag.
8113 ** The BTREE_AUXDELETE bit is a hint that is not used by this implementation,
8114 ** but which might be used by alternative storage engines.
8115 */
8116 int sqlite3BtreeDelete(BtCursor *pCur, u8 flags){
8117   Btree *p = pCur->pBtree;
8118   BtShared *pBt = p->pBt;
8119   int rc;                              /* Return code */
8120   MemPage *pPage;                      /* Page to delete cell from */
8121   unsigned char *pCell;                /* Pointer to cell to delete */
8122   int iCellIdx;                        /* Index of cell to delete */
8123   int iCellDepth;                      /* Depth of node containing pCell */
8124   u16 szCell;                          /* Size of the cell being deleted */
8125   int bSkipnext = 0;                   /* Leaf cursor in SKIPNEXT state */
8126   u8 bPreserve = flags & BTREE_SAVEPOSITION;  /* Keep cursor valid */
8127 
8128   assert( cursorOwnsBtShared(pCur) );
8129   assert( pBt->inTransaction==TRANS_WRITE );
8130   assert( (pBt->btsFlags & BTS_READ_ONLY)==0 );
8131   assert( pCur->curFlags & BTCF_WriteFlag );
8132   assert( hasSharedCacheTableLock(p, pCur->pgnoRoot, pCur->pKeyInfo!=0, 2) );
8133   assert( !hasReadConflicts(p, pCur->pgnoRoot) );
8134   assert( pCur->aiIdx[pCur->iPage]<pCur->apPage[pCur->iPage]->nCell );
8135   assert( pCur->eState==CURSOR_VALID );
8136   assert( (flags & ~(BTREE_SAVEPOSITION | BTREE_AUXDELETE))==0 );
8137 
8138   iCellDepth = pCur->iPage;
8139   iCellIdx = pCur->aiIdx[iCellDepth];
8140   pPage = pCur->apPage[iCellDepth];
8141   pCell = findCell(pPage, iCellIdx);
8142 
8143   /* If the bPreserve flag is set to true, then the cursor position must
8144   ** be preserved following this delete operation. If the current delete
8145   ** will cause a b-tree rebalance, then this is done by saving the cursor
8146   ** key and leaving the cursor in CURSOR_REQUIRESEEK state before
8147   ** returning.
8148   **
8149   ** Or, if the current delete will not cause a rebalance, then the cursor
8150   ** will be left in CURSOR_SKIPNEXT state pointing to the entry immediately
8151   ** before or after the deleted entry. In this case set bSkipnext to true.  */
8152   if( bPreserve ){
8153     if( !pPage->leaf
8154      || (pPage->nFree+cellSizePtr(pPage,pCell)+2)>(int)(pBt->usableSize*2/3)
8155     ){
8156       /* A b-tree rebalance will be required after deleting this entry.
8157       ** Save the cursor key.  */
8158       rc = saveCursorKey(pCur);
8159       if( rc ) return rc;
8160     }else{
8161       bSkipnext = 1;
8162     }
8163   }
8164 
8165   /* If the page containing the entry to delete is not a leaf page, move
8166   ** the cursor to the largest entry in the tree that is smaller than
8167   ** the entry being deleted. This cell will replace the cell being deleted
8168   ** from the internal node. The 'previous' entry is used for this instead
8169   ** of the 'next' entry, as the previous entry is always a part of the
8170   ** sub-tree headed by the child page of the cell being deleted. This makes
8171   ** balancing the tree following the delete operation easier.  */
8172   if( !pPage->leaf ){
8173     int notUsed = 0;
8174     rc = sqlite3BtreePrevious(pCur, &notUsed);
8175     if( rc ) return rc;
8176   }
8177 
8178   /* Save the positions of any other cursors open on this table before
8179   ** making any modifications.  */
8180   if( pCur->curFlags & BTCF_Multiple ){
8181     rc = saveAllCursors(pBt, pCur->pgnoRoot, pCur);
8182     if( rc ) return rc;
8183   }
8184 
8185   /* If this is a delete operation to remove a row from a table b-tree,
8186   ** invalidate any incrblob cursors open on the row being deleted.  */
8187   if( pCur->pKeyInfo==0 ){
8188     invalidateIncrblobCursors(p, pCur->info.nKey, 0);
8189   }
8190 
8191   /* Make the page containing the entry to be deleted writable. Then free any
8192   ** overflow pages associated with the entry and finally remove the cell
8193   ** itself from within the page.  */
8194   rc = sqlite3PagerWrite(pPage->pDbPage);
8195   if( rc ) return rc;
8196   rc = clearCell(pPage, pCell, &szCell);
8197   dropCell(pPage, iCellIdx, szCell, &rc);
8198   if( rc ) return rc;
8199 
8200   /* If the cell deleted was not located on a leaf page, then the cursor
8201   ** is currently pointing to the largest entry in the sub-tree headed
8202   ** by the child-page of the cell that was just deleted from an internal
8203   ** node. The cell from the leaf node needs to be moved to the internal
8204   ** node to replace the deleted cell.  */
8205   if( !pPage->leaf ){
8206     MemPage *pLeaf = pCur->apPage[pCur->iPage];
8207     int nCell;
8208     Pgno n = pCur->apPage[iCellDepth+1]->pgno;
8209     unsigned char *pTmp;
8210 
8211     pCell = findCell(pLeaf, pLeaf->nCell-1);
8212     if( pCell<&pLeaf->aData[4] ) return SQLITE_CORRUPT_BKPT;
8213     nCell = pLeaf->xCellSize(pLeaf, pCell);
8214     assert( MX_CELL_SIZE(pBt) >= nCell );
8215     pTmp = pBt->pTmpSpace;
8216     assert( pTmp!=0 );
8217     rc = sqlite3PagerWrite(pLeaf->pDbPage);
8218     insertCell(pPage, iCellIdx, pCell-4, nCell+4, pTmp, n, &rc);
8219     dropCell(pLeaf, pLeaf->nCell-1, nCell, &rc);
8220     if( rc ) return rc;
8221   }
8222 
8223   /* Balance the tree. If the entry deleted was located on a leaf page,
8224   ** then the cursor still points to that page. In this case the first
8225   ** call to balance() repairs the tree, and the if(...) condition is
8226   ** never true.
8227   **
8228   ** Otherwise, if the entry deleted was on an internal node page, then
8229   ** pCur is pointing to the leaf page from which a cell was removed to
8230   ** replace the cell deleted from the internal node. This is slightly
8231   ** tricky as the leaf node may be underfull, and the internal node may
8232   ** be either under or overfull. In this case run the balancing algorithm
8233   ** on the leaf node first. If the balance proceeds far enough up the
8234   ** tree that we can be sure that any problem in the internal node has
8235   ** been corrected, so be it. Otherwise, after balancing the leaf node,
8236   ** walk the cursor up the tree to the internal node and balance it as
8237   ** well.  */
8238   rc = balance(pCur);
8239   if( rc==SQLITE_OK && pCur->iPage>iCellDepth ){
8240     while( pCur->iPage>iCellDepth ){
8241       releasePage(pCur->apPage[pCur->iPage--]);
8242     }
8243     rc = balance(pCur);
8244   }
8245 
8246   if( rc==SQLITE_OK ){
8247     if( bSkipnext ){
8248       assert( bPreserve && (pCur->iPage==iCellDepth || CORRUPT_DB) );
8249       assert( pPage==pCur->apPage[pCur->iPage] || CORRUPT_DB );
8250       assert( (pPage->nCell>0 || CORRUPT_DB) && iCellIdx<=pPage->nCell );
8251       pCur->eState = CURSOR_SKIPNEXT;
8252       if( iCellIdx>=pPage->nCell ){
8253         pCur->skipNext = -1;
8254         pCur->aiIdx[iCellDepth] = pPage->nCell-1;
8255       }else{
8256         pCur->skipNext = 1;
8257       }
8258     }else{
8259       rc = moveToRoot(pCur);
8260       if( bPreserve ){
8261         pCur->eState = CURSOR_REQUIRESEEK;
8262       }
8263     }
8264   }
8265   return rc;
8266 }
8267 
8268 /*
8269 ** Create a new BTree table.  Write into *piTable the page
8270 ** number for the root page of the new table.
8271 **
8272 ** The type of type is determined by the flags parameter.  Only the
8273 ** following values of flags are currently in use.  Other values for
8274 ** flags might not work:
8275 **
8276 **     BTREE_INTKEY|BTREE_LEAFDATA     Used for SQL tables with rowid keys
8277 **     BTREE_ZERODATA                  Used for SQL indices
8278 */
8279 static int btreeCreateTable(Btree *p, int *piTable, int createTabFlags){
8280   BtShared *pBt = p->pBt;
8281   MemPage *pRoot;
8282   Pgno pgnoRoot;
8283   int rc;
8284   int ptfFlags;          /* Page-type flage for the root page of new table */
8285 
8286   assert( sqlite3BtreeHoldsMutex(p) );
8287   assert( pBt->inTransaction==TRANS_WRITE );
8288   assert( (pBt->btsFlags & BTS_READ_ONLY)==0 );
8289 
8290 #ifdef SQLITE_OMIT_AUTOVACUUM
8291   rc = allocateBtreePage(pBt, &pRoot, &pgnoRoot, 1, 0);
8292   if( rc ){
8293     return rc;
8294   }
8295 #else
8296   if( pBt->autoVacuum ){
8297     Pgno pgnoMove;      /* Move a page here to make room for the root-page */
8298     MemPage *pPageMove; /* The page to move to. */
8299 
8300     /* Creating a new table may probably require moving an existing database
8301     ** to make room for the new tables root page. In case this page turns
8302     ** out to be an overflow page, delete all overflow page-map caches
8303     ** held by open cursors.
8304     */
8305     invalidateAllOverflowCache(pBt);
8306 
8307     /* Read the value of meta[3] from the database to determine where the
8308     ** root page of the new table should go. meta[3] is the largest root-page
8309     ** created so far, so the new root-page is (meta[3]+1).
8310     */
8311     sqlite3BtreeGetMeta(p, BTREE_LARGEST_ROOT_PAGE, &pgnoRoot);
8312     pgnoRoot++;
8313 
8314     /* The new root-page may not be allocated on a pointer-map page, or the
8315     ** PENDING_BYTE page.
8316     */
8317     while( pgnoRoot==PTRMAP_PAGENO(pBt, pgnoRoot) ||
8318         pgnoRoot==PENDING_BYTE_PAGE(pBt) ){
8319       pgnoRoot++;
8320     }
8321     assert( pgnoRoot>=3 || CORRUPT_DB );
8322     testcase( pgnoRoot<3 );
8323 
8324     /* Allocate a page. The page that currently resides at pgnoRoot will
8325     ** be moved to the allocated page (unless the allocated page happens
8326     ** to reside at pgnoRoot).
8327     */
8328     rc = allocateBtreePage(pBt, &pPageMove, &pgnoMove, pgnoRoot, BTALLOC_EXACT);
8329     if( rc!=SQLITE_OK ){
8330       return rc;
8331     }
8332 
8333     if( pgnoMove!=pgnoRoot ){
8334       /* pgnoRoot is the page that will be used for the root-page of
8335       ** the new table (assuming an error did not occur). But we were
8336       ** allocated pgnoMove. If required (i.e. if it was not allocated
8337       ** by extending the file), the current page at position pgnoMove
8338       ** is already journaled.
8339       */
8340       u8 eType = 0;
8341       Pgno iPtrPage = 0;
8342 
8343       /* Save the positions of any open cursors. This is required in
8344       ** case they are holding a reference to an xFetch reference
8345       ** corresponding to page pgnoRoot.  */
8346       rc = saveAllCursors(pBt, 0, 0);
8347       releasePage(pPageMove);
8348       if( rc!=SQLITE_OK ){
8349         return rc;
8350       }
8351 
8352       /* Move the page currently at pgnoRoot to pgnoMove. */
8353       rc = btreeGetPage(pBt, pgnoRoot, &pRoot, 0);
8354       if( rc!=SQLITE_OK ){
8355         return rc;
8356       }
8357       rc = ptrmapGet(pBt, pgnoRoot, &eType, &iPtrPage);
8358       if( eType==PTRMAP_ROOTPAGE || eType==PTRMAP_FREEPAGE ){
8359         rc = SQLITE_CORRUPT_BKPT;
8360       }
8361       if( rc!=SQLITE_OK ){
8362         releasePage(pRoot);
8363         return rc;
8364       }
8365       assert( eType!=PTRMAP_ROOTPAGE );
8366       assert( eType!=PTRMAP_FREEPAGE );
8367       rc = relocatePage(pBt, pRoot, eType, iPtrPage, pgnoMove, 0);
8368       releasePage(pRoot);
8369 
8370       /* Obtain the page at pgnoRoot */
8371       if( rc!=SQLITE_OK ){
8372         return rc;
8373       }
8374       rc = btreeGetPage(pBt, pgnoRoot, &pRoot, 0);
8375       if( rc!=SQLITE_OK ){
8376         return rc;
8377       }
8378       rc = sqlite3PagerWrite(pRoot->pDbPage);
8379       if( rc!=SQLITE_OK ){
8380         releasePage(pRoot);
8381         return rc;
8382       }
8383     }else{
8384       pRoot = pPageMove;
8385     }
8386 
8387     /* Update the pointer-map and meta-data with the new root-page number. */
8388     ptrmapPut(pBt, pgnoRoot, PTRMAP_ROOTPAGE, 0, &rc);
8389     if( rc ){
8390       releasePage(pRoot);
8391       return rc;
8392     }
8393 
8394     /* When the new root page was allocated, page 1 was made writable in
8395     ** order either to increase the database filesize, or to decrement the
8396     ** freelist count.  Hence, the sqlite3BtreeUpdateMeta() call cannot fail.
8397     */
8398     assert( sqlite3PagerIswriteable(pBt->pPage1->pDbPage) );
8399     rc = sqlite3BtreeUpdateMeta(p, 4, pgnoRoot);
8400     if( NEVER(rc) ){
8401       releasePage(pRoot);
8402       return rc;
8403     }
8404 
8405   }else{
8406     rc = allocateBtreePage(pBt, &pRoot, &pgnoRoot, 1, 0);
8407     if( rc ) return rc;
8408   }
8409 #endif
8410   assert( sqlite3PagerIswriteable(pRoot->pDbPage) );
8411   if( createTabFlags & BTREE_INTKEY ){
8412     ptfFlags = PTF_INTKEY | PTF_LEAFDATA | PTF_LEAF;
8413   }else{
8414     ptfFlags = PTF_ZERODATA | PTF_LEAF;
8415   }
8416   zeroPage(pRoot, ptfFlags);
8417   sqlite3PagerUnref(pRoot->pDbPage);
8418   assert( (pBt->openFlags & BTREE_SINGLE)==0 || pgnoRoot==2 );
8419   *piTable = (int)pgnoRoot;
8420   return SQLITE_OK;
8421 }
8422 int sqlite3BtreeCreateTable(Btree *p, int *piTable, int flags){
8423   int rc;
8424   sqlite3BtreeEnter(p);
8425   rc = btreeCreateTable(p, piTable, flags);
8426   sqlite3BtreeLeave(p);
8427   return rc;
8428 }
8429 
8430 /*
8431 ** Erase the given database page and all its children.  Return
8432 ** the page to the freelist.
8433 */
8434 static int clearDatabasePage(
8435   BtShared *pBt,           /* The BTree that contains the table */
8436   Pgno pgno,               /* Page number to clear */
8437   int freePageFlag,        /* Deallocate page if true */
8438   int *pnChange            /* Add number of Cells freed to this counter */
8439 ){
8440   MemPage *pPage;
8441   int rc;
8442   unsigned char *pCell;
8443   int i;
8444   int hdr;
8445   u16 szCell;
8446 
8447   assert( sqlite3_mutex_held(pBt->mutex) );
8448   if( pgno>btreePagecount(pBt) ){
8449     return SQLITE_CORRUPT_BKPT;
8450   }
8451   rc = getAndInitPage(pBt, pgno, &pPage, 0, 0);
8452   if( rc ) return rc;
8453   if( pPage->bBusy ){
8454     rc = SQLITE_CORRUPT_BKPT;
8455     goto cleardatabasepage_out;
8456   }
8457   pPage->bBusy = 1;
8458   hdr = pPage->hdrOffset;
8459   for(i=0; i<pPage->nCell; i++){
8460     pCell = findCell(pPage, i);
8461     if( !pPage->leaf ){
8462       rc = clearDatabasePage(pBt, get4byte(pCell), 1, pnChange);
8463       if( rc ) goto cleardatabasepage_out;
8464     }
8465     rc = clearCell(pPage, pCell, &szCell);
8466     if( rc ) goto cleardatabasepage_out;
8467   }
8468   if( !pPage->leaf ){
8469     rc = clearDatabasePage(pBt, get4byte(&pPage->aData[hdr+8]), 1, pnChange);
8470     if( rc ) goto cleardatabasepage_out;
8471   }else if( pnChange ){
8472     assert( pPage->intKey || CORRUPT_DB );
8473     testcase( !pPage->intKey );
8474     *pnChange += pPage->nCell;
8475   }
8476   if( freePageFlag ){
8477     freePage(pPage, &rc);
8478   }else if( (rc = sqlite3PagerWrite(pPage->pDbPage))==0 ){
8479     zeroPage(pPage, pPage->aData[hdr] | PTF_LEAF);
8480   }
8481 
8482 cleardatabasepage_out:
8483   pPage->bBusy = 0;
8484   releasePage(pPage);
8485   return rc;
8486 }
8487 
8488 /*
8489 ** Delete all information from a single table in the database.  iTable is
8490 ** the page number of the root of the table.  After this routine returns,
8491 ** the root page is empty, but still exists.
8492 **
8493 ** This routine will fail with SQLITE_LOCKED if there are any open
8494 ** read cursors on the table.  Open write cursors are moved to the
8495 ** root of the table.
8496 **
8497 ** If pnChange is not NULL, then table iTable must be an intkey table. The
8498 ** integer value pointed to by pnChange is incremented by the number of
8499 ** entries in the table.
8500 */
8501 int sqlite3BtreeClearTable(Btree *p, int iTable, int *pnChange){
8502   int rc;
8503   BtShared *pBt = p->pBt;
8504   sqlite3BtreeEnter(p);
8505   assert( p->inTrans==TRANS_WRITE );
8506 
8507   rc = saveAllCursors(pBt, (Pgno)iTable, 0);
8508 
8509   if( SQLITE_OK==rc ){
8510     /* Invalidate all incrblob cursors open on table iTable (assuming iTable
8511     ** is the root of a table b-tree - if it is not, the following call is
8512     ** a no-op).  */
8513     invalidateIncrblobCursors(p, 0, 1);
8514     rc = clearDatabasePage(pBt, (Pgno)iTable, 0, pnChange);
8515   }
8516   sqlite3BtreeLeave(p);
8517   return rc;
8518 }
8519 
8520 /*
8521 ** Delete all information from the single table that pCur is open on.
8522 **
8523 ** This routine only work for pCur on an ephemeral table.
8524 */
8525 int sqlite3BtreeClearTableOfCursor(BtCursor *pCur){
8526   return sqlite3BtreeClearTable(pCur->pBtree, pCur->pgnoRoot, 0);
8527 }
8528 
8529 /*
8530 ** Erase all information in a table and add the root of the table to
8531 ** the freelist.  Except, the root of the principle table (the one on
8532 ** page 1) is never added to the freelist.
8533 **
8534 ** This routine will fail with SQLITE_LOCKED if there are any open
8535 ** cursors on the table.
8536 **
8537 ** If AUTOVACUUM is enabled and the page at iTable is not the last
8538 ** root page in the database file, then the last root page
8539 ** in the database file is moved into the slot formerly occupied by
8540 ** iTable and that last slot formerly occupied by the last root page
8541 ** is added to the freelist instead of iTable.  In this say, all
8542 ** root pages are kept at the beginning of the database file, which
8543 ** is necessary for AUTOVACUUM to work right.  *piMoved is set to the
8544 ** page number that used to be the last root page in the file before
8545 ** the move.  If no page gets moved, *piMoved is set to 0.
8546 ** The last root page is recorded in meta[3] and the value of
8547 ** meta[3] is updated by this procedure.
8548 */
8549 static int btreeDropTable(Btree *p, Pgno iTable, int *piMoved){
8550   int rc;
8551   MemPage *pPage = 0;
8552   BtShared *pBt = p->pBt;
8553 
8554   assert( sqlite3BtreeHoldsMutex(p) );
8555   assert( p->inTrans==TRANS_WRITE );
8556 
8557   /* It is illegal to drop a table if any cursors are open on the
8558   ** database. This is because in auto-vacuum mode the backend may
8559   ** need to move another root-page to fill a gap left by the deleted
8560   ** root page. If an open cursor was using this page a problem would
8561   ** occur.
8562   **
8563   ** This error is caught long before control reaches this point.
8564   */
8565   if( NEVER(pBt->pCursor) ){
8566     sqlite3ConnectionBlocked(p->db, pBt->pCursor->pBtree->db);
8567     return SQLITE_LOCKED_SHAREDCACHE;
8568   }
8569 
8570   /*
8571   ** It is illegal to drop the sqlite_master table on page 1.  But again,
8572   ** this error is caught long before reaching this point.
8573   */
8574   if( NEVER(iTable<2) ){
8575     return SQLITE_CORRUPT_BKPT;
8576   }
8577 
8578   rc = btreeGetPage(pBt, (Pgno)iTable, &pPage, 0);
8579   if( rc ) return rc;
8580   rc = sqlite3BtreeClearTable(p, iTable, 0);
8581   if( rc ){
8582     releasePage(pPage);
8583     return rc;
8584   }
8585 
8586   *piMoved = 0;
8587 
8588 #ifdef SQLITE_OMIT_AUTOVACUUM
8589   freePage(pPage, &rc);
8590   releasePage(pPage);
8591 #else
8592   if( pBt->autoVacuum ){
8593     Pgno maxRootPgno;
8594     sqlite3BtreeGetMeta(p, BTREE_LARGEST_ROOT_PAGE, &maxRootPgno);
8595 
8596     if( iTable==maxRootPgno ){
8597       /* If the table being dropped is the table with the largest root-page
8598       ** number in the database, put the root page on the free list.
8599       */
8600       freePage(pPage, &rc);
8601       releasePage(pPage);
8602       if( rc!=SQLITE_OK ){
8603         return rc;
8604       }
8605     }else{
8606       /* The table being dropped does not have the largest root-page
8607       ** number in the database. So move the page that does into the
8608       ** gap left by the deleted root-page.
8609       */
8610       MemPage *pMove;
8611       releasePage(pPage);
8612       rc = btreeGetPage(pBt, maxRootPgno, &pMove, 0);
8613       if( rc!=SQLITE_OK ){
8614         return rc;
8615       }
8616       rc = relocatePage(pBt, pMove, PTRMAP_ROOTPAGE, 0, iTable, 0);
8617       releasePage(pMove);
8618       if( rc!=SQLITE_OK ){
8619         return rc;
8620       }
8621       pMove = 0;
8622       rc = btreeGetPage(pBt, maxRootPgno, &pMove, 0);
8623       freePage(pMove, &rc);
8624       releasePage(pMove);
8625       if( rc!=SQLITE_OK ){
8626         return rc;
8627       }
8628       *piMoved = maxRootPgno;
8629     }
8630 
8631     /* Set the new 'max-root-page' value in the database header. This
8632     ** is the old value less one, less one more if that happens to
8633     ** be a root-page number, less one again if that is the
8634     ** PENDING_BYTE_PAGE.
8635     */
8636     maxRootPgno--;
8637     while( maxRootPgno==PENDING_BYTE_PAGE(pBt)
8638            || PTRMAP_ISPAGE(pBt, maxRootPgno) ){
8639       maxRootPgno--;
8640     }
8641     assert( maxRootPgno!=PENDING_BYTE_PAGE(pBt) );
8642 
8643     rc = sqlite3BtreeUpdateMeta(p, 4, maxRootPgno);
8644   }else{
8645     freePage(pPage, &rc);
8646     releasePage(pPage);
8647   }
8648 #endif
8649   return rc;
8650 }
8651 int sqlite3BtreeDropTable(Btree *p, int iTable, int *piMoved){
8652   int rc;
8653   sqlite3BtreeEnter(p);
8654   rc = btreeDropTable(p, iTable, piMoved);
8655   sqlite3BtreeLeave(p);
8656   return rc;
8657 }
8658 
8659 
8660 /*
8661 ** This function may only be called if the b-tree connection already
8662 ** has a read or write transaction open on the database.
8663 **
8664 ** Read the meta-information out of a database file.  Meta[0]
8665 ** is the number of free pages currently in the database.  Meta[1]
8666 ** through meta[15] are available for use by higher layers.  Meta[0]
8667 ** is read-only, the others are read/write.
8668 **
8669 ** The schema layer numbers meta values differently.  At the schema
8670 ** layer (and the SetCookie and ReadCookie opcodes) the number of
8671 ** free pages is not visible.  So Cookie[0] is the same as Meta[1].
8672 **
8673 ** This routine treats Meta[BTREE_DATA_VERSION] as a special case.  Instead
8674 ** of reading the value out of the header, it instead loads the "DataVersion"
8675 ** from the pager.  The BTREE_DATA_VERSION value is not actually stored in the
8676 ** database file.  It is a number computed by the pager.  But its access
8677 ** pattern is the same as header meta values, and so it is convenient to
8678 ** read it from this routine.
8679 */
8680 void sqlite3BtreeGetMeta(Btree *p, int idx, u32 *pMeta){
8681   BtShared *pBt = p->pBt;
8682 
8683   sqlite3BtreeEnter(p);
8684   assert( p->inTrans>TRANS_NONE );
8685   assert( SQLITE_OK==querySharedCacheTableLock(p, MASTER_ROOT, READ_LOCK) );
8686   assert( pBt->pPage1 );
8687   assert( idx>=0 && idx<=15 );
8688 
8689   if( idx==BTREE_DATA_VERSION ){
8690     *pMeta = sqlite3PagerDataVersion(pBt->pPager) + p->iDataVersion;
8691   }else{
8692     *pMeta = get4byte(&pBt->pPage1->aData[36 + idx*4]);
8693   }
8694 
8695   /* If auto-vacuum is disabled in this build and this is an auto-vacuum
8696   ** database, mark the database as read-only.  */
8697 #ifdef SQLITE_OMIT_AUTOVACUUM
8698   if( idx==BTREE_LARGEST_ROOT_PAGE && *pMeta>0 ){
8699     pBt->btsFlags |= BTS_READ_ONLY;
8700   }
8701 #endif
8702 
8703   sqlite3BtreeLeave(p);
8704 }
8705 
8706 /*
8707 ** Write meta-information back into the database.  Meta[0] is
8708 ** read-only and may not be written.
8709 */
8710 int sqlite3BtreeUpdateMeta(Btree *p, int idx, u32 iMeta){
8711   BtShared *pBt = p->pBt;
8712   unsigned char *pP1;
8713   int rc;
8714   assert( idx>=1 && idx<=15 );
8715   sqlite3BtreeEnter(p);
8716   assert( p->inTrans==TRANS_WRITE );
8717   assert( pBt->pPage1!=0 );
8718   pP1 = pBt->pPage1->aData;
8719   rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
8720   if( rc==SQLITE_OK ){
8721     put4byte(&pP1[36 + idx*4], iMeta);
8722 #ifndef SQLITE_OMIT_AUTOVACUUM
8723     if( idx==BTREE_INCR_VACUUM ){
8724       assert( pBt->autoVacuum || iMeta==0 );
8725       assert( iMeta==0 || iMeta==1 );
8726       pBt->incrVacuum = (u8)iMeta;
8727     }
8728 #endif
8729   }
8730   sqlite3BtreeLeave(p);
8731   return rc;
8732 }
8733 
8734 #ifndef SQLITE_OMIT_BTREECOUNT
8735 /*
8736 ** The first argument, pCur, is a cursor opened on some b-tree. Count the
8737 ** number of entries in the b-tree and write the result to *pnEntry.
8738 **
8739 ** SQLITE_OK is returned if the operation is successfully executed.
8740 ** Otherwise, if an error is encountered (i.e. an IO error or database
8741 ** corruption) an SQLite error code is returned.
8742 */
8743 int sqlite3BtreeCount(BtCursor *pCur, i64 *pnEntry){
8744   i64 nEntry = 0;                      /* Value to return in *pnEntry */
8745   int rc;                              /* Return code */
8746 
8747   if( pCur->pgnoRoot==0 ){
8748     *pnEntry = 0;
8749     return SQLITE_OK;
8750   }
8751   rc = moveToRoot(pCur);
8752 
8753   /* Unless an error occurs, the following loop runs one iteration for each
8754   ** page in the B-Tree structure (not including overflow pages).
8755   */
8756   while( rc==SQLITE_OK ){
8757     int iIdx;                          /* Index of child node in parent */
8758     MemPage *pPage;                    /* Current page of the b-tree */
8759 
8760     /* If this is a leaf page or the tree is not an int-key tree, then
8761     ** this page contains countable entries. Increment the entry counter
8762     ** accordingly.
8763     */
8764     pPage = pCur->apPage[pCur->iPage];
8765     if( pPage->leaf || !pPage->intKey ){
8766       nEntry += pPage->nCell;
8767     }
8768 
8769     /* pPage is a leaf node. This loop navigates the cursor so that it
8770     ** points to the first interior cell that it points to the parent of
8771     ** the next page in the tree that has not yet been visited. The
8772     ** pCur->aiIdx[pCur->iPage] value is set to the index of the parent cell
8773     ** of the page, or to the number of cells in the page if the next page
8774     ** to visit is the right-child of its parent.
8775     **
8776     ** If all pages in the tree have been visited, return SQLITE_OK to the
8777     ** caller.
8778     */
8779     if( pPage->leaf ){
8780       do {
8781         if( pCur->iPage==0 ){
8782           /* All pages of the b-tree have been visited. Return successfully. */
8783           *pnEntry = nEntry;
8784           return moveToRoot(pCur);
8785         }
8786         moveToParent(pCur);
8787       }while ( pCur->aiIdx[pCur->iPage]>=pCur->apPage[pCur->iPage]->nCell );
8788 
8789       pCur->aiIdx[pCur->iPage]++;
8790       pPage = pCur->apPage[pCur->iPage];
8791     }
8792 
8793     /* Descend to the child node of the cell that the cursor currently
8794     ** points at. This is the right-child if (iIdx==pPage->nCell).
8795     */
8796     iIdx = pCur->aiIdx[pCur->iPage];
8797     if( iIdx==pPage->nCell ){
8798       rc = moveToChild(pCur, get4byte(&pPage->aData[pPage->hdrOffset+8]));
8799     }else{
8800       rc = moveToChild(pCur, get4byte(findCell(pPage, iIdx)));
8801     }
8802   }
8803 
8804   /* An error has occurred. Return an error code. */
8805   return rc;
8806 }
8807 #endif
8808 
8809 /*
8810 ** Return the pager associated with a BTree.  This routine is used for
8811 ** testing and debugging only.
8812 */
8813 Pager *sqlite3BtreePager(Btree *p){
8814   return p->pBt->pPager;
8815 }
8816 
8817 #ifndef SQLITE_OMIT_INTEGRITY_CHECK
8818 /*
8819 ** Append a message to the error message string.
8820 */
8821 static void checkAppendMsg(
8822   IntegrityCk *pCheck,
8823   const char *zFormat,
8824   ...
8825 ){
8826   va_list ap;
8827   if( !pCheck->mxErr ) return;
8828   pCheck->mxErr--;
8829   pCheck->nErr++;
8830   va_start(ap, zFormat);
8831   if( pCheck->errMsg.nChar ){
8832     sqlite3StrAccumAppend(&pCheck->errMsg, "\n", 1);
8833   }
8834   if( pCheck->zPfx ){
8835     sqlite3XPrintf(&pCheck->errMsg, pCheck->zPfx, pCheck->v1, pCheck->v2);
8836   }
8837   sqlite3VXPrintf(&pCheck->errMsg, zFormat, ap);
8838   va_end(ap);
8839   if( pCheck->errMsg.accError==STRACCUM_NOMEM ){
8840     pCheck->mallocFailed = 1;
8841   }
8842 }
8843 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
8844 
8845 #ifndef SQLITE_OMIT_INTEGRITY_CHECK
8846 
8847 /*
8848 ** Return non-zero if the bit in the IntegrityCk.aPgRef[] array that
8849 ** corresponds to page iPg is already set.
8850 */
8851 static int getPageReferenced(IntegrityCk *pCheck, Pgno iPg){
8852   assert( iPg<=pCheck->nPage && sizeof(pCheck->aPgRef[0])==1 );
8853   return (pCheck->aPgRef[iPg/8] & (1 << (iPg & 0x07)));
8854 }
8855 
8856 /*
8857 ** Set the bit in the IntegrityCk.aPgRef[] array that corresponds to page iPg.
8858 */
8859 static void setPageReferenced(IntegrityCk *pCheck, Pgno iPg){
8860   assert( iPg<=pCheck->nPage && sizeof(pCheck->aPgRef[0])==1 );
8861   pCheck->aPgRef[iPg/8] |= (1 << (iPg & 0x07));
8862 }
8863 
8864 
8865 /*
8866 ** Add 1 to the reference count for page iPage.  If this is the second
8867 ** reference to the page, add an error message to pCheck->zErrMsg.
8868 ** Return 1 if there are 2 or more references to the page and 0 if
8869 ** if this is the first reference to the page.
8870 **
8871 ** Also check that the page number is in bounds.
8872 */
8873 static int checkRef(IntegrityCk *pCheck, Pgno iPage){
8874   if( iPage==0 ) return 1;
8875   if( iPage>pCheck->nPage ){
8876     checkAppendMsg(pCheck, "invalid page number %d", iPage);
8877     return 1;
8878   }
8879   if( getPageReferenced(pCheck, iPage) ){
8880     checkAppendMsg(pCheck, "2nd reference to page %d", iPage);
8881     return 1;
8882   }
8883   setPageReferenced(pCheck, iPage);
8884   return 0;
8885 }
8886 
8887 #ifndef SQLITE_OMIT_AUTOVACUUM
8888 /*
8889 ** Check that the entry in the pointer-map for page iChild maps to
8890 ** page iParent, pointer type ptrType. If not, append an error message
8891 ** to pCheck.
8892 */
8893 static void checkPtrmap(
8894   IntegrityCk *pCheck,   /* Integrity check context */
8895   Pgno iChild,           /* Child page number */
8896   u8 eType,              /* Expected pointer map type */
8897   Pgno iParent           /* Expected pointer map parent page number */
8898 ){
8899   int rc;
8900   u8 ePtrmapType;
8901   Pgno iPtrmapParent;
8902 
8903   rc = ptrmapGet(pCheck->pBt, iChild, &ePtrmapType, &iPtrmapParent);
8904   if( rc!=SQLITE_OK ){
8905     if( rc==SQLITE_NOMEM || rc==SQLITE_IOERR_NOMEM ) pCheck->mallocFailed = 1;
8906     checkAppendMsg(pCheck, "Failed to read ptrmap key=%d", iChild);
8907     return;
8908   }
8909 
8910   if( ePtrmapType!=eType || iPtrmapParent!=iParent ){
8911     checkAppendMsg(pCheck,
8912       "Bad ptr map entry key=%d expected=(%d,%d) got=(%d,%d)",
8913       iChild, eType, iParent, ePtrmapType, iPtrmapParent);
8914   }
8915 }
8916 #endif
8917 
8918 /*
8919 ** Check the integrity of the freelist or of an overflow page list.
8920 ** Verify that the number of pages on the list is N.
8921 */
8922 static void checkList(
8923   IntegrityCk *pCheck,  /* Integrity checking context */
8924   int isFreeList,       /* True for a freelist.  False for overflow page list */
8925   int iPage,            /* Page number for first page in the list */
8926   int N                 /* Expected number of pages in the list */
8927 ){
8928   int i;
8929   int expected = N;
8930   int iFirst = iPage;
8931   while( N-- > 0 && pCheck->mxErr ){
8932     DbPage *pOvflPage;
8933     unsigned char *pOvflData;
8934     if( iPage<1 ){
8935       checkAppendMsg(pCheck,
8936          "%d of %d pages missing from overflow list starting at %d",
8937           N+1, expected, iFirst);
8938       break;
8939     }
8940     if( checkRef(pCheck, iPage) ) break;
8941     if( sqlite3PagerGet(pCheck->pPager, (Pgno)iPage, &pOvflPage, 0) ){
8942       checkAppendMsg(pCheck, "failed to get page %d", iPage);
8943       break;
8944     }
8945     pOvflData = (unsigned char *)sqlite3PagerGetData(pOvflPage);
8946     if( isFreeList ){
8947       int n = get4byte(&pOvflData[4]);
8948 #ifndef SQLITE_OMIT_AUTOVACUUM
8949       if( pCheck->pBt->autoVacuum ){
8950         checkPtrmap(pCheck, iPage, PTRMAP_FREEPAGE, 0);
8951       }
8952 #endif
8953       if( n>(int)pCheck->pBt->usableSize/4-2 ){
8954         checkAppendMsg(pCheck,
8955            "freelist leaf count too big on page %d", iPage);
8956         N--;
8957       }else{
8958         for(i=0; i<n; i++){
8959           Pgno iFreePage = get4byte(&pOvflData[8+i*4]);
8960 #ifndef SQLITE_OMIT_AUTOVACUUM
8961           if( pCheck->pBt->autoVacuum ){
8962             checkPtrmap(pCheck, iFreePage, PTRMAP_FREEPAGE, 0);
8963           }
8964 #endif
8965           checkRef(pCheck, iFreePage);
8966         }
8967         N -= n;
8968       }
8969     }
8970 #ifndef SQLITE_OMIT_AUTOVACUUM
8971     else{
8972       /* If this database supports auto-vacuum and iPage is not the last
8973       ** page in this overflow list, check that the pointer-map entry for
8974       ** the following page matches iPage.
8975       */
8976       if( pCheck->pBt->autoVacuum && N>0 ){
8977         i = get4byte(pOvflData);
8978         checkPtrmap(pCheck, i, PTRMAP_OVERFLOW2, iPage);
8979       }
8980     }
8981 #endif
8982     iPage = get4byte(pOvflData);
8983     sqlite3PagerUnref(pOvflPage);
8984 
8985     if( isFreeList && N<(iPage!=0) ){
8986       checkAppendMsg(pCheck, "free-page count in header is too small");
8987     }
8988   }
8989 }
8990 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
8991 
8992 /*
8993 ** An implementation of a min-heap.
8994 **
8995 ** aHeap[0] is the number of elements on the heap.  aHeap[1] is the
8996 ** root element.  The daughter nodes of aHeap[N] are aHeap[N*2]
8997 ** and aHeap[N*2+1].
8998 **
8999 ** The heap property is this:  Every node is less than or equal to both
9000 ** of its daughter nodes.  A consequence of the heap property is that the
9001 ** root node aHeap[1] is always the minimum value currently in the heap.
9002 **
9003 ** The btreeHeapInsert() routine inserts an unsigned 32-bit number onto
9004 ** the heap, preserving the heap property.  The btreeHeapPull() routine
9005 ** removes the root element from the heap (the minimum value in the heap)
9006 ** and then moves other nodes around as necessary to preserve the heap
9007 ** property.
9008 **
9009 ** This heap is used for cell overlap and coverage testing.  Each u32
9010 ** entry represents the span of a cell or freeblock on a btree page.
9011 ** The upper 16 bits are the index of the first byte of a range and the
9012 ** lower 16 bits are the index of the last byte of that range.
9013 */
9014 static void btreeHeapInsert(u32 *aHeap, u32 x){
9015   u32 j, i = ++aHeap[0];
9016   aHeap[i] = x;
9017   while( (j = i/2)>0 && aHeap[j]>aHeap[i] ){
9018     x = aHeap[j];
9019     aHeap[j] = aHeap[i];
9020     aHeap[i] = x;
9021     i = j;
9022   }
9023 }
9024 static int btreeHeapPull(u32 *aHeap, u32 *pOut){
9025   u32 j, i, x;
9026   if( (x = aHeap[0])==0 ) return 0;
9027   *pOut = aHeap[1];
9028   aHeap[1] = aHeap[x];
9029   aHeap[x] = 0xffffffff;
9030   aHeap[0]--;
9031   i = 1;
9032   while( (j = i*2)<=aHeap[0] ){
9033     if( aHeap[j]>aHeap[j+1] ) j++;
9034     if( aHeap[i]<aHeap[j] ) break;
9035     x = aHeap[i];
9036     aHeap[i] = aHeap[j];
9037     aHeap[j] = x;
9038     i = j;
9039   }
9040   return 1;
9041 }
9042 
9043 #ifndef SQLITE_OMIT_INTEGRITY_CHECK
9044 /*
9045 ** Do various sanity checks on a single page of a tree.  Return
9046 ** the tree depth.  Root pages return 0.  Parents of root pages
9047 ** return 1, and so forth.
9048 **
9049 ** These checks are done:
9050 **
9051 **      1.  Make sure that cells and freeblocks do not overlap
9052 **          but combine to completely cover the page.
9053 **      2.  Make sure integer cell keys are in order.
9054 **      3.  Check the integrity of overflow pages.
9055 **      4.  Recursively call checkTreePage on all children.
9056 **      5.  Verify that the depth of all children is the same.
9057 */
9058 static int checkTreePage(
9059   IntegrityCk *pCheck,  /* Context for the sanity check */
9060   int iPage,            /* Page number of the page to check */
9061   i64 *piMinKey,        /* Write minimum integer primary key here */
9062   i64 maxKey            /* Error if integer primary key greater than this */
9063 ){
9064   MemPage *pPage = 0;      /* The page being analyzed */
9065   int i;                   /* Loop counter */
9066   int rc;                  /* Result code from subroutine call */
9067   int depth = -1, d2;      /* Depth of a subtree */
9068   int pgno;                /* Page number */
9069   int nFrag;               /* Number of fragmented bytes on the page */
9070   int hdr;                 /* Offset to the page header */
9071   int cellStart;           /* Offset to the start of the cell pointer array */
9072   int nCell;               /* Number of cells */
9073   int doCoverageCheck = 1; /* True if cell coverage checking should be done */
9074   int keyCanBeEqual = 1;   /* True if IPK can be equal to maxKey
9075                            ** False if IPK must be strictly less than maxKey */
9076   u8 *data;                /* Page content */
9077   u8 *pCell;               /* Cell content */
9078   u8 *pCellIdx;            /* Next element of the cell pointer array */
9079   BtShared *pBt;           /* The BtShared object that owns pPage */
9080   u32 pc;                  /* Address of a cell */
9081   u32 usableSize;          /* Usable size of the page */
9082   u32 contentOffset;       /* Offset to the start of the cell content area */
9083   u32 *heap = 0;           /* Min-heap used for checking cell coverage */
9084   u32 x, prev = 0;         /* Next and previous entry on the min-heap */
9085   const char *saved_zPfx = pCheck->zPfx;
9086   int saved_v1 = pCheck->v1;
9087   int saved_v2 = pCheck->v2;
9088   u8 savedIsInit = 0;
9089 
9090   /* Check that the page exists
9091   */
9092   pBt = pCheck->pBt;
9093   usableSize = pBt->usableSize;
9094   if( iPage==0 ) return 0;
9095   if( checkRef(pCheck, iPage) ) return 0;
9096   pCheck->zPfx = "Page %d: ";
9097   pCheck->v1 = iPage;
9098   if( (rc = btreeGetPage(pBt, (Pgno)iPage, &pPage, 0))!=0 ){
9099     checkAppendMsg(pCheck,
9100        "unable to get the page. error code=%d", rc);
9101     goto end_of_check;
9102   }
9103 
9104   /* Clear MemPage.isInit to make sure the corruption detection code in
9105   ** btreeInitPage() is executed.  */
9106   savedIsInit = pPage->isInit;
9107   pPage->isInit = 0;
9108   if( (rc = btreeInitPage(pPage))!=0 ){
9109     assert( rc==SQLITE_CORRUPT );  /* The only possible error from InitPage */
9110     checkAppendMsg(pCheck,
9111                    "btreeInitPage() returns error code %d", rc);
9112     goto end_of_check;
9113   }
9114   data = pPage->aData;
9115   hdr = pPage->hdrOffset;
9116 
9117   /* Set up for cell analysis */
9118   pCheck->zPfx = "On tree page %d cell %d: ";
9119   contentOffset = get2byteNotZero(&data[hdr+5]);
9120   assert( contentOffset<=usableSize );  /* Enforced by btreeInitPage() */
9121 
9122   /* EVIDENCE-OF: R-37002-32774 The two-byte integer at offset 3 gives the
9123   ** number of cells on the page. */
9124   nCell = get2byte(&data[hdr+3]);
9125   assert( pPage->nCell==nCell );
9126 
9127   /* EVIDENCE-OF: R-23882-45353 The cell pointer array of a b-tree page
9128   ** immediately follows the b-tree page header. */
9129   cellStart = hdr + 12 - 4*pPage->leaf;
9130   assert( pPage->aCellIdx==&data[cellStart] );
9131   pCellIdx = &data[cellStart + 2*(nCell-1)];
9132 
9133   if( !pPage->leaf ){
9134     /* Analyze the right-child page of internal pages */
9135     pgno = get4byte(&data[hdr+8]);
9136 #ifndef SQLITE_OMIT_AUTOVACUUM
9137     if( pBt->autoVacuum ){
9138       pCheck->zPfx = "On page %d at right child: ";
9139       checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage);
9140     }
9141 #endif
9142     depth = checkTreePage(pCheck, pgno, &maxKey, maxKey);
9143     keyCanBeEqual = 0;
9144   }else{
9145     /* For leaf pages, the coverage check will occur in the same loop
9146     ** as the other cell checks, so initialize the heap.  */
9147     heap = pCheck->heap;
9148     heap[0] = 0;
9149   }
9150 
9151   /* EVIDENCE-OF: R-02776-14802 The cell pointer array consists of K 2-byte
9152   ** integer offsets to the cell contents. */
9153   for(i=nCell-1; i>=0 && pCheck->mxErr; i--){
9154     CellInfo info;
9155 
9156     /* Check cell size */
9157     pCheck->v2 = i;
9158     assert( pCellIdx==&data[cellStart + i*2] );
9159     pc = get2byteAligned(pCellIdx);
9160     pCellIdx -= 2;
9161     if( pc<contentOffset || pc>usableSize-4 ){
9162       checkAppendMsg(pCheck, "Offset %d out of range %d..%d",
9163                              pc, contentOffset, usableSize-4);
9164       doCoverageCheck = 0;
9165       continue;
9166     }
9167     pCell = &data[pc];
9168     pPage->xParseCell(pPage, pCell, &info);
9169     if( pc+info.nSize>usableSize ){
9170       checkAppendMsg(pCheck, "Extends off end of page");
9171       doCoverageCheck = 0;
9172       continue;
9173     }
9174 
9175     /* Check for integer primary key out of range */
9176     if( pPage->intKey ){
9177       if( keyCanBeEqual ? (info.nKey > maxKey) : (info.nKey >= maxKey) ){
9178         checkAppendMsg(pCheck, "Rowid %lld out of order", info.nKey);
9179       }
9180       maxKey = info.nKey;
9181     }
9182 
9183     /* Check the content overflow list */
9184     if( info.nPayload>info.nLocal ){
9185       int nPage;       /* Number of pages on the overflow chain */
9186       Pgno pgnoOvfl;   /* First page of the overflow chain */
9187       assert( pc + info.nSize - 4 <= usableSize );
9188       nPage = (info.nPayload - info.nLocal + usableSize - 5)/(usableSize - 4);
9189       pgnoOvfl = get4byte(&pCell[info.nSize - 4]);
9190 #ifndef SQLITE_OMIT_AUTOVACUUM
9191       if( pBt->autoVacuum ){
9192         checkPtrmap(pCheck, pgnoOvfl, PTRMAP_OVERFLOW1, iPage);
9193       }
9194 #endif
9195       checkList(pCheck, 0, pgnoOvfl, nPage);
9196     }
9197 
9198     if( !pPage->leaf ){
9199       /* Check sanity of left child page for internal pages */
9200       pgno = get4byte(pCell);
9201 #ifndef SQLITE_OMIT_AUTOVACUUM
9202       if( pBt->autoVacuum ){
9203         checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage);
9204       }
9205 #endif
9206       d2 = checkTreePage(pCheck, pgno, &maxKey, maxKey);
9207       keyCanBeEqual = 0;
9208       if( d2!=depth ){
9209         checkAppendMsg(pCheck, "Child page depth differs");
9210         depth = d2;
9211       }
9212     }else{
9213       /* Populate the coverage-checking heap for leaf pages */
9214       btreeHeapInsert(heap, (pc<<16)|(pc+info.nSize-1));
9215     }
9216   }
9217   *piMinKey = maxKey;
9218 
9219   /* Check for complete coverage of the page
9220   */
9221   pCheck->zPfx = 0;
9222   if( doCoverageCheck && pCheck->mxErr>0 ){
9223     /* For leaf pages, the min-heap has already been initialized and the
9224     ** cells have already been inserted.  But for internal pages, that has
9225     ** not yet been done, so do it now */
9226     if( !pPage->leaf ){
9227       heap = pCheck->heap;
9228       heap[0] = 0;
9229       for(i=nCell-1; i>=0; i--){
9230         u32 size;
9231         pc = get2byteAligned(&data[cellStart+i*2]);
9232         size = pPage->xCellSize(pPage, &data[pc]);
9233         btreeHeapInsert(heap, (pc<<16)|(pc+size-1));
9234       }
9235     }
9236     /* Add the freeblocks to the min-heap
9237     **
9238     ** EVIDENCE-OF: R-20690-50594 The second field of the b-tree page header
9239     ** is the offset of the first freeblock, or zero if there are no
9240     ** freeblocks on the page.
9241     */
9242     i = get2byte(&data[hdr+1]);
9243     while( i>0 ){
9244       int size, j;
9245       assert( (u32)i<=usableSize-4 );     /* Enforced by btreeInitPage() */
9246       size = get2byte(&data[i+2]);
9247       assert( (u32)(i+size)<=usableSize );  /* Enforced by btreeInitPage() */
9248       btreeHeapInsert(heap, (((u32)i)<<16)|(i+size-1));
9249       /* EVIDENCE-OF: R-58208-19414 The first 2 bytes of a freeblock are a
9250       ** big-endian integer which is the offset in the b-tree page of the next
9251       ** freeblock in the chain, or zero if the freeblock is the last on the
9252       ** chain. */
9253       j = get2byte(&data[i]);
9254       /* EVIDENCE-OF: R-06866-39125 Freeblocks are always connected in order of
9255       ** increasing offset. */
9256       assert( j==0 || j>i+size );  /* Enforced by btreeInitPage() */
9257       assert( (u32)j<=usableSize-4 );   /* Enforced by btreeInitPage() */
9258       i = j;
9259     }
9260     /* Analyze the min-heap looking for overlap between cells and/or
9261     ** freeblocks, and counting the number of untracked bytes in nFrag.
9262     **
9263     ** Each min-heap entry is of the form:    (start_address<<16)|end_address.
9264     ** There is an implied first entry the covers the page header, the cell
9265     ** pointer index, and the gap between the cell pointer index and the start
9266     ** of cell content.
9267     **
9268     ** The loop below pulls entries from the min-heap in order and compares
9269     ** the start_address against the previous end_address.  If there is an
9270     ** overlap, that means bytes are used multiple times.  If there is a gap,
9271     ** that gap is added to the fragmentation count.
9272     */
9273     nFrag = 0;
9274     prev = contentOffset - 1;   /* Implied first min-heap entry */
9275     while( btreeHeapPull(heap,&x) ){
9276       if( (prev&0xffff)>=(x>>16) ){
9277         checkAppendMsg(pCheck,
9278           "Multiple uses for byte %u of page %d", x>>16, iPage);
9279         break;
9280       }else{
9281         nFrag += (x>>16) - (prev&0xffff) - 1;
9282         prev = x;
9283       }
9284     }
9285     nFrag += usableSize - (prev&0xffff) - 1;
9286     /* EVIDENCE-OF: R-43263-13491 The total number of bytes in all fragments
9287     ** is stored in the fifth field of the b-tree page header.
9288     ** EVIDENCE-OF: R-07161-27322 The one-byte integer at offset 7 gives the
9289     ** number of fragmented free bytes within the cell content area.
9290     */
9291     if( heap[0]==0 && nFrag!=data[hdr+7] ){
9292       checkAppendMsg(pCheck,
9293           "Fragmentation of %d bytes reported as %d on page %d",
9294           nFrag, data[hdr+7], iPage);
9295     }
9296   }
9297 
9298 end_of_check:
9299   if( !doCoverageCheck ) pPage->isInit = savedIsInit;
9300   releasePage(pPage);
9301   pCheck->zPfx = saved_zPfx;
9302   pCheck->v1 = saved_v1;
9303   pCheck->v2 = saved_v2;
9304   return depth+1;
9305 }
9306 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
9307 
9308 #ifndef SQLITE_OMIT_INTEGRITY_CHECK
9309 /*
9310 ** This routine does a complete check of the given BTree file.  aRoot[] is
9311 ** an array of pages numbers were each page number is the root page of
9312 ** a table.  nRoot is the number of entries in aRoot.
9313 **
9314 ** A read-only or read-write transaction must be opened before calling
9315 ** this function.
9316 **
9317 ** Write the number of error seen in *pnErr.  Except for some memory
9318 ** allocation errors,  an error message held in memory obtained from
9319 ** malloc is returned if *pnErr is non-zero.  If *pnErr==0 then NULL is
9320 ** returned.  If a memory allocation error occurs, NULL is returned.
9321 */
9322 char *sqlite3BtreeIntegrityCheck(
9323   Btree *p,     /* The btree to be checked */
9324   int *aRoot,   /* An array of root pages numbers for individual trees */
9325   int nRoot,    /* Number of entries in aRoot[] */
9326   int mxErr,    /* Stop reporting errors after this many */
9327   int *pnErr    /* Write number of errors seen to this variable */
9328 ){
9329   Pgno i;
9330   IntegrityCk sCheck;
9331   BtShared *pBt = p->pBt;
9332   int savedDbFlags = pBt->db->flags;
9333   char zErr[100];
9334   VVA_ONLY( int nRef );
9335 
9336   sqlite3BtreeEnter(p);
9337   assert( p->inTrans>TRANS_NONE && pBt->inTransaction>TRANS_NONE );
9338   VVA_ONLY( nRef = sqlite3PagerRefcount(pBt->pPager) );
9339   assert( nRef>=0 );
9340   sCheck.pBt = pBt;
9341   sCheck.pPager = pBt->pPager;
9342   sCheck.nPage = btreePagecount(sCheck.pBt);
9343   sCheck.mxErr = mxErr;
9344   sCheck.nErr = 0;
9345   sCheck.mallocFailed = 0;
9346   sCheck.zPfx = 0;
9347   sCheck.v1 = 0;
9348   sCheck.v2 = 0;
9349   sCheck.aPgRef = 0;
9350   sCheck.heap = 0;
9351   sqlite3StrAccumInit(&sCheck.errMsg, 0, zErr, sizeof(zErr), SQLITE_MAX_LENGTH);
9352   sCheck.errMsg.printfFlags = SQLITE_PRINTF_INTERNAL;
9353   if( sCheck.nPage==0 ){
9354     goto integrity_ck_cleanup;
9355   }
9356 
9357   sCheck.aPgRef = sqlite3MallocZero((sCheck.nPage / 8)+ 1);
9358   if( !sCheck.aPgRef ){
9359     sCheck.mallocFailed = 1;
9360     goto integrity_ck_cleanup;
9361   }
9362   sCheck.heap = (u32*)sqlite3PageMalloc( pBt->pageSize );
9363   if( sCheck.heap==0 ){
9364     sCheck.mallocFailed = 1;
9365     goto integrity_ck_cleanup;
9366   }
9367 
9368   i = PENDING_BYTE_PAGE(pBt);
9369   if( i<=sCheck.nPage ) setPageReferenced(&sCheck, i);
9370 
9371   /* Check the integrity of the freelist
9372   */
9373   sCheck.zPfx = "Main freelist: ";
9374   checkList(&sCheck, 1, get4byte(&pBt->pPage1->aData[32]),
9375             get4byte(&pBt->pPage1->aData[36]));
9376   sCheck.zPfx = 0;
9377 
9378   /* Check all the tables.
9379   */
9380   testcase( pBt->db->flags & SQLITE_CellSizeCk );
9381   pBt->db->flags &= ~SQLITE_CellSizeCk;
9382   for(i=0; (int)i<nRoot && sCheck.mxErr; i++){
9383     i64 notUsed;
9384     if( aRoot[i]==0 ) continue;
9385 #ifndef SQLITE_OMIT_AUTOVACUUM
9386     if( pBt->autoVacuum && aRoot[i]>1 ){
9387       checkPtrmap(&sCheck, aRoot[i], PTRMAP_ROOTPAGE, 0);
9388     }
9389 #endif
9390     checkTreePage(&sCheck, aRoot[i], &notUsed, LARGEST_INT64);
9391   }
9392   pBt->db->flags = savedDbFlags;
9393 
9394   /* Make sure every page in the file is referenced
9395   */
9396   for(i=1; i<=sCheck.nPage && sCheck.mxErr; i++){
9397 #ifdef SQLITE_OMIT_AUTOVACUUM
9398     if( getPageReferenced(&sCheck, i)==0 ){
9399       checkAppendMsg(&sCheck, "Page %d is never used", i);
9400     }
9401 #else
9402     /* If the database supports auto-vacuum, make sure no tables contain
9403     ** references to pointer-map pages.
9404     */
9405     if( getPageReferenced(&sCheck, i)==0 &&
9406        (PTRMAP_PAGENO(pBt, i)!=i || !pBt->autoVacuum) ){
9407       checkAppendMsg(&sCheck, "Page %d is never used", i);
9408     }
9409     if( getPageReferenced(&sCheck, i)!=0 &&
9410        (PTRMAP_PAGENO(pBt, i)==i && pBt->autoVacuum) ){
9411       checkAppendMsg(&sCheck, "Pointer map page %d is referenced", i);
9412     }
9413 #endif
9414   }
9415 
9416   /* Clean  up and report errors.
9417   */
9418 integrity_ck_cleanup:
9419   sqlite3PageFree(sCheck.heap);
9420   sqlite3_free(sCheck.aPgRef);
9421   if( sCheck.mallocFailed ){
9422     sqlite3StrAccumReset(&sCheck.errMsg);
9423     sCheck.nErr++;
9424   }
9425   *pnErr = sCheck.nErr;
9426   if( sCheck.nErr==0 ) sqlite3StrAccumReset(&sCheck.errMsg);
9427   /* Make sure this analysis did not leave any unref() pages. */
9428   assert( nRef==sqlite3PagerRefcount(pBt->pPager) );
9429   sqlite3BtreeLeave(p);
9430   return sqlite3StrAccumFinish(&sCheck.errMsg);
9431 }
9432 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
9433 
9434 /*
9435 ** Return the full pathname of the underlying database file.  Return
9436 ** an empty string if the database is in-memory or a TEMP database.
9437 **
9438 ** The pager filename is invariant as long as the pager is
9439 ** open so it is safe to access without the BtShared mutex.
9440 */
9441 const char *sqlite3BtreeGetFilename(Btree *p){
9442   assert( p->pBt->pPager!=0 );
9443   return sqlite3PagerFilename(p->pBt->pPager, 1);
9444 }
9445 
9446 /*
9447 ** Return the pathname of the journal file for this database. The return
9448 ** value of this routine is the same regardless of whether the journal file
9449 ** has been created or not.
9450 **
9451 ** The pager journal filename is invariant as long as the pager is
9452 ** open so it is safe to access without the BtShared mutex.
9453 */
9454 const char *sqlite3BtreeGetJournalname(Btree *p){
9455   assert( p->pBt->pPager!=0 );
9456   return sqlite3PagerJournalname(p->pBt->pPager);
9457 }
9458 
9459 /*
9460 ** Return non-zero if a transaction is active.
9461 */
9462 int sqlite3BtreeIsInTrans(Btree *p){
9463   assert( p==0 || sqlite3_mutex_held(p->db->mutex) );
9464   return (p && (p->inTrans==TRANS_WRITE));
9465 }
9466 
9467 #ifndef SQLITE_OMIT_WAL
9468 /*
9469 ** Run a checkpoint on the Btree passed as the first argument.
9470 **
9471 ** Return SQLITE_LOCKED if this or any other connection has an open
9472 ** transaction on the shared-cache the argument Btree is connected to.
9473 **
9474 ** Parameter eMode is one of SQLITE_CHECKPOINT_PASSIVE, FULL or RESTART.
9475 */
9476 int sqlite3BtreeCheckpoint(Btree *p, int eMode, int *pnLog, int *pnCkpt){
9477   int rc = SQLITE_OK;
9478   if( p ){
9479     BtShared *pBt = p->pBt;
9480     sqlite3BtreeEnter(p);
9481     if( pBt->inTransaction!=TRANS_NONE ){
9482       rc = SQLITE_LOCKED;
9483     }else{
9484       rc = sqlite3PagerCheckpoint(pBt->pPager, eMode, pnLog, pnCkpt);
9485     }
9486     sqlite3BtreeLeave(p);
9487   }
9488   return rc;
9489 }
9490 #endif
9491 
9492 /*
9493 ** Return non-zero if a read (or write) transaction is active.
9494 */
9495 int sqlite3BtreeIsInReadTrans(Btree *p){
9496   assert( p );
9497   assert( sqlite3_mutex_held(p->db->mutex) );
9498   return p->inTrans!=TRANS_NONE;
9499 }
9500 
9501 int sqlite3BtreeIsInBackup(Btree *p){
9502   assert( p );
9503   assert( sqlite3_mutex_held(p->db->mutex) );
9504   return p->nBackup!=0;
9505 }
9506 
9507 /*
9508 ** This function returns a pointer to a blob of memory associated with
9509 ** a single shared-btree. The memory is used by client code for its own
9510 ** purposes (for example, to store a high-level schema associated with
9511 ** the shared-btree). The btree layer manages reference counting issues.
9512 **
9513 ** The first time this is called on a shared-btree, nBytes bytes of memory
9514 ** are allocated, zeroed, and returned to the caller. For each subsequent
9515 ** call the nBytes parameter is ignored and a pointer to the same blob
9516 ** of memory returned.
9517 **
9518 ** If the nBytes parameter is 0 and the blob of memory has not yet been
9519 ** allocated, a null pointer is returned. If the blob has already been
9520 ** allocated, it is returned as normal.
9521 **
9522 ** Just before the shared-btree is closed, the function passed as the
9523 ** xFree argument when the memory allocation was made is invoked on the
9524 ** blob of allocated memory. The xFree function should not call sqlite3_free()
9525 ** on the memory, the btree layer does that.
9526 */
9527 void *sqlite3BtreeSchema(Btree *p, int nBytes, void(*xFree)(void *)){
9528   BtShared *pBt = p->pBt;
9529   sqlite3BtreeEnter(p);
9530   if( !pBt->pSchema && nBytes ){
9531     pBt->pSchema = sqlite3DbMallocZero(0, nBytes);
9532     pBt->xFreeSchema = xFree;
9533   }
9534   sqlite3BtreeLeave(p);
9535   return pBt->pSchema;
9536 }
9537 
9538 /*
9539 ** Return SQLITE_LOCKED_SHAREDCACHE if another user of the same shared
9540 ** btree as the argument handle holds an exclusive lock on the
9541 ** sqlite_master table. Otherwise SQLITE_OK.
9542 */
9543 int sqlite3BtreeSchemaLocked(Btree *p){
9544   int rc;
9545   assert( sqlite3_mutex_held(p->db->mutex) );
9546   sqlite3BtreeEnter(p);
9547   rc = querySharedCacheTableLock(p, MASTER_ROOT, READ_LOCK);
9548   assert( rc==SQLITE_OK || rc==SQLITE_LOCKED_SHAREDCACHE );
9549   sqlite3BtreeLeave(p);
9550   return rc;
9551 }
9552 
9553 
9554 #ifndef SQLITE_OMIT_SHARED_CACHE
9555 /*
9556 ** Obtain a lock on the table whose root page is iTab.  The
9557 ** lock is a write lock if isWritelock is true or a read lock
9558 ** if it is false.
9559 */
9560 int sqlite3BtreeLockTable(Btree *p, int iTab, u8 isWriteLock){
9561   int rc = SQLITE_OK;
9562   assert( p->inTrans!=TRANS_NONE );
9563   if( p->sharable ){
9564     u8 lockType = READ_LOCK + isWriteLock;
9565     assert( READ_LOCK+1==WRITE_LOCK );
9566     assert( isWriteLock==0 || isWriteLock==1 );
9567 
9568     sqlite3BtreeEnter(p);
9569     rc = querySharedCacheTableLock(p, iTab, lockType);
9570     if( rc==SQLITE_OK ){
9571       rc = setSharedCacheTableLock(p, iTab, lockType);
9572     }
9573     sqlite3BtreeLeave(p);
9574   }
9575   return rc;
9576 }
9577 #endif
9578 
9579 #ifndef SQLITE_OMIT_INCRBLOB
9580 /*
9581 ** Argument pCsr must be a cursor opened for writing on an
9582 ** INTKEY table currently pointing at a valid table entry.
9583 ** This function modifies the data stored as part of that entry.
9584 **
9585 ** Only the data content may only be modified, it is not possible to
9586 ** change the length of the data stored. If this function is called with
9587 ** parameters that attempt to write past the end of the existing data,
9588 ** no modifications are made and SQLITE_CORRUPT is returned.
9589 */
9590 int sqlite3BtreePutData(BtCursor *pCsr, u32 offset, u32 amt, void *z){
9591   int rc;
9592   assert( cursorOwnsBtShared(pCsr) );
9593   assert( sqlite3_mutex_held(pCsr->pBtree->db->mutex) );
9594   assert( pCsr->curFlags & BTCF_Incrblob );
9595 
9596   rc = restoreCursorPosition(pCsr);
9597   if( rc!=SQLITE_OK ){
9598     return rc;
9599   }
9600   assert( pCsr->eState!=CURSOR_REQUIRESEEK );
9601   if( pCsr->eState!=CURSOR_VALID ){
9602     return SQLITE_ABORT;
9603   }
9604 
9605   /* Save the positions of all other cursors open on this table. This is
9606   ** required in case any of them are holding references to an xFetch
9607   ** version of the b-tree page modified by the accessPayload call below.
9608   **
9609   ** Note that pCsr must be open on a INTKEY table and saveCursorPosition()
9610   ** and hence saveAllCursors() cannot fail on a BTREE_INTKEY table, hence
9611   ** saveAllCursors can only return SQLITE_OK.
9612   */
9613   VVA_ONLY(rc =) saveAllCursors(pCsr->pBt, pCsr->pgnoRoot, pCsr);
9614   assert( rc==SQLITE_OK );
9615 
9616   /* Check some assumptions:
9617   **   (a) the cursor is open for writing,
9618   **   (b) there is a read/write transaction open,
9619   **   (c) the connection holds a write-lock on the table (if required),
9620   **   (d) there are no conflicting read-locks, and
9621   **   (e) the cursor points at a valid row of an intKey table.
9622   */
9623   if( (pCsr->curFlags & BTCF_WriteFlag)==0 ){
9624     return SQLITE_READONLY;
9625   }
9626   assert( (pCsr->pBt->btsFlags & BTS_READ_ONLY)==0
9627               && pCsr->pBt->inTransaction==TRANS_WRITE );
9628   assert( hasSharedCacheTableLock(pCsr->pBtree, pCsr->pgnoRoot, 0, 2) );
9629   assert( !hasReadConflicts(pCsr->pBtree, pCsr->pgnoRoot) );
9630   assert( pCsr->apPage[pCsr->iPage]->intKey );
9631 
9632   return accessPayload(pCsr, offset, amt, (unsigned char *)z, 1);
9633 }
9634 
9635 /*
9636 ** Mark this cursor as an incremental blob cursor.
9637 */
9638 void sqlite3BtreeIncrblobCursor(BtCursor *pCur){
9639   pCur->curFlags |= BTCF_Incrblob;
9640   pCur->pBtree->hasIncrblobCur = 1;
9641 }
9642 #endif
9643 
9644 /*
9645 ** Set both the "read version" (single byte at byte offset 18) and
9646 ** "write version" (single byte at byte offset 19) fields in the database
9647 ** header to iVersion.
9648 */
9649 int sqlite3BtreeSetVersion(Btree *pBtree, int iVersion){
9650   BtShared *pBt = pBtree->pBt;
9651   int rc;                         /* Return code */
9652 
9653   assert( iVersion==1 || iVersion==2 );
9654 
9655   /* If setting the version fields to 1, do not automatically open the
9656   ** WAL connection, even if the version fields are currently set to 2.
9657   */
9658   pBt->btsFlags &= ~BTS_NO_WAL;
9659   if( iVersion==1 ) pBt->btsFlags |= BTS_NO_WAL;
9660 
9661   rc = sqlite3BtreeBeginTrans(pBtree, 0);
9662   if( rc==SQLITE_OK ){
9663     u8 *aData = pBt->pPage1->aData;
9664     if( aData[18]!=(u8)iVersion || aData[19]!=(u8)iVersion ){
9665       rc = sqlite3BtreeBeginTrans(pBtree, 2);
9666       if( rc==SQLITE_OK ){
9667         rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
9668         if( rc==SQLITE_OK ){
9669           aData[18] = (u8)iVersion;
9670           aData[19] = (u8)iVersion;
9671         }
9672       }
9673     }
9674   }
9675 
9676   pBt->btsFlags &= ~BTS_NO_WAL;
9677   return rc;
9678 }
9679 
9680 /*
9681 ** Return true if the cursor has a hint specified.  This routine is
9682 ** only used from within assert() statements
9683 */
9684 int sqlite3BtreeCursorHasHint(BtCursor *pCsr, unsigned int mask){
9685   return (pCsr->hints & mask)!=0;
9686 }
9687 
9688 /*
9689 ** Return true if the given Btree is read-only.
9690 */
9691 int sqlite3BtreeIsReadonly(Btree *p){
9692   return (p->pBt->btsFlags & BTS_READ_ONLY)!=0;
9693 }
9694 
9695 /*
9696 ** Return the size of the header added to each page by this module.
9697 */
9698 int sqlite3HeaderSizeBtree(void){ return ROUND8(sizeof(MemPage)); }
9699 
9700 #if !defined(SQLITE_OMIT_SHARED_CACHE)
9701 /*
9702 ** Return true if the Btree passed as the only argument is sharable.
9703 */
9704 int sqlite3BtreeSharable(Btree *p){
9705   return p->sharable;
9706 }
9707 #endif
9708