xref: /sqlite-3.40.0/src/btree.c (revision eb4ac06f)
1 /*
2 ** 2004 April 6
3 **
4 ** The author disclaims copyright to this source code.  In place of
5 ** a legal notice, here is a blessing:
6 **
7 **    May you do good and not evil.
8 **    May you find forgiveness for yourself and forgive others.
9 **    May you share freely, never taking more than you give.
10 **
11 *************************************************************************
12 ** $Id: btree.c,v 1.602 2009/04/30 13:30:33 drh Exp $
13 **
14 ** This file implements a external (disk-based) database using BTrees.
15 ** See the header comment on "btreeInt.h" for additional information.
16 ** Including a description of file format and an overview of operation.
17 */
18 #include "btreeInt.h"
19 
20 /*
21 ** The header string that appears at the beginning of every
22 ** SQLite database.
23 */
24 static const char zMagicHeader[] = SQLITE_FILE_HEADER;
25 
26 /*
27 ** Set this global variable to 1 to enable tracing using the TRACE
28 ** macro.
29 */
30 #if 0
31 int sqlite3BtreeTrace=0;  /* True to enable tracing */
32 # define TRACE(X)  if(sqlite3BtreeTrace){printf X;fflush(stdout);}
33 #else
34 # define TRACE(X)
35 #endif
36 
37 
38 
39 #ifndef SQLITE_OMIT_SHARED_CACHE
40 /*
41 ** A list of BtShared objects that are eligible for participation
42 ** in shared cache.  This variable has file scope during normal builds,
43 ** but the test harness needs to access it so we make it global for
44 ** test builds.
45 **
46 ** Access to this variable is protected by SQLITE_MUTEX_STATIC_MASTER.
47 */
48 #ifdef SQLITE_TEST
49 BtShared *SQLITE_WSD sqlite3SharedCacheList = 0;
50 #else
51 static BtShared *SQLITE_WSD sqlite3SharedCacheList = 0;
52 #endif
53 #endif /* SQLITE_OMIT_SHARED_CACHE */
54 
55 #ifndef SQLITE_OMIT_SHARED_CACHE
56 /*
57 ** Enable or disable the shared pager and schema features.
58 **
59 ** This routine has no effect on existing database connections.
60 ** The shared cache setting effects only future calls to
61 ** sqlite3_open(), sqlite3_open16(), or sqlite3_open_v2().
62 */
63 int sqlite3_enable_shared_cache(int enable){
64   sqlite3GlobalConfig.sharedCacheEnabled = enable;
65   return SQLITE_OK;
66 }
67 #endif
68 
69 
70 /*
71 ** Forward declaration
72 */
73 static int checkForReadConflicts(Btree*, Pgno, BtCursor*, i64);
74 
75 
76 #ifdef SQLITE_OMIT_SHARED_CACHE
77   /*
78   ** The functions querySharedCacheTableLock(), setSharedCacheTableLock(),
79   ** and clearAllSharedCacheTableLocks()
80   ** manipulate entries in the BtShared.pLock linked list used to store
81   ** shared-cache table level locks. If the library is compiled with the
82   ** shared-cache feature disabled, then there is only ever one user
83   ** of each BtShared structure and so this locking is not necessary.
84   ** So define the lock related functions as no-ops.
85   */
86   #define querySharedCacheTableLock(a,b,c) SQLITE_OK
87   #define setSharedCacheTableLock(a,b,c) SQLITE_OK
88   #define clearAllSharedCacheTableLocks(a)
89 #endif
90 
91 #ifndef SQLITE_OMIT_SHARED_CACHE
92 /*
93 ** Query to see if btree handle p may obtain a lock of type eLock
94 ** (READ_LOCK or WRITE_LOCK) on the table with root-page iTab. Return
95 ** SQLITE_OK if the lock may be obtained (by calling
96 ** setSharedCacheTableLock()), or SQLITE_LOCKED if not.
97 */
98 static int querySharedCacheTableLock(Btree *p, Pgno iTab, u8 eLock){
99   BtShared *pBt = p->pBt;
100   BtLock *pIter;
101 
102   assert( sqlite3BtreeHoldsMutex(p) );
103   assert( eLock==READ_LOCK || eLock==WRITE_LOCK );
104   assert( p->db!=0 );
105 
106   /* If requesting a write-lock, then the Btree must have an open write
107   ** transaction on this file. And, obviously, for this to be so there
108   ** must be an open write transaction on the file itself.
109   */
110   assert( eLock==READ_LOCK || (p==pBt->pWriter && p->inTrans==TRANS_WRITE) );
111   assert( eLock==READ_LOCK || pBt->inTransaction==TRANS_WRITE );
112 
113   /* This is a no-op if the shared-cache is not enabled */
114   if( !p->sharable ){
115     return SQLITE_OK;
116   }
117 
118   /* If some other connection is holding an exclusive lock, the
119   ** requested lock may not be obtained.
120   */
121   if( pBt->pWriter!=p && pBt->isExclusive ){
122     sqlite3ConnectionBlocked(p->db, pBt->pWriter->db);
123     return SQLITE_LOCKED_SHAREDCACHE;
124   }
125 
126   /* This (along with setSharedCacheTableLock()) is where
127   ** the ReadUncommitted flag is dealt with.
128   ** If the caller is querying for a read-lock on any table
129   ** other than the sqlite_master table (table 1) and if the ReadUncommitted
130   ** flag is set, then the lock granted even if there are write-locks
131   ** on the table. If a write-lock is requested, the ReadUncommitted flag
132   ** is not considered.
133   **
134   ** In function setSharedCacheTableLock(), if a read-lock is demanded and the
135   ** ReadUncommitted flag is set, no entry is added to the locks list
136   ** (BtShared.pLock).
137   **
138   ** To summarize: If the ReadUncommitted flag is set, then read cursors
139   ** on non-schema tables do not create or respect table locks. The locking
140   ** procedure for a write-cursor does not change.
141   */
142   if(
143     0==(p->db->flags&SQLITE_ReadUncommitted) ||
144     eLock==WRITE_LOCK ||
145     iTab==MASTER_ROOT
146   ){
147     for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){
148       /* The condition (pIter->eLock!=eLock) in the following if(...)
149       ** statement is a simplification of:
150       **
151       **   (eLock==WRITE_LOCK || pIter->eLock==WRITE_LOCK)
152       **
153       ** since we know that if eLock==WRITE_LOCK, then no other connection
154       ** may hold a WRITE_LOCK on any table in this file (since there can
155       ** only be a single writer).
156       */
157       assert( pIter->eLock==READ_LOCK || pIter->eLock==WRITE_LOCK );
158       assert( eLock==READ_LOCK || pIter->pBtree==p || pIter->eLock==READ_LOCK);
159       if( pIter->pBtree!=p && pIter->iTable==iTab && pIter->eLock!=eLock ){
160         sqlite3ConnectionBlocked(p->db, pIter->pBtree->db);
161         if( eLock==WRITE_LOCK ){
162           assert( p==pBt->pWriter );
163           pBt->isPending = 1;
164         }
165         return SQLITE_LOCKED_SHAREDCACHE;
166       }
167     }
168   }
169   return SQLITE_OK;
170 }
171 #endif /* !SQLITE_OMIT_SHARED_CACHE */
172 
173 #ifndef SQLITE_OMIT_SHARED_CACHE
174 /*
175 ** Add a lock on the table with root-page iTable to the shared-btree used
176 ** by Btree handle p. Parameter eLock must be either READ_LOCK or
177 ** WRITE_LOCK.
178 **
179 ** SQLITE_OK is returned if the lock is added successfully. SQLITE_BUSY and
180 ** SQLITE_NOMEM may also be returned.
181 */
182 static int setSharedCacheTableLock(Btree *p, Pgno iTable, u8 eLock){
183   BtShared *pBt = p->pBt;
184   BtLock *pLock = 0;
185   BtLock *pIter;
186 
187   assert( sqlite3BtreeHoldsMutex(p) );
188   assert( eLock==READ_LOCK || eLock==WRITE_LOCK );
189   assert( p->db!=0 );
190 
191   /* This is a no-op if the shared-cache is not enabled */
192   if( !p->sharable ){
193     return SQLITE_OK;
194   }
195 
196   assert( SQLITE_OK==querySharedCacheTableLock(p, iTable, eLock) );
197 
198   /* If the read-uncommitted flag is set and a read-lock is requested on
199   ** a non-schema table, then the lock is always granted.  Return early
200   ** without adding an entry to the BtShared.pLock list. See
201   ** comment in function querySharedCacheTableLock() for more info
202   ** on handling the ReadUncommitted flag.
203   */
204   if(
205     (p->db->flags&SQLITE_ReadUncommitted) &&
206     (eLock==READ_LOCK) &&
207     iTable!=MASTER_ROOT
208   ){
209     return SQLITE_OK;
210   }
211 
212   /* First search the list for an existing lock on this table. */
213   for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){
214     if( pIter->iTable==iTable && pIter->pBtree==p ){
215       pLock = pIter;
216       break;
217     }
218   }
219 
220   /* If the above search did not find a BtLock struct associating Btree p
221   ** with table iTable, allocate one and link it into the list.
222   */
223   if( !pLock ){
224     pLock = (BtLock *)sqlite3MallocZero(sizeof(BtLock));
225     if( !pLock ){
226       return SQLITE_NOMEM;
227     }
228     pLock->iTable = iTable;
229     pLock->pBtree = p;
230     pLock->pNext = pBt->pLock;
231     pBt->pLock = pLock;
232   }
233 
234   /* Set the BtLock.eLock variable to the maximum of the current lock
235   ** and the requested lock. This means if a write-lock was already held
236   ** and a read-lock requested, we don't incorrectly downgrade the lock.
237   */
238   assert( WRITE_LOCK>READ_LOCK );
239   if( eLock>pLock->eLock ){
240     pLock->eLock = eLock;
241   }
242 
243   return SQLITE_OK;
244 }
245 #endif /* !SQLITE_OMIT_SHARED_CACHE */
246 
247 #ifndef SQLITE_OMIT_SHARED_CACHE
248 /*
249 ** Release all the table locks (locks obtained via calls to
250 ** the setSharedCacheTableLock() procedure) held by Btree handle p.
251 **
252 ** This function assumes that handle p has an open read or write
253 ** transaction. If it does not, then the BtShared.isPending variable
254 ** may be incorrectly cleared.
255 */
256 static void clearAllSharedCacheTableLocks(Btree *p){
257   BtShared *pBt = p->pBt;
258   BtLock **ppIter = &pBt->pLock;
259 
260   assert( sqlite3BtreeHoldsMutex(p) );
261   assert( p->sharable || 0==*ppIter );
262   assert( p->inTrans>0 );
263 
264   while( *ppIter ){
265     BtLock *pLock = *ppIter;
266     assert( pBt->isExclusive==0 || pBt->pWriter==pLock->pBtree );
267     assert( pLock->pBtree->inTrans>=pLock->eLock );
268     if( pLock->pBtree==p ){
269       *ppIter = pLock->pNext;
270       sqlite3_free(pLock);
271     }else{
272       ppIter = &pLock->pNext;
273     }
274   }
275 
276   assert( pBt->isPending==0 || pBt->pWriter );
277   if( pBt->pWriter==p ){
278     pBt->pWriter = 0;
279     pBt->isExclusive = 0;
280     pBt->isPending = 0;
281   }else if( pBt->nTransaction==2 ){
282     /* This function is called when connection p is concluding its
283     ** transaction. If there currently exists a writer, and p is not
284     ** that writer, then the number of locks held by connections other
285     ** than the writer must be about to drop to zero. In this case
286     ** set the isPending flag to 0.
287     **
288     ** If there is not currently a writer, then BtShared.isPending must
289     ** be zero already. So this next line is harmless in that case.
290     */
291     pBt->isPending = 0;
292   }
293 }
294 #endif /* SQLITE_OMIT_SHARED_CACHE */
295 
296 static void releasePage(MemPage *pPage);  /* Forward reference */
297 
298 /*
299 ** Verify that the cursor holds a mutex on the BtShared
300 */
301 #ifndef NDEBUG
302 static int cursorHoldsMutex(BtCursor *p){
303   return sqlite3_mutex_held(p->pBt->mutex);
304 }
305 #endif
306 
307 
308 #ifndef SQLITE_OMIT_INCRBLOB
309 /*
310 ** Invalidate the overflow page-list cache for cursor pCur, if any.
311 */
312 static void invalidateOverflowCache(BtCursor *pCur){
313   assert( cursorHoldsMutex(pCur) );
314   sqlite3_free(pCur->aOverflow);
315   pCur->aOverflow = 0;
316 }
317 
318 /*
319 ** Invalidate the overflow page-list cache for all cursors opened
320 ** on the shared btree structure pBt.
321 */
322 static void invalidateAllOverflowCache(BtShared *pBt){
323   BtCursor *p;
324   assert( sqlite3_mutex_held(pBt->mutex) );
325   for(p=pBt->pCursor; p; p=p->pNext){
326     invalidateOverflowCache(p);
327   }
328 }
329 #else
330   #define invalidateOverflowCache(x)
331   #define invalidateAllOverflowCache(x)
332 #endif
333 
334 /*
335 ** Set bit pgno of the BtShared.pHasContent bitvec. This is called
336 ** when a page that previously contained data becomes a free-list leaf
337 ** page.
338 **
339 ** The BtShared.pHasContent bitvec exists to work around an obscure
340 ** bug caused by the interaction of two useful IO optimizations surrounding
341 ** free-list leaf pages:
342 **
343 **   1) When all data is deleted from a page and the page becomes
344 **      a free-list leaf page, the page is not written to the database
345 **      (as free-list leaf pages contain no meaningful data). Sometimes
346 **      such a page is not even journalled (as it will not be modified,
347 **      why bother journalling it?).
348 **
349 **   2) When a free-list leaf page is reused, its content is not read
350 **      from the database or written to the journal file (why should it
351 **      be, if it is not at all meaningful?).
352 **
353 ** By themselves, these optimizations work fine and provide a handy
354 ** performance boost to bulk delete or insert operations. However, if
355 ** a page is moved to the free-list and then reused within the same
356 ** transaction, a problem comes up. If the page is not journalled when
357 ** it is moved to the free-list and it is also not journalled when it
358 ** is extracted from the free-list and reused, then the original data
359 ** may be lost. In the event of a rollback, it may not be possible
360 ** to restore the database to its original configuration.
361 **
362 ** The solution is the BtShared.pHasContent bitvec. Whenever a page is
363 ** moved to become a free-list leaf page, the corresponding bit is
364 ** set in the bitvec. Whenever a leaf page is extracted from the free-list,
365 ** optimization 2 above is ommitted if the corresponding bit is already
366 ** set in BtShared.pHasContent. The contents of the bitvec are cleared
367 ** at the end of every transaction.
368 */
369 static int btreeSetHasContent(BtShared *pBt, Pgno pgno){
370   int rc = SQLITE_OK;
371   if( !pBt->pHasContent ){
372     int nPage;
373     rc = sqlite3PagerPagecount(pBt->pPager, &nPage);
374     if( rc==SQLITE_OK ){
375       pBt->pHasContent = sqlite3BitvecCreate((u32)nPage);
376       if( !pBt->pHasContent ){
377         rc = SQLITE_NOMEM;
378       }
379     }
380   }
381   if( rc==SQLITE_OK && pgno<=sqlite3BitvecSize(pBt->pHasContent) ){
382     rc = sqlite3BitvecSet(pBt->pHasContent, pgno);
383   }
384   return rc;
385 }
386 
387 /*
388 ** Query the BtShared.pHasContent vector.
389 **
390 ** This function is called when a free-list leaf page is removed from the
391 ** free-list for reuse. It returns false if it is safe to retrieve the
392 ** page from the pager layer with the 'no-content' flag set. True otherwise.
393 */
394 static int btreeGetHasContent(BtShared *pBt, Pgno pgno){
395   Bitvec *p = pBt->pHasContent;
396   return (p && (pgno>sqlite3BitvecSize(p) || sqlite3BitvecTest(p, pgno)));
397 }
398 
399 /*
400 ** Clear (destroy) the BtShared.pHasContent bitvec. This should be
401 ** invoked at the conclusion of each write-transaction.
402 */
403 static void btreeClearHasContent(BtShared *pBt){
404   sqlite3BitvecDestroy(pBt->pHasContent);
405   pBt->pHasContent = 0;
406 }
407 
408 /*
409 ** Save the current cursor position in the variables BtCursor.nKey
410 ** and BtCursor.pKey. The cursor's state is set to CURSOR_REQUIRESEEK.
411 */
412 static int saveCursorPosition(BtCursor *pCur){
413   int rc;
414 
415   assert( CURSOR_VALID==pCur->eState );
416   assert( 0==pCur->pKey );
417   assert( cursorHoldsMutex(pCur) );
418 
419   rc = sqlite3BtreeKeySize(pCur, &pCur->nKey);
420 
421   /* If this is an intKey table, then the above call to BtreeKeySize()
422   ** stores the integer key in pCur->nKey. In this case this value is
423   ** all that is required. Otherwise, if pCur is not open on an intKey
424   ** table, then malloc space for and store the pCur->nKey bytes of key
425   ** data.
426   */
427   if( rc==SQLITE_OK && 0==pCur->apPage[0]->intKey){
428     void *pKey = sqlite3Malloc( (int)pCur->nKey );
429     if( pKey ){
430       rc = sqlite3BtreeKey(pCur, 0, (int)pCur->nKey, pKey);
431       if( rc==SQLITE_OK ){
432         pCur->pKey = pKey;
433       }else{
434         sqlite3_free(pKey);
435       }
436     }else{
437       rc = SQLITE_NOMEM;
438     }
439   }
440   assert( !pCur->apPage[0]->intKey || !pCur->pKey );
441 
442   if( rc==SQLITE_OK ){
443     int i;
444     for(i=0; i<=pCur->iPage; i++){
445       releasePage(pCur->apPage[i]);
446       pCur->apPage[i] = 0;
447     }
448     pCur->iPage = -1;
449     pCur->eState = CURSOR_REQUIRESEEK;
450   }
451 
452   invalidateOverflowCache(pCur);
453   return rc;
454 }
455 
456 /*
457 ** Save the positions of all cursors except pExcept open on the table
458 ** with root-page iRoot. Usually, this is called just before cursor
459 ** pExcept is used to modify the table (BtreeDelete() or BtreeInsert()).
460 */
461 static int saveAllCursors(BtShared *pBt, Pgno iRoot, BtCursor *pExcept){
462   BtCursor *p;
463   assert( sqlite3_mutex_held(pBt->mutex) );
464   assert( pExcept==0 || pExcept->pBt==pBt );
465   for(p=pBt->pCursor; p; p=p->pNext){
466     if( p!=pExcept && (0==iRoot || p->pgnoRoot==iRoot) &&
467         p->eState==CURSOR_VALID ){
468       int rc = saveCursorPosition(p);
469       if( SQLITE_OK!=rc ){
470         return rc;
471       }
472     }
473   }
474   return SQLITE_OK;
475 }
476 
477 /*
478 ** Clear the current cursor position.
479 */
480 void sqlite3BtreeClearCursor(BtCursor *pCur){
481   assert( cursorHoldsMutex(pCur) );
482   sqlite3_free(pCur->pKey);
483   pCur->pKey = 0;
484   pCur->eState = CURSOR_INVALID;
485 }
486 
487 /*
488 ** Restore the cursor to the position it was in (or as close to as possible)
489 ** when saveCursorPosition() was called. Note that this call deletes the
490 ** saved position info stored by saveCursorPosition(), so there can be
491 ** at most one effective restoreCursorPosition() call after each
492 ** saveCursorPosition().
493 */
494 int sqlite3BtreeRestoreCursorPosition(BtCursor *pCur){
495   int rc;
496   assert( cursorHoldsMutex(pCur) );
497   assert( pCur->eState>=CURSOR_REQUIRESEEK );
498   if( pCur->eState==CURSOR_FAULT ){
499     return pCur->skip;
500   }
501   pCur->eState = CURSOR_INVALID;
502   rc = sqlite3BtreeMoveto(pCur, pCur->pKey, pCur->nKey, 0, &pCur->skip);
503   if( rc==SQLITE_OK ){
504     sqlite3_free(pCur->pKey);
505     pCur->pKey = 0;
506     assert( pCur->eState==CURSOR_VALID || pCur->eState==CURSOR_INVALID );
507   }
508   return rc;
509 }
510 
511 #define restoreCursorPosition(p) \
512   (p->eState>=CURSOR_REQUIRESEEK ? \
513          sqlite3BtreeRestoreCursorPosition(p) : \
514          SQLITE_OK)
515 
516 /*
517 ** Determine whether or not a cursor has moved from the position it
518 ** was last placed at.  Cursors can move when the row they are pointing
519 ** at is deleted out from under them.
520 **
521 ** This routine returns an error code if something goes wrong.  The
522 ** integer *pHasMoved is set to one if the cursor has moved and 0 if not.
523 */
524 int sqlite3BtreeCursorHasMoved(BtCursor *pCur, int *pHasMoved){
525   int rc;
526 
527   rc = restoreCursorPosition(pCur);
528   if( rc ){
529     *pHasMoved = 1;
530     return rc;
531   }
532   if( pCur->eState!=CURSOR_VALID || pCur->skip!=0 ){
533     *pHasMoved = 1;
534   }else{
535     *pHasMoved = 0;
536   }
537   return SQLITE_OK;
538 }
539 
540 #ifndef SQLITE_OMIT_AUTOVACUUM
541 /*
542 ** Given a page number of a regular database page, return the page
543 ** number for the pointer-map page that contains the entry for the
544 ** input page number.
545 */
546 static Pgno ptrmapPageno(BtShared *pBt, Pgno pgno){
547   int nPagesPerMapPage;
548   Pgno iPtrMap, ret;
549   assert( sqlite3_mutex_held(pBt->mutex) );
550   nPagesPerMapPage = (pBt->usableSize/5)+1;
551   iPtrMap = (pgno-2)/nPagesPerMapPage;
552   ret = (iPtrMap*nPagesPerMapPage) + 2;
553   if( ret==PENDING_BYTE_PAGE(pBt) ){
554     ret++;
555   }
556   return ret;
557 }
558 
559 /*
560 ** Write an entry into the pointer map.
561 **
562 ** This routine updates the pointer map entry for page number 'key'
563 ** so that it maps to type 'eType' and parent page number 'pgno'.
564 ** An error code is returned if something goes wrong, otherwise SQLITE_OK.
565 */
566 static int ptrmapPut(BtShared *pBt, Pgno key, u8 eType, Pgno parent){
567   DbPage *pDbPage;  /* The pointer map page */
568   u8 *pPtrmap;      /* The pointer map data */
569   Pgno iPtrmap;     /* The pointer map page number */
570   int offset;       /* Offset in pointer map page */
571   int rc;
572 
573   assert( sqlite3_mutex_held(pBt->mutex) );
574   /* The master-journal page number must never be used as a pointer map page */
575   assert( 0==PTRMAP_ISPAGE(pBt, PENDING_BYTE_PAGE(pBt)) );
576 
577   assert( pBt->autoVacuum );
578   if( key==0 ){
579     return SQLITE_CORRUPT_BKPT;
580   }
581   iPtrmap = PTRMAP_PAGENO(pBt, key);
582   rc = sqlite3PagerGet(pBt->pPager, iPtrmap, &pDbPage);
583   if( rc!=SQLITE_OK ){
584     return rc;
585   }
586   offset = PTRMAP_PTROFFSET(iPtrmap, key);
587   pPtrmap = (u8 *)sqlite3PagerGetData(pDbPage);
588 
589   if( eType!=pPtrmap[offset] || get4byte(&pPtrmap[offset+1])!=parent ){
590     TRACE(("PTRMAP_UPDATE: %d->(%d,%d)\n", key, eType, parent));
591     rc = sqlite3PagerWrite(pDbPage);
592     if( rc==SQLITE_OK ){
593       pPtrmap[offset] = eType;
594       put4byte(&pPtrmap[offset+1], parent);
595     }
596   }
597 
598   sqlite3PagerUnref(pDbPage);
599   return rc;
600 }
601 
602 /*
603 ** Read an entry from the pointer map.
604 **
605 ** This routine retrieves the pointer map entry for page 'key', writing
606 ** the type and parent page number to *pEType and *pPgno respectively.
607 ** An error code is returned if something goes wrong, otherwise SQLITE_OK.
608 */
609 static int ptrmapGet(BtShared *pBt, Pgno key, u8 *pEType, Pgno *pPgno){
610   DbPage *pDbPage;   /* The pointer map page */
611   int iPtrmap;       /* Pointer map page index */
612   u8 *pPtrmap;       /* Pointer map page data */
613   int offset;        /* Offset of entry in pointer map */
614   int rc;
615 
616   assert( sqlite3_mutex_held(pBt->mutex) );
617 
618   iPtrmap = PTRMAP_PAGENO(pBt, key);
619   rc = sqlite3PagerGet(pBt->pPager, iPtrmap, &pDbPage);
620   if( rc!=0 ){
621     return rc;
622   }
623   pPtrmap = (u8 *)sqlite3PagerGetData(pDbPage);
624 
625   offset = PTRMAP_PTROFFSET(iPtrmap, key);
626   assert( pEType!=0 );
627   *pEType = pPtrmap[offset];
628   if( pPgno ) *pPgno = get4byte(&pPtrmap[offset+1]);
629 
630   sqlite3PagerUnref(pDbPage);
631   if( *pEType<1 || *pEType>5 ) return SQLITE_CORRUPT_BKPT;
632   return SQLITE_OK;
633 }
634 
635 #else /* if defined SQLITE_OMIT_AUTOVACUUM */
636   #define ptrmapPut(w,x,y,z) SQLITE_OK
637   #define ptrmapGet(w,x,y,z) SQLITE_OK
638   #define ptrmapPutOvfl(y,z) SQLITE_OK
639 #endif
640 
641 /*
642 ** Given a btree page and a cell index (0 means the first cell on
643 ** the page, 1 means the second cell, and so forth) return a pointer
644 ** to the cell content.
645 **
646 ** This routine works only for pages that do not contain overflow cells.
647 */
648 #define findCell(P,I) \
649   ((P)->aData + ((P)->maskPage & get2byte(&(P)->aData[(P)->cellOffset+2*(I)])))
650 
651 /*
652 ** This a more complex version of findCell() that works for
653 ** pages that do contain overflow cells.  See insert
654 */
655 static u8 *findOverflowCell(MemPage *pPage, int iCell){
656   int i;
657   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
658   for(i=pPage->nOverflow-1; i>=0; i--){
659     int k;
660     struct _OvflCell *pOvfl;
661     pOvfl = &pPage->aOvfl[i];
662     k = pOvfl->idx;
663     if( k<=iCell ){
664       if( k==iCell ){
665         return pOvfl->pCell;
666       }
667       iCell--;
668     }
669   }
670   return findCell(pPage, iCell);
671 }
672 
673 /*
674 ** Parse a cell content block and fill in the CellInfo structure.  There
675 ** are two versions of this function.  sqlite3BtreeParseCell() takes a
676 ** cell index as the second argument and sqlite3BtreeParseCellPtr()
677 ** takes a pointer to the body of the cell as its second argument.
678 **
679 ** Within this file, the parseCell() macro can be called instead of
680 ** sqlite3BtreeParseCellPtr(). Using some compilers, this will be faster.
681 */
682 void sqlite3BtreeParseCellPtr(
683   MemPage *pPage,         /* Page containing the cell */
684   u8 *pCell,              /* Pointer to the cell text. */
685   CellInfo *pInfo         /* Fill in this structure */
686 ){
687   u16 n;                  /* Number bytes in cell content header */
688   u32 nPayload;           /* Number of bytes of cell payload */
689 
690   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
691 
692   pInfo->pCell = pCell;
693   assert( pPage->leaf==0 || pPage->leaf==1 );
694   n = pPage->childPtrSize;
695   assert( n==4-4*pPage->leaf );
696   if( pPage->intKey ){
697     if( pPage->hasData ){
698       n += getVarint32(&pCell[n], nPayload);
699     }else{
700       nPayload = 0;
701     }
702     n += getVarint(&pCell[n], (u64*)&pInfo->nKey);
703     pInfo->nData = nPayload;
704   }else{
705     pInfo->nData = 0;
706     n += getVarint32(&pCell[n], nPayload);
707     pInfo->nKey = nPayload;
708   }
709   pInfo->nPayload = nPayload;
710   pInfo->nHeader = n;
711   if( likely(nPayload<=pPage->maxLocal) ){
712     /* This is the (easy) common case where the entire payload fits
713     ** on the local page.  No overflow is required.
714     */
715     int nSize;          /* Total size of cell content in bytes */
716     nSize = nPayload + n;
717     pInfo->nLocal = (u16)nPayload;
718     pInfo->iOverflow = 0;
719     if( (nSize & ~3)==0 ){
720       nSize = 4;        /* Minimum cell size is 4 */
721     }
722     pInfo->nSize = (u16)nSize;
723   }else{
724     /* If the payload will not fit completely on the local page, we have
725     ** to decide how much to store locally and how much to spill onto
726     ** overflow pages.  The strategy is to minimize the amount of unused
727     ** space on overflow pages while keeping the amount of local storage
728     ** in between minLocal and maxLocal.
729     **
730     ** Warning:  changing the way overflow payload is distributed in any
731     ** way will result in an incompatible file format.
732     */
733     int minLocal;  /* Minimum amount of payload held locally */
734     int maxLocal;  /* Maximum amount of payload held locally */
735     int surplus;   /* Overflow payload available for local storage */
736 
737     minLocal = pPage->minLocal;
738     maxLocal = pPage->maxLocal;
739     surplus = minLocal + (nPayload - minLocal)%(pPage->pBt->usableSize - 4);
740     if( surplus <= maxLocal ){
741       pInfo->nLocal = (u16)surplus;
742     }else{
743       pInfo->nLocal = (u16)minLocal;
744     }
745     pInfo->iOverflow = (u16)(pInfo->nLocal + n);
746     pInfo->nSize = pInfo->iOverflow + 4;
747   }
748 }
749 #define parseCell(pPage, iCell, pInfo) \
750   sqlite3BtreeParseCellPtr((pPage), findCell((pPage), (iCell)), (pInfo))
751 void sqlite3BtreeParseCell(
752   MemPage *pPage,         /* Page containing the cell */
753   int iCell,              /* The cell index.  First cell is 0 */
754   CellInfo *pInfo         /* Fill in this structure */
755 ){
756   parseCell(pPage, iCell, pInfo);
757 }
758 
759 /*
760 ** Compute the total number of bytes that a Cell needs in the cell
761 ** data area of the btree-page.  The return number includes the cell
762 ** data header and the local payload, but not any overflow page or
763 ** the space used by the cell pointer.
764 */
765 static u16 cellSizePtr(MemPage *pPage, u8 *pCell){
766   u8 *pIter = &pCell[pPage->childPtrSize];
767   u32 nSize;
768 
769 #ifdef SQLITE_DEBUG
770   /* The value returned by this function should always be the same as
771   ** the (CellInfo.nSize) value found by doing a full parse of the
772   ** cell. If SQLITE_DEBUG is defined, an assert() at the bottom of
773   ** this function verifies that this invariant is not violated. */
774   CellInfo debuginfo;
775   sqlite3BtreeParseCellPtr(pPage, pCell, &debuginfo);
776 #endif
777 
778   if( pPage->intKey ){
779     u8 *pEnd;
780     if( pPage->hasData ){
781       pIter += getVarint32(pIter, nSize);
782     }else{
783       nSize = 0;
784     }
785 
786     /* pIter now points at the 64-bit integer key value, a variable length
787     ** integer. The following block moves pIter to point at the first byte
788     ** past the end of the key value. */
789     pEnd = &pIter[9];
790     while( (*pIter++)&0x80 && pIter<pEnd );
791   }else{
792     pIter += getVarint32(pIter, nSize);
793   }
794 
795   if( nSize>pPage->maxLocal ){
796     int minLocal = pPage->minLocal;
797     nSize = minLocal + (nSize - minLocal) % (pPage->pBt->usableSize - 4);
798     if( nSize>pPage->maxLocal ){
799       nSize = minLocal;
800     }
801     nSize += 4;
802   }
803   nSize += (pIter - pCell);
804 
805   /* The minimum size of any cell is 4 bytes. */
806   if( nSize<4 ){
807     nSize = 4;
808   }
809 
810   assert( nSize==debuginfo.nSize );
811   return nSize;
812 }
813 #ifndef NDEBUG
814 static u16 cellSize(MemPage *pPage, int iCell){
815   return cellSizePtr(pPage, findCell(pPage, iCell));
816 }
817 #endif
818 
819 #ifndef SQLITE_OMIT_AUTOVACUUM
820 /*
821 ** If the cell pCell, part of page pPage contains a pointer
822 ** to an overflow page, insert an entry into the pointer-map
823 ** for the overflow page.
824 */
825 static int ptrmapPutOvflPtr(MemPage *pPage, u8 *pCell){
826   CellInfo info;
827   assert( pCell!=0 );
828   sqlite3BtreeParseCellPtr(pPage, pCell, &info);
829   assert( (info.nData+(pPage->intKey?0:info.nKey))==info.nPayload );
830   if( (info.nData+(pPage->intKey?0:info.nKey))>info.nLocal ){
831     Pgno ovfl = get4byte(&pCell[info.iOverflow]);
832     return ptrmapPut(pPage->pBt, ovfl, PTRMAP_OVERFLOW1, pPage->pgno);
833   }
834   return SQLITE_OK;
835 }
836 /*
837 ** If the cell with index iCell on page pPage contains a pointer
838 ** to an overflow page, insert an entry into the pointer-map
839 ** for the overflow page.
840 */
841 static int ptrmapPutOvfl(MemPage *pPage, int iCell){
842   u8 *pCell;
843   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
844   pCell = findOverflowCell(pPage, iCell);
845   return ptrmapPutOvflPtr(pPage, pCell);
846 }
847 #endif
848 
849 
850 /*
851 ** Defragment the page given.  All Cells are moved to the
852 ** end of the page and all free space is collected into one
853 ** big FreeBlk that occurs in between the header and cell
854 ** pointer array and the cell content area.
855 */
856 static int defragmentPage(MemPage *pPage){
857   int i;                     /* Loop counter */
858   int pc;                    /* Address of a i-th cell */
859   int addr;                  /* Offset of first byte after cell pointer array */
860   int hdr;                   /* Offset to the page header */
861   int size;                  /* Size of a cell */
862   int usableSize;            /* Number of usable bytes on a page */
863   int cellOffset;            /* Offset to the cell pointer array */
864   int cbrk;                  /* Offset to the cell content area */
865   int nCell;                 /* Number of cells on the page */
866   unsigned char *data;       /* The page data */
867   unsigned char *temp;       /* Temp area for cell content */
868 
869   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
870   assert( pPage->pBt!=0 );
871   assert( pPage->pBt->usableSize <= SQLITE_MAX_PAGE_SIZE );
872   assert( pPage->nOverflow==0 );
873   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
874   temp = sqlite3PagerTempSpace(pPage->pBt->pPager);
875   data = pPage->aData;
876   hdr = pPage->hdrOffset;
877   cellOffset = pPage->cellOffset;
878   nCell = pPage->nCell;
879   assert( nCell==get2byte(&data[hdr+3]) );
880   usableSize = pPage->pBt->usableSize;
881   cbrk = get2byte(&data[hdr+5]);
882   memcpy(&temp[cbrk], &data[cbrk], usableSize - cbrk);
883   cbrk = usableSize;
884   for(i=0; i<nCell; i++){
885     u8 *pAddr;     /* The i-th cell pointer */
886     pAddr = &data[cellOffset + i*2];
887     pc = get2byte(pAddr);
888     if( pc>=usableSize ){
889       return SQLITE_CORRUPT_BKPT;
890     }
891     size = cellSizePtr(pPage, &temp[pc]);
892     cbrk -= size;
893     if( cbrk<cellOffset+2*nCell || pc+size>usableSize ){
894       return SQLITE_CORRUPT_BKPT;
895     }
896     assert( cbrk+size<=usableSize && cbrk>=0 );
897     memcpy(&data[cbrk], &temp[pc], size);
898     put2byte(pAddr, cbrk);
899   }
900   assert( cbrk>=cellOffset+2*nCell );
901   put2byte(&data[hdr+5], cbrk);
902   data[hdr+1] = 0;
903   data[hdr+2] = 0;
904   data[hdr+7] = 0;
905   addr = cellOffset+2*nCell;
906   memset(&data[addr], 0, cbrk-addr);
907   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
908   if( cbrk-addr!=pPage->nFree ){
909     return SQLITE_CORRUPT_BKPT;
910   }
911   return SQLITE_OK;
912 }
913 
914 /*
915 ** Allocate nByte bytes of space from within the B-Tree page passed
916 ** as the first argument. Return the index into pPage->aData[] of the
917 ** first byte of allocated space.
918 **
919 ** The caller guarantees that the space between the end of the cell-offset
920 ** array and the start of the cell-content area is at least nByte bytes
921 ** in size. So this routine can never fail.
922 **
923 ** If there are already 60 or more bytes of fragments within the page,
924 ** the page is defragmented before returning. If this were not done there
925 ** is a chance that the number of fragmented bytes could eventually
926 ** overflow the single-byte field of the page-header in which this value
927 ** is stored.
928 */
929 static int allocateSpace(MemPage *pPage, int nByte){
930   const int hdr = pPage->hdrOffset;    /* Local cache of pPage->hdrOffset */
931   u8 * const data = pPage->aData;      /* Local cache of pPage->aData */
932   int nFrag;                           /* Number of fragmented bytes on pPage */
933   int top;
934 
935   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
936   assert( pPage->pBt );
937   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
938   assert( nByte>=0 );  /* Minimum cell size is 4 */
939   assert( pPage->nFree>=nByte );
940   assert( pPage->nOverflow==0 );
941 
942   /* Assert that the space between the cell-offset array and the
943   ** cell-content area is greater than nByte bytes.
944   */
945   assert( nByte <= (
946       get2byte(&data[hdr+5])-(hdr+8+(pPage->leaf?0:4)+2*get2byte(&data[hdr+3]))
947   ));
948 
949   pPage->nFree -= (u16)nByte;
950   nFrag = data[hdr+7];
951   if( nFrag>=60 ){
952     defragmentPage(pPage);
953   }else{
954     /* Search the freelist looking for a free slot big enough to satisfy
955     ** the request. The allocation is made from the first free slot in
956     ** the list that is large enough to accomadate it.
957     */
958     int pc, addr;
959     for(addr=hdr+1; (pc = get2byte(&data[addr]))>0; addr=pc){
960       int size = get2byte(&data[pc+2]);     /* Size of free slot */
961       if( size>=nByte ){
962         int x = size - nByte;
963         if( x<4 ){
964           /* Remove the slot from the free-list. Update the number of
965           ** fragmented bytes within the page. */
966           memcpy(&data[addr], &data[pc], 2);
967           data[hdr+7] = (u8)(nFrag + x);
968         }else{
969           /* The slot remains on the free-list. Reduce its size to account
970           ** for the portion used by the new allocation. */
971           put2byte(&data[pc+2], x);
972         }
973         return pc + x;
974       }
975     }
976   }
977 
978   /* Allocate memory from the gap in between the cell pointer array
979   ** and the cell content area.
980   */
981   top = get2byte(&data[hdr+5]) - nByte;
982   put2byte(&data[hdr+5], top);
983   return top;
984 }
985 
986 /*
987 ** Return a section of the pPage->aData to the freelist.
988 ** The first byte of the new free block is pPage->aDisk[start]
989 ** and the size of the block is "size" bytes.
990 **
991 ** Most of the effort here is involved in coalesing adjacent
992 ** free blocks into a single big free block.
993 */
994 static int freeSpace(MemPage *pPage, int start, int size){
995   int addr, pbegin, hdr;
996   unsigned char *data = pPage->aData;
997 
998   assert( pPage->pBt!=0 );
999   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
1000   assert( start>=pPage->hdrOffset+6+(pPage->leaf?0:4) );
1001   assert( (start + size)<=pPage->pBt->usableSize );
1002   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1003   assert( size>=0 );   /* Minimum cell size is 4 */
1004 
1005 #ifdef SQLITE_SECURE_DELETE
1006   /* Overwrite deleted information with zeros when the SECURE_DELETE
1007   ** option is enabled at compile-time */
1008   memset(&data[start], 0, size);
1009 #endif
1010 
1011   /* Add the space back into the linked list of freeblocks */
1012   hdr = pPage->hdrOffset;
1013   addr = hdr + 1;
1014   while( (pbegin = get2byte(&data[addr]))<start && pbegin>0 ){
1015     assert( pbegin<=pPage->pBt->usableSize-4 );
1016     if( pbegin<=addr ) {
1017       return SQLITE_CORRUPT_BKPT;
1018     }
1019     addr = pbegin;
1020   }
1021   if ( pbegin>pPage->pBt->usableSize-4 ) {
1022     return SQLITE_CORRUPT_BKPT;
1023   }
1024   assert( pbegin>addr || pbegin==0 );
1025   put2byte(&data[addr], start);
1026   put2byte(&data[start], pbegin);
1027   put2byte(&data[start+2], size);
1028   pPage->nFree += (u16)size;
1029 
1030   /* Coalesce adjacent free blocks */
1031   addr = pPage->hdrOffset + 1;
1032   while( (pbegin = get2byte(&data[addr]))>0 ){
1033     int pnext, psize, x;
1034     assert( pbegin>addr );
1035     assert( pbegin<=pPage->pBt->usableSize-4 );
1036     pnext = get2byte(&data[pbegin]);
1037     psize = get2byte(&data[pbegin+2]);
1038     if( pbegin + psize + 3 >= pnext && pnext>0 ){
1039       int frag = pnext - (pbegin+psize);
1040       if( (frag<0) || (frag>(int)data[pPage->hdrOffset+7]) ){
1041         return SQLITE_CORRUPT_BKPT;
1042       }
1043       data[pPage->hdrOffset+7] -= (u8)frag;
1044       x = get2byte(&data[pnext]);
1045       put2byte(&data[pbegin], x);
1046       x = pnext + get2byte(&data[pnext+2]) - pbegin;
1047       put2byte(&data[pbegin+2], x);
1048     }else{
1049       addr = pbegin;
1050     }
1051   }
1052 
1053   /* If the cell content area begins with a freeblock, remove it. */
1054   if( data[hdr+1]==data[hdr+5] && data[hdr+2]==data[hdr+6] ){
1055     int top;
1056     pbegin = get2byte(&data[hdr+1]);
1057     memcpy(&data[hdr+1], &data[pbegin], 2);
1058     top = get2byte(&data[hdr+5]) + get2byte(&data[pbegin+2]);
1059     put2byte(&data[hdr+5], top);
1060   }
1061   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
1062   return SQLITE_OK;
1063 }
1064 
1065 /*
1066 ** Decode the flags byte (the first byte of the header) for a page
1067 ** and initialize fields of the MemPage structure accordingly.
1068 **
1069 ** Only the following combinations are supported.  Anything different
1070 ** indicates a corrupt database files:
1071 **
1072 **         PTF_ZERODATA
1073 **         PTF_ZERODATA | PTF_LEAF
1074 **         PTF_LEAFDATA | PTF_INTKEY
1075 **         PTF_LEAFDATA | PTF_INTKEY | PTF_LEAF
1076 */
1077 static int decodeFlags(MemPage *pPage, int flagByte){
1078   BtShared *pBt;     /* A copy of pPage->pBt */
1079 
1080   assert( pPage->hdrOffset==(pPage->pgno==1 ? 100 : 0) );
1081   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1082   pPage->leaf = (u8)(flagByte>>3);  assert( PTF_LEAF == 1<<3 );
1083   flagByte &= ~PTF_LEAF;
1084   pPage->childPtrSize = 4-4*pPage->leaf;
1085   pBt = pPage->pBt;
1086   if( flagByte==(PTF_LEAFDATA | PTF_INTKEY) ){
1087     pPage->intKey = 1;
1088     pPage->hasData = pPage->leaf;
1089     pPage->maxLocal = pBt->maxLeaf;
1090     pPage->minLocal = pBt->minLeaf;
1091   }else if( flagByte==PTF_ZERODATA ){
1092     pPage->intKey = 0;
1093     pPage->hasData = 0;
1094     pPage->maxLocal = pBt->maxLocal;
1095     pPage->minLocal = pBt->minLocal;
1096   }else{
1097     return SQLITE_CORRUPT_BKPT;
1098   }
1099   return SQLITE_OK;
1100 }
1101 
1102 /*
1103 ** Initialize the auxiliary information for a disk block.
1104 **
1105 ** Return SQLITE_OK on success.  If we see that the page does
1106 ** not contain a well-formed database page, then return
1107 ** SQLITE_CORRUPT.  Note that a return of SQLITE_OK does not
1108 ** guarantee that the page is well-formed.  It only shows that
1109 ** we failed to detect any corruption.
1110 */
1111 int sqlite3BtreeInitPage(MemPage *pPage){
1112 
1113   assert( pPage->pBt!=0 );
1114   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1115   assert( pPage->pgno==sqlite3PagerPagenumber(pPage->pDbPage) );
1116   assert( pPage == sqlite3PagerGetExtra(pPage->pDbPage) );
1117   assert( pPage->aData == sqlite3PagerGetData(pPage->pDbPage) );
1118 
1119   if( !pPage->isInit ){
1120     u16 pc;            /* Address of a freeblock within pPage->aData[] */
1121     u8 hdr;            /* Offset to beginning of page header */
1122     u8 *data;          /* Equal to pPage->aData */
1123     BtShared *pBt;        /* The main btree structure */
1124     u16 usableSize;    /* Amount of usable space on each page */
1125     u16 cellOffset;    /* Offset from start of page to first cell pointer */
1126     u16 nFree;         /* Number of unused bytes on the page */
1127     u16 top;           /* First byte of the cell content area */
1128 
1129     pBt = pPage->pBt;
1130 
1131     hdr = pPage->hdrOffset;
1132     data = pPage->aData;
1133     if( decodeFlags(pPage, data[hdr]) ) return SQLITE_CORRUPT_BKPT;
1134     assert( pBt->pageSize>=512 && pBt->pageSize<=32768 );
1135     pPage->maskPage = pBt->pageSize - 1;
1136     pPage->nOverflow = 0;
1137     usableSize = pBt->usableSize;
1138     pPage->cellOffset = cellOffset = hdr + 12 - 4*pPage->leaf;
1139     top = get2byte(&data[hdr+5]);
1140     pPage->nCell = get2byte(&data[hdr+3]);
1141     if( pPage->nCell>MX_CELL(pBt) ){
1142       /* To many cells for a single page.  The page must be corrupt */
1143       return SQLITE_CORRUPT_BKPT;
1144     }
1145 
1146     /* Compute the total free space on the page */
1147     pc = get2byte(&data[hdr+1]);
1148     nFree = data[hdr+7] + top - (cellOffset + 2*pPage->nCell);
1149     while( pc>0 ){
1150       u16 next, size;
1151       if( pc>usableSize-4 ){
1152         /* Free block is off the page */
1153         return SQLITE_CORRUPT_BKPT;
1154       }
1155       next = get2byte(&data[pc]);
1156       size = get2byte(&data[pc+2]);
1157       if( next>0 && next<=pc+size+3 ){
1158         /* Free blocks must be in accending order */
1159         return SQLITE_CORRUPT_BKPT;
1160       }
1161       nFree += size;
1162       pc = next;
1163     }
1164     pPage->nFree = (u16)nFree;
1165     if( nFree>=usableSize ){
1166       /* Free space cannot exceed total page size */
1167       return SQLITE_CORRUPT_BKPT;
1168     }
1169 
1170 #if 0
1171   /* Check that all the offsets in the cell offset array are within range.
1172   **
1173   ** Omitting this consistency check and using the pPage->maskPage mask
1174   ** to prevent overrunning the page buffer in findCell() results in a
1175   ** 2.5% performance gain.
1176   */
1177   {
1178     u8 *pOff;        /* Iterator used to check all cell offsets are in range */
1179     u8 *pEnd;        /* Pointer to end of cell offset array */
1180     u8 mask;         /* Mask of bits that must be zero in MSB of cell offsets */
1181     mask = ~(((u8)(pBt->pageSize>>8))-1);
1182     pEnd = &data[cellOffset + pPage->nCell*2];
1183     for(pOff=&data[cellOffset]; pOff!=pEnd && !((*pOff)&mask); pOff+=2);
1184     if( pOff!=pEnd ){
1185       return SQLITE_CORRUPT_BKPT;
1186     }
1187   }
1188 #endif
1189 
1190     pPage->isInit = 1;
1191   }
1192   return SQLITE_OK;
1193 }
1194 
1195 /*
1196 ** Set up a raw page so that it looks like a database page holding
1197 ** no entries.
1198 */
1199 static void zeroPage(MemPage *pPage, int flags){
1200   unsigned char *data = pPage->aData;
1201   BtShared *pBt = pPage->pBt;
1202   u8 hdr = pPage->hdrOffset;
1203   u16 first;
1204 
1205   assert( sqlite3PagerPagenumber(pPage->pDbPage)==pPage->pgno );
1206   assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage );
1207   assert( sqlite3PagerGetData(pPage->pDbPage) == data );
1208   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
1209   assert( sqlite3_mutex_held(pBt->mutex) );
1210   /*memset(&data[hdr], 0, pBt->usableSize - hdr);*/
1211   data[hdr] = (char)flags;
1212   first = hdr + 8 + 4*((flags&PTF_LEAF)==0 ?1:0);
1213   memset(&data[hdr+1], 0, 4);
1214   data[hdr+7] = 0;
1215   put2byte(&data[hdr+5], pBt->usableSize);
1216   pPage->nFree = pBt->usableSize - first;
1217   decodeFlags(pPage, flags);
1218   pPage->hdrOffset = hdr;
1219   pPage->cellOffset = first;
1220   pPage->nOverflow = 0;
1221   assert( pBt->pageSize>=512 && pBt->pageSize<=32768 );
1222   pPage->maskPage = pBt->pageSize - 1;
1223   pPage->nCell = 0;
1224   pPage->isInit = 1;
1225 }
1226 
1227 
1228 /*
1229 ** Convert a DbPage obtained from the pager into a MemPage used by
1230 ** the btree layer.
1231 */
1232 static MemPage *btreePageFromDbPage(DbPage *pDbPage, Pgno pgno, BtShared *pBt){
1233   MemPage *pPage = (MemPage*)sqlite3PagerGetExtra(pDbPage);
1234   pPage->aData = sqlite3PagerGetData(pDbPage);
1235   pPage->pDbPage = pDbPage;
1236   pPage->pBt = pBt;
1237   pPage->pgno = pgno;
1238   pPage->hdrOffset = pPage->pgno==1 ? 100 : 0;
1239   return pPage;
1240 }
1241 
1242 /*
1243 ** Get a page from the pager.  Initialize the MemPage.pBt and
1244 ** MemPage.aData elements if needed.
1245 **
1246 ** If the noContent flag is set, it means that we do not care about
1247 ** the content of the page at this time.  So do not go to the disk
1248 ** to fetch the content.  Just fill in the content with zeros for now.
1249 ** If in the future we call sqlite3PagerWrite() on this page, that
1250 ** means we have started to be concerned about content and the disk
1251 ** read should occur at that point.
1252 */
1253 int sqlite3BtreeGetPage(
1254   BtShared *pBt,       /* The btree */
1255   Pgno pgno,           /* Number of the page to fetch */
1256   MemPage **ppPage,    /* Return the page in this parameter */
1257   int noContent        /* Do not load page content if true */
1258 ){
1259   int rc;
1260   DbPage *pDbPage;
1261 
1262   assert( sqlite3_mutex_held(pBt->mutex) );
1263   rc = sqlite3PagerAcquire(pBt->pPager, pgno, (DbPage**)&pDbPage, noContent);
1264   if( rc ) return rc;
1265   *ppPage = btreePageFromDbPage(pDbPage, pgno, pBt);
1266   return SQLITE_OK;
1267 }
1268 
1269 /*
1270 ** Retrieve a page from the pager cache. If the requested page is not
1271 ** already in the pager cache return NULL. Initialize the MemPage.pBt and
1272 ** MemPage.aData elements if needed.
1273 */
1274 static MemPage *btreePageLookup(BtShared *pBt, Pgno pgno){
1275   DbPage *pDbPage;
1276   assert( sqlite3_mutex_held(pBt->mutex) );
1277   pDbPage = sqlite3PagerLookup(pBt->pPager, pgno);
1278   if( pDbPage ){
1279     return btreePageFromDbPage(pDbPage, pgno, pBt);
1280   }
1281   return 0;
1282 }
1283 
1284 /*
1285 ** Return the size of the database file in pages. If there is any kind of
1286 ** error, return ((unsigned int)-1).
1287 */
1288 static Pgno pagerPagecount(BtShared *pBt){
1289   int nPage = -1;
1290   int rc;
1291   assert( pBt->pPage1 );
1292   rc = sqlite3PagerPagecount(pBt->pPager, &nPage);
1293   assert( rc==SQLITE_OK || nPage==-1 );
1294   return (Pgno)nPage;
1295 }
1296 
1297 /*
1298 ** Get a page from the pager and initialize it.  This routine
1299 ** is just a convenience wrapper around separate calls to
1300 ** sqlite3BtreeGetPage() and sqlite3BtreeInitPage().
1301 */
1302 static int getAndInitPage(
1303   BtShared *pBt,          /* The database file */
1304   Pgno pgno,           /* Number of the page to get */
1305   MemPage **ppPage     /* Write the page pointer here */
1306 ){
1307   int rc;
1308   MemPage *pPage;
1309 
1310   assert( sqlite3_mutex_held(pBt->mutex) );
1311   if( pgno==0 ){
1312     return SQLITE_CORRUPT_BKPT;
1313   }
1314 
1315   /* It is often the case that the page we want is already in cache.
1316   ** If so, get it directly.  This saves us from having to call
1317   ** pagerPagecount() to make sure pgno is within limits, which results
1318   ** in a measureable performance improvements.
1319   */
1320   *ppPage = pPage = btreePageLookup(pBt, pgno);
1321   if( pPage ){
1322     /* Page is already in cache */
1323     rc = SQLITE_OK;
1324   }else{
1325     /* Page not in cache.  Acquire it. */
1326     if( pgno>pagerPagecount(pBt) ){
1327       return SQLITE_CORRUPT_BKPT;
1328     }
1329     rc = sqlite3BtreeGetPage(pBt, pgno, ppPage, 0);
1330     if( rc ) return rc;
1331     pPage = *ppPage;
1332   }
1333   if( !pPage->isInit ){
1334     rc = sqlite3BtreeInitPage(pPage);
1335   }
1336   if( rc!=SQLITE_OK ){
1337     releasePage(pPage);
1338     *ppPage = 0;
1339   }
1340   return rc;
1341 }
1342 
1343 /*
1344 ** Release a MemPage.  This should be called once for each prior
1345 ** call to sqlite3BtreeGetPage.
1346 */
1347 static void releasePage(MemPage *pPage){
1348   if( pPage ){
1349     assert( pPage->nOverflow==0 || sqlite3PagerPageRefcount(pPage->pDbPage)>1 );
1350     assert( pPage->aData );
1351     assert( pPage->pBt );
1352     assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage );
1353     assert( sqlite3PagerGetData(pPage->pDbPage)==pPage->aData );
1354     assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1355     sqlite3PagerUnref(pPage->pDbPage);
1356   }
1357 }
1358 
1359 /*
1360 ** During a rollback, when the pager reloads information into the cache
1361 ** so that the cache is restored to its original state at the start of
1362 ** the transaction, for each page restored this routine is called.
1363 **
1364 ** This routine needs to reset the extra data section at the end of the
1365 ** page to agree with the restored data.
1366 */
1367 static void pageReinit(DbPage *pData){
1368   MemPage *pPage;
1369   pPage = (MemPage *)sqlite3PagerGetExtra(pData);
1370   assert( sqlite3PagerPageRefcount(pData)>0 );
1371   if( pPage->isInit ){
1372     assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1373     pPage->isInit = 0;
1374     if( sqlite3PagerPageRefcount(pData)>1 ){
1375       /* pPage might not be a btree page;  it might be an overflow page
1376       ** or ptrmap page or a free page.  In those cases, the following
1377       ** call to sqlite3BtreeInitPage() will likely return SQLITE_CORRUPT.
1378       ** But no harm is done by this.  And it is very important that
1379       ** sqlite3BtreeInitPage() be called on every btree page so we make
1380       ** the call for every page that comes in for re-initing. */
1381       sqlite3BtreeInitPage(pPage);
1382     }
1383   }
1384 }
1385 
1386 /*
1387 ** Invoke the busy handler for a btree.
1388 */
1389 static int btreeInvokeBusyHandler(void *pArg){
1390   BtShared *pBt = (BtShared*)pArg;
1391   assert( pBt->db );
1392   assert( sqlite3_mutex_held(pBt->db->mutex) );
1393   return sqlite3InvokeBusyHandler(&pBt->db->busyHandler);
1394 }
1395 
1396 /*
1397 ** Open a database file.
1398 **
1399 ** zFilename is the name of the database file.  If zFilename is NULL
1400 ** a new database with a random name is created.  This randomly named
1401 ** database file will be deleted when sqlite3BtreeClose() is called.
1402 ** If zFilename is ":memory:" then an in-memory database is created
1403 ** that is automatically destroyed when it is closed.
1404 **
1405 ** If the database is already opened in the same database connection
1406 ** and we are in shared cache mode, then the open will fail with an
1407 ** SQLITE_CONSTRAINT error.  We cannot allow two or more BtShared
1408 ** objects in the same database connection since doing so will lead
1409 ** to problems with locking.
1410 */
1411 int sqlite3BtreeOpen(
1412   const char *zFilename,  /* Name of the file containing the BTree database */
1413   sqlite3 *db,            /* Associated database handle */
1414   Btree **ppBtree,        /* Pointer to new Btree object written here */
1415   int flags,              /* Options */
1416   int vfsFlags            /* Flags passed through to sqlite3_vfs.xOpen() */
1417 ){
1418   sqlite3_vfs *pVfs;             /* The VFS to use for this btree */
1419   BtShared *pBt = 0;             /* Shared part of btree structure */
1420   Btree *p;                      /* Handle to return */
1421   sqlite3_mutex *mutexOpen = 0;  /* Prevents a race condition. Ticket #3537 */
1422   int rc = SQLITE_OK;            /* Result code from this function */
1423   u8 nReserve;                   /* Byte of unused space on each page */
1424   unsigned char zDbHeader[100];  /* Database header content */
1425 
1426   /* Set the variable isMemdb to true for an in-memory database, or
1427   ** false for a file-based database. This symbol is only required if
1428   ** either of the shared-data or autovacuum features are compiled
1429   ** into the library.
1430   */
1431 #if !defined(SQLITE_OMIT_SHARED_CACHE) || !defined(SQLITE_OMIT_AUTOVACUUM)
1432   #ifdef SQLITE_OMIT_MEMORYDB
1433     const int isMemdb = 0;
1434   #else
1435     const int isMemdb = zFilename && !strcmp(zFilename, ":memory:");
1436   #endif
1437 #endif
1438 
1439   assert( db!=0 );
1440   assert( sqlite3_mutex_held(db->mutex) );
1441 
1442   pVfs = db->pVfs;
1443   p = sqlite3MallocZero(sizeof(Btree));
1444   if( !p ){
1445     return SQLITE_NOMEM;
1446   }
1447   p->inTrans = TRANS_NONE;
1448   p->db = db;
1449 
1450 #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)
1451   /*
1452   ** If this Btree is a candidate for shared cache, try to find an
1453   ** existing BtShared object that we can share with
1454   */
1455   if( isMemdb==0 && zFilename && zFilename[0] ){
1456     if( sqlite3GlobalConfig.sharedCacheEnabled ){
1457       int nFullPathname = pVfs->mxPathname+1;
1458       char *zFullPathname = sqlite3Malloc(nFullPathname);
1459       sqlite3_mutex *mutexShared;
1460       p->sharable = 1;
1461       db->flags |= SQLITE_SharedCache;
1462       if( !zFullPathname ){
1463         sqlite3_free(p);
1464         return SQLITE_NOMEM;
1465       }
1466       sqlite3OsFullPathname(pVfs, zFilename, nFullPathname, zFullPathname);
1467       mutexOpen = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_OPEN);
1468       sqlite3_mutex_enter(mutexOpen);
1469       mutexShared = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER);
1470       sqlite3_mutex_enter(mutexShared);
1471       for(pBt=GLOBAL(BtShared*,sqlite3SharedCacheList); pBt; pBt=pBt->pNext){
1472         assert( pBt->nRef>0 );
1473         if( 0==strcmp(zFullPathname, sqlite3PagerFilename(pBt->pPager))
1474                  && sqlite3PagerVfs(pBt->pPager)==pVfs ){
1475           int iDb;
1476           for(iDb=db->nDb-1; iDb>=0; iDb--){
1477             Btree *pExisting = db->aDb[iDb].pBt;
1478             if( pExisting && pExisting->pBt==pBt ){
1479               sqlite3_mutex_leave(mutexShared);
1480               sqlite3_mutex_leave(mutexOpen);
1481               sqlite3_free(zFullPathname);
1482               sqlite3_free(p);
1483               return SQLITE_CONSTRAINT;
1484             }
1485           }
1486           p->pBt = pBt;
1487           pBt->nRef++;
1488           break;
1489         }
1490       }
1491       sqlite3_mutex_leave(mutexShared);
1492       sqlite3_free(zFullPathname);
1493     }
1494 #ifdef SQLITE_DEBUG
1495     else{
1496       /* In debug mode, we mark all persistent databases as sharable
1497       ** even when they are not.  This exercises the locking code and
1498       ** gives more opportunity for asserts(sqlite3_mutex_held())
1499       ** statements to find locking problems.
1500       */
1501       p->sharable = 1;
1502     }
1503 #endif
1504   }
1505 #endif
1506   if( pBt==0 ){
1507     /*
1508     ** The following asserts make sure that structures used by the btree are
1509     ** the right size.  This is to guard against size changes that result
1510     ** when compiling on a different architecture.
1511     */
1512     assert( sizeof(i64)==8 || sizeof(i64)==4 );
1513     assert( sizeof(u64)==8 || sizeof(u64)==4 );
1514     assert( sizeof(u32)==4 );
1515     assert( sizeof(u16)==2 );
1516     assert( sizeof(Pgno)==4 );
1517 
1518     pBt = sqlite3MallocZero( sizeof(*pBt) );
1519     if( pBt==0 ){
1520       rc = SQLITE_NOMEM;
1521       goto btree_open_out;
1522     }
1523     rc = sqlite3PagerOpen(pVfs, &pBt->pPager, zFilename,
1524                           EXTRA_SIZE, flags, vfsFlags);
1525     if( rc==SQLITE_OK ){
1526       rc = sqlite3PagerReadFileheader(pBt->pPager,sizeof(zDbHeader),zDbHeader);
1527     }
1528     if( rc!=SQLITE_OK ){
1529       goto btree_open_out;
1530     }
1531     pBt->db = db;
1532     sqlite3PagerSetBusyhandler(pBt->pPager, btreeInvokeBusyHandler, pBt);
1533     p->pBt = pBt;
1534 
1535     sqlite3PagerSetReiniter(pBt->pPager, pageReinit);
1536     pBt->pCursor = 0;
1537     pBt->pPage1 = 0;
1538     pBt->readOnly = sqlite3PagerIsreadonly(pBt->pPager);
1539     pBt->pageSize = get2byte(&zDbHeader[16]);
1540     if( pBt->pageSize<512 || pBt->pageSize>SQLITE_MAX_PAGE_SIZE
1541          || ((pBt->pageSize-1)&pBt->pageSize)!=0 ){
1542       pBt->pageSize = 0;
1543 #ifndef SQLITE_OMIT_AUTOVACUUM
1544       /* If the magic name ":memory:" will create an in-memory database, then
1545       ** leave the autoVacuum mode at 0 (do not auto-vacuum), even if
1546       ** SQLITE_DEFAULT_AUTOVACUUM is true. On the other hand, if
1547       ** SQLITE_OMIT_MEMORYDB has been defined, then ":memory:" is just a
1548       ** regular file-name. In this case the auto-vacuum applies as per normal.
1549       */
1550       if( zFilename && !isMemdb ){
1551         pBt->autoVacuum = (SQLITE_DEFAULT_AUTOVACUUM ? 1 : 0);
1552         pBt->incrVacuum = (SQLITE_DEFAULT_AUTOVACUUM==2 ? 1 : 0);
1553       }
1554 #endif
1555       nReserve = 0;
1556     }else{
1557       nReserve = zDbHeader[20];
1558       pBt->pageSizeFixed = 1;
1559 #ifndef SQLITE_OMIT_AUTOVACUUM
1560       pBt->autoVacuum = (get4byte(&zDbHeader[36 + 4*4])?1:0);
1561       pBt->incrVacuum = (get4byte(&zDbHeader[36 + 7*4])?1:0);
1562 #endif
1563     }
1564     rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize);
1565     if( rc ) goto btree_open_out;
1566     pBt->usableSize = pBt->pageSize - nReserve;
1567     assert( (pBt->pageSize & 7)==0 );  /* 8-byte alignment of pageSize */
1568 
1569 #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)
1570     /* Add the new BtShared object to the linked list sharable BtShareds.
1571     */
1572     if( p->sharable ){
1573       sqlite3_mutex *mutexShared;
1574       pBt->nRef = 1;
1575       mutexShared = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER);
1576       if( SQLITE_THREADSAFE && sqlite3GlobalConfig.bCoreMutex ){
1577         pBt->mutex = sqlite3MutexAlloc(SQLITE_MUTEX_FAST);
1578         if( pBt->mutex==0 ){
1579           rc = SQLITE_NOMEM;
1580           db->mallocFailed = 0;
1581           goto btree_open_out;
1582         }
1583       }
1584       sqlite3_mutex_enter(mutexShared);
1585       pBt->pNext = GLOBAL(BtShared*,sqlite3SharedCacheList);
1586       GLOBAL(BtShared*,sqlite3SharedCacheList) = pBt;
1587       sqlite3_mutex_leave(mutexShared);
1588     }
1589 #endif
1590   }
1591 
1592 #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)
1593   /* If the new Btree uses a sharable pBtShared, then link the new
1594   ** Btree into the list of all sharable Btrees for the same connection.
1595   ** The list is kept in ascending order by pBt address.
1596   */
1597   if( p->sharable ){
1598     int i;
1599     Btree *pSib;
1600     for(i=0; i<db->nDb; i++){
1601       if( (pSib = db->aDb[i].pBt)!=0 && pSib->sharable ){
1602         while( pSib->pPrev ){ pSib = pSib->pPrev; }
1603         if( p->pBt<pSib->pBt ){
1604           p->pNext = pSib;
1605           p->pPrev = 0;
1606           pSib->pPrev = p;
1607         }else{
1608           while( pSib->pNext && pSib->pNext->pBt<p->pBt ){
1609             pSib = pSib->pNext;
1610           }
1611           p->pNext = pSib->pNext;
1612           p->pPrev = pSib;
1613           if( p->pNext ){
1614             p->pNext->pPrev = p;
1615           }
1616           pSib->pNext = p;
1617         }
1618         break;
1619       }
1620     }
1621   }
1622 #endif
1623   *ppBtree = p;
1624 
1625 btree_open_out:
1626   if( rc!=SQLITE_OK ){
1627     if( pBt && pBt->pPager ){
1628       sqlite3PagerClose(pBt->pPager);
1629     }
1630     sqlite3_free(pBt);
1631     sqlite3_free(p);
1632     *ppBtree = 0;
1633   }
1634   if( mutexOpen ){
1635     assert( sqlite3_mutex_held(mutexOpen) );
1636     sqlite3_mutex_leave(mutexOpen);
1637   }
1638   return rc;
1639 }
1640 
1641 /*
1642 ** Decrement the BtShared.nRef counter.  When it reaches zero,
1643 ** remove the BtShared structure from the sharing list.  Return
1644 ** true if the BtShared.nRef counter reaches zero and return
1645 ** false if it is still positive.
1646 */
1647 static int removeFromSharingList(BtShared *pBt){
1648 #ifndef SQLITE_OMIT_SHARED_CACHE
1649   sqlite3_mutex *pMaster;
1650   BtShared *pList;
1651   int removed = 0;
1652 
1653   assert( sqlite3_mutex_notheld(pBt->mutex) );
1654   pMaster = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER);
1655   sqlite3_mutex_enter(pMaster);
1656   pBt->nRef--;
1657   if( pBt->nRef<=0 ){
1658     if( GLOBAL(BtShared*,sqlite3SharedCacheList)==pBt ){
1659       GLOBAL(BtShared*,sqlite3SharedCacheList) = pBt->pNext;
1660     }else{
1661       pList = GLOBAL(BtShared*,sqlite3SharedCacheList);
1662       while( ALWAYS(pList) && pList->pNext!=pBt ){
1663         pList=pList->pNext;
1664       }
1665       if( ALWAYS(pList) ){
1666         pList->pNext = pBt->pNext;
1667       }
1668     }
1669     if( SQLITE_THREADSAFE ){
1670       sqlite3_mutex_free(pBt->mutex);
1671     }
1672     removed = 1;
1673   }
1674   sqlite3_mutex_leave(pMaster);
1675   return removed;
1676 #else
1677   return 1;
1678 #endif
1679 }
1680 
1681 /*
1682 ** Make sure pBt->pTmpSpace points to an allocation of
1683 ** MX_CELL_SIZE(pBt) bytes.
1684 */
1685 static void allocateTempSpace(BtShared *pBt){
1686   if( !pBt->pTmpSpace ){
1687     pBt->pTmpSpace = sqlite3PageMalloc( pBt->pageSize );
1688   }
1689 }
1690 
1691 /*
1692 ** Free the pBt->pTmpSpace allocation
1693 */
1694 static void freeTempSpace(BtShared *pBt){
1695   sqlite3PageFree( pBt->pTmpSpace);
1696   pBt->pTmpSpace = 0;
1697 }
1698 
1699 /*
1700 ** Close an open database and invalidate all cursors.
1701 */
1702 int sqlite3BtreeClose(Btree *p){
1703   BtShared *pBt = p->pBt;
1704   BtCursor *pCur;
1705 
1706   /* Close all cursors opened via this handle.  */
1707   assert( sqlite3_mutex_held(p->db->mutex) );
1708   sqlite3BtreeEnter(p);
1709   pCur = pBt->pCursor;
1710   while( pCur ){
1711     BtCursor *pTmp = pCur;
1712     pCur = pCur->pNext;
1713     if( pTmp->pBtree==p ){
1714       sqlite3BtreeCloseCursor(pTmp);
1715     }
1716   }
1717 
1718   /* Rollback any active transaction and free the handle structure.
1719   ** The call to sqlite3BtreeRollback() drops any table-locks held by
1720   ** this handle.
1721   */
1722   sqlite3BtreeRollback(p);
1723   sqlite3BtreeLeave(p);
1724 
1725   /* If there are still other outstanding references to the shared-btree
1726   ** structure, return now. The remainder of this procedure cleans
1727   ** up the shared-btree.
1728   */
1729   assert( p->wantToLock==0 && p->locked==0 );
1730   if( !p->sharable || removeFromSharingList(pBt) ){
1731     /* The pBt is no longer on the sharing list, so we can access
1732     ** it without having to hold the mutex.
1733     **
1734     ** Clean out and delete the BtShared object.
1735     */
1736     assert( !pBt->pCursor );
1737     sqlite3PagerClose(pBt->pPager);
1738     if( pBt->xFreeSchema && pBt->pSchema ){
1739       pBt->xFreeSchema(pBt->pSchema);
1740     }
1741     sqlite3_free(pBt->pSchema);
1742     freeTempSpace(pBt);
1743     sqlite3_free(pBt);
1744   }
1745 
1746 #ifndef SQLITE_OMIT_SHARED_CACHE
1747   assert( p->wantToLock==0 );
1748   assert( p->locked==0 );
1749   if( p->pPrev ) p->pPrev->pNext = p->pNext;
1750   if( p->pNext ) p->pNext->pPrev = p->pPrev;
1751 #endif
1752 
1753   sqlite3_free(p);
1754   return SQLITE_OK;
1755 }
1756 
1757 /*
1758 ** Change the limit on the number of pages allowed in the cache.
1759 **
1760 ** The maximum number of cache pages is set to the absolute
1761 ** value of mxPage.  If mxPage is negative, the pager will
1762 ** operate asynchronously - it will not stop to do fsync()s
1763 ** to insure data is written to the disk surface before
1764 ** continuing.  Transactions still work if synchronous is off,
1765 ** and the database cannot be corrupted if this program
1766 ** crashes.  But if the operating system crashes or there is
1767 ** an abrupt power failure when synchronous is off, the database
1768 ** could be left in an inconsistent and unrecoverable state.
1769 ** Synchronous is on by default so database corruption is not
1770 ** normally a worry.
1771 */
1772 int sqlite3BtreeSetCacheSize(Btree *p, int mxPage){
1773   BtShared *pBt = p->pBt;
1774   assert( sqlite3_mutex_held(p->db->mutex) );
1775   sqlite3BtreeEnter(p);
1776   sqlite3PagerSetCachesize(pBt->pPager, mxPage);
1777   sqlite3BtreeLeave(p);
1778   return SQLITE_OK;
1779 }
1780 
1781 /*
1782 ** Change the way data is synced to disk in order to increase or decrease
1783 ** how well the database resists damage due to OS crashes and power
1784 ** failures.  Level 1 is the same as asynchronous (no syncs() occur and
1785 ** there is a high probability of damage)  Level 2 is the default.  There
1786 ** is a very low but non-zero probability of damage.  Level 3 reduces the
1787 ** probability of damage to near zero but with a write performance reduction.
1788 */
1789 #ifndef SQLITE_OMIT_PAGER_PRAGMAS
1790 int sqlite3BtreeSetSafetyLevel(Btree *p, int level, int fullSync){
1791   BtShared *pBt = p->pBt;
1792   assert( sqlite3_mutex_held(p->db->mutex) );
1793   sqlite3BtreeEnter(p);
1794   sqlite3PagerSetSafetyLevel(pBt->pPager, level, fullSync);
1795   sqlite3BtreeLeave(p);
1796   return SQLITE_OK;
1797 }
1798 #endif
1799 
1800 /*
1801 ** Return TRUE if the given btree is set to safety level 1.  In other
1802 ** words, return TRUE if no sync() occurs on the disk files.
1803 */
1804 int sqlite3BtreeSyncDisabled(Btree *p){
1805   BtShared *pBt = p->pBt;
1806   int rc;
1807   assert( sqlite3_mutex_held(p->db->mutex) );
1808   sqlite3BtreeEnter(p);
1809   assert( pBt && pBt->pPager );
1810   rc = sqlite3PagerNosync(pBt->pPager);
1811   sqlite3BtreeLeave(p);
1812   return rc;
1813 }
1814 
1815 #if !defined(SQLITE_OMIT_PAGER_PRAGMAS) || !defined(SQLITE_OMIT_VACUUM)
1816 /*
1817 ** Change the default pages size and the number of reserved bytes per page.
1818 ** Or, if the page size has already been fixed, return SQLITE_READONLY
1819 ** without changing anything.
1820 **
1821 ** The page size must be a power of 2 between 512 and 65536.  If the page
1822 ** size supplied does not meet this constraint then the page size is not
1823 ** changed.
1824 **
1825 ** Page sizes are constrained to be a power of two so that the region
1826 ** of the database file used for locking (beginning at PENDING_BYTE,
1827 ** the first byte past the 1GB boundary, 0x40000000) needs to occur
1828 ** at the beginning of a page.
1829 **
1830 ** If parameter nReserve is less than zero, then the number of reserved
1831 ** bytes per page is left unchanged.
1832 **
1833 ** If the iFix!=0 then the pageSizeFixed flag is set so that the page size
1834 ** and autovacuum mode can no longer be changed.
1835 */
1836 int sqlite3BtreeSetPageSize(Btree *p, int pageSize, int nReserve, int iFix){
1837   int rc = SQLITE_OK;
1838   BtShared *pBt = p->pBt;
1839   assert( nReserve>=-1 && nReserve<=255 );
1840   sqlite3BtreeEnter(p);
1841   if( pBt->pageSizeFixed ){
1842     sqlite3BtreeLeave(p);
1843     return SQLITE_READONLY;
1844   }
1845   if( nReserve<0 ){
1846     nReserve = pBt->pageSize - pBt->usableSize;
1847   }
1848   assert( nReserve>=0 && nReserve<=255 );
1849   if( pageSize>=512 && pageSize<=SQLITE_MAX_PAGE_SIZE &&
1850         ((pageSize-1)&pageSize)==0 ){
1851     assert( (pageSize & 7)==0 );
1852     assert( !pBt->pPage1 && !pBt->pCursor );
1853     pBt->pageSize = (u16)pageSize;
1854     freeTempSpace(pBt);
1855     rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize);
1856   }
1857   pBt->usableSize = pBt->pageSize - (u16)nReserve;
1858   if( iFix ) pBt->pageSizeFixed = 1;
1859   sqlite3BtreeLeave(p);
1860   return rc;
1861 }
1862 
1863 /*
1864 ** Return the currently defined page size
1865 */
1866 int sqlite3BtreeGetPageSize(Btree *p){
1867   return p->pBt->pageSize;
1868 }
1869 
1870 /*
1871 ** Return the number of bytes of space at the end of every page that
1872 ** are intentually left unused.  This is the "reserved" space that is
1873 ** sometimes used by extensions.
1874 */
1875 int sqlite3BtreeGetReserve(Btree *p){
1876   int n;
1877   sqlite3BtreeEnter(p);
1878   n = p->pBt->pageSize - p->pBt->usableSize;
1879   sqlite3BtreeLeave(p);
1880   return n;
1881 }
1882 
1883 /*
1884 ** Set the maximum page count for a database if mxPage is positive.
1885 ** No changes are made if mxPage is 0 or negative.
1886 ** Regardless of the value of mxPage, return the maximum page count.
1887 */
1888 int sqlite3BtreeMaxPageCount(Btree *p, int mxPage){
1889   int n;
1890   sqlite3BtreeEnter(p);
1891   n = sqlite3PagerMaxPageCount(p->pBt->pPager, mxPage);
1892   sqlite3BtreeLeave(p);
1893   return n;
1894 }
1895 #endif /* !defined(SQLITE_OMIT_PAGER_PRAGMAS) || !defined(SQLITE_OMIT_VACUUM) */
1896 
1897 /*
1898 ** Change the 'auto-vacuum' property of the database. If the 'autoVacuum'
1899 ** parameter is non-zero, then auto-vacuum mode is enabled. If zero, it
1900 ** is disabled. The default value for the auto-vacuum property is
1901 ** determined by the SQLITE_DEFAULT_AUTOVACUUM macro.
1902 */
1903 int sqlite3BtreeSetAutoVacuum(Btree *p, int autoVacuum){
1904 #ifdef SQLITE_OMIT_AUTOVACUUM
1905   return SQLITE_READONLY;
1906 #else
1907   BtShared *pBt = p->pBt;
1908   int rc = SQLITE_OK;
1909   u8 av = (u8)autoVacuum;
1910 
1911   sqlite3BtreeEnter(p);
1912   if( pBt->pageSizeFixed && (av ?1:0)!=pBt->autoVacuum ){
1913     rc = SQLITE_READONLY;
1914   }else{
1915     pBt->autoVacuum = av ?1:0;
1916     pBt->incrVacuum = av==2 ?1:0;
1917   }
1918   sqlite3BtreeLeave(p);
1919   return rc;
1920 #endif
1921 }
1922 
1923 /*
1924 ** Return the value of the 'auto-vacuum' property. If auto-vacuum is
1925 ** enabled 1 is returned. Otherwise 0.
1926 */
1927 int sqlite3BtreeGetAutoVacuum(Btree *p){
1928 #ifdef SQLITE_OMIT_AUTOVACUUM
1929   return BTREE_AUTOVACUUM_NONE;
1930 #else
1931   int rc;
1932   sqlite3BtreeEnter(p);
1933   rc = (
1934     (!p->pBt->autoVacuum)?BTREE_AUTOVACUUM_NONE:
1935     (!p->pBt->incrVacuum)?BTREE_AUTOVACUUM_FULL:
1936     BTREE_AUTOVACUUM_INCR
1937   );
1938   sqlite3BtreeLeave(p);
1939   return rc;
1940 #endif
1941 }
1942 
1943 
1944 /*
1945 ** Get a reference to pPage1 of the database file.  This will
1946 ** also acquire a readlock on that file.
1947 **
1948 ** SQLITE_OK is returned on success.  If the file is not a
1949 ** well-formed database file, then SQLITE_CORRUPT is returned.
1950 ** SQLITE_BUSY is returned if the database is locked.  SQLITE_NOMEM
1951 ** is returned if we run out of memory.
1952 */
1953 static int lockBtree(BtShared *pBt){
1954   int rc;
1955   MemPage *pPage1;
1956   int nPage;
1957 
1958   assert( sqlite3_mutex_held(pBt->mutex) );
1959   assert( pBt->pPage1==0 );
1960   rc = sqlite3BtreeGetPage(pBt, 1, &pPage1, 0);
1961   if( rc!=SQLITE_OK ) return rc;
1962 
1963   /* Do some checking to help insure the file we opened really is
1964   ** a valid database file.
1965   */
1966   rc = sqlite3PagerPagecount(pBt->pPager, &nPage);
1967   if( rc!=SQLITE_OK ){
1968     goto page1_init_failed;
1969   }else if( nPage>0 ){
1970     int pageSize;
1971     int usableSize;
1972     u8 *page1 = pPage1->aData;
1973     rc = SQLITE_NOTADB;
1974     if( memcmp(page1, zMagicHeader, 16)!=0 ){
1975       goto page1_init_failed;
1976     }
1977     if( page1[18]>1 ){
1978       pBt->readOnly = 1;
1979     }
1980     if( page1[19]>1 ){
1981       goto page1_init_failed;
1982     }
1983 
1984     /* The maximum embedded fraction must be exactly 25%.  And the minimum
1985     ** embedded fraction must be 12.5% for both leaf-data and non-leaf-data.
1986     ** The original design allowed these amounts to vary, but as of
1987     ** version 3.6.0, we require them to be fixed.
1988     */
1989     if( memcmp(&page1[21], "\100\040\040",3)!=0 ){
1990       goto page1_init_failed;
1991     }
1992     pageSize = get2byte(&page1[16]);
1993     if( ((pageSize-1)&pageSize)!=0 || pageSize<512 ||
1994         (SQLITE_MAX_PAGE_SIZE<32768 && pageSize>SQLITE_MAX_PAGE_SIZE)
1995     ){
1996       goto page1_init_failed;
1997     }
1998     assert( (pageSize & 7)==0 );
1999     usableSize = pageSize - page1[20];
2000     if( pageSize!=pBt->pageSize ){
2001       /* After reading the first page of the database assuming a page size
2002       ** of BtShared.pageSize, we have discovered that the page-size is
2003       ** actually pageSize. Unlock the database, leave pBt->pPage1 at
2004       ** zero and return SQLITE_OK. The caller will call this function
2005       ** again with the correct page-size.
2006       */
2007       releasePage(pPage1);
2008       pBt->usableSize = (u16)usableSize;
2009       pBt->pageSize = (u16)pageSize;
2010       freeTempSpace(pBt);
2011       rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize);
2012       if( rc ) goto page1_init_failed;
2013       return SQLITE_OK;
2014     }
2015     if( usableSize<500 ){
2016       goto page1_init_failed;
2017     }
2018     pBt->pageSize = (u16)pageSize;
2019     pBt->usableSize = (u16)usableSize;
2020 #ifndef SQLITE_OMIT_AUTOVACUUM
2021     pBt->autoVacuum = (get4byte(&page1[36 + 4*4])?1:0);
2022     pBt->incrVacuum = (get4byte(&page1[36 + 7*4])?1:0);
2023 #endif
2024   }
2025 
2026   /* maxLocal is the maximum amount of payload to store locally for
2027   ** a cell.  Make sure it is small enough so that at least minFanout
2028   ** cells can will fit on one page.  We assume a 10-byte page header.
2029   ** Besides the payload, the cell must store:
2030   **     2-byte pointer to the cell
2031   **     4-byte child pointer
2032   **     9-byte nKey value
2033   **     4-byte nData value
2034   **     4-byte overflow page pointer
2035   ** So a cell consists of a 2-byte poiner, a header which is as much as
2036   ** 17 bytes long, 0 to N bytes of payload, and an optional 4 byte overflow
2037   ** page pointer.
2038   */
2039   pBt->maxLocal = (pBt->usableSize-12)*64/255 - 23;
2040   pBt->minLocal = (pBt->usableSize-12)*32/255 - 23;
2041   pBt->maxLeaf = pBt->usableSize - 35;
2042   pBt->minLeaf = (pBt->usableSize-12)*32/255 - 23;
2043   assert( pBt->maxLeaf + 23 <= MX_CELL_SIZE(pBt) );
2044   pBt->pPage1 = pPage1;
2045   return SQLITE_OK;
2046 
2047 page1_init_failed:
2048   releasePage(pPage1);
2049   pBt->pPage1 = 0;
2050   return rc;
2051 }
2052 
2053 /*
2054 ** This routine works like lockBtree() except that it also invokes the
2055 ** busy callback if there is lock contention.
2056 */
2057 static int lockBtreeWithRetry(Btree *pRef){
2058   int rc = SQLITE_OK;
2059 
2060   assert( sqlite3BtreeHoldsMutex(pRef) );
2061   if( pRef->inTrans==TRANS_NONE ){
2062     u8 inTransaction = pRef->pBt->inTransaction;
2063     btreeIntegrity(pRef);
2064     rc = sqlite3BtreeBeginTrans(pRef, 0);
2065     pRef->pBt->inTransaction = inTransaction;
2066     pRef->inTrans = TRANS_NONE;
2067     if( rc==SQLITE_OK ){
2068       pRef->pBt->nTransaction--;
2069     }
2070     btreeIntegrity(pRef);
2071   }
2072   return rc;
2073 }
2074 
2075 
2076 /*
2077 ** If there are no outstanding cursors and we are not in the middle
2078 ** of a transaction but there is a read lock on the database, then
2079 ** this routine unrefs the first page of the database file which
2080 ** has the effect of releasing the read lock.
2081 **
2082 ** If there are any outstanding cursors, this routine is a no-op.
2083 **
2084 ** If there is a transaction in progress, this routine is a no-op.
2085 */
2086 static void unlockBtreeIfUnused(BtShared *pBt){
2087   assert( sqlite3_mutex_held(pBt->mutex) );
2088   if( pBt->inTransaction==TRANS_NONE && pBt->pCursor==0 && pBt->pPage1!=0 ){
2089     if( sqlite3PagerRefcount(pBt->pPager)>=1 ){
2090       assert( pBt->pPage1->aData );
2091       releasePage(pBt->pPage1);
2092     }
2093     pBt->pPage1 = 0;
2094   }
2095 }
2096 
2097 /*
2098 ** Create a new database by initializing the first page of the
2099 ** file.
2100 */
2101 static int newDatabase(BtShared *pBt){
2102   MemPage *pP1;
2103   unsigned char *data;
2104   int rc;
2105   int nPage;
2106 
2107   assert( sqlite3_mutex_held(pBt->mutex) );
2108   rc = sqlite3PagerPagecount(pBt->pPager, &nPage);
2109   if( rc!=SQLITE_OK || nPage>0 ){
2110     return rc;
2111   }
2112   pP1 = pBt->pPage1;
2113   assert( pP1!=0 );
2114   data = pP1->aData;
2115   rc = sqlite3PagerWrite(pP1->pDbPage);
2116   if( rc ) return rc;
2117   memcpy(data, zMagicHeader, sizeof(zMagicHeader));
2118   assert( sizeof(zMagicHeader)==16 );
2119   put2byte(&data[16], pBt->pageSize);
2120   data[18] = 1;
2121   data[19] = 1;
2122   assert( pBt->usableSize<=pBt->pageSize && pBt->usableSize+255>=pBt->pageSize);
2123   data[20] = (u8)(pBt->pageSize - pBt->usableSize);
2124   data[21] = 64;
2125   data[22] = 32;
2126   data[23] = 32;
2127   memset(&data[24], 0, 100-24);
2128   zeroPage(pP1, PTF_INTKEY|PTF_LEAF|PTF_LEAFDATA );
2129   pBt->pageSizeFixed = 1;
2130 #ifndef SQLITE_OMIT_AUTOVACUUM
2131   assert( pBt->autoVacuum==1 || pBt->autoVacuum==0 );
2132   assert( pBt->incrVacuum==1 || pBt->incrVacuum==0 );
2133   put4byte(&data[36 + 4*4], pBt->autoVacuum);
2134   put4byte(&data[36 + 7*4], pBt->incrVacuum);
2135 #endif
2136   return SQLITE_OK;
2137 }
2138 
2139 /*
2140 ** Attempt to start a new transaction. A write-transaction
2141 ** is started if the second argument is nonzero, otherwise a read-
2142 ** transaction.  If the second argument is 2 or more and exclusive
2143 ** transaction is started, meaning that no other process is allowed
2144 ** to access the database.  A preexisting transaction may not be
2145 ** upgraded to exclusive by calling this routine a second time - the
2146 ** exclusivity flag only works for a new transaction.
2147 **
2148 ** A write-transaction must be started before attempting any
2149 ** changes to the database.  None of the following routines
2150 ** will work unless a transaction is started first:
2151 **
2152 **      sqlite3BtreeCreateTable()
2153 **      sqlite3BtreeCreateIndex()
2154 **      sqlite3BtreeClearTable()
2155 **      sqlite3BtreeDropTable()
2156 **      sqlite3BtreeInsert()
2157 **      sqlite3BtreeDelete()
2158 **      sqlite3BtreeUpdateMeta()
2159 **
2160 ** If an initial attempt to acquire the lock fails because of lock contention
2161 ** and the database was previously unlocked, then invoke the busy handler
2162 ** if there is one.  But if there was previously a read-lock, do not
2163 ** invoke the busy handler - just return SQLITE_BUSY.  SQLITE_BUSY is
2164 ** returned when there is already a read-lock in order to avoid a deadlock.
2165 **
2166 ** Suppose there are two processes A and B.  A has a read lock and B has
2167 ** a reserved lock.  B tries to promote to exclusive but is blocked because
2168 ** of A's read lock.  A tries to promote to reserved but is blocked by B.
2169 ** One or the other of the two processes must give way or there can be
2170 ** no progress.  By returning SQLITE_BUSY and not invoking the busy callback
2171 ** when A already has a read lock, we encourage A to give up and let B
2172 ** proceed.
2173 */
2174 int sqlite3BtreeBeginTrans(Btree *p, int wrflag){
2175   sqlite3 *pBlock = 0;
2176   BtShared *pBt = p->pBt;
2177   int rc = SQLITE_OK;
2178 
2179   sqlite3BtreeEnter(p);
2180   btreeIntegrity(p);
2181 
2182   /* If the btree is already in a write-transaction, or it
2183   ** is already in a read-transaction and a read-transaction
2184   ** is requested, this is a no-op.
2185   */
2186   if( p->inTrans==TRANS_WRITE || (p->inTrans==TRANS_READ && !wrflag) ){
2187     goto trans_begun;
2188   }
2189 
2190   /* Write transactions are not possible on a read-only database */
2191   if( pBt->readOnly && wrflag ){
2192     rc = SQLITE_READONLY;
2193     goto trans_begun;
2194   }
2195 
2196 #ifndef SQLITE_OMIT_SHARED_CACHE
2197   /* If another database handle has already opened a write transaction
2198   ** on this shared-btree structure and a second write transaction is
2199   ** requested, return SQLITE_LOCKED.
2200   */
2201   if( (wrflag && pBt->inTransaction==TRANS_WRITE) || pBt->isPending ){
2202     pBlock = pBt->pWriter->db;
2203   }else if( wrflag>1 ){
2204     BtLock *pIter;
2205     for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){
2206       if( pIter->pBtree!=p ){
2207         pBlock = pIter->pBtree->db;
2208         break;
2209       }
2210     }
2211   }
2212   if( pBlock ){
2213     sqlite3ConnectionBlocked(p->db, pBlock);
2214     rc = SQLITE_LOCKED_SHAREDCACHE;
2215     goto trans_begun;
2216   }
2217 #endif
2218 
2219   do {
2220     /* Call lockBtree() until either pBt->pPage1 is populated or
2221     ** lockBtree() returns something other than SQLITE_OK. lockBtree()
2222     ** may return SQLITE_OK but leave pBt->pPage1 set to 0 if after
2223     ** reading page 1 it discovers that the page-size of the database
2224     ** file is not pBt->pageSize. In this case lockBtree() will update
2225     ** pBt->pageSize to the page-size of the file on disk.
2226     */
2227     while( pBt->pPage1==0 && SQLITE_OK==(rc = lockBtree(pBt)) );
2228 
2229     if( rc==SQLITE_OK && wrflag ){
2230       if( pBt->readOnly ){
2231         rc = SQLITE_READONLY;
2232       }else{
2233         rc = sqlite3PagerBegin(pBt->pPager,wrflag>1,sqlite3TempInMemory(p->db));
2234         if( rc==SQLITE_OK ){
2235           rc = newDatabase(pBt);
2236         }
2237       }
2238     }
2239 
2240     if( rc!=SQLITE_OK ){
2241       unlockBtreeIfUnused(pBt);
2242     }
2243   }while( rc==SQLITE_BUSY && pBt->inTransaction==TRANS_NONE &&
2244           btreeInvokeBusyHandler(pBt) );
2245 
2246   if( rc==SQLITE_OK ){
2247     if( p->inTrans==TRANS_NONE ){
2248       pBt->nTransaction++;
2249     }
2250     p->inTrans = (wrflag?TRANS_WRITE:TRANS_READ);
2251     if( p->inTrans>pBt->inTransaction ){
2252       pBt->inTransaction = p->inTrans;
2253     }
2254 #ifndef SQLITE_OMIT_SHARED_CACHE
2255     if( wrflag ){
2256       assert( !pBt->pWriter );
2257       pBt->pWriter = p;
2258       pBt->isExclusive = (u8)(wrflag>1);
2259     }
2260 #endif
2261   }
2262 
2263 
2264 trans_begun:
2265   if( rc==SQLITE_OK && wrflag ){
2266     /* This call makes sure that the pager has the correct number of
2267     ** open savepoints. If the second parameter is greater than 0 and
2268     ** the sub-journal is not already open, then it will be opened here.
2269     */
2270     rc = sqlite3PagerOpenSavepoint(pBt->pPager, p->db->nSavepoint);
2271   }
2272 
2273   btreeIntegrity(p);
2274   sqlite3BtreeLeave(p);
2275   return rc;
2276 }
2277 
2278 #ifndef SQLITE_OMIT_AUTOVACUUM
2279 
2280 /*
2281 ** Set the pointer-map entries for all children of page pPage. Also, if
2282 ** pPage contains cells that point to overflow pages, set the pointer
2283 ** map entries for the overflow pages as well.
2284 */
2285 static int setChildPtrmaps(MemPage *pPage){
2286   int i;                             /* Counter variable */
2287   int nCell;                         /* Number of cells in page pPage */
2288   int rc;                            /* Return code */
2289   BtShared *pBt = pPage->pBt;
2290   u8 isInitOrig = pPage->isInit;
2291   Pgno pgno = pPage->pgno;
2292 
2293   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
2294   rc = sqlite3BtreeInitPage(pPage);
2295   if( rc!=SQLITE_OK ){
2296     goto set_child_ptrmaps_out;
2297   }
2298   nCell = pPage->nCell;
2299 
2300   for(i=0; i<nCell; i++){
2301     u8 *pCell = findCell(pPage, i);
2302 
2303     rc = ptrmapPutOvflPtr(pPage, pCell);
2304     if( rc!=SQLITE_OK ){
2305       goto set_child_ptrmaps_out;
2306     }
2307 
2308     if( !pPage->leaf ){
2309       Pgno childPgno = get4byte(pCell);
2310       rc = ptrmapPut(pBt, childPgno, PTRMAP_BTREE, pgno);
2311       if( rc!=SQLITE_OK ) goto set_child_ptrmaps_out;
2312     }
2313   }
2314 
2315   if( !pPage->leaf ){
2316     Pgno childPgno = get4byte(&pPage->aData[pPage->hdrOffset+8]);
2317     rc = ptrmapPut(pBt, childPgno, PTRMAP_BTREE, pgno);
2318   }
2319 
2320 set_child_ptrmaps_out:
2321   pPage->isInit = isInitOrig;
2322   return rc;
2323 }
2324 
2325 /*
2326 ** Somewhere on pPage, which is guaranteed to be a btree page, not an overflow
2327 ** page, is a pointer to page iFrom. Modify this pointer so that it points to
2328 ** iTo. Parameter eType describes the type of pointer to be modified, as
2329 ** follows:
2330 **
2331 ** PTRMAP_BTREE:     pPage is a btree-page. The pointer points at a child
2332 **                   page of pPage.
2333 **
2334 ** PTRMAP_OVERFLOW1: pPage is a btree-page. The pointer points at an overflow
2335 **                   page pointed to by one of the cells on pPage.
2336 **
2337 ** PTRMAP_OVERFLOW2: pPage is an overflow-page. The pointer points at the next
2338 **                   overflow page in the list.
2339 */
2340 static int modifyPagePointer(MemPage *pPage, Pgno iFrom, Pgno iTo, u8 eType){
2341   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
2342   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
2343   if( eType==PTRMAP_OVERFLOW2 ){
2344     /* The pointer is always the first 4 bytes of the page in this case.  */
2345     if( get4byte(pPage->aData)!=iFrom ){
2346       return SQLITE_CORRUPT_BKPT;
2347     }
2348     put4byte(pPage->aData, iTo);
2349   }else{
2350     u8 isInitOrig = pPage->isInit;
2351     int i;
2352     int nCell;
2353 
2354     sqlite3BtreeInitPage(pPage);
2355     nCell = pPage->nCell;
2356 
2357     for(i=0; i<nCell; i++){
2358       u8 *pCell = findCell(pPage, i);
2359       if( eType==PTRMAP_OVERFLOW1 ){
2360         CellInfo info;
2361         sqlite3BtreeParseCellPtr(pPage, pCell, &info);
2362         if( info.iOverflow ){
2363           if( iFrom==get4byte(&pCell[info.iOverflow]) ){
2364             put4byte(&pCell[info.iOverflow], iTo);
2365             break;
2366           }
2367         }
2368       }else{
2369         if( get4byte(pCell)==iFrom ){
2370           put4byte(pCell, iTo);
2371           break;
2372         }
2373       }
2374     }
2375 
2376     if( i==nCell ){
2377       if( eType!=PTRMAP_BTREE ||
2378           get4byte(&pPage->aData[pPage->hdrOffset+8])!=iFrom ){
2379         return SQLITE_CORRUPT_BKPT;
2380       }
2381       put4byte(&pPage->aData[pPage->hdrOffset+8], iTo);
2382     }
2383 
2384     pPage->isInit = isInitOrig;
2385   }
2386   return SQLITE_OK;
2387 }
2388 
2389 
2390 /*
2391 ** Move the open database page pDbPage to location iFreePage in the
2392 ** database. The pDbPage reference remains valid.
2393 */
2394 static int relocatePage(
2395   BtShared *pBt,           /* Btree */
2396   MemPage *pDbPage,        /* Open page to move */
2397   u8 eType,                /* Pointer map 'type' entry for pDbPage */
2398   Pgno iPtrPage,           /* Pointer map 'page-no' entry for pDbPage */
2399   Pgno iFreePage,          /* The location to move pDbPage to */
2400   int isCommit
2401 ){
2402   MemPage *pPtrPage;   /* The page that contains a pointer to pDbPage */
2403   Pgno iDbPage = pDbPage->pgno;
2404   Pager *pPager = pBt->pPager;
2405   int rc;
2406 
2407   assert( eType==PTRMAP_OVERFLOW2 || eType==PTRMAP_OVERFLOW1 ||
2408       eType==PTRMAP_BTREE || eType==PTRMAP_ROOTPAGE );
2409   assert( sqlite3_mutex_held(pBt->mutex) );
2410   assert( pDbPage->pBt==pBt );
2411 
2412   /* Move page iDbPage from its current location to page number iFreePage */
2413   TRACE(("AUTOVACUUM: Moving %d to free page %d (ptr page %d type %d)\n",
2414       iDbPage, iFreePage, iPtrPage, eType));
2415   rc = sqlite3PagerMovepage(pPager, pDbPage->pDbPage, iFreePage, isCommit);
2416   if( rc!=SQLITE_OK ){
2417     return rc;
2418   }
2419   pDbPage->pgno = iFreePage;
2420 
2421   /* If pDbPage was a btree-page, then it may have child pages and/or cells
2422   ** that point to overflow pages. The pointer map entries for all these
2423   ** pages need to be changed.
2424   **
2425   ** If pDbPage is an overflow page, then the first 4 bytes may store a
2426   ** pointer to a subsequent overflow page. If this is the case, then
2427   ** the pointer map needs to be updated for the subsequent overflow page.
2428   */
2429   if( eType==PTRMAP_BTREE || eType==PTRMAP_ROOTPAGE ){
2430     rc = setChildPtrmaps(pDbPage);
2431     if( rc!=SQLITE_OK ){
2432       return rc;
2433     }
2434   }else{
2435     Pgno nextOvfl = get4byte(pDbPage->aData);
2436     if( nextOvfl!=0 ){
2437       rc = ptrmapPut(pBt, nextOvfl, PTRMAP_OVERFLOW2, iFreePage);
2438       if( rc!=SQLITE_OK ){
2439         return rc;
2440       }
2441     }
2442   }
2443 
2444   /* Fix the database pointer on page iPtrPage that pointed at iDbPage so
2445   ** that it points at iFreePage. Also fix the pointer map entry for
2446   ** iPtrPage.
2447   */
2448   if( eType!=PTRMAP_ROOTPAGE ){
2449     rc = sqlite3BtreeGetPage(pBt, iPtrPage, &pPtrPage, 0);
2450     if( rc!=SQLITE_OK ){
2451       return rc;
2452     }
2453     rc = sqlite3PagerWrite(pPtrPage->pDbPage);
2454     if( rc!=SQLITE_OK ){
2455       releasePage(pPtrPage);
2456       return rc;
2457     }
2458     rc = modifyPagePointer(pPtrPage, iDbPage, iFreePage, eType);
2459     releasePage(pPtrPage);
2460     if( rc==SQLITE_OK ){
2461       rc = ptrmapPut(pBt, iFreePage, eType, iPtrPage);
2462     }
2463   }
2464   return rc;
2465 }
2466 
2467 /* Forward declaration required by incrVacuumStep(). */
2468 static int allocateBtreePage(BtShared *, MemPage **, Pgno *, Pgno, u8);
2469 
2470 /*
2471 ** Perform a single step of an incremental-vacuum. If successful,
2472 ** return SQLITE_OK. If there is no work to do (and therefore no
2473 ** point in calling this function again), return SQLITE_DONE.
2474 **
2475 ** More specificly, this function attempts to re-organize the
2476 ** database so that the last page of the file currently in use
2477 ** is no longer in use.
2478 **
2479 ** If the nFin parameter is non-zero, the implementation assumes
2480 ** that the caller will keep calling incrVacuumStep() until
2481 ** it returns SQLITE_DONE or an error, and that nFin is the
2482 ** number of pages the database file will contain after this
2483 ** process is complete.
2484 */
2485 static int incrVacuumStep(BtShared *pBt, Pgno nFin, Pgno iLastPg){
2486   Pgno nFreeList;           /* Number of pages still on the free-list */
2487 
2488   assert( sqlite3_mutex_held(pBt->mutex) );
2489   assert( iLastPg>nFin );
2490 
2491   if( !PTRMAP_ISPAGE(pBt, iLastPg) && iLastPg!=PENDING_BYTE_PAGE(pBt) ){
2492     int rc;
2493     u8 eType;
2494     Pgno iPtrPage;
2495 
2496     nFreeList = get4byte(&pBt->pPage1->aData[36]);
2497     if( nFreeList==0 ){
2498       return SQLITE_DONE;
2499     }
2500 
2501     rc = ptrmapGet(pBt, iLastPg, &eType, &iPtrPage);
2502     if( rc!=SQLITE_OK ){
2503       return rc;
2504     }
2505     if( eType==PTRMAP_ROOTPAGE ){
2506       return SQLITE_CORRUPT_BKPT;
2507     }
2508 
2509     if( eType==PTRMAP_FREEPAGE ){
2510       if( nFin==0 ){
2511         /* Remove the page from the files free-list. This is not required
2512         ** if nFin is non-zero. In that case, the free-list will be
2513         ** truncated to zero after this function returns, so it doesn't
2514         ** matter if it still contains some garbage entries.
2515         */
2516         Pgno iFreePg;
2517         MemPage *pFreePg;
2518         rc = allocateBtreePage(pBt, &pFreePg, &iFreePg, iLastPg, 1);
2519         if( rc!=SQLITE_OK ){
2520           return rc;
2521         }
2522         assert( iFreePg==iLastPg );
2523         releasePage(pFreePg);
2524       }
2525     } else {
2526       Pgno iFreePg;             /* Index of free page to move pLastPg to */
2527       MemPage *pLastPg;
2528 
2529       rc = sqlite3BtreeGetPage(pBt, iLastPg, &pLastPg, 0);
2530       if( rc!=SQLITE_OK ){
2531         return rc;
2532       }
2533 
2534       /* If nFin is zero, this loop runs exactly once and page pLastPg
2535       ** is swapped with the first free page pulled off the free list.
2536       **
2537       ** On the other hand, if nFin is greater than zero, then keep
2538       ** looping until a free-page located within the first nFin pages
2539       ** of the file is found.
2540       */
2541       do {
2542         MemPage *pFreePg;
2543         rc = allocateBtreePage(pBt, &pFreePg, &iFreePg, 0, 0);
2544         if( rc!=SQLITE_OK ){
2545           releasePage(pLastPg);
2546           return rc;
2547         }
2548         releasePage(pFreePg);
2549       }while( nFin!=0 && iFreePg>nFin );
2550       assert( iFreePg<iLastPg );
2551 
2552       rc = sqlite3PagerWrite(pLastPg->pDbPage);
2553       if( rc==SQLITE_OK ){
2554         rc = relocatePage(pBt, pLastPg, eType, iPtrPage, iFreePg, nFin!=0);
2555       }
2556       releasePage(pLastPg);
2557       if( rc!=SQLITE_OK ){
2558         return rc;
2559       }
2560     }
2561   }
2562 
2563   if( nFin==0 ){
2564     iLastPg--;
2565     while( iLastPg==PENDING_BYTE_PAGE(pBt)||PTRMAP_ISPAGE(pBt, iLastPg) ){
2566       if( PTRMAP_ISPAGE(pBt, iLastPg) ){
2567         MemPage *pPg;
2568         int rc = sqlite3BtreeGetPage(pBt, iLastPg, &pPg, 0);
2569         if( rc!=SQLITE_OK ){
2570           return rc;
2571         }
2572         rc = sqlite3PagerWrite(pPg->pDbPage);
2573         releasePage(pPg);
2574         if( rc!=SQLITE_OK ){
2575           return rc;
2576         }
2577       }
2578       iLastPg--;
2579     }
2580     sqlite3PagerTruncateImage(pBt->pPager, iLastPg);
2581   }
2582   return SQLITE_OK;
2583 }
2584 
2585 /*
2586 ** A write-transaction must be opened before calling this function.
2587 ** It performs a single unit of work towards an incremental vacuum.
2588 **
2589 ** If the incremental vacuum is finished after this function has run,
2590 ** SQLITE_DONE is returned. If it is not finished, but no error occurred,
2591 ** SQLITE_OK is returned. Otherwise an SQLite error code.
2592 */
2593 int sqlite3BtreeIncrVacuum(Btree *p){
2594   int rc;
2595   BtShared *pBt = p->pBt;
2596 
2597   sqlite3BtreeEnter(p);
2598   assert( pBt->inTransaction==TRANS_WRITE && p->inTrans==TRANS_WRITE );
2599   if( !pBt->autoVacuum ){
2600     rc = SQLITE_DONE;
2601   }else{
2602     invalidateAllOverflowCache(pBt);
2603     rc = incrVacuumStep(pBt, 0, pagerPagecount(pBt));
2604   }
2605   sqlite3BtreeLeave(p);
2606   return rc;
2607 }
2608 
2609 /*
2610 ** This routine is called prior to sqlite3PagerCommit when a transaction
2611 ** is commited for an auto-vacuum database.
2612 **
2613 ** If SQLITE_OK is returned, then *pnTrunc is set to the number of pages
2614 ** the database file should be truncated to during the commit process.
2615 ** i.e. the database has been reorganized so that only the first *pnTrunc
2616 ** pages are in use.
2617 */
2618 static int autoVacuumCommit(BtShared *pBt){
2619   int rc = SQLITE_OK;
2620   Pager *pPager = pBt->pPager;
2621   VVA_ONLY( int nRef = sqlite3PagerRefcount(pPager) );
2622 
2623   assert( sqlite3_mutex_held(pBt->mutex) );
2624   invalidateAllOverflowCache(pBt);
2625   assert(pBt->autoVacuum);
2626   if( !pBt->incrVacuum ){
2627     Pgno nFin;
2628     Pgno nFree;
2629     Pgno nPtrmap;
2630     Pgno iFree;
2631     const int pgsz = pBt->pageSize;
2632     Pgno nOrig = pagerPagecount(pBt);
2633 
2634     if( PTRMAP_ISPAGE(pBt, nOrig) || nOrig==PENDING_BYTE_PAGE(pBt) ){
2635       /* It is not possible to create a database for which the final page
2636       ** is either a pointer-map page or the pending-byte page. If one
2637       ** is encountered, this indicates corruption.
2638       */
2639       return SQLITE_CORRUPT_BKPT;
2640     }
2641 
2642     nFree = get4byte(&pBt->pPage1->aData[36]);
2643     nPtrmap = (nFree-nOrig+PTRMAP_PAGENO(pBt, nOrig)+pgsz/5)/(pgsz/5);
2644     nFin = nOrig - nFree - nPtrmap;
2645     if( nOrig>PENDING_BYTE_PAGE(pBt) && nFin<PENDING_BYTE_PAGE(pBt) ){
2646       nFin--;
2647     }
2648     while( PTRMAP_ISPAGE(pBt, nFin) || nFin==PENDING_BYTE_PAGE(pBt) ){
2649       nFin--;
2650     }
2651 
2652     for(iFree=nOrig; iFree>nFin && rc==SQLITE_OK; iFree--){
2653       rc = incrVacuumStep(pBt, nFin, iFree);
2654     }
2655     if( (rc==SQLITE_DONE || rc==SQLITE_OK) && nFree>0 ){
2656       rc = SQLITE_OK;
2657       rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
2658       put4byte(&pBt->pPage1->aData[32], 0);
2659       put4byte(&pBt->pPage1->aData[36], 0);
2660       sqlite3PagerTruncateImage(pBt->pPager, nFin);
2661     }
2662     if( rc!=SQLITE_OK ){
2663       sqlite3PagerRollback(pPager);
2664     }
2665   }
2666 
2667   assert( nRef==sqlite3PagerRefcount(pPager) );
2668   return rc;
2669 }
2670 
2671 #endif /* ifndef SQLITE_OMIT_AUTOVACUUM */
2672 
2673 /*
2674 ** This routine does the first phase of a two-phase commit.  This routine
2675 ** causes a rollback journal to be created (if it does not already exist)
2676 ** and populated with enough information so that if a power loss occurs
2677 ** the database can be restored to its original state by playing back
2678 ** the journal.  Then the contents of the journal are flushed out to
2679 ** the disk.  After the journal is safely on oxide, the changes to the
2680 ** database are written into the database file and flushed to oxide.
2681 ** At the end of this call, the rollback journal still exists on the
2682 ** disk and we are still holding all locks, so the transaction has not
2683 ** committed.  See sqlite3BtreeCommitPhaseTwo() for the second phase of the
2684 ** commit process.
2685 **
2686 ** This call is a no-op if no write-transaction is currently active on pBt.
2687 **
2688 ** Otherwise, sync the database file for the btree pBt. zMaster points to
2689 ** the name of a master journal file that should be written into the
2690 ** individual journal file, or is NULL, indicating no master journal file
2691 ** (single database transaction).
2692 **
2693 ** When this is called, the master journal should already have been
2694 ** created, populated with this journal pointer and synced to disk.
2695 **
2696 ** Once this is routine has returned, the only thing required to commit
2697 ** the write-transaction for this database file is to delete the journal.
2698 */
2699 int sqlite3BtreeCommitPhaseOne(Btree *p, const char *zMaster){
2700   int rc = SQLITE_OK;
2701   if( p->inTrans==TRANS_WRITE ){
2702     BtShared *pBt = p->pBt;
2703     sqlite3BtreeEnter(p);
2704 #ifndef SQLITE_OMIT_AUTOVACUUM
2705     if( pBt->autoVacuum ){
2706       rc = autoVacuumCommit(pBt);
2707       if( rc!=SQLITE_OK ){
2708         sqlite3BtreeLeave(p);
2709         return rc;
2710       }
2711     }
2712 #endif
2713     rc = sqlite3PagerCommitPhaseOne(pBt->pPager, zMaster, 0);
2714     sqlite3BtreeLeave(p);
2715   }
2716   return rc;
2717 }
2718 
2719 /*
2720 ** Commit the transaction currently in progress.
2721 **
2722 ** This routine implements the second phase of a 2-phase commit.  The
2723 ** sqlite3BtreeCommitPhaseOne() routine does the first phase and should
2724 ** be invoked prior to calling this routine.  The sqlite3BtreeCommitPhaseOne()
2725 ** routine did all the work of writing information out to disk and flushing the
2726 ** contents so that they are written onto the disk platter.  All this
2727 ** routine has to do is delete or truncate or zero the header in the
2728 ** the rollback journal (which causes the transaction to commit) and
2729 ** drop locks.
2730 **
2731 ** This will release the write lock on the database file.  If there
2732 ** are no active cursors, it also releases the read lock.
2733 */
2734 int sqlite3BtreeCommitPhaseTwo(Btree *p){
2735   BtShared *pBt = p->pBt;
2736 
2737   sqlite3BtreeEnter(p);
2738   btreeIntegrity(p);
2739 
2740   /* If the handle has a write-transaction open, commit the shared-btrees
2741   ** transaction and set the shared state to TRANS_READ.
2742   */
2743   if( p->inTrans==TRANS_WRITE ){
2744     int rc;
2745     assert( pBt->inTransaction==TRANS_WRITE );
2746     assert( pBt->nTransaction>0 );
2747     rc = sqlite3PagerCommitPhaseTwo(pBt->pPager);
2748     if( rc!=SQLITE_OK ){
2749       sqlite3BtreeLeave(p);
2750       return rc;
2751     }
2752     pBt->inTransaction = TRANS_READ;
2753   }
2754 
2755   /* If the handle has any kind of transaction open, decrement the transaction
2756   ** count of the shared btree. If the transaction count reaches 0, set
2757   ** the shared state to TRANS_NONE. The unlockBtreeIfUnused() call below
2758   ** will unlock the pager.
2759   */
2760   if( p->inTrans!=TRANS_NONE ){
2761     clearAllSharedCacheTableLocks(p);
2762     pBt->nTransaction--;
2763     if( 0==pBt->nTransaction ){
2764       pBt->inTransaction = TRANS_NONE;
2765     }
2766   }
2767 
2768   /* Set the current transaction state to TRANS_NONE and unlock
2769   ** the pager if this call closed the only read or write transaction.
2770   */
2771   btreeClearHasContent(pBt);
2772   p->inTrans = TRANS_NONE;
2773   unlockBtreeIfUnused(pBt);
2774 
2775   btreeIntegrity(p);
2776   sqlite3BtreeLeave(p);
2777   return SQLITE_OK;
2778 }
2779 
2780 /*
2781 ** Do both phases of a commit.
2782 */
2783 int sqlite3BtreeCommit(Btree *p){
2784   int rc;
2785   sqlite3BtreeEnter(p);
2786   rc = sqlite3BtreeCommitPhaseOne(p, 0);
2787   if( rc==SQLITE_OK ){
2788     rc = sqlite3BtreeCommitPhaseTwo(p);
2789   }
2790   sqlite3BtreeLeave(p);
2791   return rc;
2792 }
2793 
2794 #ifndef NDEBUG
2795 /*
2796 ** Return the number of write-cursors open on this handle. This is for use
2797 ** in assert() expressions, so it is only compiled if NDEBUG is not
2798 ** defined.
2799 **
2800 ** For the purposes of this routine, a write-cursor is any cursor that
2801 ** is capable of writing to the databse.  That means the cursor was
2802 ** originally opened for writing and the cursor has not be disabled
2803 ** by having its state changed to CURSOR_FAULT.
2804 */
2805 static int countWriteCursors(BtShared *pBt){
2806   BtCursor *pCur;
2807   int r = 0;
2808   for(pCur=pBt->pCursor; pCur; pCur=pCur->pNext){
2809     if( pCur->wrFlag && pCur->eState!=CURSOR_FAULT ) r++;
2810   }
2811   return r;
2812 }
2813 #endif
2814 
2815 /*
2816 ** This routine sets the state to CURSOR_FAULT and the error
2817 ** code to errCode for every cursor on BtShared that pBtree
2818 ** references.
2819 **
2820 ** Every cursor is tripped, including cursors that belong
2821 ** to other database connections that happen to be sharing
2822 ** the cache with pBtree.
2823 **
2824 ** This routine gets called when a rollback occurs.
2825 ** All cursors using the same cache must be tripped
2826 ** to prevent them from trying to use the btree after
2827 ** the rollback.  The rollback may have deleted tables
2828 ** or moved root pages, so it is not sufficient to
2829 ** save the state of the cursor.  The cursor must be
2830 ** invalidated.
2831 */
2832 void sqlite3BtreeTripAllCursors(Btree *pBtree, int errCode){
2833   BtCursor *p;
2834   sqlite3BtreeEnter(pBtree);
2835   for(p=pBtree->pBt->pCursor; p; p=p->pNext){
2836     int i;
2837     sqlite3BtreeClearCursor(p);
2838     p->eState = CURSOR_FAULT;
2839     p->skip = errCode;
2840     for(i=0; i<=p->iPage; i++){
2841       releasePage(p->apPage[i]);
2842       p->apPage[i] = 0;
2843     }
2844   }
2845   sqlite3BtreeLeave(pBtree);
2846 }
2847 
2848 /*
2849 ** Rollback the transaction in progress.  All cursors will be
2850 ** invalided by this operation.  Any attempt to use a cursor
2851 ** that was open at the beginning of this operation will result
2852 ** in an error.
2853 **
2854 ** This will release the write lock on the database file.  If there
2855 ** are no active cursors, it also releases the read lock.
2856 */
2857 int sqlite3BtreeRollback(Btree *p){
2858   int rc;
2859   BtShared *pBt = p->pBt;
2860   MemPage *pPage1;
2861 
2862   sqlite3BtreeEnter(p);
2863   rc = saveAllCursors(pBt, 0, 0);
2864 #ifndef SQLITE_OMIT_SHARED_CACHE
2865   if( rc!=SQLITE_OK ){
2866     /* This is a horrible situation. An IO or malloc() error occurred whilst
2867     ** trying to save cursor positions. If this is an automatic rollback (as
2868     ** the result of a constraint, malloc() failure or IO error) then
2869     ** the cache may be internally inconsistent (not contain valid trees) so
2870     ** we cannot simply return the error to the caller. Instead, abort
2871     ** all queries that may be using any of the cursors that failed to save.
2872     */
2873     sqlite3BtreeTripAllCursors(p, rc);
2874   }
2875 #endif
2876   btreeIntegrity(p);
2877 
2878   if( p->inTrans==TRANS_WRITE ){
2879     int rc2;
2880 
2881     assert( TRANS_WRITE==pBt->inTransaction );
2882     rc2 = sqlite3PagerRollback(pBt->pPager);
2883     if( rc2!=SQLITE_OK ){
2884       rc = rc2;
2885     }
2886 
2887     /* The rollback may have destroyed the pPage1->aData value.  So
2888     ** call sqlite3BtreeGetPage() on page 1 again to make
2889     ** sure pPage1->aData is set correctly. */
2890     if( sqlite3BtreeGetPage(pBt, 1, &pPage1, 0)==SQLITE_OK ){
2891       releasePage(pPage1);
2892     }
2893     assert( countWriteCursors(pBt)==0 );
2894     pBt->inTransaction = TRANS_READ;
2895   }
2896 
2897   if( p->inTrans!=TRANS_NONE ){
2898     clearAllSharedCacheTableLocks(p);
2899     assert( pBt->nTransaction>0 );
2900     pBt->nTransaction--;
2901     if( 0==pBt->nTransaction ){
2902       pBt->inTransaction = TRANS_NONE;
2903     }
2904   }
2905 
2906   btreeClearHasContent(pBt);
2907   p->inTrans = TRANS_NONE;
2908   unlockBtreeIfUnused(pBt);
2909 
2910   btreeIntegrity(p);
2911   sqlite3BtreeLeave(p);
2912   return rc;
2913 }
2914 
2915 /*
2916 ** Start a statement subtransaction. The subtransaction can can be rolled
2917 ** back independently of the main transaction. You must start a transaction
2918 ** before starting a subtransaction. The subtransaction is ended automatically
2919 ** if the main transaction commits or rolls back.
2920 **
2921 ** Statement subtransactions are used around individual SQL statements
2922 ** that are contained within a BEGIN...COMMIT block.  If a constraint
2923 ** error occurs within the statement, the effect of that one statement
2924 ** can be rolled back without having to rollback the entire transaction.
2925 **
2926 ** A statement sub-transaction is implemented as an anonymous savepoint. The
2927 ** value passed as the second parameter is the total number of savepoints,
2928 ** including the new anonymous savepoint, open on the B-Tree. i.e. if there
2929 ** are no active savepoints and no other statement-transactions open,
2930 ** iStatement is 1. This anonymous savepoint can be released or rolled back
2931 ** using the sqlite3BtreeSavepoint() function.
2932 */
2933 int sqlite3BtreeBeginStmt(Btree *p, int iStatement){
2934   int rc;
2935   BtShared *pBt = p->pBt;
2936   sqlite3BtreeEnter(p);
2937   assert( p->inTrans==TRANS_WRITE );
2938   assert( pBt->readOnly==0 );
2939   assert( iStatement>0 );
2940   assert( iStatement>p->db->nSavepoint );
2941   if( NEVER(p->inTrans!=TRANS_WRITE || pBt->readOnly) ){
2942     rc = SQLITE_INTERNAL;
2943   }else{
2944     assert( pBt->inTransaction==TRANS_WRITE );
2945     /* At the pager level, a statement transaction is a savepoint with
2946     ** an index greater than all savepoints created explicitly using
2947     ** SQL statements. It is illegal to open, release or rollback any
2948     ** such savepoints while the statement transaction savepoint is active.
2949     */
2950     rc = sqlite3PagerOpenSavepoint(pBt->pPager, iStatement);
2951   }
2952   sqlite3BtreeLeave(p);
2953   return rc;
2954 }
2955 
2956 /*
2957 ** The second argument to this function, op, is always SAVEPOINT_ROLLBACK
2958 ** or SAVEPOINT_RELEASE. This function either releases or rolls back the
2959 ** savepoint identified by parameter iSavepoint, depending on the value
2960 ** of op.
2961 **
2962 ** Normally, iSavepoint is greater than or equal to zero. However, if op is
2963 ** SAVEPOINT_ROLLBACK, then iSavepoint may also be -1. In this case the
2964 ** contents of the entire transaction are rolled back. This is different
2965 ** from a normal transaction rollback, as no locks are released and the
2966 ** transaction remains open.
2967 */
2968 int sqlite3BtreeSavepoint(Btree *p, int op, int iSavepoint){
2969   int rc = SQLITE_OK;
2970   if( p && p->inTrans==TRANS_WRITE ){
2971     BtShared *pBt = p->pBt;
2972     assert( op==SAVEPOINT_RELEASE || op==SAVEPOINT_ROLLBACK );
2973     assert( iSavepoint>=0 || (iSavepoint==-1 && op==SAVEPOINT_ROLLBACK) );
2974     sqlite3BtreeEnter(p);
2975     rc = sqlite3PagerSavepoint(pBt->pPager, op, iSavepoint);
2976     if( rc==SQLITE_OK ){
2977       rc = newDatabase(pBt);
2978     }
2979     sqlite3BtreeLeave(p);
2980   }
2981   return rc;
2982 }
2983 
2984 /*
2985 ** Create a new cursor for the BTree whose root is on the page
2986 ** iTable.  The act of acquiring a cursor gets a read lock on
2987 ** the database file.
2988 **
2989 ** If wrFlag==0, then the cursor can only be used for reading.
2990 ** If wrFlag==1, then the cursor can be used for reading or for
2991 ** writing if other conditions for writing are also met.  These
2992 ** are the conditions that must be met in order for writing to
2993 ** be allowed:
2994 **
2995 ** 1:  The cursor must have been opened with wrFlag==1
2996 **
2997 ** 2:  Other database connections that share the same pager cache
2998 **     but which are not in the READ_UNCOMMITTED state may not have
2999 **     cursors open with wrFlag==0 on the same table.  Otherwise
3000 **     the changes made by this write cursor would be visible to
3001 **     the read cursors in the other database connection.
3002 **
3003 ** 3:  The database must be writable (not on read-only media)
3004 **
3005 ** 4:  There must be an active transaction.
3006 **
3007 ** No checking is done to make sure that page iTable really is the
3008 ** root page of a b-tree.  If it is not, then the cursor acquired
3009 ** will not work correctly.
3010 **
3011 ** It is assumed that the sqlite3BtreeCursorSize() bytes of memory
3012 ** pointed to by pCur have been zeroed by the caller.
3013 */
3014 static int btreeCursor(
3015   Btree *p,                              /* The btree */
3016   int iTable,                            /* Root page of table to open */
3017   int wrFlag,                            /* 1 to write. 0 read-only */
3018   struct KeyInfo *pKeyInfo,              /* First arg to comparison function */
3019   BtCursor *pCur                         /* Space for new cursor */
3020 ){
3021   int rc;
3022   Pgno nPage;
3023   BtShared *pBt = p->pBt;
3024 
3025   assert( sqlite3BtreeHoldsMutex(p) );
3026   assert( wrFlag==0 || wrFlag==1 );
3027   if( wrFlag ){
3028     assert( !pBt->readOnly );
3029     if( NEVER(pBt->readOnly) ){
3030       return SQLITE_READONLY;
3031     }
3032     rc = checkForReadConflicts(p, iTable, 0, 0);
3033     if( rc!=SQLITE_OK ){
3034       assert( rc==SQLITE_LOCKED_SHAREDCACHE );
3035       return rc;
3036     }
3037   }
3038 
3039   if( pBt->pPage1==0 ){
3040     rc = lockBtreeWithRetry(p);
3041     if( rc!=SQLITE_OK ){
3042       return rc;
3043     }
3044   }
3045   pCur->pgnoRoot = (Pgno)iTable;
3046   rc = sqlite3PagerPagecount(pBt->pPager, (int *)&nPage);
3047   if( rc!=SQLITE_OK ){
3048     return rc;
3049   }
3050   if( iTable==1 && nPage==0 ){
3051     rc = SQLITE_EMPTY;
3052     goto create_cursor_exception;
3053   }
3054   rc = getAndInitPage(pBt, pCur->pgnoRoot, &pCur->apPage[0]);
3055   if( rc!=SQLITE_OK ){
3056     goto create_cursor_exception;
3057   }
3058 
3059   /* Now that no other errors can occur, finish filling in the BtCursor
3060   ** variables, link the cursor into the BtShared list and set *ppCur (the
3061   ** output argument to this function).
3062   */
3063   pCur->pKeyInfo = pKeyInfo;
3064   pCur->pBtree = p;
3065   pCur->pBt = pBt;
3066   pCur->wrFlag = (u8)wrFlag;
3067   pCur->pNext = pBt->pCursor;
3068   if( pCur->pNext ){
3069     pCur->pNext->pPrev = pCur;
3070   }
3071   pBt->pCursor = pCur;
3072   pCur->eState = CURSOR_INVALID;
3073   pCur->cachedRowid = 0;
3074 
3075   return SQLITE_OK;
3076 
3077 create_cursor_exception:
3078   releasePage(pCur->apPage[0]);
3079   unlockBtreeIfUnused(pBt);
3080   return rc;
3081 }
3082 int sqlite3BtreeCursor(
3083   Btree *p,                                   /* The btree */
3084   int iTable,                                 /* Root page of table to open */
3085   int wrFlag,                                 /* 1 to write. 0 read-only */
3086   struct KeyInfo *pKeyInfo,                   /* First arg to xCompare() */
3087   BtCursor *pCur                              /* Write new cursor here */
3088 ){
3089   int rc;
3090   sqlite3BtreeEnter(p);
3091   rc = btreeCursor(p, iTable, wrFlag, pKeyInfo, pCur);
3092   sqlite3BtreeLeave(p);
3093   return rc;
3094 }
3095 
3096 /*
3097 ** Return the size of a BtCursor object in bytes.
3098 **
3099 ** This interfaces is needed so that users of cursors can preallocate
3100 ** sufficient storage to hold a cursor.  The BtCursor object is opaque
3101 ** to users so they cannot do the sizeof() themselves - they must call
3102 ** this routine.
3103 */
3104 int sqlite3BtreeCursorSize(void){
3105   return sizeof(BtCursor);
3106 }
3107 
3108 /*
3109 ** Set the cached rowid value of every cursor in the same database file
3110 ** as pCur and having the same root page number as pCur.  The value is
3111 ** set to iRowid.
3112 **
3113 ** Only positive rowid values are considered valid for this cache.
3114 ** The cache is initialized to zero, indicating an invalid cache.
3115 ** A btree will work fine with zero or negative rowids.  We just cannot
3116 ** cache zero or negative rowids, which means tables that use zero or
3117 ** negative rowids might run a little slower.  But in practice, zero
3118 ** or negative rowids are very uncommon so this should not be a problem.
3119 */
3120 void sqlite3BtreeSetCachedRowid(BtCursor *pCur, sqlite3_int64 iRowid){
3121   BtCursor *p;
3122   for(p=pCur->pBt->pCursor; p; p=p->pNext){
3123     if( p->pgnoRoot==pCur->pgnoRoot ) p->cachedRowid = iRowid;
3124   }
3125   assert( pCur->cachedRowid==iRowid );
3126 }
3127 
3128 /*
3129 ** Return the cached rowid for the given cursor.  A negative or zero
3130 ** return value indicates that the rowid cache is invalid and should be
3131 ** ignored.  If the rowid cache has never before been set, then a
3132 ** zero is returned.
3133 */
3134 sqlite3_int64 sqlite3BtreeGetCachedRowid(BtCursor *pCur){
3135   return pCur->cachedRowid;
3136 }
3137 
3138 /*
3139 ** Close a cursor.  The read lock on the database file is released
3140 ** when the last cursor is closed.
3141 */
3142 int sqlite3BtreeCloseCursor(BtCursor *pCur){
3143   Btree *pBtree = pCur->pBtree;
3144   if( pBtree ){
3145     int i;
3146     BtShared *pBt = pCur->pBt;
3147     sqlite3BtreeEnter(pBtree);
3148     sqlite3BtreeClearCursor(pCur);
3149     if( pCur->pPrev ){
3150       pCur->pPrev->pNext = pCur->pNext;
3151     }else{
3152       pBt->pCursor = pCur->pNext;
3153     }
3154     if( pCur->pNext ){
3155       pCur->pNext->pPrev = pCur->pPrev;
3156     }
3157     for(i=0; i<=pCur->iPage; i++){
3158       releasePage(pCur->apPage[i]);
3159     }
3160     unlockBtreeIfUnused(pBt);
3161     invalidateOverflowCache(pCur);
3162     /* sqlite3_free(pCur); */
3163     sqlite3BtreeLeave(pBtree);
3164   }
3165   return SQLITE_OK;
3166 }
3167 
3168 /*
3169 ** Make a temporary cursor by filling in the fields of pTempCur.
3170 ** The temporary cursor is not on the cursor list for the Btree.
3171 */
3172 void sqlite3BtreeGetTempCursor(BtCursor *pCur, BtCursor *pTempCur){
3173   int i;
3174   assert( cursorHoldsMutex(pCur) );
3175   memcpy(pTempCur, pCur, sizeof(BtCursor));
3176   pTempCur->pNext = 0;
3177   pTempCur->pPrev = 0;
3178   for(i=0; i<=pTempCur->iPage; i++){
3179     sqlite3PagerRef(pTempCur->apPage[i]->pDbPage);
3180   }
3181   assert( pTempCur->pKey==0 );
3182 }
3183 
3184 /*
3185 ** Delete a temporary cursor such as was made by the CreateTemporaryCursor()
3186 ** function above.
3187 */
3188 void sqlite3BtreeReleaseTempCursor(BtCursor *pCur){
3189   int i;
3190   assert( cursorHoldsMutex(pCur) );
3191   for(i=0; i<=pCur->iPage; i++){
3192     sqlite3PagerUnref(pCur->apPage[i]->pDbPage);
3193   }
3194   sqlite3_free(pCur->pKey);
3195 }
3196 
3197 
3198 
3199 /*
3200 ** Make sure the BtCursor* given in the argument has a valid
3201 ** BtCursor.info structure.  If it is not already valid, call
3202 ** sqlite3BtreeParseCell() to fill it in.
3203 **
3204 ** BtCursor.info is a cache of the information in the current cell.
3205 ** Using this cache reduces the number of calls to sqlite3BtreeParseCell().
3206 **
3207 ** 2007-06-25:  There is a bug in some versions of MSVC that cause the
3208 ** compiler to crash when getCellInfo() is implemented as a macro.
3209 ** But there is a measureable speed advantage to using the macro on gcc
3210 ** (when less compiler optimizations like -Os or -O0 are used and the
3211 ** compiler is not doing agressive inlining.)  So we use a real function
3212 ** for MSVC and a macro for everything else.  Ticket #2457.
3213 */
3214 #ifndef NDEBUG
3215   static void assertCellInfo(BtCursor *pCur){
3216     CellInfo info;
3217     int iPage = pCur->iPage;
3218     memset(&info, 0, sizeof(info));
3219     sqlite3BtreeParseCell(pCur->apPage[iPage], pCur->aiIdx[iPage], &info);
3220     assert( memcmp(&info, &pCur->info, sizeof(info))==0 );
3221   }
3222 #else
3223   #define assertCellInfo(x)
3224 #endif
3225 #ifdef _MSC_VER
3226   /* Use a real function in MSVC to work around bugs in that compiler. */
3227   static void getCellInfo(BtCursor *pCur){
3228     if( pCur->info.nSize==0 ){
3229       int iPage = pCur->iPage;
3230       sqlite3BtreeParseCell(pCur->apPage[iPage],pCur->aiIdx[iPage],&pCur->info);
3231       pCur->validNKey = 1;
3232     }else{
3233       assertCellInfo(pCur);
3234     }
3235   }
3236 #else /* if not _MSC_VER */
3237   /* Use a macro in all other compilers so that the function is inlined */
3238 #define getCellInfo(pCur)                                                      \
3239   if( pCur->info.nSize==0 ){                                                   \
3240     int iPage = pCur->iPage;                                                   \
3241     sqlite3BtreeParseCell(pCur->apPage[iPage],pCur->aiIdx[iPage],&pCur->info); \
3242     pCur->validNKey = 1;                                                       \
3243   }else{                                                                       \
3244     assertCellInfo(pCur);                                                      \
3245   }
3246 #endif /* _MSC_VER */
3247 
3248 /*
3249 ** Set *pSize to the size of the buffer needed to hold the value of
3250 ** the key for the current entry.  If the cursor is not pointing
3251 ** to a valid entry, *pSize is set to 0.
3252 **
3253 ** For a table with the INTKEY flag set, this routine returns the key
3254 ** itself, not the number of bytes in the key.
3255 */
3256 int sqlite3BtreeKeySize(BtCursor *pCur, i64 *pSize){
3257   int rc;
3258 
3259   assert( cursorHoldsMutex(pCur) );
3260   rc = restoreCursorPosition(pCur);
3261   if( rc==SQLITE_OK ){
3262     assert( pCur->eState==CURSOR_INVALID || pCur->eState==CURSOR_VALID );
3263     if( pCur->eState==CURSOR_INVALID ){
3264       *pSize = 0;
3265     }else{
3266       getCellInfo(pCur);
3267       *pSize = pCur->info.nKey;
3268     }
3269   }
3270   return rc;
3271 }
3272 
3273 /*
3274 ** Set *pSize to the number of bytes of data in the entry the
3275 ** cursor currently points to.  Always return SQLITE_OK.
3276 ** Failure is not possible.  If the cursor is not currently
3277 ** pointing to an entry (which can happen, for example, if
3278 ** the database is empty) then *pSize is set to 0.
3279 */
3280 int sqlite3BtreeDataSize(BtCursor *pCur, u32 *pSize){
3281   int rc;
3282 
3283   assert( cursorHoldsMutex(pCur) );
3284   rc = restoreCursorPosition(pCur);
3285   if( rc==SQLITE_OK ){
3286     assert( pCur->eState==CURSOR_INVALID || pCur->eState==CURSOR_VALID );
3287     if( pCur->eState==CURSOR_INVALID ){
3288       /* Not pointing at a valid entry - set *pSize to 0. */
3289       *pSize = 0;
3290     }else{
3291       getCellInfo(pCur);
3292       *pSize = pCur->info.nData;
3293     }
3294   }
3295   return rc;
3296 }
3297 
3298 /*
3299 ** Given the page number of an overflow page in the database (parameter
3300 ** ovfl), this function finds the page number of the next page in the
3301 ** linked list of overflow pages. If possible, it uses the auto-vacuum
3302 ** pointer-map data instead of reading the content of page ovfl to do so.
3303 **
3304 ** If an error occurs an SQLite error code is returned. Otherwise:
3305 **
3306 ** The page number of the next overflow page in the linked list is
3307 ** written to *pPgnoNext. If page ovfl is the last page in its linked
3308 ** list, *pPgnoNext is set to zero.
3309 **
3310 ** If ppPage is not NULL, and a reference to the MemPage object corresponding
3311 ** to page number pOvfl was obtained, then *ppPage is set to point to that
3312 ** reference. It is the responsibility of the caller to call releasePage()
3313 ** on *ppPage to free the reference. In no reference was obtained (because
3314 ** the pointer-map was used to obtain the value for *pPgnoNext), then
3315 ** *ppPage is set to zero.
3316 */
3317 static int getOverflowPage(
3318   BtShared *pBt,
3319   Pgno ovfl,                   /* Overflow page */
3320   MemPage **ppPage,            /* OUT: MemPage handle (may be NULL) */
3321   Pgno *pPgnoNext              /* OUT: Next overflow page number */
3322 ){
3323   Pgno next = 0;
3324   MemPage *pPage = 0;
3325   int rc = SQLITE_OK;
3326 
3327   assert( sqlite3_mutex_held(pBt->mutex) );
3328   assert(pPgnoNext);
3329 
3330 #ifndef SQLITE_OMIT_AUTOVACUUM
3331   /* Try to find the next page in the overflow list using the
3332   ** autovacuum pointer-map pages. Guess that the next page in
3333   ** the overflow list is page number (ovfl+1). If that guess turns
3334   ** out to be wrong, fall back to loading the data of page
3335   ** number ovfl to determine the next page number.
3336   */
3337   if( pBt->autoVacuum ){
3338     Pgno pgno;
3339     Pgno iGuess = ovfl+1;
3340     u8 eType;
3341 
3342     while( PTRMAP_ISPAGE(pBt, iGuess) || iGuess==PENDING_BYTE_PAGE(pBt) ){
3343       iGuess++;
3344     }
3345 
3346     if( iGuess<=pagerPagecount(pBt) ){
3347       rc = ptrmapGet(pBt, iGuess, &eType, &pgno);
3348       if( rc==SQLITE_OK && eType==PTRMAP_OVERFLOW2 && pgno==ovfl ){
3349         next = iGuess;
3350         rc = SQLITE_DONE;
3351       }
3352     }
3353   }
3354 #endif
3355 
3356   if( rc==SQLITE_OK ){
3357     rc = sqlite3BtreeGetPage(pBt, ovfl, &pPage, 0);
3358     assert(rc==SQLITE_OK || pPage==0);
3359     if( next==0 && rc==SQLITE_OK ){
3360       next = get4byte(pPage->aData);
3361     }
3362   }
3363 
3364   *pPgnoNext = next;
3365   if( ppPage ){
3366     *ppPage = pPage;
3367   }else{
3368     releasePage(pPage);
3369   }
3370   return (rc==SQLITE_DONE ? SQLITE_OK : rc);
3371 }
3372 
3373 /*
3374 ** Copy data from a buffer to a page, or from a page to a buffer.
3375 **
3376 ** pPayload is a pointer to data stored on database page pDbPage.
3377 ** If argument eOp is false, then nByte bytes of data are copied
3378 ** from pPayload to the buffer pointed at by pBuf. If eOp is true,
3379 ** then sqlite3PagerWrite() is called on pDbPage and nByte bytes
3380 ** of data are copied from the buffer pBuf to pPayload.
3381 **
3382 ** SQLITE_OK is returned on success, otherwise an error code.
3383 */
3384 static int copyPayload(
3385   void *pPayload,           /* Pointer to page data */
3386   void *pBuf,               /* Pointer to buffer */
3387   int nByte,                /* Number of bytes to copy */
3388   int eOp,                  /* 0 -> copy from page, 1 -> copy to page */
3389   DbPage *pDbPage           /* Page containing pPayload */
3390 ){
3391   if( eOp ){
3392     /* Copy data from buffer to page (a write operation) */
3393     int rc = sqlite3PagerWrite(pDbPage);
3394     if( rc!=SQLITE_OK ){
3395       return rc;
3396     }
3397     memcpy(pPayload, pBuf, nByte);
3398   }else{
3399     /* Copy data from page to buffer (a read operation) */
3400     memcpy(pBuf, pPayload, nByte);
3401   }
3402   return SQLITE_OK;
3403 }
3404 
3405 /*
3406 ** This function is used to read or overwrite payload information
3407 ** for the entry that the pCur cursor is pointing to. If the eOp
3408 ** parameter is 0, this is a read operation (data copied into
3409 ** buffer pBuf). If it is non-zero, a write (data copied from
3410 ** buffer pBuf).
3411 **
3412 ** A total of "amt" bytes are read or written beginning at "offset".
3413 ** Data is read to or from the buffer pBuf.
3414 **
3415 ** This routine does not make a distinction between key and data.
3416 ** It just reads or writes bytes from the payload area.  Data might
3417 ** appear on the main page or be scattered out on multiple overflow
3418 ** pages.
3419 **
3420 ** If the BtCursor.isIncrblobHandle flag is set, and the current
3421 ** cursor entry uses one or more overflow pages, this function
3422 ** allocates space for and lazily popluates the overflow page-list
3423 ** cache array (BtCursor.aOverflow). Subsequent calls use this
3424 ** cache to make seeking to the supplied offset more efficient.
3425 **
3426 ** Once an overflow page-list cache has been allocated, it may be
3427 ** invalidated if some other cursor writes to the same table, or if
3428 ** the cursor is moved to a different row. Additionally, in auto-vacuum
3429 ** mode, the following events may invalidate an overflow page-list cache.
3430 **
3431 **   * An incremental vacuum,
3432 **   * A commit in auto_vacuum="full" mode,
3433 **   * Creating a table (may require moving an overflow page).
3434 */
3435 static int accessPayload(
3436   BtCursor *pCur,      /* Cursor pointing to entry to read from */
3437   u32 offset,          /* Begin reading this far into payload */
3438   u32 amt,             /* Read this many bytes */
3439   unsigned char *pBuf, /* Write the bytes into this buffer */
3440   int skipKey,         /* offset begins at data if this is true */
3441   int eOp              /* zero to read. non-zero to write. */
3442 ){
3443   unsigned char *aPayload;
3444   int rc = SQLITE_OK;
3445   u32 nKey;
3446   int iIdx = 0;
3447   MemPage *pPage = pCur->apPage[pCur->iPage]; /* Btree page of current entry */
3448   BtShared *pBt = pCur->pBt;                  /* Btree this cursor belongs to */
3449 
3450   assert( pPage );
3451   assert( pCur->eState==CURSOR_VALID );
3452   assert( pCur->aiIdx[pCur->iPage]<pPage->nCell );
3453   assert( cursorHoldsMutex(pCur) );
3454 
3455   getCellInfo(pCur);
3456   aPayload = pCur->info.pCell + pCur->info.nHeader;
3457   nKey = (pPage->intKey ? 0 : (int)pCur->info.nKey);
3458 
3459   if( skipKey ){
3460     offset += nKey;
3461   }
3462   if( offset+amt > nKey+pCur->info.nData
3463    || &aPayload[pCur->info.nLocal] > &pPage->aData[pBt->usableSize]
3464   ){
3465     /* Trying to read or write past the end of the data is an error */
3466     return SQLITE_CORRUPT_BKPT;
3467   }
3468 
3469   /* Check if data must be read/written to/from the btree page itself. */
3470   if( offset<pCur->info.nLocal ){
3471     int a = amt;
3472     if( a+offset>pCur->info.nLocal ){
3473       a = pCur->info.nLocal - offset;
3474     }
3475     rc = copyPayload(&aPayload[offset], pBuf, a, eOp, pPage->pDbPage);
3476     offset = 0;
3477     pBuf += a;
3478     amt -= a;
3479   }else{
3480     offset -= pCur->info.nLocal;
3481   }
3482 
3483   if( rc==SQLITE_OK && amt>0 ){
3484     const u32 ovflSize = pBt->usableSize - 4;  /* Bytes content per ovfl page */
3485     Pgno nextPage;
3486 
3487     nextPage = get4byte(&aPayload[pCur->info.nLocal]);
3488 
3489 #ifndef SQLITE_OMIT_INCRBLOB
3490     /* If the isIncrblobHandle flag is set and the BtCursor.aOverflow[]
3491     ** has not been allocated, allocate it now. The array is sized at
3492     ** one entry for each overflow page in the overflow chain. The
3493     ** page number of the first overflow page is stored in aOverflow[0],
3494     ** etc. A value of 0 in the aOverflow[] array means "not yet known"
3495     ** (the cache is lazily populated).
3496     */
3497     if( pCur->isIncrblobHandle && !pCur->aOverflow ){
3498       int nOvfl = (pCur->info.nPayload-pCur->info.nLocal+ovflSize-1)/ovflSize;
3499       pCur->aOverflow = (Pgno *)sqlite3MallocZero(sizeof(Pgno)*nOvfl);
3500       if( nOvfl && !pCur->aOverflow ){
3501         rc = SQLITE_NOMEM;
3502       }
3503     }
3504 
3505     /* If the overflow page-list cache has been allocated and the
3506     ** entry for the first required overflow page is valid, skip
3507     ** directly to it.
3508     */
3509     if( pCur->aOverflow && pCur->aOverflow[offset/ovflSize] ){
3510       iIdx = (offset/ovflSize);
3511       nextPage = pCur->aOverflow[iIdx];
3512       offset = (offset%ovflSize);
3513     }
3514 #endif
3515 
3516     for( ; rc==SQLITE_OK && amt>0 && nextPage; iIdx++){
3517 
3518 #ifndef SQLITE_OMIT_INCRBLOB
3519       /* If required, populate the overflow page-list cache. */
3520       if( pCur->aOverflow ){
3521         assert(!pCur->aOverflow[iIdx] || pCur->aOverflow[iIdx]==nextPage);
3522         pCur->aOverflow[iIdx] = nextPage;
3523       }
3524 #endif
3525 
3526       if( offset>=ovflSize ){
3527         /* The only reason to read this page is to obtain the page
3528         ** number for the next page in the overflow chain. The page
3529         ** data is not required. So first try to lookup the overflow
3530         ** page-list cache, if any, then fall back to the getOverflowPage()
3531         ** function.
3532         */
3533 #ifndef SQLITE_OMIT_INCRBLOB
3534         if( pCur->aOverflow && pCur->aOverflow[iIdx+1] ){
3535           nextPage = pCur->aOverflow[iIdx+1];
3536         } else
3537 #endif
3538           rc = getOverflowPage(pBt, nextPage, 0, &nextPage);
3539         offset -= ovflSize;
3540       }else{
3541         /* Need to read this page properly. It contains some of the
3542         ** range of data that is being read (eOp==0) or written (eOp!=0).
3543         */
3544         DbPage *pDbPage;
3545         int a = amt;
3546         rc = sqlite3PagerGet(pBt->pPager, nextPage, &pDbPage);
3547         if( rc==SQLITE_OK ){
3548           aPayload = sqlite3PagerGetData(pDbPage);
3549           nextPage = get4byte(aPayload);
3550           if( a + offset > ovflSize ){
3551             a = ovflSize - offset;
3552           }
3553           rc = copyPayload(&aPayload[offset+4], pBuf, a, eOp, pDbPage);
3554           sqlite3PagerUnref(pDbPage);
3555           offset = 0;
3556           amt -= a;
3557           pBuf += a;
3558         }
3559       }
3560     }
3561   }
3562 
3563   if( rc==SQLITE_OK && amt>0 ){
3564     return SQLITE_CORRUPT_BKPT;
3565   }
3566   return rc;
3567 }
3568 
3569 /*
3570 ** Read part of the key associated with cursor pCur.  Exactly
3571 ** "amt" bytes will be transfered into pBuf[].  The transfer
3572 ** begins at "offset".
3573 **
3574 ** Return SQLITE_OK on success or an error code if anything goes
3575 ** wrong.  An error is returned if "offset+amt" is larger than
3576 ** the available payload.
3577 */
3578 int sqlite3BtreeKey(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){
3579   int rc;
3580 
3581   assert( cursorHoldsMutex(pCur) );
3582   rc = restoreCursorPosition(pCur);
3583   if( rc==SQLITE_OK ){
3584     assert( pCur->eState==CURSOR_VALID );
3585     assert( pCur->iPage>=0 && pCur->apPage[pCur->iPage] );
3586     if( pCur->apPage[0]->intKey ){
3587       return SQLITE_CORRUPT_BKPT;
3588     }
3589     assert( pCur->aiIdx[pCur->iPage]<pCur->apPage[pCur->iPage]->nCell );
3590     rc = accessPayload(pCur, offset, amt, (unsigned char*)pBuf, 0, 0);
3591   }
3592   return rc;
3593 }
3594 
3595 /*
3596 ** Read part of the data associated with cursor pCur.  Exactly
3597 ** "amt" bytes will be transfered into pBuf[].  The transfer
3598 ** begins at "offset".
3599 **
3600 ** Return SQLITE_OK on success or an error code if anything goes
3601 ** wrong.  An error is returned if "offset+amt" is larger than
3602 ** the available payload.
3603 */
3604 int sqlite3BtreeData(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){
3605   int rc;
3606 
3607 #ifndef SQLITE_OMIT_INCRBLOB
3608   if ( pCur->eState==CURSOR_INVALID ){
3609     return SQLITE_ABORT;
3610   }
3611 #endif
3612 
3613   assert( cursorHoldsMutex(pCur) );
3614   rc = restoreCursorPosition(pCur);
3615   if( rc==SQLITE_OK ){
3616     assert( pCur->eState==CURSOR_VALID );
3617     assert( pCur->iPage>=0 && pCur->apPage[pCur->iPage] );
3618     assert( pCur->aiIdx[pCur->iPage]<pCur->apPage[pCur->iPage]->nCell );
3619     rc = accessPayload(pCur, offset, amt, pBuf, 1, 0);
3620   }
3621   return rc;
3622 }
3623 
3624 /*
3625 ** Return a pointer to payload information from the entry that the
3626 ** pCur cursor is pointing to.  The pointer is to the beginning of
3627 ** the key if skipKey==0 and it points to the beginning of data if
3628 ** skipKey==1.  The number of bytes of available key/data is written
3629 ** into *pAmt.  If *pAmt==0, then the value returned will not be
3630 ** a valid pointer.
3631 **
3632 ** This routine is an optimization.  It is common for the entire key
3633 ** and data to fit on the local page and for there to be no overflow
3634 ** pages.  When that is so, this routine can be used to access the
3635 ** key and data without making a copy.  If the key and/or data spills
3636 ** onto overflow pages, then accessPayload() must be used to reassemble
3637 ** the key/data and copy it into a preallocated buffer.
3638 **
3639 ** The pointer returned by this routine looks directly into the cached
3640 ** page of the database.  The data might change or move the next time
3641 ** any btree routine is called.
3642 */
3643 static const unsigned char *fetchPayload(
3644   BtCursor *pCur,      /* Cursor pointing to entry to read from */
3645   int *pAmt,           /* Write the number of available bytes here */
3646   int skipKey          /* read beginning at data if this is true */
3647 ){
3648   unsigned char *aPayload;
3649   MemPage *pPage;
3650   u32 nKey;
3651   u32 nLocal;
3652 
3653   assert( pCur!=0 && pCur->iPage>=0 && pCur->apPage[pCur->iPage]);
3654   assert( pCur->eState==CURSOR_VALID );
3655   assert( cursorHoldsMutex(pCur) );
3656   pPage = pCur->apPage[pCur->iPage];
3657   assert( pCur->aiIdx[pCur->iPage]<pPage->nCell );
3658   getCellInfo(pCur);
3659   aPayload = pCur->info.pCell;
3660   aPayload += pCur->info.nHeader;
3661   if( pPage->intKey ){
3662     nKey = 0;
3663   }else{
3664     nKey = (int)pCur->info.nKey;
3665   }
3666   if( skipKey ){
3667     aPayload += nKey;
3668     nLocal = pCur->info.nLocal - nKey;
3669   }else{
3670     nLocal = pCur->info.nLocal;
3671     if( nLocal>nKey ){
3672       nLocal = nKey;
3673     }
3674   }
3675   *pAmt = nLocal;
3676   return aPayload;
3677 }
3678 
3679 
3680 /*
3681 ** For the entry that cursor pCur is point to, return as
3682 ** many bytes of the key or data as are available on the local
3683 ** b-tree page.  Write the number of available bytes into *pAmt.
3684 **
3685 ** The pointer returned is ephemeral.  The key/data may move
3686 ** or be destroyed on the next call to any Btree routine,
3687 ** including calls from other threads against the same cache.
3688 ** Hence, a mutex on the BtShared should be held prior to calling
3689 ** this routine.
3690 **
3691 ** These routines is used to get quick access to key and data
3692 ** in the common case where no overflow pages are used.
3693 */
3694 const void *sqlite3BtreeKeyFetch(BtCursor *pCur, int *pAmt){
3695   assert( cursorHoldsMutex(pCur) );
3696   if( pCur->eState==CURSOR_VALID ){
3697     return (const void*)fetchPayload(pCur, pAmt, 0);
3698   }
3699   return 0;
3700 }
3701 const void *sqlite3BtreeDataFetch(BtCursor *pCur, int *pAmt){
3702   assert( cursorHoldsMutex(pCur) );
3703   if( pCur->eState==CURSOR_VALID ){
3704     return (const void*)fetchPayload(pCur, pAmt, 1);
3705   }
3706   return 0;
3707 }
3708 
3709 
3710 /*
3711 ** Move the cursor down to a new child page.  The newPgno argument is the
3712 ** page number of the child page to move to.
3713 */
3714 static int moveToChild(BtCursor *pCur, u32 newPgno){
3715   int rc;
3716   int i = pCur->iPage;
3717   MemPage *pNewPage;
3718   BtShared *pBt = pCur->pBt;
3719 
3720   assert( cursorHoldsMutex(pCur) );
3721   assert( pCur->eState==CURSOR_VALID );
3722   assert( pCur->iPage<BTCURSOR_MAX_DEPTH );
3723   if( pCur->iPage>=(BTCURSOR_MAX_DEPTH-1) ){
3724     return SQLITE_CORRUPT_BKPT;
3725   }
3726   rc = getAndInitPage(pBt, newPgno, &pNewPage);
3727   if( rc ) return rc;
3728   pCur->apPage[i+1] = pNewPage;
3729   pCur->aiIdx[i+1] = 0;
3730   pCur->iPage++;
3731 
3732   pCur->info.nSize = 0;
3733   pCur->validNKey = 0;
3734   if( pNewPage->nCell<1 ){
3735     return SQLITE_CORRUPT_BKPT;
3736   }
3737   return SQLITE_OK;
3738 }
3739 
3740 #ifndef NDEBUG
3741 /*
3742 ** Page pParent is an internal (non-leaf) tree page. This function
3743 ** asserts that page number iChild is the left-child if the iIdx'th
3744 ** cell in page pParent. Or, if iIdx is equal to the total number of
3745 ** cells in pParent, that page number iChild is the right-child of
3746 ** the page.
3747 */
3748 static void assertParentIndex(MemPage *pParent, int iIdx, Pgno iChild){
3749   assert( iIdx<=pParent->nCell );
3750   if( iIdx==pParent->nCell ){
3751     assert( get4byte(&pParent->aData[pParent->hdrOffset+8])==iChild );
3752   }else{
3753     assert( get4byte(findCell(pParent, iIdx))==iChild );
3754   }
3755 }
3756 #else
3757 #  define assertParentIndex(x,y,z)
3758 #endif
3759 
3760 /*
3761 ** Move the cursor up to the parent page.
3762 **
3763 ** pCur->idx is set to the cell index that contains the pointer
3764 ** to the page we are coming from.  If we are coming from the
3765 ** right-most child page then pCur->idx is set to one more than
3766 ** the largest cell index.
3767 */
3768 void sqlite3BtreeMoveToParent(BtCursor *pCur){
3769   assert( cursorHoldsMutex(pCur) );
3770   assert( pCur->eState==CURSOR_VALID );
3771   assert( pCur->iPage>0 );
3772   assert( pCur->apPage[pCur->iPage] );
3773   assertParentIndex(
3774     pCur->apPage[pCur->iPage-1],
3775     pCur->aiIdx[pCur->iPage-1],
3776     pCur->apPage[pCur->iPage]->pgno
3777   );
3778   releasePage(pCur->apPage[pCur->iPage]);
3779   pCur->iPage--;
3780   pCur->info.nSize = 0;
3781   pCur->validNKey = 0;
3782 }
3783 
3784 /*
3785 ** Move the cursor to the root page
3786 */
3787 static int moveToRoot(BtCursor *pCur){
3788   MemPage *pRoot;
3789   int rc = SQLITE_OK;
3790   Btree *p = pCur->pBtree;
3791   BtShared *pBt = p->pBt;
3792 
3793   assert( cursorHoldsMutex(pCur) );
3794   assert( CURSOR_INVALID < CURSOR_REQUIRESEEK );
3795   assert( CURSOR_VALID   < CURSOR_REQUIRESEEK );
3796   assert( CURSOR_FAULT   > CURSOR_REQUIRESEEK );
3797   if( pCur->eState>=CURSOR_REQUIRESEEK ){
3798     if( pCur->eState==CURSOR_FAULT ){
3799       return pCur->skip;
3800     }
3801     sqlite3BtreeClearCursor(pCur);
3802   }
3803 
3804   if( pCur->iPage>=0 ){
3805     int i;
3806     for(i=1; i<=pCur->iPage; i++){
3807       releasePage(pCur->apPage[i]);
3808     }
3809   }else{
3810     if(
3811       SQLITE_OK!=(rc = getAndInitPage(pBt, pCur->pgnoRoot, &pCur->apPage[0]))
3812     ){
3813       pCur->eState = CURSOR_INVALID;
3814       return rc;
3815     }
3816   }
3817 
3818   pRoot = pCur->apPage[0];
3819   assert( pRoot->pgno==pCur->pgnoRoot );
3820   pCur->iPage = 0;
3821   pCur->aiIdx[0] = 0;
3822   pCur->info.nSize = 0;
3823   pCur->atLast = 0;
3824   pCur->validNKey = 0;
3825 
3826   if( pRoot->nCell==0 && !pRoot->leaf ){
3827     Pgno subpage;
3828     assert( pRoot->pgno==1 );
3829     subpage = get4byte(&pRoot->aData[pRoot->hdrOffset+8]);
3830     assert( subpage>0 );
3831     pCur->eState = CURSOR_VALID;
3832     rc = moveToChild(pCur, subpage);
3833   }else{
3834     pCur->eState = ((pRoot->nCell>0)?CURSOR_VALID:CURSOR_INVALID);
3835   }
3836   return rc;
3837 }
3838 
3839 /*
3840 ** Move the cursor down to the left-most leaf entry beneath the
3841 ** entry to which it is currently pointing.
3842 **
3843 ** The left-most leaf is the one with the smallest key - the first
3844 ** in ascending order.
3845 */
3846 static int moveToLeftmost(BtCursor *pCur){
3847   Pgno pgno;
3848   int rc = SQLITE_OK;
3849   MemPage *pPage;
3850 
3851   assert( cursorHoldsMutex(pCur) );
3852   assert( pCur->eState==CURSOR_VALID );
3853   while( rc==SQLITE_OK && !(pPage = pCur->apPage[pCur->iPage])->leaf ){
3854     assert( pCur->aiIdx[pCur->iPage]<pPage->nCell );
3855     pgno = get4byte(findCell(pPage, pCur->aiIdx[pCur->iPage]));
3856     rc = moveToChild(pCur, pgno);
3857   }
3858   return rc;
3859 }
3860 
3861 /*
3862 ** Move the cursor down to the right-most leaf entry beneath the
3863 ** page to which it is currently pointing.  Notice the difference
3864 ** between moveToLeftmost() and moveToRightmost().  moveToLeftmost()
3865 ** finds the left-most entry beneath the *entry* whereas moveToRightmost()
3866 ** finds the right-most entry beneath the *page*.
3867 **
3868 ** The right-most entry is the one with the largest key - the last
3869 ** key in ascending order.
3870 */
3871 static int moveToRightmost(BtCursor *pCur){
3872   Pgno pgno;
3873   int rc = SQLITE_OK;
3874   MemPage *pPage = 0;
3875 
3876   assert( cursorHoldsMutex(pCur) );
3877   assert( pCur->eState==CURSOR_VALID );
3878   while( rc==SQLITE_OK && !(pPage = pCur->apPage[pCur->iPage])->leaf ){
3879     pgno = get4byte(&pPage->aData[pPage->hdrOffset+8]);
3880     pCur->aiIdx[pCur->iPage] = pPage->nCell;
3881     rc = moveToChild(pCur, pgno);
3882   }
3883   if( rc==SQLITE_OK ){
3884     pCur->aiIdx[pCur->iPage] = pPage->nCell-1;
3885     pCur->info.nSize = 0;
3886     pCur->validNKey = 0;
3887   }
3888   return rc;
3889 }
3890 
3891 /* Move the cursor to the first entry in the table.  Return SQLITE_OK
3892 ** on success.  Set *pRes to 0 if the cursor actually points to something
3893 ** or set *pRes to 1 if the table is empty.
3894 */
3895 int sqlite3BtreeFirst(BtCursor *pCur, int *pRes){
3896   int rc;
3897 
3898   assert( cursorHoldsMutex(pCur) );
3899   assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
3900   rc = moveToRoot(pCur);
3901   if( rc==SQLITE_OK ){
3902     if( pCur->eState==CURSOR_INVALID ){
3903       assert( pCur->apPage[pCur->iPage]->nCell==0 );
3904       *pRes = 1;
3905       rc = SQLITE_OK;
3906     }else{
3907       assert( pCur->apPage[pCur->iPage]->nCell>0 );
3908       *pRes = 0;
3909       rc = moveToLeftmost(pCur);
3910     }
3911   }
3912   return rc;
3913 }
3914 
3915 /* Move the cursor to the last entry in the table.  Return SQLITE_OK
3916 ** on success.  Set *pRes to 0 if the cursor actually points to something
3917 ** or set *pRes to 1 if the table is empty.
3918 */
3919 int sqlite3BtreeLast(BtCursor *pCur, int *pRes){
3920   int rc;
3921 
3922   assert( cursorHoldsMutex(pCur) );
3923   assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
3924   rc = moveToRoot(pCur);
3925   if( rc==SQLITE_OK ){
3926     if( CURSOR_INVALID==pCur->eState ){
3927       assert( pCur->apPage[pCur->iPage]->nCell==0 );
3928       *pRes = 1;
3929     }else{
3930       assert( pCur->eState==CURSOR_VALID );
3931       *pRes = 0;
3932       rc = moveToRightmost(pCur);
3933       pCur->atLast = rc==SQLITE_OK ?1:0;
3934     }
3935   }
3936   return rc;
3937 }
3938 
3939 /* Move the cursor so that it points to an entry near the key
3940 ** specified by pIdxKey or intKey.   Return a success code.
3941 **
3942 ** For INTKEY tables, the intKey parameter is used.  pIdxKey
3943 ** must be NULL.  For index tables, pIdxKey is used and intKey
3944 ** is ignored.
3945 **
3946 ** If an exact match is not found, then the cursor is always
3947 ** left pointing at a leaf page which would hold the entry if it
3948 ** were present.  The cursor might point to an entry that comes
3949 ** before or after the key.
3950 **
3951 ** An integer is written into *pRes which is the result of
3952 ** comparing the key with the entry to which the cursor is
3953 ** pointing.  The meaning of the integer written into
3954 ** *pRes is as follows:
3955 **
3956 **     *pRes<0      The cursor is left pointing at an entry that
3957 **                  is smaller than intKey/pIdxKey or if the table is empty
3958 **                  and the cursor is therefore left point to nothing.
3959 **
3960 **     *pRes==0     The cursor is left pointing at an entry that
3961 **                  exactly matches intKey/pIdxKey.
3962 **
3963 **     *pRes>0      The cursor is left pointing at an entry that
3964 **                  is larger than intKey/pIdxKey.
3965 **
3966 */
3967 int sqlite3BtreeMovetoUnpacked(
3968   BtCursor *pCur,          /* The cursor to be moved */
3969   UnpackedRecord *pIdxKey, /* Unpacked index key */
3970   i64 intKey,              /* The table key */
3971   int biasRight,           /* If true, bias the search to the high end */
3972   int *pRes                /* Write search results here */
3973 ){
3974   int rc;
3975 
3976   assert( cursorHoldsMutex(pCur) );
3977   assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
3978 
3979   /* If the cursor is already positioned at the point we are trying
3980   ** to move to, then just return without doing any work */
3981   if( pCur->eState==CURSOR_VALID && pCur->validNKey
3982    && pCur->apPage[0]->intKey
3983   ){
3984     if( pCur->info.nKey==intKey ){
3985       *pRes = 0;
3986       return SQLITE_OK;
3987     }
3988     if( pCur->atLast && pCur->info.nKey<intKey ){
3989       *pRes = -1;
3990       return SQLITE_OK;
3991     }
3992   }
3993 
3994   rc = moveToRoot(pCur);
3995   if( rc ){
3996     return rc;
3997   }
3998   assert( pCur->apPage[pCur->iPage] );
3999   assert( pCur->apPage[pCur->iPage]->isInit );
4000   if( pCur->eState==CURSOR_INVALID ){
4001     *pRes = -1;
4002     assert( pCur->apPage[pCur->iPage]->nCell==0 );
4003     return SQLITE_OK;
4004   }
4005   assert( pCur->apPage[0]->intKey || pIdxKey );
4006   for(;;){
4007     int lwr, upr;
4008     Pgno chldPg;
4009     MemPage *pPage = pCur->apPage[pCur->iPage];
4010     int c = -1;  /* pRes return if table is empty must be -1 */
4011     lwr = 0;
4012     upr = pPage->nCell-1;
4013     if( (!pPage->intKey && pIdxKey==0) || upr<0 ){
4014       rc = SQLITE_CORRUPT_BKPT;
4015       goto moveto_finish;
4016     }
4017     if( biasRight ){
4018       pCur->aiIdx[pCur->iPage] = (u16)upr;
4019     }else{
4020       pCur->aiIdx[pCur->iPage] = (u16)((upr+lwr)/2);
4021     }
4022     for(;;){
4023       void *pCellKey;
4024       i64 nCellKey;
4025       int idx = pCur->aiIdx[pCur->iPage];
4026       pCur->info.nSize = 0;
4027       pCur->validNKey = 1;
4028       if( pPage->intKey ){
4029         u8 *pCell;
4030         pCell = findCell(pPage, idx) + pPage->childPtrSize;
4031         if( pPage->hasData ){
4032           u32 dummy;
4033           pCell += getVarint32(pCell, dummy);
4034         }
4035         getVarint(pCell, (u64*)&nCellKey);
4036         if( nCellKey==intKey ){
4037           c = 0;
4038         }else if( nCellKey<intKey ){
4039           c = -1;
4040         }else{
4041           assert( nCellKey>intKey );
4042           c = +1;
4043         }
4044       }else{
4045         int available;
4046         pCellKey = (void *)fetchPayload(pCur, &available, 0);
4047         nCellKey = pCur->info.nKey;
4048         if( available>=nCellKey ){
4049           c = sqlite3VdbeRecordCompare((int)nCellKey, pCellKey, pIdxKey);
4050         }else{
4051           pCellKey = sqlite3Malloc( (int)nCellKey );
4052           if( pCellKey==0 ){
4053             rc = SQLITE_NOMEM;
4054             goto moveto_finish;
4055           }
4056           rc = sqlite3BtreeKey(pCur, 0, (int)nCellKey, (void*)pCellKey);
4057           c = sqlite3VdbeRecordCompare((int)nCellKey, pCellKey, pIdxKey);
4058           sqlite3_free(pCellKey);
4059           if( rc ) goto moveto_finish;
4060         }
4061       }
4062       if( c==0 ){
4063         pCur->info.nKey = nCellKey;
4064         if( pPage->intKey && !pPage->leaf ){
4065           lwr = idx;
4066           upr = lwr - 1;
4067           break;
4068         }else{
4069           *pRes = 0;
4070           rc = SQLITE_OK;
4071           goto moveto_finish;
4072         }
4073       }
4074       if( c<0 ){
4075         lwr = idx+1;
4076       }else{
4077         upr = idx-1;
4078       }
4079       if( lwr>upr ){
4080         pCur->info.nKey = nCellKey;
4081         break;
4082       }
4083       pCur->aiIdx[pCur->iPage] = (u16)((lwr+upr)/2);
4084     }
4085     assert( lwr==upr+1 );
4086     assert( pPage->isInit );
4087     if( pPage->leaf ){
4088       chldPg = 0;
4089     }else if( lwr>=pPage->nCell ){
4090       chldPg = get4byte(&pPage->aData[pPage->hdrOffset+8]);
4091     }else{
4092       chldPg = get4byte(findCell(pPage, lwr));
4093     }
4094     if( chldPg==0 ){
4095       assert( pCur->aiIdx[pCur->iPage]<pCur->apPage[pCur->iPage]->nCell );
4096       if( pRes ) *pRes = c;
4097       rc = SQLITE_OK;
4098       goto moveto_finish;
4099     }
4100     pCur->aiIdx[pCur->iPage] = (u16)lwr;
4101     pCur->info.nSize = 0;
4102     pCur->validNKey = 0;
4103     rc = moveToChild(pCur, chldPg);
4104     if( rc ) goto moveto_finish;
4105   }
4106 moveto_finish:
4107   return rc;
4108 }
4109 
4110 /*
4111 ** In this version of BtreeMoveto, pKey is a packed index record
4112 ** such as is generated by the OP_MakeRecord opcode.  Unpack the
4113 ** record and then call BtreeMovetoUnpacked() to do the work.
4114 */
4115 int sqlite3BtreeMoveto(
4116   BtCursor *pCur,     /* Cursor open on the btree to be searched */
4117   const void *pKey,   /* Packed key if the btree is an index */
4118   i64 nKey,           /* Integer key for tables.  Size of pKey for indices */
4119   int bias,           /* Bias search to the high end */
4120   int *pRes           /* Write search results here */
4121 ){
4122   int rc;                    /* Status code */
4123   UnpackedRecord *pIdxKey;   /* Unpacked index key */
4124   char aSpace[150];          /* Temp space for pIdxKey - to avoid a malloc */
4125 
4126 
4127   if( pKey ){
4128     assert( nKey==(i64)(int)nKey );
4129     pIdxKey = sqlite3VdbeRecordUnpack(pCur->pKeyInfo, (int)nKey, pKey,
4130                                       aSpace, sizeof(aSpace));
4131     if( pIdxKey==0 ) return SQLITE_NOMEM;
4132   }else{
4133     pIdxKey = 0;
4134   }
4135   rc = sqlite3BtreeMovetoUnpacked(pCur, pIdxKey, nKey, bias, pRes);
4136   if( pKey ){
4137     sqlite3VdbeDeleteUnpackedRecord(pIdxKey);
4138   }
4139   return rc;
4140 }
4141 
4142 
4143 /*
4144 ** Return TRUE if the cursor is not pointing at an entry of the table.
4145 **
4146 ** TRUE will be returned after a call to sqlite3BtreeNext() moves
4147 ** past the last entry in the table or sqlite3BtreePrev() moves past
4148 ** the first entry.  TRUE is also returned if the table is empty.
4149 */
4150 int sqlite3BtreeEof(BtCursor *pCur){
4151   /* TODO: What if the cursor is in CURSOR_REQUIRESEEK but all table entries
4152   ** have been deleted? This API will need to change to return an error code
4153   ** as well as the boolean result value.
4154   */
4155   return (CURSOR_VALID!=pCur->eState);
4156 }
4157 
4158 /*
4159 ** Return the database connection handle for a cursor.
4160 */
4161 sqlite3 *sqlite3BtreeCursorDb(const BtCursor *pCur){
4162   assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
4163   return pCur->pBtree->db;
4164 }
4165 
4166 /*
4167 ** Advance the cursor to the next entry in the database.  If
4168 ** successful then set *pRes=0.  If the cursor
4169 ** was already pointing to the last entry in the database before
4170 ** this routine was called, then set *pRes=1.
4171 */
4172 int sqlite3BtreeNext(BtCursor *pCur, int *pRes){
4173   int rc;
4174   int idx;
4175   MemPage *pPage;
4176 
4177   assert( cursorHoldsMutex(pCur) );
4178   rc = restoreCursorPosition(pCur);
4179   if( rc!=SQLITE_OK ){
4180     return rc;
4181   }
4182   assert( pRes!=0 );
4183   if( CURSOR_INVALID==pCur->eState ){
4184     *pRes = 1;
4185     return SQLITE_OK;
4186   }
4187   if( pCur->skip>0 ){
4188     pCur->skip = 0;
4189     *pRes = 0;
4190     return SQLITE_OK;
4191   }
4192   pCur->skip = 0;
4193 
4194   pPage = pCur->apPage[pCur->iPage];
4195   idx = ++pCur->aiIdx[pCur->iPage];
4196   assert( pPage->isInit );
4197   assert( idx<=pPage->nCell );
4198 
4199   pCur->info.nSize = 0;
4200   pCur->validNKey = 0;
4201   if( idx>=pPage->nCell ){
4202     if( !pPage->leaf ){
4203       rc = moveToChild(pCur, get4byte(&pPage->aData[pPage->hdrOffset+8]));
4204       if( rc ) return rc;
4205       rc = moveToLeftmost(pCur);
4206       *pRes = 0;
4207       return rc;
4208     }
4209     do{
4210       if( pCur->iPage==0 ){
4211         *pRes = 1;
4212         pCur->eState = CURSOR_INVALID;
4213         return SQLITE_OK;
4214       }
4215       sqlite3BtreeMoveToParent(pCur);
4216       pPage = pCur->apPage[pCur->iPage];
4217     }while( pCur->aiIdx[pCur->iPage]>=pPage->nCell );
4218     *pRes = 0;
4219     if( pPage->intKey ){
4220       rc = sqlite3BtreeNext(pCur, pRes);
4221     }else{
4222       rc = SQLITE_OK;
4223     }
4224     return rc;
4225   }
4226   *pRes = 0;
4227   if( pPage->leaf ){
4228     return SQLITE_OK;
4229   }
4230   rc = moveToLeftmost(pCur);
4231   return rc;
4232 }
4233 
4234 
4235 /*
4236 ** Step the cursor to the back to the previous entry in the database.  If
4237 ** successful then set *pRes=0.  If the cursor
4238 ** was already pointing to the first entry in the database before
4239 ** this routine was called, then set *pRes=1.
4240 */
4241 int sqlite3BtreePrevious(BtCursor *pCur, int *pRes){
4242   int rc;
4243   MemPage *pPage;
4244 
4245   assert( cursorHoldsMutex(pCur) );
4246   rc = restoreCursorPosition(pCur);
4247   if( rc!=SQLITE_OK ){
4248     return rc;
4249   }
4250   pCur->atLast = 0;
4251   if( CURSOR_INVALID==pCur->eState ){
4252     *pRes = 1;
4253     return SQLITE_OK;
4254   }
4255   if( pCur->skip<0 ){
4256     pCur->skip = 0;
4257     *pRes = 0;
4258     return SQLITE_OK;
4259   }
4260   pCur->skip = 0;
4261 
4262   pPage = pCur->apPage[pCur->iPage];
4263   assert( pPage->isInit );
4264   if( !pPage->leaf ){
4265     int idx = pCur->aiIdx[pCur->iPage];
4266     rc = moveToChild(pCur, get4byte(findCell(pPage, idx)));
4267     if( rc ){
4268       return rc;
4269     }
4270     rc = moveToRightmost(pCur);
4271   }else{
4272     while( pCur->aiIdx[pCur->iPage]==0 ){
4273       if( pCur->iPage==0 ){
4274         pCur->eState = CURSOR_INVALID;
4275         *pRes = 1;
4276         return SQLITE_OK;
4277       }
4278       sqlite3BtreeMoveToParent(pCur);
4279     }
4280     pCur->info.nSize = 0;
4281     pCur->validNKey = 0;
4282 
4283     pCur->aiIdx[pCur->iPage]--;
4284     pPage = pCur->apPage[pCur->iPage];
4285     if( pPage->intKey && !pPage->leaf ){
4286       rc = sqlite3BtreePrevious(pCur, pRes);
4287     }else{
4288       rc = SQLITE_OK;
4289     }
4290   }
4291   *pRes = 0;
4292   return rc;
4293 }
4294 
4295 /*
4296 ** Allocate a new page from the database file.
4297 **
4298 ** The new page is marked as dirty.  (In other words, sqlite3PagerWrite()
4299 ** has already been called on the new page.)  The new page has also
4300 ** been referenced and the calling routine is responsible for calling
4301 ** sqlite3PagerUnref() on the new page when it is done.
4302 **
4303 ** SQLITE_OK is returned on success.  Any other return value indicates
4304 ** an error.  *ppPage and *pPgno are undefined in the event of an error.
4305 ** Do not invoke sqlite3PagerUnref() on *ppPage if an error is returned.
4306 **
4307 ** If the "nearby" parameter is not 0, then a (feeble) effort is made to
4308 ** locate a page close to the page number "nearby".  This can be used in an
4309 ** attempt to keep related pages close to each other in the database file,
4310 ** which in turn can make database access faster.
4311 **
4312 ** If the "exact" parameter is not 0, and the page-number nearby exists
4313 ** anywhere on the free-list, then it is guarenteed to be returned. This
4314 ** is only used by auto-vacuum databases when allocating a new table.
4315 */
4316 static int allocateBtreePage(
4317   BtShared *pBt,
4318   MemPage **ppPage,
4319   Pgno *pPgno,
4320   Pgno nearby,
4321   u8 exact
4322 ){
4323   MemPage *pPage1;
4324   int rc;
4325   int n;     /* Number of pages on the freelist */
4326   int k;     /* Number of leaves on the trunk of the freelist */
4327   MemPage *pTrunk = 0;
4328   MemPage *pPrevTrunk = 0;
4329 
4330   assert( sqlite3_mutex_held(pBt->mutex) );
4331   pPage1 = pBt->pPage1;
4332   n = get4byte(&pPage1->aData[36]);
4333   if( n>0 ){
4334     /* There are pages on the freelist.  Reuse one of those pages. */
4335     Pgno iTrunk;
4336     u8 searchList = 0; /* If the free-list must be searched for 'nearby' */
4337 
4338     /* If the 'exact' parameter was true and a query of the pointer-map
4339     ** shows that the page 'nearby' is somewhere on the free-list, then
4340     ** the entire-list will be searched for that page.
4341     */
4342 #ifndef SQLITE_OMIT_AUTOVACUUM
4343     if( exact && nearby<=pagerPagecount(pBt) ){
4344       u8 eType;
4345       assert( nearby>0 );
4346       assert( pBt->autoVacuum );
4347       rc = ptrmapGet(pBt, nearby, &eType, 0);
4348       if( rc ) return rc;
4349       if( eType==PTRMAP_FREEPAGE ){
4350         searchList = 1;
4351       }
4352       *pPgno = nearby;
4353     }
4354 #endif
4355 
4356     /* Decrement the free-list count by 1. Set iTrunk to the index of the
4357     ** first free-list trunk page. iPrevTrunk is initially 1.
4358     */
4359     rc = sqlite3PagerWrite(pPage1->pDbPage);
4360     if( rc ) return rc;
4361     put4byte(&pPage1->aData[36], n-1);
4362 
4363     /* The code within this loop is run only once if the 'searchList' variable
4364     ** is not true. Otherwise, it runs once for each trunk-page on the
4365     ** free-list until the page 'nearby' is located.
4366     */
4367     do {
4368       pPrevTrunk = pTrunk;
4369       if( pPrevTrunk ){
4370         iTrunk = get4byte(&pPrevTrunk->aData[0]);
4371       }else{
4372         iTrunk = get4byte(&pPage1->aData[32]);
4373       }
4374       rc = sqlite3BtreeGetPage(pBt, iTrunk, &pTrunk, 0);
4375       if( rc ){
4376         pTrunk = 0;
4377         goto end_allocate_page;
4378       }
4379 
4380       k = get4byte(&pTrunk->aData[4]);
4381       if( k==0 && !searchList ){
4382         /* The trunk has no leaves and the list is not being searched.
4383         ** So extract the trunk page itself and use it as the newly
4384         ** allocated page */
4385         assert( pPrevTrunk==0 );
4386         rc = sqlite3PagerWrite(pTrunk->pDbPage);
4387         if( rc ){
4388           goto end_allocate_page;
4389         }
4390         *pPgno = iTrunk;
4391         memcpy(&pPage1->aData[32], &pTrunk->aData[0], 4);
4392         *ppPage = pTrunk;
4393         pTrunk = 0;
4394         TRACE(("ALLOCATE: %d trunk - %d free pages left\n", *pPgno, n-1));
4395       }else if( k>pBt->usableSize/4 - 2 ){
4396         /* Value of k is out of range.  Database corruption */
4397         rc = SQLITE_CORRUPT_BKPT;
4398         goto end_allocate_page;
4399 #ifndef SQLITE_OMIT_AUTOVACUUM
4400       }else if( searchList && nearby==iTrunk ){
4401         /* The list is being searched and this trunk page is the page
4402         ** to allocate, regardless of whether it has leaves.
4403         */
4404         assert( *pPgno==iTrunk );
4405         *ppPage = pTrunk;
4406         searchList = 0;
4407         rc = sqlite3PagerWrite(pTrunk->pDbPage);
4408         if( rc ){
4409           goto end_allocate_page;
4410         }
4411         if( k==0 ){
4412           if( !pPrevTrunk ){
4413             memcpy(&pPage1->aData[32], &pTrunk->aData[0], 4);
4414           }else{
4415             memcpy(&pPrevTrunk->aData[0], &pTrunk->aData[0], 4);
4416           }
4417         }else{
4418           /* The trunk page is required by the caller but it contains
4419           ** pointers to free-list leaves. The first leaf becomes a trunk
4420           ** page in this case.
4421           */
4422           MemPage *pNewTrunk;
4423           Pgno iNewTrunk = get4byte(&pTrunk->aData[8]);
4424           rc = sqlite3BtreeGetPage(pBt, iNewTrunk, &pNewTrunk, 0);
4425           if( rc!=SQLITE_OK ){
4426             goto end_allocate_page;
4427           }
4428           rc = sqlite3PagerWrite(pNewTrunk->pDbPage);
4429           if( rc!=SQLITE_OK ){
4430             releasePage(pNewTrunk);
4431             goto end_allocate_page;
4432           }
4433           memcpy(&pNewTrunk->aData[0], &pTrunk->aData[0], 4);
4434           put4byte(&pNewTrunk->aData[4], k-1);
4435           memcpy(&pNewTrunk->aData[8], &pTrunk->aData[12], (k-1)*4);
4436           releasePage(pNewTrunk);
4437           if( !pPrevTrunk ){
4438             assert( sqlite3PagerIswriteable(pPage1->pDbPage) );
4439             put4byte(&pPage1->aData[32], iNewTrunk);
4440           }else{
4441             rc = sqlite3PagerWrite(pPrevTrunk->pDbPage);
4442             if( rc ){
4443               goto end_allocate_page;
4444             }
4445             put4byte(&pPrevTrunk->aData[0], iNewTrunk);
4446           }
4447         }
4448         pTrunk = 0;
4449         TRACE(("ALLOCATE: %d trunk - %d free pages left\n", *pPgno, n-1));
4450 #endif
4451       }else{
4452         /* Extract a leaf from the trunk */
4453         int closest;
4454         Pgno iPage;
4455         unsigned char *aData = pTrunk->aData;
4456         rc = sqlite3PagerWrite(pTrunk->pDbPage);
4457         if( rc ){
4458           goto end_allocate_page;
4459         }
4460         if( nearby>0 ){
4461           int i, dist;
4462           closest = 0;
4463           dist = get4byte(&aData[8]) - nearby;
4464           if( dist<0 ) dist = -dist;
4465           for(i=1; i<k; i++){
4466             int d2 = get4byte(&aData[8+i*4]) - nearby;
4467             if( d2<0 ) d2 = -d2;
4468             if( d2<dist ){
4469               closest = i;
4470               dist = d2;
4471             }
4472           }
4473         }else{
4474           closest = 0;
4475         }
4476 
4477         iPage = get4byte(&aData[8+closest*4]);
4478         if( !searchList || iPage==nearby ){
4479           int noContent;
4480           Pgno nPage;
4481           *pPgno = iPage;
4482           nPage = pagerPagecount(pBt);
4483           if( *pPgno>nPage ){
4484             /* Free page off the end of the file */
4485             rc = SQLITE_CORRUPT_BKPT;
4486             goto end_allocate_page;
4487           }
4488           TRACE(("ALLOCATE: %d was leaf %d of %d on trunk %d"
4489                  ": %d more free pages\n",
4490                  *pPgno, closest+1, k, pTrunk->pgno, n-1));
4491           if( closest<k-1 ){
4492             memcpy(&aData[8+closest*4], &aData[4+k*4], 4);
4493           }
4494           put4byte(&aData[4], k-1);
4495           assert( sqlite3PagerIswriteable(pTrunk->pDbPage) );
4496           noContent = !btreeGetHasContent(pBt, *pPgno);
4497           rc = sqlite3BtreeGetPage(pBt, *pPgno, ppPage, noContent);
4498           if( rc==SQLITE_OK ){
4499             rc = sqlite3PagerWrite((*ppPage)->pDbPage);
4500             if( rc!=SQLITE_OK ){
4501               releasePage(*ppPage);
4502             }
4503           }
4504           searchList = 0;
4505         }
4506       }
4507       releasePage(pPrevTrunk);
4508       pPrevTrunk = 0;
4509     }while( searchList );
4510   }else{
4511     /* There are no pages on the freelist, so create a new page at the
4512     ** end of the file */
4513     int nPage = pagerPagecount(pBt);
4514     *pPgno = nPage + 1;
4515 
4516     if( *pPgno==PENDING_BYTE_PAGE(pBt) ){
4517       (*pPgno)++;
4518     }
4519 
4520 #ifndef SQLITE_OMIT_AUTOVACUUM
4521     if( pBt->autoVacuum && PTRMAP_ISPAGE(pBt, *pPgno) ){
4522       /* If *pPgno refers to a pointer-map page, allocate two new pages
4523       ** at the end of the file instead of one. The first allocated page
4524       ** becomes a new pointer-map page, the second is used by the caller.
4525       */
4526       MemPage *pPg = 0;
4527       TRACE(("ALLOCATE: %d from end of file (pointer-map page)\n", *pPgno));
4528       assert( *pPgno!=PENDING_BYTE_PAGE(pBt) );
4529       rc = sqlite3BtreeGetPage(pBt, *pPgno, &pPg, 0);
4530       if( rc==SQLITE_OK ){
4531         rc = sqlite3PagerWrite(pPg->pDbPage);
4532         releasePage(pPg);
4533       }
4534       if( rc ) return rc;
4535       (*pPgno)++;
4536       if( *pPgno==PENDING_BYTE_PAGE(pBt) ){ (*pPgno)++; }
4537     }
4538 #endif
4539 
4540     assert( *pPgno!=PENDING_BYTE_PAGE(pBt) );
4541     rc = sqlite3BtreeGetPage(pBt, *pPgno, ppPage, 0);
4542     if( rc ) return rc;
4543     rc = sqlite3PagerWrite((*ppPage)->pDbPage);
4544     if( rc!=SQLITE_OK ){
4545       releasePage(*ppPage);
4546     }
4547     TRACE(("ALLOCATE: %d from end of file\n", *pPgno));
4548   }
4549 
4550   assert( *pPgno!=PENDING_BYTE_PAGE(pBt) );
4551 
4552 end_allocate_page:
4553   releasePage(pTrunk);
4554   releasePage(pPrevTrunk);
4555   if( rc==SQLITE_OK ){
4556     if( sqlite3PagerPageRefcount((*ppPage)->pDbPage)>1 ){
4557       releasePage(*ppPage);
4558       return SQLITE_CORRUPT_BKPT;
4559     }
4560     (*ppPage)->isInit = 0;
4561   }
4562   return rc;
4563 }
4564 
4565 /*
4566 ** This function is used to add page iPage to the database file free-list.
4567 ** It is assumed that the page is not already a part of the free-list.
4568 **
4569 ** The value passed as the second argument to this function is optional.
4570 ** If the caller happens to have a pointer to the MemPage object
4571 ** corresponding to page iPage handy, it may pass it as the second value.
4572 ** Otherwise, it may pass NULL.
4573 **
4574 ** If a pointer to a MemPage object is passed as the second argument,
4575 ** its reference count is not altered by this function.
4576 */
4577 static int freePage2(BtShared *pBt, MemPage *pMemPage, Pgno iPage){
4578   MemPage *pTrunk = 0;                /* Free-list trunk page */
4579   Pgno iTrunk = 0;                    /* Page number of free-list trunk page */
4580   MemPage *pPage1 = pBt->pPage1;      /* Local reference to page 1 */
4581   MemPage *pPage;                     /* Page being freed. May be NULL. */
4582   int rc;                             /* Return Code */
4583   int nFree;                          /* Initial number of pages on free-list */
4584 
4585   assert( sqlite3_mutex_held(pBt->mutex) );
4586   assert( iPage>1 );
4587   assert( !pMemPage || pMemPage->pgno==iPage );
4588 
4589   if( pMemPage ){
4590     pPage = pMemPage;
4591     sqlite3PagerRef(pPage->pDbPage);
4592   }else{
4593     pPage = btreePageLookup(pBt, iPage);
4594   }
4595 
4596   /* Increment the free page count on pPage1 */
4597   rc = sqlite3PagerWrite(pPage1->pDbPage);
4598   if( rc ) goto freepage_out;
4599   nFree = get4byte(&pPage1->aData[36]);
4600   put4byte(&pPage1->aData[36], nFree+1);
4601 
4602 #ifdef SQLITE_SECURE_DELETE
4603   /* If the SQLITE_SECURE_DELETE compile-time option is enabled, then
4604   ** always fully overwrite deleted information with zeros.
4605   */
4606   if( (!pPage && (rc = sqlite3BtreeGetPage(pBt, iPage, &pPage, 0)))
4607    ||            (rc = sqlite3PagerWrite(pPage->pDbPage))
4608   ){
4609     goto freepage_out;
4610   }
4611   memset(pPage->aData, 0, pPage->pBt->pageSize);
4612 #endif
4613 
4614   /* If the database supports auto-vacuum, write an entry in the pointer-map
4615   ** to indicate that the page is free.
4616   */
4617   if( ISAUTOVACUUM ){
4618     rc = ptrmapPut(pBt, iPage, PTRMAP_FREEPAGE, 0);
4619     if( rc ) goto freepage_out;
4620   }
4621 
4622   /* Now manipulate the actual database free-list structure. There are two
4623   ** possibilities. If the free-list is currently empty, or if the first
4624   ** trunk page in the free-list is full, then this page will become a
4625   ** new free-list trunk page. Otherwise, it will become a leaf of the
4626   ** first trunk page in the current free-list. This block tests if it
4627   ** is possible to add the page as a new free-list leaf.
4628   */
4629   if( nFree!=0 ){
4630     int nLeaf;                /* Initial number of leaf cells on trunk page */
4631 
4632     iTrunk = get4byte(&pPage1->aData[32]);
4633     rc = sqlite3BtreeGetPage(pBt, iTrunk, &pTrunk, 0);
4634     if( rc!=SQLITE_OK ){
4635       goto freepage_out;
4636     }
4637 
4638     nLeaf = get4byte(&pTrunk->aData[4]);
4639     if( nLeaf<0 ){
4640       rc = SQLITE_CORRUPT_BKPT;
4641       goto freepage_out;
4642     }
4643     if( nLeaf<pBt->usableSize/4 - 8 ){
4644       /* In this case there is room on the trunk page to insert the page
4645       ** being freed as a new leaf.
4646       **
4647       ** Note that the trunk page is not really full until it contains
4648       ** usableSize/4 - 2 entries, not usableSize/4 - 8 entries as we have
4649       ** coded.  But due to a coding error in versions of SQLite prior to
4650       ** 3.6.0, databases with freelist trunk pages holding more than
4651       ** usableSize/4 - 8 entries will be reported as corrupt.  In order
4652       ** to maintain backwards compatibility with older versions of SQLite,
4653       ** we will contain to restrict the number of entries to usableSize/4 - 8
4654       ** for now.  At some point in the future (once everyone has upgraded
4655       ** to 3.6.0 or later) we should consider fixing the conditional above
4656       ** to read "usableSize/4-2" instead of "usableSize/4-8".
4657       */
4658       rc = sqlite3PagerWrite(pTrunk->pDbPage);
4659       if( rc==SQLITE_OK ){
4660         put4byte(&pTrunk->aData[4], nLeaf+1);
4661         put4byte(&pTrunk->aData[8+nLeaf*4], iPage);
4662 #ifndef SQLITE_SECURE_DELETE
4663         if( pPage ){
4664           sqlite3PagerDontWrite(pPage->pDbPage);
4665         }
4666 #endif
4667         rc = btreeSetHasContent(pBt, iPage);
4668       }
4669       TRACE(("FREE-PAGE: %d leaf on trunk page %d\n",pPage->pgno,pTrunk->pgno));
4670       goto freepage_out;
4671     }
4672   }
4673 
4674   /* If control flows to this point, then it was not possible to add the
4675   ** the page being freed as a leaf page of the first trunk in the free-list.
4676   ** Possibly because the free-list is empty, or possibly because the
4677   ** first trunk in the free-list is full. Either way, the page being freed
4678   ** will become the new first trunk page in the free-list.
4679   */
4680   if(   ((!pPage) && (0 != (rc = sqlite3BtreeGetPage(pBt, iPage, &pPage, 0))))
4681      || (0 != (rc = sqlite3PagerWrite(pPage->pDbPage)))
4682   ){
4683     goto freepage_out;
4684   }
4685   put4byte(pPage->aData, iTrunk);
4686   put4byte(&pPage->aData[4], 0);
4687   put4byte(&pPage1->aData[32], iPage);
4688   TRACE(("FREE-PAGE: %d new trunk page replacing %d\n", pPage->pgno, iTrunk));
4689 
4690 freepage_out:
4691   if( pPage ){
4692     pPage->isInit = 0;
4693   }
4694   releasePage(pPage);
4695   releasePage(pTrunk);
4696   return rc;
4697 }
4698 static int freePage(MemPage *pPage){
4699   return freePage2(pPage->pBt, pPage, pPage->pgno);
4700 }
4701 
4702 /*
4703 ** Free any overflow pages associated with the given Cell.
4704 */
4705 static int clearCell(MemPage *pPage, unsigned char *pCell){
4706   BtShared *pBt = pPage->pBt;
4707   CellInfo info;
4708   Pgno ovflPgno;
4709   int rc;
4710   int nOvfl;
4711   u16 ovflPageSize;
4712 
4713   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
4714   sqlite3BtreeParseCellPtr(pPage, pCell, &info);
4715   if( info.iOverflow==0 ){
4716     return SQLITE_OK;  /* No overflow pages. Return without doing anything */
4717   }
4718   ovflPgno = get4byte(&pCell[info.iOverflow]);
4719   assert( pBt->usableSize > 4 );
4720   ovflPageSize = pBt->usableSize - 4;
4721   nOvfl = (info.nPayload - info.nLocal + ovflPageSize - 1)/ovflPageSize;
4722   assert( ovflPgno==0 || nOvfl>0 );
4723   while( nOvfl-- ){
4724     Pgno iNext = 0;
4725     MemPage *pOvfl = 0;
4726     if( ovflPgno<2 || ovflPgno>pagerPagecount(pBt) ){
4727       /* 0 is not a legal page number and page 1 cannot be an
4728       ** overflow page. Therefore if ovflPgno<2 or past the end of the
4729       ** file the database must be corrupt. */
4730       return SQLITE_CORRUPT_BKPT;
4731     }
4732     if( nOvfl ){
4733       rc = getOverflowPage(pBt, ovflPgno, &pOvfl, &iNext);
4734       if( rc ) return rc;
4735     }
4736     rc = freePage2(pBt, pOvfl, ovflPgno);
4737     if( pOvfl ){
4738       sqlite3PagerUnref(pOvfl->pDbPage);
4739     }
4740     if( rc ) return rc;
4741     ovflPgno = iNext;
4742   }
4743   return SQLITE_OK;
4744 }
4745 
4746 /*
4747 ** Create the byte sequence used to represent a cell on page pPage
4748 ** and write that byte sequence into pCell[].  Overflow pages are
4749 ** allocated and filled in as necessary.  The calling procedure
4750 ** is responsible for making sure sufficient space has been allocated
4751 ** for pCell[].
4752 **
4753 ** Note that pCell does not necessary need to point to the pPage->aData
4754 ** area.  pCell might point to some temporary storage.  The cell will
4755 ** be constructed in this temporary area then copied into pPage->aData
4756 ** later.
4757 */
4758 static int fillInCell(
4759   MemPage *pPage,                /* The page that contains the cell */
4760   unsigned char *pCell,          /* Complete text of the cell */
4761   const void *pKey, i64 nKey,    /* The key */
4762   const void *pData,int nData,   /* The data */
4763   int nZero,                     /* Extra zero bytes to append to pData */
4764   int *pnSize                    /* Write cell size here */
4765 ){
4766   int nPayload;
4767   const u8 *pSrc;
4768   int nSrc, n, rc;
4769   int spaceLeft;
4770   MemPage *pOvfl = 0;
4771   MemPage *pToRelease = 0;
4772   unsigned char *pPrior;
4773   unsigned char *pPayload;
4774   BtShared *pBt = pPage->pBt;
4775   Pgno pgnoOvfl = 0;
4776   int nHeader;
4777   CellInfo info;
4778 
4779   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
4780 
4781   /* pPage is not necessarily writeable since pCell might be auxiliary
4782   ** buffer space that is separate from the pPage buffer area */
4783   assert( pCell<pPage->aData || pCell>=&pPage->aData[pBt->pageSize]
4784             || sqlite3PagerIswriteable(pPage->pDbPage) );
4785 
4786   /* Fill in the header. */
4787   nHeader = 0;
4788   if( !pPage->leaf ){
4789     nHeader += 4;
4790   }
4791   if( pPage->hasData ){
4792     nHeader += putVarint(&pCell[nHeader], nData+nZero);
4793   }else{
4794     nData = nZero = 0;
4795   }
4796   nHeader += putVarint(&pCell[nHeader], *(u64*)&nKey);
4797   sqlite3BtreeParseCellPtr(pPage, pCell, &info);
4798   assert( info.nHeader==nHeader );
4799   assert( info.nKey==nKey );
4800   assert( info.nData==(u32)(nData+nZero) );
4801 
4802   /* Fill in the payload */
4803   nPayload = nData + nZero;
4804   if( pPage->intKey ){
4805     pSrc = pData;
4806     nSrc = nData;
4807     nData = 0;
4808   }else{
4809     if( nKey>0x7fffffff || pKey==0 ){
4810       return SQLITE_CORRUPT;
4811     }
4812     nPayload += (int)nKey;
4813     pSrc = pKey;
4814     nSrc = (int)nKey;
4815   }
4816   *pnSize = info.nSize;
4817   spaceLeft = info.nLocal;
4818   pPayload = &pCell[nHeader];
4819   pPrior = &pCell[info.iOverflow];
4820 
4821   while( nPayload>0 ){
4822     if( spaceLeft==0 ){
4823 #ifndef SQLITE_OMIT_AUTOVACUUM
4824       Pgno pgnoPtrmap = pgnoOvfl; /* Overflow page pointer-map entry page */
4825       if( pBt->autoVacuum ){
4826         do{
4827           pgnoOvfl++;
4828         } while(
4829           PTRMAP_ISPAGE(pBt, pgnoOvfl) || pgnoOvfl==PENDING_BYTE_PAGE(pBt)
4830         );
4831       }
4832 #endif
4833       rc = allocateBtreePage(pBt, &pOvfl, &pgnoOvfl, pgnoOvfl, 0);
4834 #ifndef SQLITE_OMIT_AUTOVACUUM
4835       /* If the database supports auto-vacuum, and the second or subsequent
4836       ** overflow page is being allocated, add an entry to the pointer-map
4837       ** for that page now.
4838       **
4839       ** If this is the first overflow page, then write a partial entry
4840       ** to the pointer-map. If we write nothing to this pointer-map slot,
4841       ** then the optimistic overflow chain processing in clearCell()
4842       ** may misinterpret the uninitialised values and delete the
4843       ** wrong pages from the database.
4844       */
4845       if( pBt->autoVacuum && rc==SQLITE_OK ){
4846         u8 eType = (pgnoPtrmap?PTRMAP_OVERFLOW2:PTRMAP_OVERFLOW1);
4847         rc = ptrmapPut(pBt, pgnoOvfl, eType, pgnoPtrmap);
4848         if( rc ){
4849           releasePage(pOvfl);
4850         }
4851       }
4852 #endif
4853       if( rc ){
4854         releasePage(pToRelease);
4855         return rc;
4856       }
4857 
4858       /* If pToRelease is not zero than pPrior points into the data area
4859       ** of pToRelease.  Make sure pToRelease is still writeable. */
4860       assert( pToRelease==0 || sqlite3PagerIswriteable(pToRelease->pDbPage) );
4861 
4862       /* If pPrior is part of the data area of pPage, then make sure pPage
4863       ** is still writeable */
4864       assert( pPrior<pPage->aData || pPrior>=&pPage->aData[pBt->pageSize]
4865             || sqlite3PagerIswriteable(pPage->pDbPage) );
4866 
4867       put4byte(pPrior, pgnoOvfl);
4868       releasePage(pToRelease);
4869       pToRelease = pOvfl;
4870       pPrior = pOvfl->aData;
4871       put4byte(pPrior, 0);
4872       pPayload = &pOvfl->aData[4];
4873       spaceLeft = pBt->usableSize - 4;
4874     }
4875     n = nPayload;
4876     if( n>spaceLeft ) n = spaceLeft;
4877 
4878     /* If pToRelease is not zero than pPayload points into the data area
4879     ** of pToRelease.  Make sure pToRelease is still writeable. */
4880     assert( pToRelease==0 || sqlite3PagerIswriteable(pToRelease->pDbPage) );
4881 
4882     /* If pPayload is part of the data area of pPage, then make sure pPage
4883     ** is still writeable */
4884     assert( pPayload<pPage->aData || pPayload>=&pPage->aData[pBt->pageSize]
4885             || sqlite3PagerIswriteable(pPage->pDbPage) );
4886 
4887     if( nSrc>0 ){
4888       if( n>nSrc ) n = nSrc;
4889       assert( pSrc );
4890       memcpy(pPayload, pSrc, n);
4891     }else{
4892       memset(pPayload, 0, n);
4893     }
4894     nPayload -= n;
4895     pPayload += n;
4896     pSrc += n;
4897     nSrc -= n;
4898     spaceLeft -= n;
4899     if( nSrc==0 ){
4900       nSrc = nData;
4901       pSrc = pData;
4902     }
4903   }
4904   releasePage(pToRelease);
4905   return SQLITE_OK;
4906 }
4907 
4908 /*
4909 ** Remove the i-th cell from pPage.  This routine effects pPage only.
4910 ** The cell content is not freed or deallocated.  It is assumed that
4911 ** the cell content has been copied someplace else.  This routine just
4912 ** removes the reference to the cell from pPage.
4913 **
4914 ** "sz" must be the number of bytes in the cell.
4915 */
4916 static int dropCell(MemPage *pPage, int idx, int sz){
4917   int i;          /* Loop counter */
4918   int pc;         /* Offset to cell content of cell being deleted */
4919   u8 *data;       /* pPage->aData */
4920   u8 *ptr;        /* Used to move bytes around within data[] */
4921   int rc;         /* The return code */
4922 
4923   assert( idx>=0 && idx<pPage->nCell );
4924   assert( sz==cellSize(pPage, idx) );
4925   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
4926   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
4927   data = pPage->aData;
4928   ptr = &data[pPage->cellOffset + 2*idx];
4929   pc = get2byte(ptr);
4930   if( (pc<pPage->hdrOffset+6+(pPage->leaf?0:4))
4931      || (pc+sz>pPage->pBt->usableSize) ){
4932     return SQLITE_CORRUPT_BKPT;
4933   }
4934   rc = freeSpace(pPage, pc, sz);
4935   if( rc!=SQLITE_OK ){
4936     return rc;
4937   }
4938   for(i=idx+1; i<pPage->nCell; i++, ptr+=2){
4939     ptr[0] = ptr[2];
4940     ptr[1] = ptr[3];
4941   }
4942   pPage->nCell--;
4943   put2byte(&data[pPage->hdrOffset+3], pPage->nCell);
4944   pPage->nFree += 2;
4945   return SQLITE_OK;
4946 }
4947 
4948 /*
4949 ** Insert a new cell on pPage at cell index "i".  pCell points to the
4950 ** content of the cell.
4951 **
4952 ** If the cell content will fit on the page, then put it there.  If it
4953 ** will not fit, then make a copy of the cell content into pTemp if
4954 ** pTemp is not null.  Regardless of pTemp, allocate a new entry
4955 ** in pPage->aOvfl[] and make it point to the cell content (either
4956 ** in pTemp or the original pCell) and also record its index.
4957 ** Allocating a new entry in pPage->aCell[] implies that
4958 ** pPage->nOverflow is incremented.
4959 **
4960 ** If nSkip is non-zero, then do not copy the first nSkip bytes of the
4961 ** cell. The caller will overwrite them after this function returns. If
4962 ** nSkip is non-zero, then pCell may not point to an invalid memory location
4963 ** (but pCell+nSkip is always valid).
4964 */
4965 static int insertCell(
4966   MemPage *pPage,   /* Page into which we are copying */
4967   int i,            /* New cell becomes the i-th cell of the page */
4968   u8 *pCell,        /* Content of the new cell */
4969   int sz,           /* Bytes of content in pCell */
4970   u8 *pTemp,        /* Temp storage space for pCell, if needed */
4971   u8 nSkip          /* Do not write the first nSkip bytes of the cell */
4972 ){
4973   int idx;          /* Where to write new cell content in data[] */
4974   int j;            /* Loop counter */
4975   int top;          /* First byte of content for any cell in data[] */
4976   int end;          /* First byte past the last cell pointer in data[] */
4977   int ins;          /* Index in data[] where new cell pointer is inserted */
4978   int hdr;          /* Offset into data[] of the page header */
4979   int cellOffset;   /* Address of first cell pointer in data[] */
4980   u8 *data;         /* The content of the whole page */
4981   u8 *ptr;          /* Used for moving information around in data[] */
4982 
4983   assert( i>=0 && i<=pPage->nCell+pPage->nOverflow );
4984   assert( pPage->nCell<=MX_CELL(pPage->pBt) && MX_CELL(pPage->pBt)<=5460 );
4985   assert( pPage->nOverflow<=ArraySize(pPage->aOvfl) );
4986   assert( sz==cellSizePtr(pPage, pCell) );
4987   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
4988   if( pPage->nOverflow || sz+2>pPage->nFree ){
4989     if( pTemp ){
4990       memcpy(pTemp+nSkip, pCell+nSkip, sz-nSkip);
4991       pCell = pTemp;
4992     }
4993     j = pPage->nOverflow++;
4994     assert( j<(int)(sizeof(pPage->aOvfl)/sizeof(pPage->aOvfl[0])) );
4995     pPage->aOvfl[j].pCell = pCell;
4996     pPage->aOvfl[j].idx = (u16)i;
4997     pPage->nFree = 0;
4998   }else{
4999     int rc = sqlite3PagerWrite(pPage->pDbPage);
5000     if( rc!=SQLITE_OK ){
5001       return rc;
5002     }
5003     assert( sqlite3PagerIswriteable(pPage->pDbPage) );
5004     data = pPage->aData;
5005     hdr = pPage->hdrOffset;
5006     top = get2byte(&data[hdr+5]);
5007     cellOffset = pPage->cellOffset;
5008     end = cellOffset + 2*pPage->nCell + 2;
5009     ins = cellOffset + 2*i;
5010     if( end > top - sz ){
5011       rc = defragmentPage(pPage);
5012       if( rc!=SQLITE_OK ){
5013         return rc;
5014       }
5015       top = get2byte(&data[hdr+5]);
5016       assert( end + sz <= top );
5017     }
5018     idx = allocateSpace(pPage, sz);
5019     assert( idx>0 );
5020     assert( end <= get2byte(&data[hdr+5]) );
5021     if (idx+sz > pPage->pBt->usableSize) {
5022       return SQLITE_CORRUPT_BKPT;
5023     }
5024     pPage->nCell++;
5025     pPage->nFree -= 2;
5026     memcpy(&data[idx+nSkip], pCell+nSkip, sz-nSkip);
5027     for(j=end-2, ptr=&data[j]; j>ins; j-=2, ptr-=2){
5028       ptr[0] = ptr[-2];
5029       ptr[1] = ptr[-1];
5030     }
5031     put2byte(&data[ins], idx);
5032     put2byte(&data[hdr+3], pPage->nCell);
5033 #ifndef SQLITE_OMIT_AUTOVACUUM
5034     if( pPage->pBt->autoVacuum ){
5035       /* The cell may contain a pointer to an overflow page. If so, write
5036       ** the entry for the overflow page into the pointer map.
5037       */
5038       CellInfo info;
5039       sqlite3BtreeParseCellPtr(pPage, pCell, &info);
5040       assert( (info.nData+(pPage->intKey?0:info.nKey))==info.nPayload );
5041       if( info.iOverflow ){
5042         Pgno pgnoOvfl = get4byte(&pCell[info.iOverflow]);
5043         rc = ptrmapPut(pPage->pBt, pgnoOvfl, PTRMAP_OVERFLOW1, pPage->pgno);
5044         if( rc!=SQLITE_OK ) return rc;
5045       }
5046     }
5047 #endif
5048   }
5049 
5050   return SQLITE_OK;
5051 }
5052 
5053 /*
5054 ** Add a list of cells to a page.  The page should be initially empty.
5055 ** The cells are guaranteed to fit on the page.
5056 */
5057 static void assemblePage(
5058   MemPage *pPage,   /* The page to be assemblied */
5059   int nCell,        /* The number of cells to add to this page */
5060   u8 **apCell,      /* Pointers to cell bodies */
5061   u16 *aSize        /* Sizes of the cells */
5062 ){
5063   int i;            /* Loop counter */
5064   u8 *pCellptr;     /* Address of next cell pointer */
5065   int cellbody;     /* Address of next cell body */
5066   u8 * const data = pPage->aData;             /* Pointer to data for pPage */
5067   const int hdr = pPage->hdrOffset;           /* Offset of header on pPage */
5068   const int nUsable = pPage->pBt->usableSize; /* Usable size of page */
5069 
5070   assert( pPage->nOverflow==0 );
5071   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
5072   assert( nCell>=0 && nCell<=MX_CELL(pPage->pBt) && MX_CELL(pPage->pBt)<=5460 );
5073   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
5074 
5075   /* Check that the page has just been zeroed by zeroPage() */
5076   assert( pPage->nCell==0 );
5077   assert( get2byte(&data[hdr+5])==nUsable );
5078 
5079   pCellptr = &data[pPage->cellOffset + nCell*2];
5080   cellbody = nUsable;
5081   for(i=nCell-1; i>=0; i--){
5082     pCellptr -= 2;
5083     cellbody -= aSize[i];
5084     put2byte(pCellptr, cellbody);
5085     memcpy(&data[cellbody], apCell[i], aSize[i]);
5086   }
5087   put2byte(&data[hdr+3], nCell);
5088   put2byte(&data[hdr+5], cellbody);
5089   pPage->nFree -= (nCell*2 + nUsable - cellbody);
5090   pPage->nCell = (u16)nCell;
5091 }
5092 
5093 /*
5094 ** The following parameters determine how many adjacent pages get involved
5095 ** in a balancing operation.  NN is the number of neighbors on either side
5096 ** of the page that participate in the balancing operation.  NB is the
5097 ** total number of pages that participate, including the target page and
5098 ** NN neighbors on either side.
5099 **
5100 ** The minimum value of NN is 1 (of course).  Increasing NN above 1
5101 ** (to 2 or 3) gives a modest improvement in SELECT and DELETE performance
5102 ** in exchange for a larger degradation in INSERT and UPDATE performance.
5103 ** The value of NN appears to give the best results overall.
5104 */
5105 #define NN 1             /* Number of neighbors on either side of pPage */
5106 #define NB (NN*2+1)      /* Total pages involved in the balance */
5107 
5108 /* Forward reference */
5109 static int balance(BtCursor*, int);
5110 
5111 #ifndef SQLITE_OMIT_QUICKBALANCE
5112 /*
5113 ** This version of balance() handles the common special case where
5114 ** a new entry is being inserted on the extreme right-end of the
5115 ** tree, in other words, when the new entry will become the largest
5116 ** entry in the tree.
5117 **
5118 ** Instead of trying balance the 3 right-most leaf pages, just add
5119 ** a new page to the right-hand side and put the one new entry in
5120 ** that page.  This leaves the right side of the tree somewhat
5121 ** unbalanced.  But odds are that we will be inserting new entries
5122 ** at the end soon afterwards so the nearly empty page will quickly
5123 ** fill up.  On average.
5124 **
5125 ** pPage is the leaf page which is the right-most page in the tree.
5126 ** pParent is its parent.  pPage must have a single overflow entry
5127 ** which is also the right-most entry on the page.
5128 */
5129 static int balance_quick(BtCursor *pCur){
5130   int rc;
5131   MemPage *pNew = 0;
5132   Pgno pgnoNew;
5133   u8 *pCell;
5134   u16 szCell;
5135   CellInfo info;
5136   MemPage *pPage = pCur->apPage[pCur->iPage];
5137   MemPage *pParent = pCur->apPage[pCur->iPage-1];
5138   BtShared *pBt = pPage->pBt;
5139   int parentIdx = pParent->nCell;   /* pParent new divider cell index */
5140   int parentSize;                   /* Size of new divider cell */
5141   u8 parentCell[64];                /* Space for the new divider cell */
5142 
5143   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
5144 
5145   /* Allocate a new page. Insert the overflow cell from pPage
5146   ** into it. Then remove the overflow cell from pPage.
5147   */
5148   rc = allocateBtreePage(pBt, &pNew, &pgnoNew, 0, 0);
5149   if( rc==SQLITE_OK ){
5150     pCell = pPage->aOvfl[0].pCell;
5151     szCell = cellSizePtr(pPage, pCell);
5152     assert( sqlite3PagerIswriteable(pNew->pDbPage) );
5153     zeroPage(pNew, pPage->aData[0]);
5154     assemblePage(pNew, 1, &pCell, &szCell);
5155     pPage->nOverflow = 0;
5156 
5157     /* pPage is currently the right-child of pParent. Change this
5158     ** so that the right-child is the new page allocated above and
5159     ** pPage is the next-to-right child.
5160     **
5161     ** Ignore the return value of the call to fillInCell(). fillInCell()
5162     ** may only return other than SQLITE_OK if it is required to allocate
5163     ** one or more overflow pages. Since an internal table B-Tree cell
5164     ** may never spill over onto an overflow page (it is a maximum of
5165     ** 13 bytes in size), it is not neccessary to check the return code.
5166     **
5167     ** Similarly, the insertCell() function cannot fail if the page
5168     ** being inserted into is already writable and the cell does not
5169     ** contain an overflow pointer. So ignore this return code too.
5170     */
5171     assert( pPage->nCell>0 );
5172     pCell = findCell(pPage, pPage->nCell-1);
5173     sqlite3BtreeParseCellPtr(pPage, pCell, &info);
5174     fillInCell(pParent, parentCell, 0, info.nKey, 0, 0, 0, &parentSize);
5175     assert( parentSize<64 );
5176     assert( sqlite3PagerIswriteable(pParent->pDbPage) );
5177     insertCell(pParent, parentIdx, parentCell, parentSize, 0, 4);
5178     put4byte(findOverflowCell(pParent,parentIdx), pPage->pgno);
5179     put4byte(&pParent->aData[pParent->hdrOffset+8], pgnoNew);
5180 
5181     /* If this is an auto-vacuum database, update the pointer map
5182     ** with entries for the new page, and any pointer from the
5183     ** cell on the page to an overflow page.
5184     */
5185     if( ISAUTOVACUUM ){
5186       rc = ptrmapPut(pBt, pgnoNew, PTRMAP_BTREE, pParent->pgno);
5187       if( rc==SQLITE_OK ){
5188         rc = ptrmapPutOvfl(pNew, 0);
5189       }
5190     }
5191 
5192     /* Release the reference to the new page. */
5193     releasePage(pNew);
5194   }
5195 
5196   /* At this point the pPage->nFree variable is not set correctly with
5197   ** respect to the content of the page (because it was set to 0 by
5198   ** insertCell). So call sqlite3BtreeInitPage() to make sure it is
5199   ** correct.
5200   **
5201   ** This has to be done even if an error will be returned. Normally, if
5202   ** an error occurs during tree balancing, the contents of MemPage are
5203   ** not important, as they will be recalculated when the page is rolled
5204   ** back. But here, in balance_quick(), it is possible that pPage has
5205   ** not yet been marked dirty or written into the journal file. Therefore
5206   ** it will not be rolled back and so it is important to make sure that
5207   ** the page data and contents of MemPage are consistent.
5208   */
5209   pPage->isInit = 0;
5210   sqlite3BtreeInitPage(pPage);
5211   assert( pPage->nOverflow==0 );
5212 
5213   /* If everything else succeeded, balance the parent page, in
5214   ** case the divider cell inserted caused it to become overfull.
5215   */
5216   if( rc==SQLITE_OK ){
5217     releasePage(pPage);
5218     pCur->iPage--;
5219     rc = balance(pCur, 0);
5220   }
5221   return rc;
5222 }
5223 #endif /* SQLITE_OMIT_QUICKBALANCE */
5224 
5225 /*
5226 ** This routine redistributes Cells on pPage and up to NN*2 siblings
5227 ** of pPage so that all pages have about the same amount of free space.
5228 ** Usually NN siblings on either side of pPage is used in the balancing,
5229 ** though more siblings might come from one side if pPage is the first
5230 ** or last child of its parent.  If pPage has fewer than 2*NN siblings
5231 ** (something which can only happen if pPage is the root page or a
5232 ** child of root) then all available siblings participate in the balancing.
5233 **
5234 ** The number of siblings of pPage might be increased or decreased by one or
5235 ** two in an effort to keep pages nearly full but not over full. The root page
5236 ** is special and is allowed to be nearly empty. If pPage is
5237 ** the root page, then the depth of the tree might be increased
5238 ** or decreased by one, as necessary, to keep the root page from being
5239 ** overfull or completely empty.
5240 **
5241 ** Note that when this routine is called, some of the Cells on pPage
5242 ** might not actually be stored in pPage->aData[].  This can happen
5243 ** if the page is overfull.  Part of the job of this routine is to
5244 ** make sure all Cells for pPage once again fit in pPage->aData[].
5245 **
5246 ** In the course of balancing the siblings of pPage, the parent of pPage
5247 ** might become overfull or underfull.  If that happens, then this routine
5248 ** is called recursively on the parent.
5249 **
5250 ** If this routine fails for any reason, it might leave the database
5251 ** in a corrupted state.  So if this routine fails, the database should
5252 ** be rolled back.
5253 */
5254 static int balance_nonroot(BtCursor *pCur){
5255   MemPage *pPage;              /* The over or underfull page to balance */
5256   MemPage *pParent;            /* The parent of pPage */
5257   BtShared *pBt;               /* The whole database */
5258   int nCell = 0;               /* Number of cells in apCell[] */
5259   int nMaxCells = 0;           /* Allocated size of apCell, szCell, aFrom. */
5260   int nOld = 0;                /* Number of pages in apOld[] */
5261   int nNew = 0;                /* Number of pages in apNew[] */
5262   int nDiv;                    /* Number of cells in apDiv[] */
5263   int i, j, k;                 /* Loop counters */
5264   int idx;                     /* Index of pPage in pParent->aCell[] */
5265   int nxDiv;                   /* Next divider slot in pParent->aCell[] */
5266   int rc;                      /* The return code */
5267   int leafCorrection;          /* 4 if pPage is a leaf.  0 if not */
5268   int leafData;                /* True if pPage is a leaf of a LEAFDATA tree */
5269   int usableSpace;             /* Bytes in pPage beyond the header */
5270   int pageFlags;               /* Value of pPage->aData[0] */
5271   int subtotal;                /* Subtotal of bytes in cells on one page */
5272   int iSpace1 = 0;             /* First unused byte of aSpace1[] */
5273   int iSpace2 = 0;             /* First unused byte of aSpace2[] */
5274   int szScratch;               /* Size of scratch memory requested */
5275   MemPage *apOld[NB];          /* pPage and up to two siblings */
5276   Pgno pgnoOld[NB];            /* Page numbers for each page in apOld[] */
5277   MemPage *apCopy[NB];         /* Private copies of apOld[] pages */
5278   MemPage *apNew[NB+2];        /* pPage and up to NB siblings after balancing */
5279   Pgno pgnoNew[NB+2];          /* Page numbers for each page in apNew[] */
5280   u8 *apDiv[NB];               /* Divider cells in pParent */
5281   int cntNew[NB+2];            /* Index in aCell[] of cell after i-th page */
5282   int szNew[NB+2];             /* Combined size of cells place on i-th page */
5283   u8 **apCell = 0;             /* All cells begin balanced */
5284   u16 *szCell;                 /* Local size of all cells in apCell[] */
5285   u8 *aCopy[NB];         /* Space for holding data of apCopy[] */
5286   u8 *aSpace1;           /* Space for copies of dividers cells before balance */
5287   u8 *aSpace2 = 0;       /* Space for overflow dividers cells after balance */
5288   u8 *aFrom = 0;
5289 
5290   pPage = pCur->apPage[pCur->iPage];
5291   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
5292   VVA_ONLY( pCur->pagesShuffled = 1 );
5293 
5294   /*
5295   ** Find the parent page.
5296   */
5297   assert( pCur->iPage>0 );
5298   assert( pPage->isInit );
5299   assert( sqlite3PagerIswriteable(pPage->pDbPage) || pPage->nOverflow==1 );
5300   pBt = pPage->pBt;
5301   pParent = pCur->apPage[pCur->iPage-1];
5302   assert( pParent );
5303   if( SQLITE_OK!=(rc = sqlite3PagerWrite(pParent->pDbPage)) ){
5304     goto balance_cleanup;
5305   }
5306 
5307   TRACE(("BALANCE: begin page %d child of %d\n", pPage->pgno, pParent->pgno));
5308 
5309 #ifndef SQLITE_OMIT_QUICKBALANCE
5310   /*
5311   ** A special case:  If a new entry has just been inserted into a
5312   ** table (that is, a btree with integer keys and all data at the leaves)
5313   ** and the new entry is the right-most entry in the tree (it has the
5314   ** largest key) then use the special balance_quick() routine for
5315   ** balancing.  balance_quick() is much faster and results in a tighter
5316   ** packing of data in the common case.
5317   */
5318   if( pPage->leaf &&
5319       pPage->intKey &&
5320       pPage->nOverflow==1 &&
5321       pPage->aOvfl[0].idx==pPage->nCell &&
5322       pParent->pgno!=1 &&
5323       get4byte(&pParent->aData[pParent->hdrOffset+8])==pPage->pgno
5324   ){
5325     assert( pPage->intKey );
5326     /*
5327     ** TODO: Check the siblings to the left of pPage. It may be that
5328     ** they are not full and no new page is required.
5329     */
5330     return balance_quick(pCur);
5331   }
5332 #endif
5333 
5334   if( SQLITE_OK!=(rc = sqlite3PagerWrite(pPage->pDbPage)) ){
5335     goto balance_cleanup;
5336   }
5337 
5338   /*
5339   ** Find the cell in the parent page whose left child points back
5340   ** to pPage.  The "idx" variable is the index of that cell.  If pPage
5341   ** is the rightmost child of pParent then set idx to pParent->nCell
5342   */
5343   idx = pCur->aiIdx[pCur->iPage-1];
5344   assertParentIndex(pParent, idx, pPage->pgno);
5345 
5346   /*
5347   ** Find sibling pages to pPage and the cells in pParent that divide
5348   ** the siblings.  An attempt is made to find NN siblings on either
5349   ** side of pPage.  More siblings are taken from one side, however, if
5350   ** pPage there are fewer than NN siblings on the other side.  If pParent
5351   ** has NB or fewer children then all children of pParent are taken.
5352   */
5353   nxDiv = idx - NN;
5354   if( nxDiv + NB > pParent->nCell ){
5355     nxDiv = pParent->nCell - NB + 1;
5356   }
5357   if( nxDiv<0 ){
5358     nxDiv = 0;
5359   }
5360   nDiv = 0;
5361   for(i=0, k=nxDiv; i<NB; i++, k++){
5362     if( k<pParent->nCell ){
5363       apDiv[i] = findCell(pParent, k);
5364       nDiv++;
5365       assert( !pParent->leaf );
5366       pgnoOld[i] = get4byte(apDiv[i]);
5367     }else if( k==pParent->nCell ){
5368       pgnoOld[i] = get4byte(&pParent->aData[pParent->hdrOffset+8]);
5369     }else{
5370       break;
5371     }
5372     rc = getAndInitPage(pBt, pgnoOld[i], &apOld[i]);
5373     if( rc ) goto balance_cleanup;
5374     /* apOld[i]->idxParent = k; */
5375     apCopy[i] = 0;
5376     assert( i==nOld );
5377     nOld++;
5378     nMaxCells += 1+apOld[i]->nCell+apOld[i]->nOverflow;
5379   }
5380 
5381   /* Make nMaxCells a multiple of 4 in order to preserve 8-byte
5382   ** alignment */
5383   nMaxCells = (nMaxCells + 3)&~3;
5384 
5385   /*
5386   ** Allocate space for memory structures
5387   */
5388   szScratch =
5389        nMaxCells*sizeof(u8*)                       /* apCell */
5390      + nMaxCells*sizeof(u16)                       /* szCell */
5391      + (ROUND8(sizeof(MemPage))+pBt->pageSize)*NB  /* aCopy */
5392      + pBt->pageSize                               /* aSpace1 */
5393      + (ISAUTOVACUUM ? nMaxCells : 0);             /* aFrom */
5394   apCell = sqlite3ScratchMalloc( szScratch );
5395   if( apCell==0 ){
5396     rc = SQLITE_NOMEM;
5397     goto balance_cleanup;
5398   }
5399   szCell = (u16*)&apCell[nMaxCells];
5400   aCopy[0] = (u8*)&szCell[nMaxCells];
5401   assert( EIGHT_BYTE_ALIGNMENT(aCopy[0]) );
5402   for(i=1; i<NB; i++){
5403     aCopy[i] = &aCopy[i-1][pBt->pageSize+ROUND8(sizeof(MemPage))];
5404     assert( ((aCopy[i] - (u8*)0) & 7)==0 ); /* 8-byte alignment required */
5405   }
5406   aSpace1 = &aCopy[NB-1][pBt->pageSize+ROUND8(sizeof(MemPage))];
5407   assert( EIGHT_BYTE_ALIGNMENT(aSpace1) );
5408   if( ISAUTOVACUUM ){
5409     aFrom = &aSpace1[pBt->pageSize];
5410   }
5411   aSpace2 = sqlite3PageMalloc(pBt->pageSize);
5412   if( aSpace2==0 ){
5413     rc = SQLITE_NOMEM;
5414     goto balance_cleanup;
5415   }
5416 
5417   /*
5418   ** Make copies of the content of pPage and its siblings into aOld[].
5419   ** The rest of this function will use data from the copies rather
5420   ** that the original pages since the original pages will be in the
5421   ** process of being overwritten.
5422   */
5423   for(i=0; i<nOld; i++){
5424     MemPage *p = apCopy[i] = (MemPage*)aCopy[i];
5425     memcpy(p, apOld[i], sizeof(MemPage));
5426     p->aData = (void*)&p[1];
5427     memcpy(p->aData, apOld[i]->aData, pBt->pageSize);
5428   }
5429 
5430   /*
5431   ** Load pointers to all cells on sibling pages and the divider cells
5432   ** into the local apCell[] array.  Make copies of the divider cells
5433   ** into space obtained form aSpace1[] and remove the the divider Cells
5434   ** from pParent.
5435   **
5436   ** If the siblings are on leaf pages, then the child pointers of the
5437   ** divider cells are stripped from the cells before they are copied
5438   ** into aSpace1[].  In this way, all cells in apCell[] are without
5439   ** child pointers.  If siblings are not leaves, then all cell in
5440   ** apCell[] include child pointers.  Either way, all cells in apCell[]
5441   ** are alike.
5442   **
5443   ** leafCorrection:  4 if pPage is a leaf.  0 if pPage is not a leaf.
5444   **       leafData:  1 if pPage holds key+data and pParent holds only keys.
5445   */
5446   nCell = 0;
5447   leafCorrection = pPage->leaf*4;
5448   leafData = pPage->hasData;
5449   for(i=0; i<nOld; i++){
5450     MemPage *pOld = apCopy[i];
5451     int limit = pOld->nCell+pOld->nOverflow;
5452     for(j=0; j<limit; j++){
5453       assert( nCell<nMaxCells );
5454       apCell[nCell] = findOverflowCell(pOld, j);
5455       szCell[nCell] = cellSizePtr(pOld, apCell[nCell]);
5456       if( ISAUTOVACUUM ){
5457         int a;
5458         aFrom[nCell] = (u8)i;   assert( i>=0 && i<6 );
5459         for(a=0; a<pOld->nOverflow; a++){
5460           if( pOld->aOvfl[a].pCell==apCell[nCell] ){
5461             aFrom[nCell] = 0xFF;
5462             break;
5463           }
5464         }
5465       }
5466       nCell++;
5467     }
5468     if( i<nOld-1 ){
5469       u16 sz = cellSizePtr(pParent, apDiv[i]);
5470       if( leafData ){
5471         /* With the LEAFDATA flag, pParent cells hold only INTKEYs that
5472         ** are duplicates of keys on the child pages.  We need to remove
5473         ** the divider cells from pParent, but the dividers cells are not
5474         ** added to apCell[] because they are duplicates of child cells.
5475         */
5476         dropCell(pParent, nxDiv, sz);
5477       }else{
5478         u8 *pTemp;
5479         assert( nCell<nMaxCells );
5480         szCell[nCell] = sz;
5481         pTemp = &aSpace1[iSpace1];
5482         iSpace1 += sz;
5483         assert( sz<=pBt->pageSize/4 );
5484         assert( iSpace1<=pBt->pageSize );
5485         memcpy(pTemp, apDiv[i], sz);
5486         apCell[nCell] = pTemp+leafCorrection;
5487         if( ISAUTOVACUUM ){
5488           aFrom[nCell] = 0xFF;
5489         }
5490         dropCell(pParent, nxDiv, sz);
5491         assert( leafCorrection==0 || leafCorrection==4 );
5492         szCell[nCell] -= (u16)leafCorrection;
5493         assert( get4byte(pTemp)==pgnoOld[i] );
5494         if( !pOld->leaf ){
5495           assert( leafCorrection==0 );
5496           /* The right pointer of the child page pOld becomes the left
5497           ** pointer of the divider cell */
5498           memcpy(apCell[nCell], &pOld->aData[pOld->hdrOffset+8], 4);
5499         }else{
5500           assert( leafCorrection==4 );
5501           if( szCell[nCell]<4 ){
5502             /* Do not allow any cells smaller than 4 bytes. */
5503             szCell[nCell] = 4;
5504           }
5505         }
5506         nCell++;
5507       }
5508     }
5509   }
5510 
5511   /*
5512   ** Figure out the number of pages needed to hold all nCell cells.
5513   ** Store this number in "k".  Also compute szNew[] which is the total
5514   ** size of all cells on the i-th page and cntNew[] which is the index
5515   ** in apCell[] of the cell that divides page i from page i+1.
5516   ** cntNew[k] should equal nCell.
5517   **
5518   ** Values computed by this block:
5519   **
5520   **           k: The total number of sibling pages
5521   **    szNew[i]: Spaced used on the i-th sibling page.
5522   **   cntNew[i]: Index in apCell[] and szCell[] for the first cell to
5523   **              the right of the i-th sibling page.
5524   ** usableSpace: Number of bytes of space available on each sibling.
5525   **
5526   */
5527   usableSpace = pBt->usableSize - 12 + leafCorrection;
5528   for(subtotal=k=i=0; i<nCell; i++){
5529     assert( i<nMaxCells );
5530     subtotal += szCell[i] + 2;
5531     if( subtotal > usableSpace ){
5532       szNew[k] = subtotal - szCell[i];
5533       cntNew[k] = i;
5534       if( leafData ){ i--; }
5535       subtotal = 0;
5536       k++;
5537     }
5538   }
5539   szNew[k] = subtotal;
5540   cntNew[k] = nCell;
5541   k++;
5542 
5543   /*
5544   ** The packing computed by the previous block is biased toward the siblings
5545   ** on the left side.  The left siblings are always nearly full, while the
5546   ** right-most sibling might be nearly empty.  This block of code attempts
5547   ** to adjust the packing of siblings to get a better balance.
5548   **
5549   ** This adjustment is more than an optimization.  The packing above might
5550   ** be so out of balance as to be illegal.  For example, the right-most
5551   ** sibling might be completely empty.  This adjustment is not optional.
5552   */
5553   for(i=k-1; i>0; i--){
5554     int szRight = szNew[i];  /* Size of sibling on the right */
5555     int szLeft = szNew[i-1]; /* Size of sibling on the left */
5556     int r;              /* Index of right-most cell in left sibling */
5557     int d;              /* Index of first cell to the left of right sibling */
5558 
5559     r = cntNew[i-1] - 1;
5560     d = r + 1 - leafData;
5561     assert( d<nMaxCells );
5562     assert( r<nMaxCells );
5563     while( szRight==0 || szRight+szCell[d]+2<=szLeft-(szCell[r]+2) ){
5564       szRight += szCell[d] + 2;
5565       szLeft -= szCell[r] + 2;
5566       cntNew[i-1]--;
5567       r = cntNew[i-1] - 1;
5568       d = r + 1 - leafData;
5569     }
5570     szNew[i] = szRight;
5571     szNew[i-1] = szLeft;
5572   }
5573 
5574   /* Either we found one or more cells (cntnew[0])>0) or we are the
5575   ** a virtual root page.  A virtual root page is when the real root
5576   ** page is page 1 and we are the only child of that page.
5577   */
5578   assert( cntNew[0]>0 || (pParent->pgno==1 && pParent->nCell==0) );
5579 
5580   /*
5581   ** Allocate k new pages.  Reuse old pages where possible.
5582   */
5583   assert( pPage->pgno>1 );
5584   pageFlags = pPage->aData[0];
5585   for(i=0; i<k; i++){
5586     MemPage *pNew;
5587     if( i<nOld ){
5588       pNew = apNew[i] = apOld[i];
5589       pgnoNew[i] = pgnoOld[i];
5590       apOld[i] = 0;
5591       rc = sqlite3PagerWrite(pNew->pDbPage);
5592       nNew++;
5593       if( rc ) goto balance_cleanup;
5594     }else{
5595       assert( i>0 );
5596       rc = allocateBtreePage(pBt, &pNew, &pgnoNew[i], pgnoNew[i-1], 0);
5597       if( rc ) goto balance_cleanup;
5598       apNew[i] = pNew;
5599       nNew++;
5600     }
5601   }
5602 
5603   /* Free any old pages that were not reused as new pages.
5604   */
5605   while( i<nOld ){
5606     rc = freePage(apOld[i]);
5607     if( rc ) goto balance_cleanup;
5608     releasePage(apOld[i]);
5609     apOld[i] = 0;
5610     i++;
5611   }
5612 
5613   /*
5614   ** Put the new pages in accending order.  This helps to
5615   ** keep entries in the disk file in order so that a scan
5616   ** of the table is a linear scan through the file.  That
5617   ** in turn helps the operating system to deliver pages
5618   ** from the disk more rapidly.
5619   **
5620   ** An O(n^2) insertion sort algorithm is used, but since
5621   ** n is never more than NB (a small constant), that should
5622   ** not be a problem.
5623   **
5624   ** When NB==3, this one optimization makes the database
5625   ** about 25% faster for large insertions and deletions.
5626   */
5627   for(i=0; i<k-1; i++){
5628     int minV = pgnoNew[i];
5629     int minI = i;
5630     for(j=i+1; j<k; j++){
5631       if( pgnoNew[j]<(unsigned)minV ){
5632         minI = j;
5633         minV = pgnoNew[j];
5634       }
5635     }
5636     if( minI>i ){
5637       int t;
5638       MemPage *pT;
5639       t = pgnoNew[i];
5640       pT = apNew[i];
5641       pgnoNew[i] = pgnoNew[minI];
5642       apNew[i] = apNew[minI];
5643       pgnoNew[minI] = t;
5644       apNew[minI] = pT;
5645     }
5646   }
5647   TRACE(("BALANCE: old: %d %d %d  new: %d(%d) %d(%d) %d(%d) %d(%d) %d(%d)\n",
5648     pgnoOld[0],
5649     nOld>=2 ? pgnoOld[1] : 0,
5650     nOld>=3 ? pgnoOld[2] : 0,
5651     pgnoNew[0], szNew[0],
5652     nNew>=2 ? pgnoNew[1] : 0, nNew>=2 ? szNew[1] : 0,
5653     nNew>=3 ? pgnoNew[2] : 0, nNew>=3 ? szNew[2] : 0,
5654     nNew>=4 ? pgnoNew[3] : 0, nNew>=4 ? szNew[3] : 0,
5655     nNew>=5 ? pgnoNew[4] : 0, nNew>=5 ? szNew[4] : 0));
5656 
5657   /*
5658   ** Evenly distribute the data in apCell[] across the new pages.
5659   ** Insert divider cells into pParent as necessary.
5660   */
5661   j = 0;
5662   for(i=0; i<nNew; i++){
5663     /* Assemble the new sibling page. */
5664     MemPage *pNew = apNew[i];
5665     assert( j<nMaxCells );
5666     assert( pNew->pgno==pgnoNew[i] );
5667     zeroPage(pNew, pageFlags);
5668     assemblePage(pNew, cntNew[i]-j, &apCell[j], &szCell[j]);
5669     assert( pNew->nCell>0 || (nNew==1 && cntNew[0]==0) );
5670     assert( pNew->nOverflow==0 );
5671 
5672     /* If this is an auto-vacuum database, update the pointer map entries
5673     ** that point to the siblings that were rearranged. These can be: left
5674     ** children of cells, the right-child of the page, or overflow pages
5675     ** pointed to by cells.
5676     */
5677     if( ISAUTOVACUUM ){
5678       for(k=j; k<cntNew[i]; k++){
5679         assert( k<nMaxCells );
5680         if( aFrom[k]==0xFF || apCopy[aFrom[k]]->pgno!=pNew->pgno ){
5681           rc = ptrmapPutOvfl(pNew, k-j);
5682           if( rc==SQLITE_OK && leafCorrection==0 ){
5683             rc = ptrmapPut(pBt, get4byte(apCell[k]), PTRMAP_BTREE, pNew->pgno);
5684           }
5685           if( rc!=SQLITE_OK ){
5686             goto balance_cleanup;
5687           }
5688         }
5689       }
5690     }
5691 
5692     j = cntNew[i];
5693 
5694     /* If the sibling page assembled above was not the right-most sibling,
5695     ** insert a divider cell into the parent page.
5696     */
5697     if( i<nNew-1 && j<nCell ){
5698       u8 *pCell;
5699       u8 *pTemp;
5700       int sz;
5701 
5702       assert( j<nMaxCells );
5703       pCell = apCell[j];
5704       sz = szCell[j] + leafCorrection;
5705       pTemp = &aSpace2[iSpace2];
5706       if( !pNew->leaf ){
5707         memcpy(&pNew->aData[8], pCell, 4);
5708         if( ISAUTOVACUUM
5709          && (aFrom[j]==0xFF || apCopy[aFrom[j]]->pgno!=pNew->pgno)
5710         ){
5711           rc = ptrmapPut(pBt, get4byte(pCell), PTRMAP_BTREE, pNew->pgno);
5712           if( rc!=SQLITE_OK ){
5713             goto balance_cleanup;
5714           }
5715         }
5716       }else if( leafData ){
5717         /* If the tree is a leaf-data tree, and the siblings are leaves,
5718         ** then there is no divider cell in apCell[]. Instead, the divider
5719         ** cell consists of the integer key for the right-most cell of
5720         ** the sibling-page assembled above only.
5721         */
5722         CellInfo info;
5723         j--;
5724         sqlite3BtreeParseCellPtr(pNew, apCell[j], &info);
5725         pCell = pTemp;
5726         rc = fillInCell(pParent, pCell, 0, info.nKey, 0, 0, 0, &sz);
5727         if( rc!=SQLITE_OK ){
5728           goto balance_cleanup;
5729         }
5730         pTemp = 0;
5731       }else{
5732         pCell -= 4;
5733         /* Obscure case for non-leaf-data trees: If the cell at pCell was
5734         ** previously stored on a leaf node, and its reported size was 4
5735         ** bytes, then it may actually be smaller than this
5736         ** (see sqlite3BtreeParseCellPtr(), 4 bytes is the minimum size of
5737         ** any cell). But it is important to pass the correct size to
5738         ** insertCell(), so reparse the cell now.
5739         **
5740         ** Note that this can never happen in an SQLite data file, as all
5741         ** cells are at least 4 bytes. It only happens in b-trees used
5742         ** to evaluate "IN (SELECT ...)" and similar clauses.
5743         */
5744         if( szCell[j]==4 ){
5745           assert(leafCorrection==4);
5746           sz = cellSizePtr(pParent, pCell);
5747         }
5748       }
5749       iSpace2 += sz;
5750       assert( sz<=pBt->pageSize/4 );
5751       assert( iSpace2<=pBt->pageSize );
5752       rc = insertCell(pParent, nxDiv, pCell, sz, pTemp, 4);
5753       if( rc!=SQLITE_OK ) goto balance_cleanup;
5754       assert( sqlite3PagerIswriteable(pParent->pDbPage) );
5755       put4byte(findOverflowCell(pParent,nxDiv), pNew->pgno);
5756 
5757       /* If this is an auto-vacuum database, and not a leaf-data tree,
5758       ** then update the pointer map with an entry for the overflow page
5759       ** that the cell just inserted points to (if any).
5760       */
5761       if( ISAUTOVACUUM && !leafData ){
5762         rc = ptrmapPutOvfl(pParent, nxDiv);
5763         if( rc!=SQLITE_OK ){
5764           goto balance_cleanup;
5765         }
5766       }
5767       j++;
5768       nxDiv++;
5769     }
5770 
5771     /* Set the pointer-map entry for the new sibling page. */
5772     if( ISAUTOVACUUM ){
5773       rc = ptrmapPut(pBt, pNew->pgno, PTRMAP_BTREE, pParent->pgno);
5774       if( rc!=SQLITE_OK ){
5775         goto balance_cleanup;
5776       }
5777     }
5778   }
5779   assert( j==nCell );
5780   assert( nOld>0 );
5781   assert( nNew>0 );
5782   if( (pageFlags & PTF_LEAF)==0 ){
5783     u8 *zChild = &apCopy[nOld-1]->aData[8];
5784     memcpy(&apNew[nNew-1]->aData[8], zChild, 4);
5785     if( ISAUTOVACUUM ){
5786       rc = ptrmapPut(pBt, get4byte(zChild), PTRMAP_BTREE, apNew[nNew-1]->pgno);
5787       if( rc!=SQLITE_OK ){
5788         goto balance_cleanup;
5789       }
5790     }
5791   }
5792   assert( sqlite3PagerIswriteable(pParent->pDbPage) );
5793   if( nxDiv==pParent->nCell+pParent->nOverflow ){
5794     /* Right-most sibling is the right-most child of pParent */
5795     put4byte(&pParent->aData[pParent->hdrOffset+8], pgnoNew[nNew-1]);
5796   }else{
5797     /* Right-most sibling is the left child of the first entry in pParent
5798     ** past the right-most divider entry */
5799     put4byte(findOverflowCell(pParent, nxDiv), pgnoNew[nNew-1]);
5800   }
5801 
5802   /*
5803   ** Balance the parent page.  Note that the current page (pPage) might
5804   ** have been added to the freelist so it might no longer be initialized.
5805   ** But the parent page will always be initialized.
5806   */
5807   assert( pParent->isInit );
5808   sqlite3ScratchFree(apCell);
5809   apCell = 0;
5810   TRACE(("BALANCE: finished with %d: old=%d new=%d cells=%d\n",
5811           pPage->pgno, nOld, nNew, nCell));
5812   pPage->nOverflow = 0;
5813   releasePage(pPage);
5814   pCur->iPage--;
5815   rc = balance(pCur, 0);
5816 
5817   /*
5818   ** Cleanup before returning.
5819   */
5820 balance_cleanup:
5821   sqlite3PageFree(aSpace2);
5822   sqlite3ScratchFree(apCell);
5823   for(i=0; i<nOld; i++){
5824     releasePage(apOld[i]);
5825   }
5826   for(i=0; i<nNew; i++){
5827     releasePage(apNew[i]);
5828   }
5829   pCur->apPage[pCur->iPage]->nOverflow = 0;
5830 
5831   return rc;
5832 }
5833 
5834 /*
5835 ** This routine is called for the root page of a btree when the root
5836 ** page contains no cells.  This is an opportunity to make the tree
5837 ** shallower by one level.
5838 */
5839 static int balance_shallower(BtCursor *pCur){
5840   MemPage *pPage;              /* Root page of B-Tree */
5841   MemPage *pChild;             /* The only child page of pPage */
5842   Pgno pgnoChild;              /* Page number for pChild */
5843   int rc = SQLITE_OK;          /* Return code from subprocedures */
5844   BtShared *pBt;                  /* The main BTree structure */
5845   int mxCellPerPage;           /* Maximum number of cells per page */
5846   u8 **apCell;                 /* All cells from pages being balanced */
5847   u16 *szCell;                 /* Local size of all cells */
5848 
5849   assert( pCur->iPage==0 );
5850   pPage = pCur->apPage[0];
5851 
5852   assert( pPage->nCell==0 );
5853   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
5854   pBt = pPage->pBt;
5855   mxCellPerPage = MX_CELL(pBt);
5856   apCell = sqlite3Malloc( mxCellPerPage*(sizeof(u8*)+sizeof(u16)) );
5857   if( apCell==0 ) return SQLITE_NOMEM;
5858   szCell = (u16*)&apCell[mxCellPerPage];
5859   if( pPage->leaf ){
5860     /* The table is completely empty */
5861     TRACE(("BALANCE: empty table %d\n", pPage->pgno));
5862   }else{
5863     /* The root page is empty but has one child.  Transfer the
5864     ** information from that one child into the root page if it
5865     ** will fit.  This reduces the depth of the tree by one.
5866     **
5867     ** If the root page is page 1, it has less space available than
5868     ** its child (due to the 100 byte header that occurs at the beginning
5869     ** of the database fle), so it might not be able to hold all of the
5870     ** information currently contained in the child.  If this is the
5871     ** case, then do not do the transfer.  Leave page 1 empty except
5872     ** for the right-pointer to the child page.  The child page becomes
5873     ** the virtual root of the tree.
5874     */
5875     VVA_ONLY( pCur->pagesShuffled = 1 );
5876     pgnoChild = get4byte(&pPage->aData[pPage->hdrOffset+8]);
5877     assert( pgnoChild>0 );
5878     assert( pgnoChild<=pagerPagecount(pPage->pBt) );
5879     rc = sqlite3BtreeGetPage(pPage->pBt, pgnoChild, &pChild, 0);
5880     if( rc ) goto end_shallow_balance;
5881     if( pPage->pgno==1 ){
5882       rc = sqlite3BtreeInitPage(pChild);
5883       if( rc ) goto end_shallow_balance;
5884       assert( pChild->nOverflow==0 );
5885       if( pChild->nFree>=100 ){
5886         /* The child information will fit on the root page, so do the
5887         ** copy */
5888         int i;
5889         zeroPage(pPage, pChild->aData[0]);
5890         for(i=0; i<pChild->nCell; i++){
5891           apCell[i] = findCell(pChild,i);
5892           szCell[i] = cellSizePtr(pChild, apCell[i]);
5893         }
5894         assemblePage(pPage, pChild->nCell, apCell, szCell);
5895         /* Copy the right-pointer of the child to the parent. */
5896         assert( sqlite3PagerIswriteable(pPage->pDbPage) );
5897         put4byte(&pPage->aData[pPage->hdrOffset+8],
5898             get4byte(&pChild->aData[pChild->hdrOffset+8]));
5899         rc = freePage(pChild);
5900         TRACE(("BALANCE: child %d transfer to page 1\n", pChild->pgno));
5901       }else{
5902         /* The child has more information that will fit on the root.
5903         ** The tree is already balanced.  Do nothing. */
5904         TRACE(("BALANCE: child %d will not fit on page 1\n", pChild->pgno));
5905       }
5906     }else{
5907       memcpy(pPage->aData, pChild->aData, pPage->pBt->usableSize);
5908       pPage->isInit = 0;
5909       rc = sqlite3BtreeInitPage(pPage);
5910       assert( rc==SQLITE_OK );
5911       freePage(pChild);
5912       TRACE(("BALANCE: transfer child %d into root %d\n",
5913               pChild->pgno, pPage->pgno));
5914     }
5915     assert( pPage->nOverflow==0 );
5916 #ifndef SQLITE_OMIT_AUTOVACUUM
5917     if( ISAUTOVACUUM && rc==SQLITE_OK ){
5918       rc = setChildPtrmaps(pPage);
5919     }
5920 #endif
5921     releasePage(pChild);
5922   }
5923 end_shallow_balance:
5924   sqlite3_free(apCell);
5925   return rc;
5926 }
5927 
5928 
5929 /*
5930 ** The root page is overfull
5931 **
5932 ** When this happens, Create a new child page and copy the
5933 ** contents of the root into the child.  Then make the root
5934 ** page an empty page with rightChild pointing to the new
5935 ** child.   Finally, call balance_internal() on the new child
5936 ** to cause it to split.
5937 */
5938 static int balance_deeper(BtCursor *pCur){
5939   int rc;             /* Return value from subprocedures */
5940   MemPage *pPage;     /* Pointer to the root page */
5941   MemPage *pChild;    /* Pointer to a new child page */
5942   Pgno pgnoChild;     /* Page number of the new child page */
5943   BtShared *pBt;         /* The BTree */
5944   int usableSize;     /* Total usable size of a page */
5945   u8 *data;           /* Content of the parent page */
5946   u8 *cdata;          /* Content of the child page */
5947   int hdr;            /* Offset to page header in parent */
5948   int cbrk;           /* Offset to content of first cell in parent */
5949 
5950   assert( pCur->iPage==0 );
5951   assert( pCur->apPage[0]->nOverflow>0 );
5952 
5953   VVA_ONLY( pCur->pagesShuffled = 1 );
5954   pPage = pCur->apPage[0];
5955   pBt = pPage->pBt;
5956   assert( sqlite3_mutex_held(pBt->mutex) );
5957   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
5958   rc = allocateBtreePage(pBt, &pChild, &pgnoChild, pPage->pgno, 0);
5959   if( rc ) return rc;
5960   assert( sqlite3PagerIswriteable(pChild->pDbPage) );
5961   usableSize = pBt->usableSize;
5962   data = pPage->aData;
5963   hdr = pPage->hdrOffset;
5964   cbrk = get2byte(&data[hdr+5]);
5965   cdata = pChild->aData;
5966   memcpy(cdata, &data[hdr], pPage->cellOffset+2*pPage->nCell-hdr);
5967   memcpy(&cdata[cbrk], &data[cbrk], usableSize-cbrk);
5968 
5969   assert( pChild->isInit==0 );
5970   rc = sqlite3BtreeInitPage(pChild);
5971   if( rc==SQLITE_OK ){
5972     int nCopy = pPage->nOverflow*sizeof(pPage->aOvfl[0]);
5973     memcpy(pChild->aOvfl, pPage->aOvfl, nCopy);
5974     pChild->nOverflow = pPage->nOverflow;
5975     if( pChild->nOverflow ){
5976       pChild->nFree = 0;
5977     }
5978     assert( pChild->nCell==pPage->nCell );
5979     assert( sqlite3PagerIswriteable(pPage->pDbPage) );
5980     zeroPage(pPage, pChild->aData[0] & ~PTF_LEAF);
5981     put4byte(&pPage->aData[pPage->hdrOffset+8], pgnoChild);
5982     TRACE(("BALANCE: copy root %d into %d\n", pPage->pgno, pChild->pgno));
5983     if( ISAUTOVACUUM ){
5984       rc = ptrmapPut(pBt, pChild->pgno, PTRMAP_BTREE, pPage->pgno);
5985 #ifndef SQLITE_OMIT_AUTOVACUUM
5986       if( rc==SQLITE_OK ){
5987         rc = setChildPtrmaps(pChild);
5988       }
5989       if( rc ){
5990         pChild->nOverflow = 0;
5991       }
5992 #endif
5993     }
5994   }
5995 
5996   if( rc==SQLITE_OK ){
5997     pCur->iPage++;
5998     pCur->apPage[1] = pChild;
5999     pCur->aiIdx[0] = 0;
6000     rc = balance_nonroot(pCur);
6001   }else{
6002     releasePage(pChild);
6003   }
6004 
6005   return rc;
6006 }
6007 
6008 /*
6009 ** The page that pCur currently points to has just been modified in
6010 ** some way. This function figures out if this modification means the
6011 ** tree needs to be balanced, and if so calls the appropriate balancing
6012 ** routine.
6013 **
6014 ** Parameter isInsert is true if a new cell was just inserted into the
6015 ** page, or false otherwise.
6016 */
6017 static int balance(BtCursor *pCur, int isInsert){
6018   int rc = SQLITE_OK;
6019   MemPage *pPage = pCur->apPage[pCur->iPage];
6020 
6021   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
6022   if( pCur->iPage==0 ){
6023     rc = sqlite3PagerWrite(pPage->pDbPage);
6024     if( rc==SQLITE_OK && pPage->nOverflow>0 ){
6025       rc = balance_deeper(pCur);
6026       assert( pCur->apPage[0]==pPage );
6027       assert( pPage->nOverflow==0 || rc!=SQLITE_OK );
6028     }
6029     if( rc==SQLITE_OK && pPage->nCell==0 ){
6030       rc = balance_shallower(pCur);
6031       assert( pCur->apPage[0]==pPage );
6032       assert( pPage->nOverflow==0 || rc!=SQLITE_OK );
6033     }
6034   }else{
6035     if( pPage->nOverflow>0 ||
6036         (!isInsert && pPage->nFree>pPage->pBt->usableSize*2/3) ){
6037       rc = balance_nonroot(pCur);
6038     }
6039   }
6040   return rc;
6041 }
6042 
6043 /*
6044 ** This routine checks all cursors that point to table pgnoRoot.
6045 ** If any of those cursors were opened with wrFlag==0 in a different
6046 ** database connection (a database connection that shares the pager
6047 ** cache with the current connection) and that other connection
6048 ** is not in the ReadUncommmitted state, then this routine returns
6049 ** SQLITE_LOCKED.
6050 **
6051 ** As well as cursors with wrFlag==0, cursors with
6052 ** isIncrblobHandle==1 are also considered 'read' cursors because
6053 ** incremental blob cursors are used for both reading and writing.
6054 **
6055 ** When pgnoRoot is the root page of an intkey table, this function is also
6056 ** responsible for invalidating incremental blob cursors when the table row
6057 ** on which they are opened is deleted or modified. Cursors are invalidated
6058 ** according to the following rules:
6059 **
6060 **   1) When BtreeClearTable() is called to completely delete the contents
6061 **      of a B-Tree table, pExclude is set to zero and parameter iRow is
6062 **      set to non-zero. In this case all incremental blob cursors open
6063 **      on the table rooted at pgnoRoot are invalidated.
6064 **
6065 **   2) When BtreeInsert(), BtreeDelete() or BtreePutData() is called to
6066 **      modify a table row via an SQL statement, pExclude is set to the
6067 **      write cursor used to do the modification and parameter iRow is set
6068 **      to the integer row id of the B-Tree entry being modified. Unless
6069 **      pExclude is itself an incremental blob cursor, then all incremental
6070 **      blob cursors open on row iRow of the B-Tree are invalidated.
6071 **
6072 **   3) If both pExclude and iRow are set to zero, no incremental blob
6073 **      cursors are invalidated.
6074 */
6075 static int checkForReadConflicts(
6076   Btree *pBtree,          /* The database file to check */
6077   Pgno pgnoRoot,          /* Look for read cursors on this btree */
6078   BtCursor *pExclude,     /* Ignore this cursor */
6079   i64 iRow                /* The rowid that might be changing */
6080 ){
6081   BtCursor *p;
6082   BtShared *pBt = pBtree->pBt;
6083   sqlite3 *db = pBtree->db;
6084   assert( sqlite3BtreeHoldsMutex(pBtree) );
6085   for(p=pBt->pCursor; p; p=p->pNext){
6086     if( p==pExclude ) continue;
6087     if( p->pgnoRoot!=pgnoRoot ) continue;
6088 #ifndef SQLITE_OMIT_INCRBLOB
6089     if( p->isIncrblobHandle && (
6090          (!pExclude && iRow)
6091       || (pExclude && !pExclude->isIncrblobHandle && p->info.nKey==iRow)
6092     )){
6093       p->eState = CURSOR_INVALID;
6094     }
6095 #endif
6096     if( p->eState!=CURSOR_VALID ) continue;
6097     if( p->wrFlag==0
6098 #ifndef SQLITE_OMIT_INCRBLOB
6099      || p->isIncrblobHandle
6100 #endif
6101     ){
6102       sqlite3 *dbOther = p->pBtree->db;
6103       assert(dbOther);
6104       if( dbOther!=db && (dbOther->flags & SQLITE_ReadUncommitted)==0 ){
6105         sqlite3ConnectionBlocked(db, dbOther);
6106         return SQLITE_LOCKED_SHAREDCACHE;
6107       }
6108     }
6109   }
6110   return SQLITE_OK;
6111 }
6112 
6113 /*
6114 ** Insert a new record into the BTree.  The key is given by (pKey,nKey)
6115 ** and the data is given by (pData,nData).  The cursor is used only to
6116 ** define what table the record should be inserted into.  The cursor
6117 ** is left pointing at a random location.
6118 **
6119 ** For an INTKEY table, only the nKey value of the key is used.  pKey is
6120 ** ignored.  For a ZERODATA table, the pData and nData are both ignored.
6121 */
6122 int sqlite3BtreeInsert(
6123   BtCursor *pCur,                /* Insert data into the table of this cursor */
6124   const void *pKey, i64 nKey,    /* The key of the new record */
6125   const void *pData, int nData,  /* The data of the new record */
6126   int nZero,                     /* Number of extra 0 bytes to append to data */
6127   int appendBias                 /* True if this is likely an append */
6128 ){
6129   int rc;
6130   int loc;
6131   int szNew;
6132   int idx;
6133   MemPage *pPage;
6134   Btree *p = pCur->pBtree;
6135   BtShared *pBt = p->pBt;
6136   unsigned char *oldCell;
6137   unsigned char *newCell = 0;
6138 
6139   assert( cursorHoldsMutex(pCur) );
6140   assert( pBt->inTransaction==TRANS_WRITE );
6141   assert( !pBt->readOnly );
6142   assert( pCur->wrFlag );
6143   rc = checkForReadConflicts(pCur->pBtree, pCur->pgnoRoot, pCur, nKey);
6144   if( rc ){
6145     /* The table pCur points to has a read lock */
6146     assert( rc==SQLITE_LOCKED_SHAREDCACHE );
6147     return rc;
6148   }
6149   if( pCur->eState==CURSOR_FAULT ){
6150     return pCur->skip;
6151   }
6152 
6153   /* Save the positions of any other cursors open on this table */
6154   sqlite3BtreeClearCursor(pCur);
6155   if(
6156     SQLITE_OK!=(rc = saveAllCursors(pBt, pCur->pgnoRoot, pCur)) ||
6157     SQLITE_OK!=(rc = sqlite3BtreeMoveto(pCur, pKey, nKey, appendBias, &loc))
6158   ){
6159     return rc;
6160   }
6161 
6162   pPage = pCur->apPage[pCur->iPage];
6163   assert( pPage->intKey || nKey>=0 );
6164   assert( pPage->leaf || !pPage->intKey );
6165   TRACE(("INSERT: table=%d nkey=%lld ndata=%d page=%d %s\n",
6166           pCur->pgnoRoot, nKey, nData, pPage->pgno,
6167           loc==0 ? "overwrite" : "new entry"));
6168   assert( pPage->isInit );
6169   allocateTempSpace(pBt);
6170   newCell = pBt->pTmpSpace;
6171   if( newCell==0 ) return SQLITE_NOMEM;
6172   rc = fillInCell(pPage, newCell, pKey, nKey, pData, nData, nZero, &szNew);
6173   if( rc ) goto end_insert;
6174   assert( szNew==cellSizePtr(pPage, newCell) );
6175   assert( szNew<=MX_CELL_SIZE(pBt) );
6176   idx = pCur->aiIdx[pCur->iPage];
6177   if( loc==0 && CURSOR_VALID==pCur->eState ){
6178     u16 szOld;
6179     assert( idx<pPage->nCell );
6180     rc = sqlite3PagerWrite(pPage->pDbPage);
6181     if( rc ){
6182       goto end_insert;
6183     }
6184     oldCell = findCell(pPage, idx);
6185     if( !pPage->leaf ){
6186       memcpy(newCell, oldCell, 4);
6187     }
6188     szOld = cellSizePtr(pPage, oldCell);
6189     rc = clearCell(pPage, oldCell);
6190     if( rc ) goto end_insert;
6191     rc = dropCell(pPage, idx, szOld);
6192     if( rc!=SQLITE_OK ) {
6193       goto end_insert;
6194     }
6195   }else if( loc<0 && pPage->nCell>0 ){
6196     assert( pPage->leaf );
6197     idx = ++pCur->aiIdx[pCur->iPage];
6198     pCur->info.nSize = 0;
6199     pCur->validNKey = 0;
6200   }else{
6201     assert( pPage->leaf );
6202   }
6203   rc = insertCell(pPage, idx, newCell, szNew, 0, 0);
6204   if( rc==SQLITE_OK ){
6205     rc = balance(pCur, 1);
6206   }
6207 
6208   /* Must make sure nOverflow is reset to zero even if the balance()
6209   ** fails.  Internal data structure corruption will result otherwise. */
6210   pCur->apPage[pCur->iPage]->nOverflow = 0;
6211 
6212   if( rc==SQLITE_OK ){
6213     moveToRoot(pCur);
6214   }
6215 end_insert:
6216   return rc;
6217 }
6218 
6219 /*
6220 ** Delete the entry that the cursor is pointing to.  The cursor
6221 ** is left pointing at a arbitrary location.
6222 */
6223 int sqlite3BtreeDelete(BtCursor *pCur){
6224   MemPage *pPage = pCur->apPage[pCur->iPage];
6225   int idx;
6226   unsigned char *pCell;
6227   int rc;
6228   Pgno pgnoChild = 0;
6229   Btree *p = pCur->pBtree;
6230   BtShared *pBt = p->pBt;
6231 
6232   assert( cursorHoldsMutex(pCur) );
6233   assert( pPage->isInit );
6234   assert( pBt->inTransaction==TRANS_WRITE );
6235   assert( !pBt->readOnly );
6236   if( pCur->eState==CURSOR_FAULT ){
6237     return pCur->skip;
6238   }
6239   if( NEVER(pCur->aiIdx[pCur->iPage]>=pPage->nCell) ){
6240     return SQLITE_ERROR;  /* The cursor is not pointing to anything */
6241   }
6242   assert( pCur->wrFlag );
6243   rc = checkForReadConflicts(p, pCur->pgnoRoot, pCur, pCur->info.nKey);
6244   if( rc!=SQLITE_OK ){
6245     /* The table pCur points to has a read lock */
6246     assert( rc==SQLITE_LOCKED_SHAREDCACHE );
6247     return rc;
6248   }
6249 
6250   /* Restore the current cursor position (a no-op if the cursor is not in
6251   ** CURSOR_REQUIRESEEK state) and save the positions of any other cursors
6252   ** open on the same table. Then call sqlite3PagerWrite() on the page
6253   ** that the entry will be deleted from.
6254   */
6255   if(
6256     (rc = restoreCursorPosition(pCur))!=0 ||
6257     (rc = saveAllCursors(pBt, pCur->pgnoRoot, pCur))!=0 ||
6258     (rc = sqlite3PagerWrite(pPage->pDbPage))!=0
6259   ){
6260     return rc;
6261   }
6262 
6263   /* Locate the cell within its page and leave pCell pointing to the
6264   ** data. The clearCell() call frees any overflow pages associated with the
6265   ** cell. The cell itself is still intact.
6266   */
6267   idx = pCur->aiIdx[pCur->iPage];
6268   pCell = findCell(pPage, idx);
6269   if( !pPage->leaf ){
6270     pgnoChild = get4byte(pCell);
6271   }
6272   rc = clearCell(pPage, pCell);
6273   if( rc ){
6274     return rc;
6275   }
6276 
6277   if( !pPage->leaf ){
6278     /*
6279     ** The entry we are about to delete is not a leaf so if we do not
6280     ** do something we will leave a hole on an internal page.
6281     ** We have to fill the hole by moving in a cell from a leaf.  The
6282     ** next Cell after the one to be deleted is guaranteed to exist and
6283     ** to be a leaf so we can use it.
6284     */
6285     BtCursor leafCur;
6286     MemPage *pLeafPage = 0;
6287 
6288     unsigned char *pNext;
6289     int notUsed;
6290     unsigned char *tempCell = 0;
6291     assert( !pPage->intKey );
6292     sqlite3BtreeGetTempCursor(pCur, &leafCur);
6293     rc = sqlite3BtreeNext(&leafCur, &notUsed);
6294     if( rc==SQLITE_OK ){
6295       assert( leafCur.aiIdx[leafCur.iPage]==0 );
6296       pLeafPage = leafCur.apPage[leafCur.iPage];
6297       rc = sqlite3PagerWrite(pLeafPage->pDbPage);
6298     }
6299     if( rc==SQLITE_OK ){
6300       int leafCursorInvalid = 0;
6301       u16 szNext;
6302       TRACE(("DELETE: table=%d delete internal from %d replace from leaf %d\n",
6303          pCur->pgnoRoot, pPage->pgno, pLeafPage->pgno));
6304       dropCell(pPage, idx, cellSizePtr(pPage, pCell));
6305       pNext = findCell(pLeafPage, 0);
6306       szNext = cellSizePtr(pLeafPage, pNext);
6307       assert( MX_CELL_SIZE(pBt)>=szNext+4 );
6308       allocateTempSpace(pBt);
6309       tempCell = pBt->pTmpSpace;
6310       if( tempCell==0 ){
6311         rc = SQLITE_NOMEM;
6312       }
6313       if( rc==SQLITE_OK ){
6314         rc = insertCell(pPage, idx, pNext-4, szNext+4, tempCell, 0);
6315       }
6316 
6317 
6318       /* The "if" statement in the next code block is critical.  The
6319       ** slightest error in that statement would allow SQLite to operate
6320       ** correctly most of the time but produce very rare failures.  To
6321       ** guard against this, the following macros help to verify that
6322       ** the "if" statement is well tested.
6323       */
6324       testcase( pPage->nOverflow==0 && pPage->nFree<pBt->usableSize*2/3
6325                  && pLeafPage->nFree+2+szNext > pBt->usableSize*2/3 );
6326       testcase( pPage->nOverflow==0 && pPage->nFree==pBt->usableSize*2/3
6327                  && pLeafPage->nFree+2+szNext > pBt->usableSize*2/3 );
6328       testcase( pPage->nOverflow==0 && pPage->nFree==pBt->usableSize*2/3+1
6329                  && pLeafPage->nFree+2+szNext > pBt->usableSize*2/3 );
6330       testcase( pPage->nOverflow>0 && pPage->nFree<=pBt->usableSize*2/3
6331                  && pLeafPage->nFree+2+szNext > pBt->usableSize*2/3 );
6332       testcase( (pPage->nOverflow>0 || (pPage->nFree > pBt->usableSize*2/3))
6333                  && pLeafPage->nFree+2+szNext == pBt->usableSize*2/3 );
6334 
6335 
6336       if( (pPage->nOverflow>0 || (pPage->nFree > pBt->usableSize*2/3)) &&
6337           (pLeafPage->nFree+2+szNext > pBt->usableSize*2/3)
6338       ){
6339         /* This branch is taken if the internal node is now either overflowing
6340         ** or underfull and the leaf node will be underfull after the just cell
6341         ** copied to the internal node is deleted from it. This is a special
6342         ** case because the call to balance() to correct the internal node
6343         ** may change the tree structure and invalidate the contents of
6344         ** the leafCur.apPage[] and leafCur.aiIdx[] arrays, which will be
6345         ** used by the balance() required to correct the underfull leaf
6346         ** node.
6347         **
6348         ** The formula used in the expression above are based on facets of
6349         ** the SQLite file-format that do not change over time.
6350         */
6351         testcase( pPage->nFree==pBt->usableSize*2/3+1 );
6352         testcase( pLeafPage->nFree+2+szNext==pBt->usableSize*2/3+1 );
6353         leafCursorInvalid = 1;
6354       }
6355 
6356       if( rc==SQLITE_OK ){
6357         assert( sqlite3PagerIswriteable(pPage->pDbPage) );
6358         put4byte(findOverflowCell(pPage, idx), pgnoChild);
6359         VVA_ONLY( pCur->pagesShuffled = 0 );
6360         rc = balance(pCur, 0);
6361       }
6362 
6363       if( rc==SQLITE_OK && leafCursorInvalid ){
6364         /* The leaf-node is now underfull and so the tree needs to be
6365         ** rebalanced. However, the balance() operation on the internal
6366         ** node above may have modified the structure of the B-Tree and
6367         ** so the current contents of leafCur.apPage[] and leafCur.aiIdx[]
6368         ** may not be trusted.
6369         **
6370         ** It is not possible to copy the ancestry from pCur, as the same
6371         ** balance() call has invalidated the pCur->apPage[] and aiIdx[]
6372         ** arrays.
6373         **
6374         ** The call to saveCursorPosition() below internally saves the
6375         ** key that leafCur is currently pointing to. Currently, there
6376         ** are two copies of that key in the tree - one here on the leaf
6377         ** page and one on some internal node in the tree. The copy on
6378         ** the leaf node is always the next key in tree-order after the
6379         ** copy on the internal node. So, the call to sqlite3BtreeNext()
6380         ** calls restoreCursorPosition() to point the cursor to the copy
6381         ** stored on the internal node, then advances to the next entry,
6382         ** which happens to be the copy of the key on the internal node.
6383         ** Net effect: leafCur is pointing back to the duplicate cell
6384         ** that needs to be removed, and the leafCur.apPage[] and
6385         ** leafCur.aiIdx[] arrays are correct.
6386         */
6387         VVA_ONLY( Pgno leafPgno = pLeafPage->pgno );
6388         rc = saveCursorPosition(&leafCur);
6389         if( rc==SQLITE_OK ){
6390           rc = sqlite3BtreeNext(&leafCur, &notUsed);
6391         }
6392         pLeafPage = leafCur.apPage[leafCur.iPage];
6393         assert( rc!=SQLITE_OK || pLeafPage->pgno==leafPgno );
6394         assert( rc!=SQLITE_OK || leafCur.aiIdx[leafCur.iPage]==0 );
6395       }
6396 
6397       if( SQLITE_OK==rc
6398        && SQLITE_OK==(rc = sqlite3PagerWrite(pLeafPage->pDbPage))
6399       ){
6400         dropCell(pLeafPage, 0, szNext);
6401         VVA_ONLY( leafCur.pagesShuffled = 0 );
6402         rc = balance(&leafCur, 0);
6403         assert( leafCursorInvalid || !leafCur.pagesShuffled
6404                                    || !pCur->pagesShuffled );
6405       }
6406     }
6407     sqlite3BtreeReleaseTempCursor(&leafCur);
6408   }else{
6409     TRACE(("DELETE: table=%d delete from leaf %d\n",
6410        pCur->pgnoRoot, pPage->pgno));
6411     rc = dropCell(pPage, idx, cellSizePtr(pPage, pCell));
6412     if( rc==SQLITE_OK ){
6413       rc = balance(pCur, 0);
6414     }
6415   }
6416   if( rc==SQLITE_OK ){
6417     moveToRoot(pCur);
6418   }
6419   return rc;
6420 }
6421 
6422 /*
6423 ** Create a new BTree table.  Write into *piTable the page
6424 ** number for the root page of the new table.
6425 **
6426 ** The type of type is determined by the flags parameter.  Only the
6427 ** following values of flags are currently in use.  Other values for
6428 ** flags might not work:
6429 **
6430 **     BTREE_INTKEY|BTREE_LEAFDATA     Used for SQL tables with rowid keys
6431 **     BTREE_ZERODATA                  Used for SQL indices
6432 */
6433 static int btreeCreateTable(Btree *p, int *piTable, int flags){
6434   BtShared *pBt = p->pBt;
6435   MemPage *pRoot;
6436   Pgno pgnoRoot;
6437   int rc;
6438 
6439   assert( sqlite3BtreeHoldsMutex(p) );
6440   assert( pBt->inTransaction==TRANS_WRITE );
6441   assert( !pBt->readOnly );
6442 
6443 #ifdef SQLITE_OMIT_AUTOVACUUM
6444   rc = allocateBtreePage(pBt, &pRoot, &pgnoRoot, 1, 0);
6445   if( rc ){
6446     return rc;
6447   }
6448 #else
6449   if( pBt->autoVacuum ){
6450     Pgno pgnoMove;      /* Move a page here to make room for the root-page */
6451     MemPage *pPageMove; /* The page to move to. */
6452 
6453     /* Creating a new table may probably require moving an existing database
6454     ** to make room for the new tables root page. In case this page turns
6455     ** out to be an overflow page, delete all overflow page-map caches
6456     ** held by open cursors.
6457     */
6458     invalidateAllOverflowCache(pBt);
6459 
6460     /* Read the value of meta[3] from the database to determine where the
6461     ** root page of the new table should go. meta[3] is the largest root-page
6462     ** created so far, so the new root-page is (meta[3]+1).
6463     */
6464     rc = sqlite3BtreeGetMeta(p, 4, &pgnoRoot);
6465     if( rc!=SQLITE_OK ){
6466       return rc;
6467     }
6468     pgnoRoot++;
6469 
6470     /* The new root-page may not be allocated on a pointer-map page, or the
6471     ** PENDING_BYTE page.
6472     */
6473     while( pgnoRoot==PTRMAP_PAGENO(pBt, pgnoRoot) ||
6474         pgnoRoot==PENDING_BYTE_PAGE(pBt) ){
6475       pgnoRoot++;
6476     }
6477     assert( pgnoRoot>=3 );
6478 
6479     /* Allocate a page. The page that currently resides at pgnoRoot will
6480     ** be moved to the allocated page (unless the allocated page happens
6481     ** to reside at pgnoRoot).
6482     */
6483     rc = allocateBtreePage(pBt, &pPageMove, &pgnoMove, pgnoRoot, 1);
6484     if( rc!=SQLITE_OK ){
6485       return rc;
6486     }
6487 
6488     if( pgnoMove!=pgnoRoot ){
6489       /* pgnoRoot is the page that will be used for the root-page of
6490       ** the new table (assuming an error did not occur). But we were
6491       ** allocated pgnoMove. If required (i.e. if it was not allocated
6492       ** by extending the file), the current page at position pgnoMove
6493       ** is already journaled.
6494       */
6495       u8 eType;
6496       Pgno iPtrPage;
6497 
6498       releasePage(pPageMove);
6499 
6500       /* Move the page currently at pgnoRoot to pgnoMove. */
6501       rc = sqlite3BtreeGetPage(pBt, pgnoRoot, &pRoot, 0);
6502       if( rc!=SQLITE_OK ){
6503         return rc;
6504       }
6505       rc = ptrmapGet(pBt, pgnoRoot, &eType, &iPtrPage);
6506       if( rc!=SQLITE_OK || eType==PTRMAP_ROOTPAGE || eType==PTRMAP_FREEPAGE ){
6507         releasePage(pRoot);
6508         return rc;
6509       }
6510       assert( eType!=PTRMAP_ROOTPAGE );
6511       assert( eType!=PTRMAP_FREEPAGE );
6512       rc = relocatePage(pBt, pRoot, eType, iPtrPage, pgnoMove, 0);
6513       releasePage(pRoot);
6514 
6515       /* Obtain the page at pgnoRoot */
6516       if( rc!=SQLITE_OK ){
6517         return rc;
6518       }
6519       rc = sqlite3BtreeGetPage(pBt, pgnoRoot, &pRoot, 0);
6520       if( rc!=SQLITE_OK ){
6521         return rc;
6522       }
6523       rc = sqlite3PagerWrite(pRoot->pDbPage);
6524       if( rc!=SQLITE_OK ){
6525         releasePage(pRoot);
6526         return rc;
6527       }
6528     }else{
6529       pRoot = pPageMove;
6530     }
6531 
6532     /* Update the pointer-map and meta-data with the new root-page number. */
6533     rc = ptrmapPut(pBt, pgnoRoot, PTRMAP_ROOTPAGE, 0);
6534     if( rc ){
6535       releasePage(pRoot);
6536       return rc;
6537     }
6538     rc = sqlite3BtreeUpdateMeta(p, 4, pgnoRoot);
6539     if( rc ){
6540       releasePage(pRoot);
6541       return rc;
6542     }
6543 
6544   }else{
6545     rc = allocateBtreePage(pBt, &pRoot, &pgnoRoot, 1, 0);
6546     if( rc ) return rc;
6547   }
6548 #endif
6549   assert( sqlite3PagerIswriteable(pRoot->pDbPage) );
6550   zeroPage(pRoot, flags | PTF_LEAF);
6551   sqlite3PagerUnref(pRoot->pDbPage);
6552   *piTable = (int)pgnoRoot;
6553   return SQLITE_OK;
6554 }
6555 int sqlite3BtreeCreateTable(Btree *p, int *piTable, int flags){
6556   int rc;
6557   sqlite3BtreeEnter(p);
6558   rc = btreeCreateTable(p, piTable, flags);
6559   sqlite3BtreeLeave(p);
6560   return rc;
6561 }
6562 
6563 /*
6564 ** Erase the given database page and all its children.  Return
6565 ** the page to the freelist.
6566 */
6567 static int clearDatabasePage(
6568   BtShared *pBt,           /* The BTree that contains the table */
6569   Pgno pgno,            /* Page number to clear */
6570   int freePageFlag,     /* Deallocate page if true */
6571   int *pnChange
6572 ){
6573   MemPage *pPage = 0;
6574   int rc;
6575   unsigned char *pCell;
6576   int i;
6577 
6578   assert( sqlite3_mutex_held(pBt->mutex) );
6579   if( pgno>pagerPagecount(pBt) ){
6580     return SQLITE_CORRUPT_BKPT;
6581   }
6582 
6583   rc = getAndInitPage(pBt, pgno, &pPage);
6584   if( rc ) goto cleardatabasepage_out;
6585   for(i=0; i<pPage->nCell; i++){
6586     pCell = findCell(pPage, i);
6587     if( !pPage->leaf ){
6588       rc = clearDatabasePage(pBt, get4byte(pCell), 1, pnChange);
6589       if( rc ) goto cleardatabasepage_out;
6590     }
6591     rc = clearCell(pPage, pCell);
6592     if( rc ) goto cleardatabasepage_out;
6593   }
6594   if( !pPage->leaf ){
6595     rc = clearDatabasePage(pBt, get4byte(&pPage->aData[8]), 1, pnChange);
6596     if( rc ) goto cleardatabasepage_out;
6597   }else if( pnChange ){
6598     assert( pPage->intKey );
6599     *pnChange += pPage->nCell;
6600   }
6601   if( freePageFlag ){
6602     rc = freePage(pPage);
6603   }else if( (rc = sqlite3PagerWrite(pPage->pDbPage))==0 ){
6604     zeroPage(pPage, pPage->aData[0] | PTF_LEAF);
6605   }
6606 
6607 cleardatabasepage_out:
6608   releasePage(pPage);
6609   return rc;
6610 }
6611 
6612 /*
6613 ** Delete all information from a single table in the database.  iTable is
6614 ** the page number of the root of the table.  After this routine returns,
6615 ** the root page is empty, but still exists.
6616 **
6617 ** This routine will fail with SQLITE_LOCKED if there are any open
6618 ** read cursors on the table.  Open write cursors are moved to the
6619 ** root of the table.
6620 **
6621 ** If pnChange is not NULL, then table iTable must be an intkey table. The
6622 ** integer value pointed to by pnChange is incremented by the number of
6623 ** entries in the table.
6624 */
6625 int sqlite3BtreeClearTable(Btree *p, int iTable, int *pnChange){
6626   int rc;
6627   BtShared *pBt = p->pBt;
6628   sqlite3BtreeEnter(p);
6629   assert( p->inTrans==TRANS_WRITE );
6630   if( (rc = checkForReadConflicts(p, iTable, 0, 1))!=SQLITE_OK ){
6631     /* nothing to do */
6632   }else if( SQLITE_OK!=(rc = saveAllCursors(pBt, iTable, 0)) ){
6633     /* nothing to do */
6634   }else{
6635     rc = clearDatabasePage(pBt, (Pgno)iTable, 0, pnChange);
6636   }
6637   sqlite3BtreeLeave(p);
6638   return rc;
6639 }
6640 
6641 /*
6642 ** Erase all information in a table and add the root of the table to
6643 ** the freelist.  Except, the root of the principle table (the one on
6644 ** page 1) is never added to the freelist.
6645 **
6646 ** This routine will fail with SQLITE_LOCKED if there are any open
6647 ** cursors on the table.
6648 **
6649 ** If AUTOVACUUM is enabled and the page at iTable is not the last
6650 ** root page in the database file, then the last root page
6651 ** in the database file is moved into the slot formerly occupied by
6652 ** iTable and that last slot formerly occupied by the last root page
6653 ** is added to the freelist instead of iTable.  In this say, all
6654 ** root pages are kept at the beginning of the database file, which
6655 ** is necessary for AUTOVACUUM to work right.  *piMoved is set to the
6656 ** page number that used to be the last root page in the file before
6657 ** the move.  If no page gets moved, *piMoved is set to 0.
6658 ** The last root page is recorded in meta[3] and the value of
6659 ** meta[3] is updated by this procedure.
6660 */
6661 static int btreeDropTable(Btree *p, Pgno iTable, int *piMoved){
6662   int rc;
6663   MemPage *pPage = 0;
6664   BtShared *pBt = p->pBt;
6665 
6666   assert( sqlite3BtreeHoldsMutex(p) );
6667   assert( p->inTrans==TRANS_WRITE );
6668 
6669   /* It is illegal to drop a table if any cursors are open on the
6670   ** database. This is because in auto-vacuum mode the backend may
6671   ** need to move another root-page to fill a gap left by the deleted
6672   ** root page. If an open cursor was using this page a problem would
6673   ** occur.
6674   */
6675   if( pBt->pCursor ){
6676     sqlite3ConnectionBlocked(p->db, pBt->pCursor->pBtree->db);
6677     return SQLITE_LOCKED_SHAREDCACHE;
6678   }
6679 
6680   rc = sqlite3BtreeGetPage(pBt, (Pgno)iTable, &pPage, 0);
6681   if( rc ) return rc;
6682   rc = sqlite3BtreeClearTable(p, iTable, 0);
6683   if( rc ){
6684     releasePage(pPage);
6685     return rc;
6686   }
6687 
6688   *piMoved = 0;
6689 
6690   if( iTable>1 ){
6691 #ifdef SQLITE_OMIT_AUTOVACUUM
6692     rc = freePage(pPage);
6693     releasePage(pPage);
6694 #else
6695     if( pBt->autoVacuum ){
6696       Pgno maxRootPgno;
6697       rc = sqlite3BtreeGetMeta(p, 4, &maxRootPgno);
6698       if( rc!=SQLITE_OK ){
6699         releasePage(pPage);
6700         return rc;
6701       }
6702 
6703       if( iTable==maxRootPgno ){
6704         /* If the table being dropped is the table with the largest root-page
6705         ** number in the database, put the root page on the free list.
6706         */
6707         rc = freePage(pPage);
6708         releasePage(pPage);
6709         if( rc!=SQLITE_OK ){
6710           return rc;
6711         }
6712       }else{
6713         /* The table being dropped does not have the largest root-page
6714         ** number in the database. So move the page that does into the
6715         ** gap left by the deleted root-page.
6716         */
6717         MemPage *pMove;
6718         releasePage(pPage);
6719         rc = sqlite3BtreeGetPage(pBt, maxRootPgno, &pMove, 0);
6720         if( rc!=SQLITE_OK ){
6721           return rc;
6722         }
6723         rc = relocatePage(pBt, pMove, PTRMAP_ROOTPAGE, 0, iTable, 0);
6724         releasePage(pMove);
6725         if( rc!=SQLITE_OK ){
6726           return rc;
6727         }
6728         rc = sqlite3BtreeGetPage(pBt, maxRootPgno, &pMove, 0);
6729         if( rc!=SQLITE_OK ){
6730           return rc;
6731         }
6732         rc = freePage(pMove);
6733         releasePage(pMove);
6734         if( rc!=SQLITE_OK ){
6735           return rc;
6736         }
6737         *piMoved = maxRootPgno;
6738       }
6739 
6740       /* Set the new 'max-root-page' value in the database header. This
6741       ** is the old value less one, less one more if that happens to
6742       ** be a root-page number, less one again if that is the
6743       ** PENDING_BYTE_PAGE.
6744       */
6745       maxRootPgno--;
6746       if( maxRootPgno==PENDING_BYTE_PAGE(pBt) ){
6747         maxRootPgno--;
6748       }
6749       if( maxRootPgno==PTRMAP_PAGENO(pBt, maxRootPgno) ){
6750         maxRootPgno--;
6751       }
6752       assert( maxRootPgno!=PENDING_BYTE_PAGE(pBt) );
6753 
6754       rc = sqlite3BtreeUpdateMeta(p, 4, maxRootPgno);
6755     }else{
6756       rc = freePage(pPage);
6757       releasePage(pPage);
6758     }
6759 #endif
6760   }else{
6761     /* If sqlite3BtreeDropTable was called on page 1. */
6762     zeroPage(pPage, PTF_INTKEY|PTF_LEAF );
6763     releasePage(pPage);
6764   }
6765   return rc;
6766 }
6767 int sqlite3BtreeDropTable(Btree *p, int iTable, int *piMoved){
6768   int rc;
6769   sqlite3BtreeEnter(p);
6770   rc = btreeDropTable(p, iTable, piMoved);
6771   sqlite3BtreeLeave(p);
6772   return rc;
6773 }
6774 
6775 
6776 /*
6777 ** Read the meta-information out of a database file.  Meta[0]
6778 ** is the number of free pages currently in the database.  Meta[1]
6779 ** through meta[15] are available for use by higher layers.  Meta[0]
6780 ** is read-only, the others are read/write.
6781 **
6782 ** The schema layer numbers meta values differently.  At the schema
6783 ** layer (and the SetCookie and ReadCookie opcodes) the number of
6784 ** free pages is not visible.  So Cookie[0] is the same as Meta[1].
6785 */
6786 int sqlite3BtreeGetMeta(Btree *p, int idx, u32 *pMeta){
6787   DbPage *pDbPage = 0;
6788   int rc;
6789   unsigned char *pP1;
6790   BtShared *pBt = p->pBt;
6791 
6792   sqlite3BtreeEnter(p);
6793 
6794   /* Reading a meta-data value requires a read-lock on page 1 (and hence
6795   ** the sqlite_master table. We grab this lock regardless of whether or
6796   ** not the SQLITE_ReadUncommitted flag is set (the table rooted at page
6797   ** 1 is treated as a special case by querySharedCacheTableLock()
6798   ** and setSharedCacheTableLock()).
6799   */
6800   rc = querySharedCacheTableLock(p, 1, READ_LOCK);
6801   if( rc!=SQLITE_OK ){
6802     sqlite3BtreeLeave(p);
6803     return rc;
6804   }
6805 
6806   assert( idx>=0 && idx<=15 );
6807   if( pBt->pPage1 ){
6808     /* The b-tree is already holding a reference to page 1 of the database
6809     ** file. In this case the required meta-data value can be read directly
6810     ** from the page data of this reference. This is slightly faster than
6811     ** requesting a new reference from the pager layer.
6812     */
6813     pP1 = (unsigned char *)pBt->pPage1->aData;
6814   }else{
6815     /* The b-tree does not have a reference to page 1 of the database file.
6816     ** Obtain one from the pager layer.
6817     */
6818     rc = sqlite3PagerGet(pBt->pPager, 1, &pDbPage);
6819     if( rc ){
6820       sqlite3BtreeLeave(p);
6821       return rc;
6822     }
6823     pP1 = (unsigned char *)sqlite3PagerGetData(pDbPage);
6824   }
6825   *pMeta = get4byte(&pP1[36 + idx*4]);
6826 
6827   /* If the b-tree is not holding a reference to page 1, then one was
6828   ** requested from the pager layer in the above block. Release it now.
6829   */
6830   if( !pBt->pPage1 ){
6831     sqlite3PagerUnref(pDbPage);
6832   }
6833 
6834   /* If autovacuumed is disabled in this build but we are trying to
6835   ** access an autovacuumed database, then make the database readonly.
6836   */
6837 #ifdef SQLITE_OMIT_AUTOVACUUM
6838   if( idx==4 && *pMeta>0 ) pBt->readOnly = 1;
6839 #endif
6840 
6841   /* If there is currently an open transaction, grab a read-lock
6842   ** on page 1 of the database file. This is done to make sure that
6843   ** no other connection can modify the meta value just read from
6844   ** the database until the transaction is concluded.
6845   */
6846   if( p->inTrans>0 ){
6847     rc = setSharedCacheTableLock(p, 1, READ_LOCK);
6848   }
6849   sqlite3BtreeLeave(p);
6850   return rc;
6851 }
6852 
6853 /*
6854 ** Write meta-information back into the database.  Meta[0] is
6855 ** read-only and may not be written.
6856 */
6857 int sqlite3BtreeUpdateMeta(Btree *p, int idx, u32 iMeta){
6858   BtShared *pBt = p->pBt;
6859   unsigned char *pP1;
6860   int rc;
6861   assert( idx>=1 && idx<=15 );
6862   sqlite3BtreeEnter(p);
6863   assert( p->inTrans==TRANS_WRITE );
6864   assert( pBt->pPage1!=0 );
6865   pP1 = pBt->pPage1->aData;
6866   rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
6867   if( rc==SQLITE_OK ){
6868     put4byte(&pP1[36 + idx*4], iMeta);
6869 #ifndef SQLITE_OMIT_AUTOVACUUM
6870     if( idx==7 ){
6871       assert( pBt->autoVacuum || iMeta==0 );
6872       assert( iMeta==0 || iMeta==1 );
6873       pBt->incrVacuum = (u8)iMeta;
6874     }
6875 #endif
6876   }
6877   sqlite3BtreeLeave(p);
6878   return rc;
6879 }
6880 
6881 /*
6882 ** Return the flag byte at the beginning of the page that the cursor
6883 ** is currently pointing to.
6884 */
6885 int sqlite3BtreeFlags(BtCursor *pCur){
6886   /* TODO: What about CURSOR_REQUIRESEEK state? Probably need to call
6887   ** restoreCursorPosition() here.
6888   */
6889   MemPage *pPage;
6890   restoreCursorPosition(pCur);
6891   pPage = pCur->apPage[pCur->iPage];
6892   assert( cursorHoldsMutex(pCur) );
6893   assert( pPage!=0 );
6894   assert( pPage->pBt==pCur->pBt );
6895   return pPage->aData[pPage->hdrOffset];
6896 }
6897 
6898 #ifndef SQLITE_OMIT_BTREECOUNT
6899 /*
6900 ** The first argument, pCur, is a cursor opened on some b-tree. Count the
6901 ** number of entries in the b-tree and write the result to *pnEntry.
6902 **
6903 ** SQLITE_OK is returned if the operation is successfully executed.
6904 ** Otherwise, if an error is encountered (i.e. an IO error or database
6905 ** corruption) an SQLite error code is returned.
6906 */
6907 int sqlite3BtreeCount(BtCursor *pCur, i64 *pnEntry){
6908   i64 nEntry = 0;                      /* Value to return in *pnEntry */
6909   int rc;                              /* Return code */
6910   rc = moveToRoot(pCur);
6911 
6912   /* Unless an error occurs, the following loop runs one iteration for each
6913   ** page in the B-Tree structure (not including overflow pages).
6914   */
6915   while( rc==SQLITE_OK ){
6916     int iIdx;                          /* Index of child node in parent */
6917     MemPage *pPage;                    /* Current page of the b-tree */
6918 
6919     /* If this is a leaf page or the tree is not an int-key tree, then
6920     ** this page contains countable entries. Increment the entry counter
6921     ** accordingly.
6922     */
6923     pPage = pCur->apPage[pCur->iPage];
6924     if( pPage->leaf || !pPage->intKey ){
6925       nEntry += pPage->nCell;
6926     }
6927 
6928     /* pPage is a leaf node. This loop navigates the cursor so that it
6929     ** points to the first interior cell that it points to the parent of
6930     ** the next page in the tree that has not yet been visited. The
6931     ** pCur->aiIdx[pCur->iPage] value is set to the index of the parent cell
6932     ** of the page, or to the number of cells in the page if the next page
6933     ** to visit is the right-child of its parent.
6934     **
6935     ** If all pages in the tree have been visited, return SQLITE_OK to the
6936     ** caller.
6937     */
6938     if( pPage->leaf ){
6939       do {
6940         if( pCur->iPage==0 ){
6941           /* All pages of the b-tree have been visited. Return successfully. */
6942           *pnEntry = nEntry;
6943           return SQLITE_OK;
6944         }
6945         sqlite3BtreeMoveToParent(pCur);
6946       }while ( pCur->aiIdx[pCur->iPage]>=pCur->apPage[pCur->iPage]->nCell );
6947 
6948       pCur->aiIdx[pCur->iPage]++;
6949       pPage = pCur->apPage[pCur->iPage];
6950     }
6951 
6952     /* Descend to the child node of the cell that the cursor currently
6953     ** points at. This is the right-child if (iIdx==pPage->nCell).
6954     */
6955     iIdx = pCur->aiIdx[pCur->iPage];
6956     if( iIdx==pPage->nCell ){
6957       rc = moveToChild(pCur, get4byte(&pPage->aData[pPage->hdrOffset+8]));
6958     }else{
6959       rc = moveToChild(pCur, get4byte(findCell(pPage, iIdx)));
6960     }
6961   }
6962 
6963   /* An error has occurred. Return an error code. */
6964   return rc;
6965 }
6966 #endif
6967 
6968 /*
6969 ** Return the pager associated with a BTree.  This routine is used for
6970 ** testing and debugging only.
6971 */
6972 Pager *sqlite3BtreePager(Btree *p){
6973   return p->pBt->pPager;
6974 }
6975 
6976 #ifndef SQLITE_OMIT_INTEGRITY_CHECK
6977 /*
6978 ** Append a message to the error message string.
6979 */
6980 static void checkAppendMsg(
6981   IntegrityCk *pCheck,
6982   char *zMsg1,
6983   const char *zFormat,
6984   ...
6985 ){
6986   va_list ap;
6987   if( !pCheck->mxErr ) return;
6988   pCheck->mxErr--;
6989   pCheck->nErr++;
6990   va_start(ap, zFormat);
6991   if( pCheck->errMsg.nChar ){
6992     sqlite3StrAccumAppend(&pCheck->errMsg, "\n", 1);
6993   }
6994   if( zMsg1 ){
6995     sqlite3StrAccumAppend(&pCheck->errMsg, zMsg1, -1);
6996   }
6997   sqlite3VXPrintf(&pCheck->errMsg, 1, zFormat, ap);
6998   va_end(ap);
6999   if( pCheck->errMsg.mallocFailed ){
7000     pCheck->mallocFailed = 1;
7001   }
7002 }
7003 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
7004 
7005 #ifndef SQLITE_OMIT_INTEGRITY_CHECK
7006 /*
7007 ** Add 1 to the reference count for page iPage.  If this is the second
7008 ** reference to the page, add an error message to pCheck->zErrMsg.
7009 ** Return 1 if there are 2 ore more references to the page and 0 if
7010 ** if this is the first reference to the page.
7011 **
7012 ** Also check that the page number is in bounds.
7013 */
7014 static int checkRef(IntegrityCk *pCheck, Pgno iPage, char *zContext){
7015   if( iPage==0 ) return 1;
7016   if( iPage>pCheck->nPage ){
7017     checkAppendMsg(pCheck, zContext, "invalid page number %d", iPage);
7018     return 1;
7019   }
7020   if( pCheck->anRef[iPage]==1 ){
7021     checkAppendMsg(pCheck, zContext, "2nd reference to page %d", iPage);
7022     return 1;
7023   }
7024   return  (pCheck->anRef[iPage]++)>1;
7025 }
7026 
7027 #ifndef SQLITE_OMIT_AUTOVACUUM
7028 /*
7029 ** Check that the entry in the pointer-map for page iChild maps to
7030 ** page iParent, pointer type ptrType. If not, append an error message
7031 ** to pCheck.
7032 */
7033 static void checkPtrmap(
7034   IntegrityCk *pCheck,   /* Integrity check context */
7035   Pgno iChild,           /* Child page number */
7036   u8 eType,              /* Expected pointer map type */
7037   Pgno iParent,          /* Expected pointer map parent page number */
7038   char *zContext         /* Context description (used for error msg) */
7039 ){
7040   int rc;
7041   u8 ePtrmapType;
7042   Pgno iPtrmapParent;
7043 
7044   rc = ptrmapGet(pCheck->pBt, iChild, &ePtrmapType, &iPtrmapParent);
7045   if( rc!=SQLITE_OK ){
7046     if( rc==SQLITE_NOMEM ) pCheck->mallocFailed = 1;
7047     checkAppendMsg(pCheck, zContext, "Failed to read ptrmap key=%d", iChild);
7048     return;
7049   }
7050 
7051   if( ePtrmapType!=eType || iPtrmapParent!=iParent ){
7052     checkAppendMsg(pCheck, zContext,
7053       "Bad ptr map entry key=%d expected=(%d,%d) got=(%d,%d)",
7054       iChild, eType, iParent, ePtrmapType, iPtrmapParent);
7055   }
7056 }
7057 #endif
7058 
7059 /*
7060 ** Check the integrity of the freelist or of an overflow page list.
7061 ** Verify that the number of pages on the list is N.
7062 */
7063 static void checkList(
7064   IntegrityCk *pCheck,  /* Integrity checking context */
7065   int isFreeList,       /* True for a freelist.  False for overflow page list */
7066   int iPage,            /* Page number for first page in the list */
7067   int N,                /* Expected number of pages in the list */
7068   char *zContext        /* Context for error messages */
7069 ){
7070   int i;
7071   int expected = N;
7072   int iFirst = iPage;
7073   while( N-- > 0 && pCheck->mxErr ){
7074     DbPage *pOvflPage;
7075     unsigned char *pOvflData;
7076     if( iPage<1 ){
7077       checkAppendMsg(pCheck, zContext,
7078          "%d of %d pages missing from overflow list starting at %d",
7079           N+1, expected, iFirst);
7080       break;
7081     }
7082     if( checkRef(pCheck, iPage, zContext) ) break;
7083     if( sqlite3PagerGet(pCheck->pPager, (Pgno)iPage, &pOvflPage) ){
7084       checkAppendMsg(pCheck, zContext, "failed to get page %d", iPage);
7085       break;
7086     }
7087     pOvflData = (unsigned char *)sqlite3PagerGetData(pOvflPage);
7088     if( isFreeList ){
7089       int n = get4byte(&pOvflData[4]);
7090 #ifndef SQLITE_OMIT_AUTOVACUUM
7091       if( pCheck->pBt->autoVacuum ){
7092         checkPtrmap(pCheck, iPage, PTRMAP_FREEPAGE, 0, zContext);
7093       }
7094 #endif
7095       if( n>pCheck->pBt->usableSize/4-2 ){
7096         checkAppendMsg(pCheck, zContext,
7097            "freelist leaf count too big on page %d", iPage);
7098         N--;
7099       }else{
7100         for(i=0; i<n; i++){
7101           Pgno iFreePage = get4byte(&pOvflData[8+i*4]);
7102 #ifndef SQLITE_OMIT_AUTOVACUUM
7103           if( pCheck->pBt->autoVacuum ){
7104             checkPtrmap(pCheck, iFreePage, PTRMAP_FREEPAGE, 0, zContext);
7105           }
7106 #endif
7107           checkRef(pCheck, iFreePage, zContext);
7108         }
7109         N -= n;
7110       }
7111     }
7112 #ifndef SQLITE_OMIT_AUTOVACUUM
7113     else{
7114       /* If this database supports auto-vacuum and iPage is not the last
7115       ** page in this overflow list, check that the pointer-map entry for
7116       ** the following page matches iPage.
7117       */
7118       if( pCheck->pBt->autoVacuum && N>0 ){
7119         i = get4byte(pOvflData);
7120         checkPtrmap(pCheck, i, PTRMAP_OVERFLOW2, iPage, zContext);
7121       }
7122     }
7123 #endif
7124     iPage = get4byte(pOvflData);
7125     sqlite3PagerUnref(pOvflPage);
7126   }
7127 }
7128 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
7129 
7130 #ifndef SQLITE_OMIT_INTEGRITY_CHECK
7131 /*
7132 ** Do various sanity checks on a single page of a tree.  Return
7133 ** the tree depth.  Root pages return 0.  Parents of root pages
7134 ** return 1, and so forth.
7135 **
7136 ** These checks are done:
7137 **
7138 **      1.  Make sure that cells and freeblocks do not overlap
7139 **          but combine to completely cover the page.
7140 **  NO  2.  Make sure cell keys are in order.
7141 **  NO  3.  Make sure no key is less than or equal to zLowerBound.
7142 **  NO  4.  Make sure no key is greater than or equal to zUpperBound.
7143 **      5.  Check the integrity of overflow pages.
7144 **      6.  Recursively call checkTreePage on all children.
7145 **      7.  Verify that the depth of all children is the same.
7146 **      8.  Make sure this page is at least 33% full or else it is
7147 **          the root of the tree.
7148 */
7149 static int checkTreePage(
7150   IntegrityCk *pCheck,  /* Context for the sanity check */
7151   int iPage,            /* Page number of the page to check */
7152   char *zParentContext  /* Parent context */
7153 ){
7154   MemPage *pPage;
7155   int i, rc, depth, d2, pgno, cnt;
7156   int hdr, cellStart;
7157   int nCell;
7158   u8 *data;
7159   BtShared *pBt;
7160   int usableSize;
7161   char zContext[100];
7162   char *hit = 0;
7163 
7164   sqlite3_snprintf(sizeof(zContext), zContext, "Page %d: ", iPage);
7165 
7166   /* Check that the page exists
7167   */
7168   pBt = pCheck->pBt;
7169   usableSize = pBt->usableSize;
7170   if( iPage==0 ) return 0;
7171   if( checkRef(pCheck, iPage, zParentContext) ) return 0;
7172   if( (rc = sqlite3BtreeGetPage(pBt, (Pgno)iPage, &pPage, 0))!=0 ){
7173     if( rc==SQLITE_NOMEM ) pCheck->mallocFailed = 1;
7174     checkAppendMsg(pCheck, zContext,
7175        "unable to get the page. error code=%d", rc);
7176     return 0;
7177   }
7178   if( (rc = sqlite3BtreeInitPage(pPage))!=0 ){
7179     assert( rc==SQLITE_CORRUPT );  /* The only possible error from InitPage */
7180     checkAppendMsg(pCheck, zContext,
7181                    "sqlite3BtreeInitPage() returns error code %d", rc);
7182     releasePage(pPage);
7183     return 0;
7184   }
7185 
7186   /* Check out all the cells.
7187   */
7188   depth = 0;
7189   for(i=0; i<pPage->nCell && pCheck->mxErr; i++){
7190     u8 *pCell;
7191     u32 sz;
7192     CellInfo info;
7193 
7194     /* Check payload overflow pages
7195     */
7196     sqlite3_snprintf(sizeof(zContext), zContext,
7197              "On tree page %d cell %d: ", iPage, i);
7198     pCell = findCell(pPage,i);
7199     sqlite3BtreeParseCellPtr(pPage, pCell, &info);
7200     sz = info.nData;
7201     if( !pPage->intKey ) sz += (int)info.nKey;
7202     assert( sz==info.nPayload );
7203     if( (sz>info.nLocal)
7204      && (&pCell[info.iOverflow]<=&pPage->aData[pBt->usableSize])
7205     ){
7206       int nPage = (sz - info.nLocal + usableSize - 5)/(usableSize - 4);
7207       Pgno pgnoOvfl = get4byte(&pCell[info.iOverflow]);
7208 #ifndef SQLITE_OMIT_AUTOVACUUM
7209       if( pBt->autoVacuum ){
7210         checkPtrmap(pCheck, pgnoOvfl, PTRMAP_OVERFLOW1, iPage, zContext);
7211       }
7212 #endif
7213       checkList(pCheck, 0, pgnoOvfl, nPage, zContext);
7214     }
7215 
7216     /* Check sanity of left child page.
7217     */
7218     if( !pPage->leaf ){
7219       pgno = get4byte(pCell);
7220 #ifndef SQLITE_OMIT_AUTOVACUUM
7221       if( pBt->autoVacuum ){
7222         checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage, zContext);
7223       }
7224 #endif
7225       d2 = checkTreePage(pCheck, pgno, zContext);
7226       if( i>0 && d2!=depth ){
7227         checkAppendMsg(pCheck, zContext, "Child page depth differs");
7228       }
7229       depth = d2;
7230     }
7231   }
7232   if( !pPage->leaf ){
7233     pgno = get4byte(&pPage->aData[pPage->hdrOffset+8]);
7234     sqlite3_snprintf(sizeof(zContext), zContext,
7235                      "On page %d at right child: ", iPage);
7236 #ifndef SQLITE_OMIT_AUTOVACUUM
7237     if( pBt->autoVacuum ){
7238       checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage, 0);
7239     }
7240 #endif
7241     checkTreePage(pCheck, pgno, zContext);
7242   }
7243 
7244   /* Check for complete coverage of the page
7245   */
7246   data = pPage->aData;
7247   hdr = pPage->hdrOffset;
7248   hit = sqlite3PageMalloc( pBt->pageSize );
7249   if( hit==0 ){
7250     pCheck->mallocFailed = 1;
7251   }else{
7252     u16 contentOffset = get2byte(&data[hdr+5]);
7253     if (contentOffset > usableSize) {
7254       checkAppendMsg(pCheck, 0,
7255                      "Corruption detected in header on page %d",iPage,0);
7256       goto check_page_abort;
7257     }
7258     memset(hit+contentOffset, 0, usableSize-contentOffset);
7259     memset(hit, 1, contentOffset);
7260     nCell = get2byte(&data[hdr+3]);
7261     cellStart = hdr + 12 - 4*pPage->leaf;
7262     for(i=0; i<nCell; i++){
7263       int pc = get2byte(&data[cellStart+i*2]);
7264       u16 size = 1024;
7265       int j;
7266       if( pc<=usableSize ){
7267         size = cellSizePtr(pPage, &data[pc]);
7268       }
7269       if( (pc+size-1)>=usableSize || pc<0 ){
7270         checkAppendMsg(pCheck, 0,
7271             "Corruption detected in cell %d on page %d",i,iPage,0);
7272       }else{
7273         for(j=pc+size-1; j>=pc; j--) hit[j]++;
7274       }
7275     }
7276     for(cnt=0, i=get2byte(&data[hdr+1]); i>0 && i<usableSize && cnt<10000;
7277            cnt++){
7278       int size = get2byte(&data[i+2]);
7279       int j;
7280       if( (i+size-1)>=usableSize || i<0 ){
7281         checkAppendMsg(pCheck, 0,
7282             "Corruption detected in cell %d on page %d",i,iPage,0);
7283       }else{
7284         for(j=i+size-1; j>=i; j--) hit[j]++;
7285       }
7286       i = get2byte(&data[i]);
7287     }
7288     for(i=cnt=0; i<usableSize; i++){
7289       if( hit[i]==0 ){
7290         cnt++;
7291       }else if( hit[i]>1 ){
7292         checkAppendMsg(pCheck, 0,
7293           "Multiple uses for byte %d of page %d", i, iPage);
7294         break;
7295       }
7296     }
7297     if( cnt!=data[hdr+7] ){
7298       checkAppendMsg(pCheck, 0,
7299           "Fragmented space is %d byte reported as %d on page %d",
7300           cnt, data[hdr+7], iPage);
7301     }
7302   }
7303 check_page_abort:
7304   if (hit) sqlite3PageFree(hit);
7305 
7306   releasePage(pPage);
7307   return depth+1;
7308 }
7309 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
7310 
7311 #ifndef SQLITE_OMIT_INTEGRITY_CHECK
7312 /*
7313 ** This routine does a complete check of the given BTree file.  aRoot[] is
7314 ** an array of pages numbers were each page number is the root page of
7315 ** a table.  nRoot is the number of entries in aRoot.
7316 **
7317 ** Write the number of error seen in *pnErr.  Except for some memory
7318 ** allocation errors,  an error message held in memory obtained from
7319 ** malloc is returned if *pnErr is non-zero.  If *pnErr==0 then NULL is
7320 ** returned.  If a memory allocation error occurs, NULL is returned.
7321 */
7322 char *sqlite3BtreeIntegrityCheck(
7323   Btree *p,     /* The btree to be checked */
7324   int *aRoot,   /* An array of root pages numbers for individual trees */
7325   int nRoot,    /* Number of entries in aRoot[] */
7326   int mxErr,    /* Stop reporting errors after this many */
7327   int *pnErr    /* Write number of errors seen to this variable */
7328 ){
7329   Pgno i;
7330   int nRef;
7331   IntegrityCk sCheck;
7332   BtShared *pBt = p->pBt;
7333   char zErr[100];
7334 
7335   sqlite3BtreeEnter(p);
7336   nRef = sqlite3PagerRefcount(pBt->pPager);
7337   if( lockBtreeWithRetry(p)!=SQLITE_OK ){
7338     *pnErr = 1;
7339     sqlite3BtreeLeave(p);
7340     return sqlite3DbStrDup(0, "cannot acquire a read lock on the database");
7341   }
7342   sCheck.pBt = pBt;
7343   sCheck.pPager = pBt->pPager;
7344   sCheck.nPage = pagerPagecount(sCheck.pBt);
7345   sCheck.mxErr = mxErr;
7346   sCheck.nErr = 0;
7347   sCheck.mallocFailed = 0;
7348   *pnErr = 0;
7349   if( sCheck.nPage==0 ){
7350     unlockBtreeIfUnused(pBt);
7351     sqlite3BtreeLeave(p);
7352     return 0;
7353   }
7354   sCheck.anRef = sqlite3Malloc( (sCheck.nPage+1)*sizeof(sCheck.anRef[0]) );
7355   if( !sCheck.anRef ){
7356     unlockBtreeIfUnused(pBt);
7357     *pnErr = 1;
7358     sqlite3BtreeLeave(p);
7359     return 0;
7360   }
7361   for(i=0; i<=sCheck.nPage; i++){ sCheck.anRef[i] = 0; }
7362   i = PENDING_BYTE_PAGE(pBt);
7363   if( i<=sCheck.nPage ){
7364     sCheck.anRef[i] = 1;
7365   }
7366   sqlite3StrAccumInit(&sCheck.errMsg, zErr, sizeof(zErr), 20000);
7367 
7368   /* Check the integrity of the freelist
7369   */
7370   checkList(&sCheck, 1, get4byte(&pBt->pPage1->aData[32]),
7371             get4byte(&pBt->pPage1->aData[36]), "Main freelist: ");
7372 
7373   /* Check all the tables.
7374   */
7375   for(i=0; (int)i<nRoot && sCheck.mxErr; i++){
7376     if( aRoot[i]==0 ) continue;
7377 #ifndef SQLITE_OMIT_AUTOVACUUM
7378     if( pBt->autoVacuum && aRoot[i]>1 ){
7379       checkPtrmap(&sCheck, aRoot[i], PTRMAP_ROOTPAGE, 0, 0);
7380     }
7381 #endif
7382     checkTreePage(&sCheck, aRoot[i], "List of tree roots: ");
7383   }
7384 
7385   /* Make sure every page in the file is referenced
7386   */
7387   for(i=1; i<=sCheck.nPage && sCheck.mxErr; i++){
7388 #ifdef SQLITE_OMIT_AUTOVACUUM
7389     if( sCheck.anRef[i]==0 ){
7390       checkAppendMsg(&sCheck, 0, "Page %d is never used", i);
7391     }
7392 #else
7393     /* If the database supports auto-vacuum, make sure no tables contain
7394     ** references to pointer-map pages.
7395     */
7396     if( sCheck.anRef[i]==0 &&
7397        (PTRMAP_PAGENO(pBt, i)!=i || !pBt->autoVacuum) ){
7398       checkAppendMsg(&sCheck, 0, "Page %d is never used", i);
7399     }
7400     if( sCheck.anRef[i]!=0 &&
7401        (PTRMAP_PAGENO(pBt, i)==i && pBt->autoVacuum) ){
7402       checkAppendMsg(&sCheck, 0, "Pointer map page %d is referenced", i);
7403     }
7404 #endif
7405   }
7406 
7407   /* Make sure this analysis did not leave any unref() pages.
7408   ** This is an internal consistency check; an integrity check
7409   ** of the integrity check.
7410   */
7411   unlockBtreeIfUnused(pBt);
7412   if( NEVER(nRef != sqlite3PagerRefcount(pBt->pPager)) ){
7413     checkAppendMsg(&sCheck, 0,
7414       "Outstanding page count goes from %d to %d during this analysis",
7415       nRef, sqlite3PagerRefcount(pBt->pPager)
7416     );
7417   }
7418 
7419   /* Clean  up and report errors.
7420   */
7421   sqlite3BtreeLeave(p);
7422   sqlite3_free(sCheck.anRef);
7423   if( sCheck.mallocFailed ){
7424     sqlite3StrAccumReset(&sCheck.errMsg);
7425     *pnErr = sCheck.nErr+1;
7426     return 0;
7427   }
7428   *pnErr = sCheck.nErr;
7429   if( sCheck.nErr==0 ) sqlite3StrAccumReset(&sCheck.errMsg);
7430   return sqlite3StrAccumFinish(&sCheck.errMsg);
7431 }
7432 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
7433 
7434 /*
7435 ** Return the full pathname of the underlying database file.
7436 **
7437 ** The pager filename is invariant as long as the pager is
7438 ** open so it is safe to access without the BtShared mutex.
7439 */
7440 const char *sqlite3BtreeGetFilename(Btree *p){
7441   assert( p->pBt->pPager!=0 );
7442   return sqlite3PagerFilename(p->pBt->pPager);
7443 }
7444 
7445 /*
7446 ** Return the pathname of the journal file for this database. The return
7447 ** value of this routine is the same regardless of whether the journal file
7448 ** has been created or not.
7449 **
7450 ** The pager journal filename is invariant as long as the pager is
7451 ** open so it is safe to access without the BtShared mutex.
7452 */
7453 const char *sqlite3BtreeGetJournalname(Btree *p){
7454   assert( p->pBt->pPager!=0 );
7455   return sqlite3PagerJournalname(p->pBt->pPager);
7456 }
7457 
7458 /*
7459 ** Return non-zero if a transaction is active.
7460 */
7461 int sqlite3BtreeIsInTrans(Btree *p){
7462   assert( p==0 || sqlite3_mutex_held(p->db->mutex) );
7463   return (p && (p->inTrans==TRANS_WRITE));
7464 }
7465 
7466 /*
7467 ** Return non-zero if a read (or write) transaction is active.
7468 */
7469 int sqlite3BtreeIsInReadTrans(Btree *p){
7470   assert( p );
7471   assert( sqlite3_mutex_held(p->db->mutex) );
7472   return p->inTrans!=TRANS_NONE;
7473 }
7474 
7475 int sqlite3BtreeIsInBackup(Btree *p){
7476   assert( p );
7477   assert( sqlite3_mutex_held(p->db->mutex) );
7478   return p->nBackup!=0;
7479 }
7480 
7481 /*
7482 ** This function returns a pointer to a blob of memory associated with
7483 ** a single shared-btree. The memory is used by client code for its own
7484 ** purposes (for example, to store a high-level schema associated with
7485 ** the shared-btree). The btree layer manages reference counting issues.
7486 **
7487 ** The first time this is called on a shared-btree, nBytes bytes of memory
7488 ** are allocated, zeroed, and returned to the caller. For each subsequent
7489 ** call the nBytes parameter is ignored and a pointer to the same blob
7490 ** of memory returned.
7491 **
7492 ** If the nBytes parameter is 0 and the blob of memory has not yet been
7493 ** allocated, a null pointer is returned. If the blob has already been
7494 ** allocated, it is returned as normal.
7495 **
7496 ** Just before the shared-btree is closed, the function passed as the
7497 ** xFree argument when the memory allocation was made is invoked on the
7498 ** blob of allocated memory. This function should not call sqlite3_free()
7499 ** on the memory, the btree layer does that.
7500 */
7501 void *sqlite3BtreeSchema(Btree *p, int nBytes, void(*xFree)(void *)){
7502   BtShared *pBt = p->pBt;
7503   sqlite3BtreeEnter(p);
7504   if( !pBt->pSchema && nBytes ){
7505     pBt->pSchema = sqlite3MallocZero(nBytes);
7506     pBt->xFreeSchema = xFree;
7507   }
7508   sqlite3BtreeLeave(p);
7509   return pBt->pSchema;
7510 }
7511 
7512 /*
7513 ** Return SQLITE_LOCKED_SHAREDCACHE if another user of the same shared
7514 ** btree as the argument handle holds an exclusive lock on the
7515 ** sqlite_master table. Otherwise SQLITE_OK.
7516 */
7517 int sqlite3BtreeSchemaLocked(Btree *p){
7518   int rc;
7519   assert( sqlite3_mutex_held(p->db->mutex) );
7520   sqlite3BtreeEnter(p);
7521   rc = querySharedCacheTableLock(p, MASTER_ROOT, READ_LOCK);
7522   assert( rc==SQLITE_OK || rc==SQLITE_LOCKED_SHAREDCACHE );
7523   sqlite3BtreeLeave(p);
7524   return rc;
7525 }
7526 
7527 
7528 #ifndef SQLITE_OMIT_SHARED_CACHE
7529 /*
7530 ** Obtain a lock on the table whose root page is iTab.  The
7531 ** lock is a write lock if isWritelock is true or a read lock
7532 ** if it is false.
7533 */
7534 int sqlite3BtreeLockTable(Btree *p, int iTab, u8 isWriteLock){
7535   int rc = SQLITE_OK;
7536   if( p->sharable ){
7537     u8 lockType = READ_LOCK + isWriteLock;
7538     assert( READ_LOCK+1==WRITE_LOCK );
7539     assert( isWriteLock==0 || isWriteLock==1 );
7540     sqlite3BtreeEnter(p);
7541     rc = querySharedCacheTableLock(p, iTab, lockType);
7542     if( rc==SQLITE_OK ){
7543       rc = setSharedCacheTableLock(p, iTab, lockType);
7544     }
7545     sqlite3BtreeLeave(p);
7546   }
7547   return rc;
7548 }
7549 #endif
7550 
7551 #ifndef SQLITE_OMIT_INCRBLOB
7552 /*
7553 ** Argument pCsr must be a cursor opened for writing on an
7554 ** INTKEY table currently pointing at a valid table entry.
7555 ** This function modifies the data stored as part of that entry.
7556 ** Only the data content may only be modified, it is not possible
7557 ** to change the length of the data stored.
7558 */
7559 int sqlite3BtreePutData(BtCursor *pCsr, u32 offset, u32 amt, void *z){
7560   int rc;
7561 
7562   assert( cursorHoldsMutex(pCsr) );
7563   assert( sqlite3_mutex_held(pCsr->pBtree->db->mutex) );
7564   assert(pCsr->isIncrblobHandle);
7565 
7566   restoreCursorPosition(pCsr);
7567   assert( pCsr->eState!=CURSOR_REQUIRESEEK );
7568   if( pCsr->eState!=CURSOR_VALID ){
7569     return SQLITE_ABORT;
7570   }
7571 
7572   /* Check some preconditions:
7573   **   (a) the cursor is open for writing,
7574   **   (b) there is no read-lock on the table being modified and
7575   **   (c) the cursor points at a valid row of an intKey table.
7576   */
7577   if( !pCsr->wrFlag ){
7578     return SQLITE_READONLY;
7579   }
7580   assert( !pCsr->pBt->readOnly
7581           && pCsr->pBt->inTransaction==TRANS_WRITE );
7582   rc = checkForReadConflicts(pCsr->pBtree, pCsr->pgnoRoot, pCsr, 0);
7583   if( rc!=SQLITE_OK ){
7584     /* The table pCur points to has a read lock */
7585     assert( rc==SQLITE_LOCKED_SHAREDCACHE );
7586     return rc;
7587   }
7588   if( pCsr->eState==CURSOR_INVALID || !pCsr->apPage[pCsr->iPage]->intKey ){
7589     return SQLITE_ERROR;
7590   }
7591 
7592   return accessPayload(pCsr, offset, amt, (unsigned char *)z, 0, 1);
7593 }
7594 
7595 /*
7596 ** Set a flag on this cursor to cache the locations of pages from the
7597 ** overflow list for the current row. This is used by cursors opened
7598 ** for incremental blob IO only.
7599 **
7600 ** This function sets a flag only. The actual page location cache
7601 ** (stored in BtCursor.aOverflow[]) is allocated and used by function
7602 ** accessPayload() (the worker function for sqlite3BtreeData() and
7603 ** sqlite3BtreePutData()).
7604 */
7605 void sqlite3BtreeCacheOverflow(BtCursor *pCur){
7606   assert( cursorHoldsMutex(pCur) );
7607   assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
7608   assert(!pCur->isIncrblobHandle);
7609   assert(!pCur->aOverflow);
7610   pCur->isIncrblobHandle = 1;
7611 }
7612 #endif
7613