1 /* 2 ** 2004 April 6 3 ** 4 ** The author disclaims copyright to this source code. In place of 5 ** a legal notice, here is a blessing: 6 ** 7 ** May you do good and not evil. 8 ** May you find forgiveness for yourself and forgive others. 9 ** May you share freely, never taking more than you give. 10 ** 11 ************************************************************************* 12 ** $Id: btree.c,v 1.602 2009/04/30 13:30:33 drh Exp $ 13 ** 14 ** This file implements a external (disk-based) database using BTrees. 15 ** See the header comment on "btreeInt.h" for additional information. 16 ** Including a description of file format and an overview of operation. 17 */ 18 #include "btreeInt.h" 19 20 /* 21 ** The header string that appears at the beginning of every 22 ** SQLite database. 23 */ 24 static const char zMagicHeader[] = SQLITE_FILE_HEADER; 25 26 /* 27 ** Set this global variable to 1 to enable tracing using the TRACE 28 ** macro. 29 */ 30 #if 0 31 int sqlite3BtreeTrace=0; /* True to enable tracing */ 32 # define TRACE(X) if(sqlite3BtreeTrace){printf X;fflush(stdout);} 33 #else 34 # define TRACE(X) 35 #endif 36 37 38 39 #ifndef SQLITE_OMIT_SHARED_CACHE 40 /* 41 ** A list of BtShared objects that are eligible for participation 42 ** in shared cache. This variable has file scope during normal builds, 43 ** but the test harness needs to access it so we make it global for 44 ** test builds. 45 ** 46 ** Access to this variable is protected by SQLITE_MUTEX_STATIC_MASTER. 47 */ 48 #ifdef SQLITE_TEST 49 BtShared *SQLITE_WSD sqlite3SharedCacheList = 0; 50 #else 51 static BtShared *SQLITE_WSD sqlite3SharedCacheList = 0; 52 #endif 53 #endif /* SQLITE_OMIT_SHARED_CACHE */ 54 55 #ifndef SQLITE_OMIT_SHARED_CACHE 56 /* 57 ** Enable or disable the shared pager and schema features. 58 ** 59 ** This routine has no effect on existing database connections. 60 ** The shared cache setting effects only future calls to 61 ** sqlite3_open(), sqlite3_open16(), or sqlite3_open_v2(). 62 */ 63 int sqlite3_enable_shared_cache(int enable){ 64 sqlite3GlobalConfig.sharedCacheEnabled = enable; 65 return SQLITE_OK; 66 } 67 #endif 68 69 70 /* 71 ** Forward declaration 72 */ 73 static int checkForReadConflicts(Btree*, Pgno, BtCursor*, i64); 74 75 76 #ifdef SQLITE_OMIT_SHARED_CACHE 77 /* 78 ** The functions querySharedCacheTableLock(), setSharedCacheTableLock(), 79 ** and clearAllSharedCacheTableLocks() 80 ** manipulate entries in the BtShared.pLock linked list used to store 81 ** shared-cache table level locks. If the library is compiled with the 82 ** shared-cache feature disabled, then there is only ever one user 83 ** of each BtShared structure and so this locking is not necessary. 84 ** So define the lock related functions as no-ops. 85 */ 86 #define querySharedCacheTableLock(a,b,c) SQLITE_OK 87 #define setSharedCacheTableLock(a,b,c) SQLITE_OK 88 #define clearAllSharedCacheTableLocks(a) 89 #endif 90 91 #ifndef SQLITE_OMIT_SHARED_CACHE 92 /* 93 ** Query to see if btree handle p may obtain a lock of type eLock 94 ** (READ_LOCK or WRITE_LOCK) on the table with root-page iTab. Return 95 ** SQLITE_OK if the lock may be obtained (by calling 96 ** setSharedCacheTableLock()), or SQLITE_LOCKED if not. 97 */ 98 static int querySharedCacheTableLock(Btree *p, Pgno iTab, u8 eLock){ 99 BtShared *pBt = p->pBt; 100 BtLock *pIter; 101 102 assert( sqlite3BtreeHoldsMutex(p) ); 103 assert( eLock==READ_LOCK || eLock==WRITE_LOCK ); 104 assert( p->db!=0 ); 105 106 /* If requesting a write-lock, then the Btree must have an open write 107 ** transaction on this file. And, obviously, for this to be so there 108 ** must be an open write transaction on the file itself. 109 */ 110 assert( eLock==READ_LOCK || (p==pBt->pWriter && p->inTrans==TRANS_WRITE) ); 111 assert( eLock==READ_LOCK || pBt->inTransaction==TRANS_WRITE ); 112 113 /* This is a no-op if the shared-cache is not enabled */ 114 if( !p->sharable ){ 115 return SQLITE_OK; 116 } 117 118 /* If some other connection is holding an exclusive lock, the 119 ** requested lock may not be obtained. 120 */ 121 if( pBt->pWriter!=p && pBt->isExclusive ){ 122 sqlite3ConnectionBlocked(p->db, pBt->pWriter->db); 123 return SQLITE_LOCKED_SHAREDCACHE; 124 } 125 126 /* This (along with setSharedCacheTableLock()) is where 127 ** the ReadUncommitted flag is dealt with. 128 ** If the caller is querying for a read-lock on any table 129 ** other than the sqlite_master table (table 1) and if the ReadUncommitted 130 ** flag is set, then the lock granted even if there are write-locks 131 ** on the table. If a write-lock is requested, the ReadUncommitted flag 132 ** is not considered. 133 ** 134 ** In function setSharedCacheTableLock(), if a read-lock is demanded and the 135 ** ReadUncommitted flag is set, no entry is added to the locks list 136 ** (BtShared.pLock). 137 ** 138 ** To summarize: If the ReadUncommitted flag is set, then read cursors 139 ** on non-schema tables do not create or respect table locks. The locking 140 ** procedure for a write-cursor does not change. 141 */ 142 if( 143 0==(p->db->flags&SQLITE_ReadUncommitted) || 144 eLock==WRITE_LOCK || 145 iTab==MASTER_ROOT 146 ){ 147 for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){ 148 /* The condition (pIter->eLock!=eLock) in the following if(...) 149 ** statement is a simplification of: 150 ** 151 ** (eLock==WRITE_LOCK || pIter->eLock==WRITE_LOCK) 152 ** 153 ** since we know that if eLock==WRITE_LOCK, then no other connection 154 ** may hold a WRITE_LOCK on any table in this file (since there can 155 ** only be a single writer). 156 */ 157 assert( pIter->eLock==READ_LOCK || pIter->eLock==WRITE_LOCK ); 158 assert( eLock==READ_LOCK || pIter->pBtree==p || pIter->eLock==READ_LOCK); 159 if( pIter->pBtree!=p && pIter->iTable==iTab && pIter->eLock!=eLock ){ 160 sqlite3ConnectionBlocked(p->db, pIter->pBtree->db); 161 if( eLock==WRITE_LOCK ){ 162 assert( p==pBt->pWriter ); 163 pBt->isPending = 1; 164 } 165 return SQLITE_LOCKED_SHAREDCACHE; 166 } 167 } 168 } 169 return SQLITE_OK; 170 } 171 #endif /* !SQLITE_OMIT_SHARED_CACHE */ 172 173 #ifndef SQLITE_OMIT_SHARED_CACHE 174 /* 175 ** Add a lock on the table with root-page iTable to the shared-btree used 176 ** by Btree handle p. Parameter eLock must be either READ_LOCK or 177 ** WRITE_LOCK. 178 ** 179 ** SQLITE_OK is returned if the lock is added successfully. SQLITE_BUSY and 180 ** SQLITE_NOMEM may also be returned. 181 */ 182 static int setSharedCacheTableLock(Btree *p, Pgno iTable, u8 eLock){ 183 BtShared *pBt = p->pBt; 184 BtLock *pLock = 0; 185 BtLock *pIter; 186 187 assert( sqlite3BtreeHoldsMutex(p) ); 188 assert( eLock==READ_LOCK || eLock==WRITE_LOCK ); 189 assert( p->db!=0 ); 190 191 /* This is a no-op if the shared-cache is not enabled */ 192 if( !p->sharable ){ 193 return SQLITE_OK; 194 } 195 196 assert( SQLITE_OK==querySharedCacheTableLock(p, iTable, eLock) ); 197 198 /* If the read-uncommitted flag is set and a read-lock is requested on 199 ** a non-schema table, then the lock is always granted. Return early 200 ** without adding an entry to the BtShared.pLock list. See 201 ** comment in function querySharedCacheTableLock() for more info 202 ** on handling the ReadUncommitted flag. 203 */ 204 if( 205 (p->db->flags&SQLITE_ReadUncommitted) && 206 (eLock==READ_LOCK) && 207 iTable!=MASTER_ROOT 208 ){ 209 return SQLITE_OK; 210 } 211 212 /* First search the list for an existing lock on this table. */ 213 for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){ 214 if( pIter->iTable==iTable && pIter->pBtree==p ){ 215 pLock = pIter; 216 break; 217 } 218 } 219 220 /* If the above search did not find a BtLock struct associating Btree p 221 ** with table iTable, allocate one and link it into the list. 222 */ 223 if( !pLock ){ 224 pLock = (BtLock *)sqlite3MallocZero(sizeof(BtLock)); 225 if( !pLock ){ 226 return SQLITE_NOMEM; 227 } 228 pLock->iTable = iTable; 229 pLock->pBtree = p; 230 pLock->pNext = pBt->pLock; 231 pBt->pLock = pLock; 232 } 233 234 /* Set the BtLock.eLock variable to the maximum of the current lock 235 ** and the requested lock. This means if a write-lock was already held 236 ** and a read-lock requested, we don't incorrectly downgrade the lock. 237 */ 238 assert( WRITE_LOCK>READ_LOCK ); 239 if( eLock>pLock->eLock ){ 240 pLock->eLock = eLock; 241 } 242 243 return SQLITE_OK; 244 } 245 #endif /* !SQLITE_OMIT_SHARED_CACHE */ 246 247 #ifndef SQLITE_OMIT_SHARED_CACHE 248 /* 249 ** Release all the table locks (locks obtained via calls to 250 ** the setSharedCacheTableLock() procedure) held by Btree handle p. 251 ** 252 ** This function assumes that handle p has an open read or write 253 ** transaction. If it does not, then the BtShared.isPending variable 254 ** may be incorrectly cleared. 255 */ 256 static void clearAllSharedCacheTableLocks(Btree *p){ 257 BtShared *pBt = p->pBt; 258 BtLock **ppIter = &pBt->pLock; 259 260 assert( sqlite3BtreeHoldsMutex(p) ); 261 assert( p->sharable || 0==*ppIter ); 262 assert( p->inTrans>0 ); 263 264 while( *ppIter ){ 265 BtLock *pLock = *ppIter; 266 assert( pBt->isExclusive==0 || pBt->pWriter==pLock->pBtree ); 267 assert( pLock->pBtree->inTrans>=pLock->eLock ); 268 if( pLock->pBtree==p ){ 269 *ppIter = pLock->pNext; 270 sqlite3_free(pLock); 271 }else{ 272 ppIter = &pLock->pNext; 273 } 274 } 275 276 assert( pBt->isPending==0 || pBt->pWriter ); 277 if( pBt->pWriter==p ){ 278 pBt->pWriter = 0; 279 pBt->isExclusive = 0; 280 pBt->isPending = 0; 281 }else if( pBt->nTransaction==2 ){ 282 /* This function is called when connection p is concluding its 283 ** transaction. If there currently exists a writer, and p is not 284 ** that writer, then the number of locks held by connections other 285 ** than the writer must be about to drop to zero. In this case 286 ** set the isPending flag to 0. 287 ** 288 ** If there is not currently a writer, then BtShared.isPending must 289 ** be zero already. So this next line is harmless in that case. 290 */ 291 pBt->isPending = 0; 292 } 293 } 294 #endif /* SQLITE_OMIT_SHARED_CACHE */ 295 296 static void releasePage(MemPage *pPage); /* Forward reference */ 297 298 /* 299 ** Verify that the cursor holds a mutex on the BtShared 300 */ 301 #ifndef NDEBUG 302 static int cursorHoldsMutex(BtCursor *p){ 303 return sqlite3_mutex_held(p->pBt->mutex); 304 } 305 #endif 306 307 308 #ifndef SQLITE_OMIT_INCRBLOB 309 /* 310 ** Invalidate the overflow page-list cache for cursor pCur, if any. 311 */ 312 static void invalidateOverflowCache(BtCursor *pCur){ 313 assert( cursorHoldsMutex(pCur) ); 314 sqlite3_free(pCur->aOverflow); 315 pCur->aOverflow = 0; 316 } 317 318 /* 319 ** Invalidate the overflow page-list cache for all cursors opened 320 ** on the shared btree structure pBt. 321 */ 322 static void invalidateAllOverflowCache(BtShared *pBt){ 323 BtCursor *p; 324 assert( sqlite3_mutex_held(pBt->mutex) ); 325 for(p=pBt->pCursor; p; p=p->pNext){ 326 invalidateOverflowCache(p); 327 } 328 } 329 #else 330 #define invalidateOverflowCache(x) 331 #define invalidateAllOverflowCache(x) 332 #endif 333 334 /* 335 ** Set bit pgno of the BtShared.pHasContent bitvec. This is called 336 ** when a page that previously contained data becomes a free-list leaf 337 ** page. 338 ** 339 ** The BtShared.pHasContent bitvec exists to work around an obscure 340 ** bug caused by the interaction of two useful IO optimizations surrounding 341 ** free-list leaf pages: 342 ** 343 ** 1) When all data is deleted from a page and the page becomes 344 ** a free-list leaf page, the page is not written to the database 345 ** (as free-list leaf pages contain no meaningful data). Sometimes 346 ** such a page is not even journalled (as it will not be modified, 347 ** why bother journalling it?). 348 ** 349 ** 2) When a free-list leaf page is reused, its content is not read 350 ** from the database or written to the journal file (why should it 351 ** be, if it is not at all meaningful?). 352 ** 353 ** By themselves, these optimizations work fine and provide a handy 354 ** performance boost to bulk delete or insert operations. However, if 355 ** a page is moved to the free-list and then reused within the same 356 ** transaction, a problem comes up. If the page is not journalled when 357 ** it is moved to the free-list and it is also not journalled when it 358 ** is extracted from the free-list and reused, then the original data 359 ** may be lost. In the event of a rollback, it may not be possible 360 ** to restore the database to its original configuration. 361 ** 362 ** The solution is the BtShared.pHasContent bitvec. Whenever a page is 363 ** moved to become a free-list leaf page, the corresponding bit is 364 ** set in the bitvec. Whenever a leaf page is extracted from the free-list, 365 ** optimization 2 above is ommitted if the corresponding bit is already 366 ** set in BtShared.pHasContent. The contents of the bitvec are cleared 367 ** at the end of every transaction. 368 */ 369 static int btreeSetHasContent(BtShared *pBt, Pgno pgno){ 370 int rc = SQLITE_OK; 371 if( !pBt->pHasContent ){ 372 int nPage; 373 rc = sqlite3PagerPagecount(pBt->pPager, &nPage); 374 if( rc==SQLITE_OK ){ 375 pBt->pHasContent = sqlite3BitvecCreate((u32)nPage); 376 if( !pBt->pHasContent ){ 377 rc = SQLITE_NOMEM; 378 } 379 } 380 } 381 if( rc==SQLITE_OK && pgno<=sqlite3BitvecSize(pBt->pHasContent) ){ 382 rc = sqlite3BitvecSet(pBt->pHasContent, pgno); 383 } 384 return rc; 385 } 386 387 /* 388 ** Query the BtShared.pHasContent vector. 389 ** 390 ** This function is called when a free-list leaf page is removed from the 391 ** free-list for reuse. It returns false if it is safe to retrieve the 392 ** page from the pager layer with the 'no-content' flag set. True otherwise. 393 */ 394 static int btreeGetHasContent(BtShared *pBt, Pgno pgno){ 395 Bitvec *p = pBt->pHasContent; 396 return (p && (pgno>sqlite3BitvecSize(p) || sqlite3BitvecTest(p, pgno))); 397 } 398 399 /* 400 ** Clear (destroy) the BtShared.pHasContent bitvec. This should be 401 ** invoked at the conclusion of each write-transaction. 402 */ 403 static void btreeClearHasContent(BtShared *pBt){ 404 sqlite3BitvecDestroy(pBt->pHasContent); 405 pBt->pHasContent = 0; 406 } 407 408 /* 409 ** Save the current cursor position in the variables BtCursor.nKey 410 ** and BtCursor.pKey. The cursor's state is set to CURSOR_REQUIRESEEK. 411 */ 412 static int saveCursorPosition(BtCursor *pCur){ 413 int rc; 414 415 assert( CURSOR_VALID==pCur->eState ); 416 assert( 0==pCur->pKey ); 417 assert( cursorHoldsMutex(pCur) ); 418 419 rc = sqlite3BtreeKeySize(pCur, &pCur->nKey); 420 421 /* If this is an intKey table, then the above call to BtreeKeySize() 422 ** stores the integer key in pCur->nKey. In this case this value is 423 ** all that is required. Otherwise, if pCur is not open on an intKey 424 ** table, then malloc space for and store the pCur->nKey bytes of key 425 ** data. 426 */ 427 if( rc==SQLITE_OK && 0==pCur->apPage[0]->intKey){ 428 void *pKey = sqlite3Malloc( (int)pCur->nKey ); 429 if( pKey ){ 430 rc = sqlite3BtreeKey(pCur, 0, (int)pCur->nKey, pKey); 431 if( rc==SQLITE_OK ){ 432 pCur->pKey = pKey; 433 }else{ 434 sqlite3_free(pKey); 435 } 436 }else{ 437 rc = SQLITE_NOMEM; 438 } 439 } 440 assert( !pCur->apPage[0]->intKey || !pCur->pKey ); 441 442 if( rc==SQLITE_OK ){ 443 int i; 444 for(i=0; i<=pCur->iPage; i++){ 445 releasePage(pCur->apPage[i]); 446 pCur->apPage[i] = 0; 447 } 448 pCur->iPage = -1; 449 pCur->eState = CURSOR_REQUIRESEEK; 450 } 451 452 invalidateOverflowCache(pCur); 453 return rc; 454 } 455 456 /* 457 ** Save the positions of all cursors except pExcept open on the table 458 ** with root-page iRoot. Usually, this is called just before cursor 459 ** pExcept is used to modify the table (BtreeDelete() or BtreeInsert()). 460 */ 461 static int saveAllCursors(BtShared *pBt, Pgno iRoot, BtCursor *pExcept){ 462 BtCursor *p; 463 assert( sqlite3_mutex_held(pBt->mutex) ); 464 assert( pExcept==0 || pExcept->pBt==pBt ); 465 for(p=pBt->pCursor; p; p=p->pNext){ 466 if( p!=pExcept && (0==iRoot || p->pgnoRoot==iRoot) && 467 p->eState==CURSOR_VALID ){ 468 int rc = saveCursorPosition(p); 469 if( SQLITE_OK!=rc ){ 470 return rc; 471 } 472 } 473 } 474 return SQLITE_OK; 475 } 476 477 /* 478 ** Clear the current cursor position. 479 */ 480 void sqlite3BtreeClearCursor(BtCursor *pCur){ 481 assert( cursorHoldsMutex(pCur) ); 482 sqlite3_free(pCur->pKey); 483 pCur->pKey = 0; 484 pCur->eState = CURSOR_INVALID; 485 } 486 487 /* 488 ** Restore the cursor to the position it was in (or as close to as possible) 489 ** when saveCursorPosition() was called. Note that this call deletes the 490 ** saved position info stored by saveCursorPosition(), so there can be 491 ** at most one effective restoreCursorPosition() call after each 492 ** saveCursorPosition(). 493 */ 494 int sqlite3BtreeRestoreCursorPosition(BtCursor *pCur){ 495 int rc; 496 assert( cursorHoldsMutex(pCur) ); 497 assert( pCur->eState>=CURSOR_REQUIRESEEK ); 498 if( pCur->eState==CURSOR_FAULT ){ 499 return pCur->skip; 500 } 501 pCur->eState = CURSOR_INVALID; 502 rc = sqlite3BtreeMoveto(pCur, pCur->pKey, pCur->nKey, 0, &pCur->skip); 503 if( rc==SQLITE_OK ){ 504 sqlite3_free(pCur->pKey); 505 pCur->pKey = 0; 506 assert( pCur->eState==CURSOR_VALID || pCur->eState==CURSOR_INVALID ); 507 } 508 return rc; 509 } 510 511 #define restoreCursorPosition(p) \ 512 (p->eState>=CURSOR_REQUIRESEEK ? \ 513 sqlite3BtreeRestoreCursorPosition(p) : \ 514 SQLITE_OK) 515 516 /* 517 ** Determine whether or not a cursor has moved from the position it 518 ** was last placed at. Cursors can move when the row they are pointing 519 ** at is deleted out from under them. 520 ** 521 ** This routine returns an error code if something goes wrong. The 522 ** integer *pHasMoved is set to one if the cursor has moved and 0 if not. 523 */ 524 int sqlite3BtreeCursorHasMoved(BtCursor *pCur, int *pHasMoved){ 525 int rc; 526 527 rc = restoreCursorPosition(pCur); 528 if( rc ){ 529 *pHasMoved = 1; 530 return rc; 531 } 532 if( pCur->eState!=CURSOR_VALID || pCur->skip!=0 ){ 533 *pHasMoved = 1; 534 }else{ 535 *pHasMoved = 0; 536 } 537 return SQLITE_OK; 538 } 539 540 #ifndef SQLITE_OMIT_AUTOVACUUM 541 /* 542 ** Given a page number of a regular database page, return the page 543 ** number for the pointer-map page that contains the entry for the 544 ** input page number. 545 */ 546 static Pgno ptrmapPageno(BtShared *pBt, Pgno pgno){ 547 int nPagesPerMapPage; 548 Pgno iPtrMap, ret; 549 assert( sqlite3_mutex_held(pBt->mutex) ); 550 nPagesPerMapPage = (pBt->usableSize/5)+1; 551 iPtrMap = (pgno-2)/nPagesPerMapPage; 552 ret = (iPtrMap*nPagesPerMapPage) + 2; 553 if( ret==PENDING_BYTE_PAGE(pBt) ){ 554 ret++; 555 } 556 return ret; 557 } 558 559 /* 560 ** Write an entry into the pointer map. 561 ** 562 ** This routine updates the pointer map entry for page number 'key' 563 ** so that it maps to type 'eType' and parent page number 'pgno'. 564 ** An error code is returned if something goes wrong, otherwise SQLITE_OK. 565 */ 566 static int ptrmapPut(BtShared *pBt, Pgno key, u8 eType, Pgno parent){ 567 DbPage *pDbPage; /* The pointer map page */ 568 u8 *pPtrmap; /* The pointer map data */ 569 Pgno iPtrmap; /* The pointer map page number */ 570 int offset; /* Offset in pointer map page */ 571 int rc; 572 573 assert( sqlite3_mutex_held(pBt->mutex) ); 574 /* The master-journal page number must never be used as a pointer map page */ 575 assert( 0==PTRMAP_ISPAGE(pBt, PENDING_BYTE_PAGE(pBt)) ); 576 577 assert( pBt->autoVacuum ); 578 if( key==0 ){ 579 return SQLITE_CORRUPT_BKPT; 580 } 581 iPtrmap = PTRMAP_PAGENO(pBt, key); 582 rc = sqlite3PagerGet(pBt->pPager, iPtrmap, &pDbPage); 583 if( rc!=SQLITE_OK ){ 584 return rc; 585 } 586 offset = PTRMAP_PTROFFSET(iPtrmap, key); 587 pPtrmap = (u8 *)sqlite3PagerGetData(pDbPage); 588 589 if( eType!=pPtrmap[offset] || get4byte(&pPtrmap[offset+1])!=parent ){ 590 TRACE(("PTRMAP_UPDATE: %d->(%d,%d)\n", key, eType, parent)); 591 rc = sqlite3PagerWrite(pDbPage); 592 if( rc==SQLITE_OK ){ 593 pPtrmap[offset] = eType; 594 put4byte(&pPtrmap[offset+1], parent); 595 } 596 } 597 598 sqlite3PagerUnref(pDbPage); 599 return rc; 600 } 601 602 /* 603 ** Read an entry from the pointer map. 604 ** 605 ** This routine retrieves the pointer map entry for page 'key', writing 606 ** the type and parent page number to *pEType and *pPgno respectively. 607 ** An error code is returned if something goes wrong, otherwise SQLITE_OK. 608 */ 609 static int ptrmapGet(BtShared *pBt, Pgno key, u8 *pEType, Pgno *pPgno){ 610 DbPage *pDbPage; /* The pointer map page */ 611 int iPtrmap; /* Pointer map page index */ 612 u8 *pPtrmap; /* Pointer map page data */ 613 int offset; /* Offset of entry in pointer map */ 614 int rc; 615 616 assert( sqlite3_mutex_held(pBt->mutex) ); 617 618 iPtrmap = PTRMAP_PAGENO(pBt, key); 619 rc = sqlite3PagerGet(pBt->pPager, iPtrmap, &pDbPage); 620 if( rc!=0 ){ 621 return rc; 622 } 623 pPtrmap = (u8 *)sqlite3PagerGetData(pDbPage); 624 625 offset = PTRMAP_PTROFFSET(iPtrmap, key); 626 assert( pEType!=0 ); 627 *pEType = pPtrmap[offset]; 628 if( pPgno ) *pPgno = get4byte(&pPtrmap[offset+1]); 629 630 sqlite3PagerUnref(pDbPage); 631 if( *pEType<1 || *pEType>5 ) return SQLITE_CORRUPT_BKPT; 632 return SQLITE_OK; 633 } 634 635 #else /* if defined SQLITE_OMIT_AUTOVACUUM */ 636 #define ptrmapPut(w,x,y,z) SQLITE_OK 637 #define ptrmapGet(w,x,y,z) SQLITE_OK 638 #define ptrmapPutOvfl(y,z) SQLITE_OK 639 #endif 640 641 /* 642 ** Given a btree page and a cell index (0 means the first cell on 643 ** the page, 1 means the second cell, and so forth) return a pointer 644 ** to the cell content. 645 ** 646 ** This routine works only for pages that do not contain overflow cells. 647 */ 648 #define findCell(P,I) \ 649 ((P)->aData + ((P)->maskPage & get2byte(&(P)->aData[(P)->cellOffset+2*(I)]))) 650 651 /* 652 ** This a more complex version of findCell() that works for 653 ** pages that do contain overflow cells. See insert 654 */ 655 static u8 *findOverflowCell(MemPage *pPage, int iCell){ 656 int i; 657 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 658 for(i=pPage->nOverflow-1; i>=0; i--){ 659 int k; 660 struct _OvflCell *pOvfl; 661 pOvfl = &pPage->aOvfl[i]; 662 k = pOvfl->idx; 663 if( k<=iCell ){ 664 if( k==iCell ){ 665 return pOvfl->pCell; 666 } 667 iCell--; 668 } 669 } 670 return findCell(pPage, iCell); 671 } 672 673 /* 674 ** Parse a cell content block and fill in the CellInfo structure. There 675 ** are two versions of this function. sqlite3BtreeParseCell() takes a 676 ** cell index as the second argument and sqlite3BtreeParseCellPtr() 677 ** takes a pointer to the body of the cell as its second argument. 678 ** 679 ** Within this file, the parseCell() macro can be called instead of 680 ** sqlite3BtreeParseCellPtr(). Using some compilers, this will be faster. 681 */ 682 void sqlite3BtreeParseCellPtr( 683 MemPage *pPage, /* Page containing the cell */ 684 u8 *pCell, /* Pointer to the cell text. */ 685 CellInfo *pInfo /* Fill in this structure */ 686 ){ 687 u16 n; /* Number bytes in cell content header */ 688 u32 nPayload; /* Number of bytes of cell payload */ 689 690 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 691 692 pInfo->pCell = pCell; 693 assert( pPage->leaf==0 || pPage->leaf==1 ); 694 n = pPage->childPtrSize; 695 assert( n==4-4*pPage->leaf ); 696 if( pPage->intKey ){ 697 if( pPage->hasData ){ 698 n += getVarint32(&pCell[n], nPayload); 699 }else{ 700 nPayload = 0; 701 } 702 n += getVarint(&pCell[n], (u64*)&pInfo->nKey); 703 pInfo->nData = nPayload; 704 }else{ 705 pInfo->nData = 0; 706 n += getVarint32(&pCell[n], nPayload); 707 pInfo->nKey = nPayload; 708 } 709 pInfo->nPayload = nPayload; 710 pInfo->nHeader = n; 711 if( likely(nPayload<=pPage->maxLocal) ){ 712 /* This is the (easy) common case where the entire payload fits 713 ** on the local page. No overflow is required. 714 */ 715 int nSize; /* Total size of cell content in bytes */ 716 nSize = nPayload + n; 717 pInfo->nLocal = (u16)nPayload; 718 pInfo->iOverflow = 0; 719 if( (nSize & ~3)==0 ){ 720 nSize = 4; /* Minimum cell size is 4 */ 721 } 722 pInfo->nSize = (u16)nSize; 723 }else{ 724 /* If the payload will not fit completely on the local page, we have 725 ** to decide how much to store locally and how much to spill onto 726 ** overflow pages. The strategy is to minimize the amount of unused 727 ** space on overflow pages while keeping the amount of local storage 728 ** in between minLocal and maxLocal. 729 ** 730 ** Warning: changing the way overflow payload is distributed in any 731 ** way will result in an incompatible file format. 732 */ 733 int minLocal; /* Minimum amount of payload held locally */ 734 int maxLocal; /* Maximum amount of payload held locally */ 735 int surplus; /* Overflow payload available for local storage */ 736 737 minLocal = pPage->minLocal; 738 maxLocal = pPage->maxLocal; 739 surplus = minLocal + (nPayload - minLocal)%(pPage->pBt->usableSize - 4); 740 if( surplus <= maxLocal ){ 741 pInfo->nLocal = (u16)surplus; 742 }else{ 743 pInfo->nLocal = (u16)minLocal; 744 } 745 pInfo->iOverflow = (u16)(pInfo->nLocal + n); 746 pInfo->nSize = pInfo->iOverflow + 4; 747 } 748 } 749 #define parseCell(pPage, iCell, pInfo) \ 750 sqlite3BtreeParseCellPtr((pPage), findCell((pPage), (iCell)), (pInfo)) 751 void sqlite3BtreeParseCell( 752 MemPage *pPage, /* Page containing the cell */ 753 int iCell, /* The cell index. First cell is 0 */ 754 CellInfo *pInfo /* Fill in this structure */ 755 ){ 756 parseCell(pPage, iCell, pInfo); 757 } 758 759 /* 760 ** Compute the total number of bytes that a Cell needs in the cell 761 ** data area of the btree-page. The return number includes the cell 762 ** data header and the local payload, but not any overflow page or 763 ** the space used by the cell pointer. 764 */ 765 static u16 cellSizePtr(MemPage *pPage, u8 *pCell){ 766 u8 *pIter = &pCell[pPage->childPtrSize]; 767 u32 nSize; 768 769 #ifdef SQLITE_DEBUG 770 /* The value returned by this function should always be the same as 771 ** the (CellInfo.nSize) value found by doing a full parse of the 772 ** cell. If SQLITE_DEBUG is defined, an assert() at the bottom of 773 ** this function verifies that this invariant is not violated. */ 774 CellInfo debuginfo; 775 sqlite3BtreeParseCellPtr(pPage, pCell, &debuginfo); 776 #endif 777 778 if( pPage->intKey ){ 779 u8 *pEnd; 780 if( pPage->hasData ){ 781 pIter += getVarint32(pIter, nSize); 782 }else{ 783 nSize = 0; 784 } 785 786 /* pIter now points at the 64-bit integer key value, a variable length 787 ** integer. The following block moves pIter to point at the first byte 788 ** past the end of the key value. */ 789 pEnd = &pIter[9]; 790 while( (*pIter++)&0x80 && pIter<pEnd ); 791 }else{ 792 pIter += getVarint32(pIter, nSize); 793 } 794 795 if( nSize>pPage->maxLocal ){ 796 int minLocal = pPage->minLocal; 797 nSize = minLocal + (nSize - minLocal) % (pPage->pBt->usableSize - 4); 798 if( nSize>pPage->maxLocal ){ 799 nSize = minLocal; 800 } 801 nSize += 4; 802 } 803 nSize += (pIter - pCell); 804 805 /* The minimum size of any cell is 4 bytes. */ 806 if( nSize<4 ){ 807 nSize = 4; 808 } 809 810 assert( nSize==debuginfo.nSize ); 811 return nSize; 812 } 813 #ifndef NDEBUG 814 static u16 cellSize(MemPage *pPage, int iCell){ 815 return cellSizePtr(pPage, findCell(pPage, iCell)); 816 } 817 #endif 818 819 #ifndef SQLITE_OMIT_AUTOVACUUM 820 /* 821 ** If the cell pCell, part of page pPage contains a pointer 822 ** to an overflow page, insert an entry into the pointer-map 823 ** for the overflow page. 824 */ 825 static int ptrmapPutOvflPtr(MemPage *pPage, u8 *pCell){ 826 CellInfo info; 827 assert( pCell!=0 ); 828 sqlite3BtreeParseCellPtr(pPage, pCell, &info); 829 assert( (info.nData+(pPage->intKey?0:info.nKey))==info.nPayload ); 830 if( (info.nData+(pPage->intKey?0:info.nKey))>info.nLocal ){ 831 Pgno ovfl = get4byte(&pCell[info.iOverflow]); 832 return ptrmapPut(pPage->pBt, ovfl, PTRMAP_OVERFLOW1, pPage->pgno); 833 } 834 return SQLITE_OK; 835 } 836 /* 837 ** If the cell with index iCell on page pPage contains a pointer 838 ** to an overflow page, insert an entry into the pointer-map 839 ** for the overflow page. 840 */ 841 static int ptrmapPutOvfl(MemPage *pPage, int iCell){ 842 u8 *pCell; 843 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 844 pCell = findOverflowCell(pPage, iCell); 845 return ptrmapPutOvflPtr(pPage, pCell); 846 } 847 #endif 848 849 850 /* 851 ** Defragment the page given. All Cells are moved to the 852 ** end of the page and all free space is collected into one 853 ** big FreeBlk that occurs in between the header and cell 854 ** pointer array and the cell content area. 855 */ 856 static int defragmentPage(MemPage *pPage){ 857 int i; /* Loop counter */ 858 int pc; /* Address of a i-th cell */ 859 int addr; /* Offset of first byte after cell pointer array */ 860 int hdr; /* Offset to the page header */ 861 int size; /* Size of a cell */ 862 int usableSize; /* Number of usable bytes on a page */ 863 int cellOffset; /* Offset to the cell pointer array */ 864 int cbrk; /* Offset to the cell content area */ 865 int nCell; /* Number of cells on the page */ 866 unsigned char *data; /* The page data */ 867 unsigned char *temp; /* Temp area for cell content */ 868 869 assert( sqlite3PagerIswriteable(pPage->pDbPage) ); 870 assert( pPage->pBt!=0 ); 871 assert( pPage->pBt->usableSize <= SQLITE_MAX_PAGE_SIZE ); 872 assert( pPage->nOverflow==0 ); 873 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 874 temp = sqlite3PagerTempSpace(pPage->pBt->pPager); 875 data = pPage->aData; 876 hdr = pPage->hdrOffset; 877 cellOffset = pPage->cellOffset; 878 nCell = pPage->nCell; 879 assert( nCell==get2byte(&data[hdr+3]) ); 880 usableSize = pPage->pBt->usableSize; 881 cbrk = get2byte(&data[hdr+5]); 882 memcpy(&temp[cbrk], &data[cbrk], usableSize - cbrk); 883 cbrk = usableSize; 884 for(i=0; i<nCell; i++){ 885 u8 *pAddr; /* The i-th cell pointer */ 886 pAddr = &data[cellOffset + i*2]; 887 pc = get2byte(pAddr); 888 if( pc>=usableSize ){ 889 return SQLITE_CORRUPT_BKPT; 890 } 891 size = cellSizePtr(pPage, &temp[pc]); 892 cbrk -= size; 893 if( cbrk<cellOffset+2*nCell || pc+size>usableSize ){ 894 return SQLITE_CORRUPT_BKPT; 895 } 896 assert( cbrk+size<=usableSize && cbrk>=0 ); 897 memcpy(&data[cbrk], &temp[pc], size); 898 put2byte(pAddr, cbrk); 899 } 900 assert( cbrk>=cellOffset+2*nCell ); 901 put2byte(&data[hdr+5], cbrk); 902 data[hdr+1] = 0; 903 data[hdr+2] = 0; 904 data[hdr+7] = 0; 905 addr = cellOffset+2*nCell; 906 memset(&data[addr], 0, cbrk-addr); 907 assert( sqlite3PagerIswriteable(pPage->pDbPage) ); 908 if( cbrk-addr!=pPage->nFree ){ 909 return SQLITE_CORRUPT_BKPT; 910 } 911 return SQLITE_OK; 912 } 913 914 /* 915 ** Allocate nByte bytes of space from within the B-Tree page passed 916 ** as the first argument. Return the index into pPage->aData[] of the 917 ** first byte of allocated space. 918 ** 919 ** The caller guarantees that the space between the end of the cell-offset 920 ** array and the start of the cell-content area is at least nByte bytes 921 ** in size. So this routine can never fail. 922 ** 923 ** If there are already 60 or more bytes of fragments within the page, 924 ** the page is defragmented before returning. If this were not done there 925 ** is a chance that the number of fragmented bytes could eventually 926 ** overflow the single-byte field of the page-header in which this value 927 ** is stored. 928 */ 929 static int allocateSpace(MemPage *pPage, int nByte){ 930 const int hdr = pPage->hdrOffset; /* Local cache of pPage->hdrOffset */ 931 u8 * const data = pPage->aData; /* Local cache of pPage->aData */ 932 int nFrag; /* Number of fragmented bytes on pPage */ 933 int top; 934 935 assert( sqlite3PagerIswriteable(pPage->pDbPage) ); 936 assert( pPage->pBt ); 937 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 938 assert( nByte>=0 ); /* Minimum cell size is 4 */ 939 assert( pPage->nFree>=nByte ); 940 assert( pPage->nOverflow==0 ); 941 942 /* Assert that the space between the cell-offset array and the 943 ** cell-content area is greater than nByte bytes. 944 */ 945 assert( nByte <= ( 946 get2byte(&data[hdr+5])-(hdr+8+(pPage->leaf?0:4)+2*get2byte(&data[hdr+3])) 947 )); 948 949 pPage->nFree -= (u16)nByte; 950 nFrag = data[hdr+7]; 951 if( nFrag>=60 ){ 952 defragmentPage(pPage); 953 }else{ 954 /* Search the freelist looking for a free slot big enough to satisfy 955 ** the request. The allocation is made from the first free slot in 956 ** the list that is large enough to accomadate it. 957 */ 958 int pc, addr; 959 for(addr=hdr+1; (pc = get2byte(&data[addr]))>0; addr=pc){ 960 int size = get2byte(&data[pc+2]); /* Size of free slot */ 961 if( size>=nByte ){ 962 int x = size - nByte; 963 if( x<4 ){ 964 /* Remove the slot from the free-list. Update the number of 965 ** fragmented bytes within the page. */ 966 memcpy(&data[addr], &data[pc], 2); 967 data[hdr+7] = (u8)(nFrag + x); 968 }else{ 969 /* The slot remains on the free-list. Reduce its size to account 970 ** for the portion used by the new allocation. */ 971 put2byte(&data[pc+2], x); 972 } 973 return pc + x; 974 } 975 } 976 } 977 978 /* Allocate memory from the gap in between the cell pointer array 979 ** and the cell content area. 980 */ 981 top = get2byte(&data[hdr+5]) - nByte; 982 put2byte(&data[hdr+5], top); 983 return top; 984 } 985 986 /* 987 ** Return a section of the pPage->aData to the freelist. 988 ** The first byte of the new free block is pPage->aDisk[start] 989 ** and the size of the block is "size" bytes. 990 ** 991 ** Most of the effort here is involved in coalesing adjacent 992 ** free blocks into a single big free block. 993 */ 994 static int freeSpace(MemPage *pPage, int start, int size){ 995 int addr, pbegin, hdr; 996 unsigned char *data = pPage->aData; 997 998 assert( pPage->pBt!=0 ); 999 assert( sqlite3PagerIswriteable(pPage->pDbPage) ); 1000 assert( start>=pPage->hdrOffset+6+(pPage->leaf?0:4) ); 1001 assert( (start + size)<=pPage->pBt->usableSize ); 1002 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 1003 assert( size>=0 ); /* Minimum cell size is 4 */ 1004 1005 #ifdef SQLITE_SECURE_DELETE 1006 /* Overwrite deleted information with zeros when the SECURE_DELETE 1007 ** option is enabled at compile-time */ 1008 memset(&data[start], 0, size); 1009 #endif 1010 1011 /* Add the space back into the linked list of freeblocks */ 1012 hdr = pPage->hdrOffset; 1013 addr = hdr + 1; 1014 while( (pbegin = get2byte(&data[addr]))<start && pbegin>0 ){ 1015 assert( pbegin<=pPage->pBt->usableSize-4 ); 1016 if( pbegin<=addr ) { 1017 return SQLITE_CORRUPT_BKPT; 1018 } 1019 addr = pbegin; 1020 } 1021 if ( pbegin>pPage->pBt->usableSize-4 ) { 1022 return SQLITE_CORRUPT_BKPT; 1023 } 1024 assert( pbegin>addr || pbegin==0 ); 1025 put2byte(&data[addr], start); 1026 put2byte(&data[start], pbegin); 1027 put2byte(&data[start+2], size); 1028 pPage->nFree += (u16)size; 1029 1030 /* Coalesce adjacent free blocks */ 1031 addr = pPage->hdrOffset + 1; 1032 while( (pbegin = get2byte(&data[addr]))>0 ){ 1033 int pnext, psize, x; 1034 assert( pbegin>addr ); 1035 assert( pbegin<=pPage->pBt->usableSize-4 ); 1036 pnext = get2byte(&data[pbegin]); 1037 psize = get2byte(&data[pbegin+2]); 1038 if( pbegin + psize + 3 >= pnext && pnext>0 ){ 1039 int frag = pnext - (pbegin+psize); 1040 if( (frag<0) || (frag>(int)data[pPage->hdrOffset+7]) ){ 1041 return SQLITE_CORRUPT_BKPT; 1042 } 1043 data[pPage->hdrOffset+7] -= (u8)frag; 1044 x = get2byte(&data[pnext]); 1045 put2byte(&data[pbegin], x); 1046 x = pnext + get2byte(&data[pnext+2]) - pbegin; 1047 put2byte(&data[pbegin+2], x); 1048 }else{ 1049 addr = pbegin; 1050 } 1051 } 1052 1053 /* If the cell content area begins with a freeblock, remove it. */ 1054 if( data[hdr+1]==data[hdr+5] && data[hdr+2]==data[hdr+6] ){ 1055 int top; 1056 pbegin = get2byte(&data[hdr+1]); 1057 memcpy(&data[hdr+1], &data[pbegin], 2); 1058 top = get2byte(&data[hdr+5]) + get2byte(&data[pbegin+2]); 1059 put2byte(&data[hdr+5], top); 1060 } 1061 assert( sqlite3PagerIswriteable(pPage->pDbPage) ); 1062 return SQLITE_OK; 1063 } 1064 1065 /* 1066 ** Decode the flags byte (the first byte of the header) for a page 1067 ** and initialize fields of the MemPage structure accordingly. 1068 ** 1069 ** Only the following combinations are supported. Anything different 1070 ** indicates a corrupt database files: 1071 ** 1072 ** PTF_ZERODATA 1073 ** PTF_ZERODATA | PTF_LEAF 1074 ** PTF_LEAFDATA | PTF_INTKEY 1075 ** PTF_LEAFDATA | PTF_INTKEY | PTF_LEAF 1076 */ 1077 static int decodeFlags(MemPage *pPage, int flagByte){ 1078 BtShared *pBt; /* A copy of pPage->pBt */ 1079 1080 assert( pPage->hdrOffset==(pPage->pgno==1 ? 100 : 0) ); 1081 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 1082 pPage->leaf = (u8)(flagByte>>3); assert( PTF_LEAF == 1<<3 ); 1083 flagByte &= ~PTF_LEAF; 1084 pPage->childPtrSize = 4-4*pPage->leaf; 1085 pBt = pPage->pBt; 1086 if( flagByte==(PTF_LEAFDATA | PTF_INTKEY) ){ 1087 pPage->intKey = 1; 1088 pPage->hasData = pPage->leaf; 1089 pPage->maxLocal = pBt->maxLeaf; 1090 pPage->minLocal = pBt->minLeaf; 1091 }else if( flagByte==PTF_ZERODATA ){ 1092 pPage->intKey = 0; 1093 pPage->hasData = 0; 1094 pPage->maxLocal = pBt->maxLocal; 1095 pPage->minLocal = pBt->minLocal; 1096 }else{ 1097 return SQLITE_CORRUPT_BKPT; 1098 } 1099 return SQLITE_OK; 1100 } 1101 1102 /* 1103 ** Initialize the auxiliary information for a disk block. 1104 ** 1105 ** Return SQLITE_OK on success. If we see that the page does 1106 ** not contain a well-formed database page, then return 1107 ** SQLITE_CORRUPT. Note that a return of SQLITE_OK does not 1108 ** guarantee that the page is well-formed. It only shows that 1109 ** we failed to detect any corruption. 1110 */ 1111 int sqlite3BtreeInitPage(MemPage *pPage){ 1112 1113 assert( pPage->pBt!=0 ); 1114 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 1115 assert( pPage->pgno==sqlite3PagerPagenumber(pPage->pDbPage) ); 1116 assert( pPage == sqlite3PagerGetExtra(pPage->pDbPage) ); 1117 assert( pPage->aData == sqlite3PagerGetData(pPage->pDbPage) ); 1118 1119 if( !pPage->isInit ){ 1120 u16 pc; /* Address of a freeblock within pPage->aData[] */ 1121 u8 hdr; /* Offset to beginning of page header */ 1122 u8 *data; /* Equal to pPage->aData */ 1123 BtShared *pBt; /* The main btree structure */ 1124 u16 usableSize; /* Amount of usable space on each page */ 1125 u16 cellOffset; /* Offset from start of page to first cell pointer */ 1126 u16 nFree; /* Number of unused bytes on the page */ 1127 u16 top; /* First byte of the cell content area */ 1128 1129 pBt = pPage->pBt; 1130 1131 hdr = pPage->hdrOffset; 1132 data = pPage->aData; 1133 if( decodeFlags(pPage, data[hdr]) ) return SQLITE_CORRUPT_BKPT; 1134 assert( pBt->pageSize>=512 && pBt->pageSize<=32768 ); 1135 pPage->maskPage = pBt->pageSize - 1; 1136 pPage->nOverflow = 0; 1137 usableSize = pBt->usableSize; 1138 pPage->cellOffset = cellOffset = hdr + 12 - 4*pPage->leaf; 1139 top = get2byte(&data[hdr+5]); 1140 pPage->nCell = get2byte(&data[hdr+3]); 1141 if( pPage->nCell>MX_CELL(pBt) ){ 1142 /* To many cells for a single page. The page must be corrupt */ 1143 return SQLITE_CORRUPT_BKPT; 1144 } 1145 1146 /* Compute the total free space on the page */ 1147 pc = get2byte(&data[hdr+1]); 1148 nFree = data[hdr+7] + top - (cellOffset + 2*pPage->nCell); 1149 while( pc>0 ){ 1150 u16 next, size; 1151 if( pc>usableSize-4 ){ 1152 /* Free block is off the page */ 1153 return SQLITE_CORRUPT_BKPT; 1154 } 1155 next = get2byte(&data[pc]); 1156 size = get2byte(&data[pc+2]); 1157 if( next>0 && next<=pc+size+3 ){ 1158 /* Free blocks must be in accending order */ 1159 return SQLITE_CORRUPT_BKPT; 1160 } 1161 nFree += size; 1162 pc = next; 1163 } 1164 pPage->nFree = (u16)nFree; 1165 if( nFree>=usableSize ){ 1166 /* Free space cannot exceed total page size */ 1167 return SQLITE_CORRUPT_BKPT; 1168 } 1169 1170 #if 0 1171 /* Check that all the offsets in the cell offset array are within range. 1172 ** 1173 ** Omitting this consistency check and using the pPage->maskPage mask 1174 ** to prevent overrunning the page buffer in findCell() results in a 1175 ** 2.5% performance gain. 1176 */ 1177 { 1178 u8 *pOff; /* Iterator used to check all cell offsets are in range */ 1179 u8 *pEnd; /* Pointer to end of cell offset array */ 1180 u8 mask; /* Mask of bits that must be zero in MSB of cell offsets */ 1181 mask = ~(((u8)(pBt->pageSize>>8))-1); 1182 pEnd = &data[cellOffset + pPage->nCell*2]; 1183 for(pOff=&data[cellOffset]; pOff!=pEnd && !((*pOff)&mask); pOff+=2); 1184 if( pOff!=pEnd ){ 1185 return SQLITE_CORRUPT_BKPT; 1186 } 1187 } 1188 #endif 1189 1190 pPage->isInit = 1; 1191 } 1192 return SQLITE_OK; 1193 } 1194 1195 /* 1196 ** Set up a raw page so that it looks like a database page holding 1197 ** no entries. 1198 */ 1199 static void zeroPage(MemPage *pPage, int flags){ 1200 unsigned char *data = pPage->aData; 1201 BtShared *pBt = pPage->pBt; 1202 u8 hdr = pPage->hdrOffset; 1203 u16 first; 1204 1205 assert( sqlite3PagerPagenumber(pPage->pDbPage)==pPage->pgno ); 1206 assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage ); 1207 assert( sqlite3PagerGetData(pPage->pDbPage) == data ); 1208 assert( sqlite3PagerIswriteable(pPage->pDbPage) ); 1209 assert( sqlite3_mutex_held(pBt->mutex) ); 1210 /*memset(&data[hdr], 0, pBt->usableSize - hdr);*/ 1211 data[hdr] = (char)flags; 1212 first = hdr + 8 + 4*((flags&PTF_LEAF)==0 ?1:0); 1213 memset(&data[hdr+1], 0, 4); 1214 data[hdr+7] = 0; 1215 put2byte(&data[hdr+5], pBt->usableSize); 1216 pPage->nFree = pBt->usableSize - first; 1217 decodeFlags(pPage, flags); 1218 pPage->hdrOffset = hdr; 1219 pPage->cellOffset = first; 1220 pPage->nOverflow = 0; 1221 assert( pBt->pageSize>=512 && pBt->pageSize<=32768 ); 1222 pPage->maskPage = pBt->pageSize - 1; 1223 pPage->nCell = 0; 1224 pPage->isInit = 1; 1225 } 1226 1227 1228 /* 1229 ** Convert a DbPage obtained from the pager into a MemPage used by 1230 ** the btree layer. 1231 */ 1232 static MemPage *btreePageFromDbPage(DbPage *pDbPage, Pgno pgno, BtShared *pBt){ 1233 MemPage *pPage = (MemPage*)sqlite3PagerGetExtra(pDbPage); 1234 pPage->aData = sqlite3PagerGetData(pDbPage); 1235 pPage->pDbPage = pDbPage; 1236 pPage->pBt = pBt; 1237 pPage->pgno = pgno; 1238 pPage->hdrOffset = pPage->pgno==1 ? 100 : 0; 1239 return pPage; 1240 } 1241 1242 /* 1243 ** Get a page from the pager. Initialize the MemPage.pBt and 1244 ** MemPage.aData elements if needed. 1245 ** 1246 ** If the noContent flag is set, it means that we do not care about 1247 ** the content of the page at this time. So do not go to the disk 1248 ** to fetch the content. Just fill in the content with zeros for now. 1249 ** If in the future we call sqlite3PagerWrite() on this page, that 1250 ** means we have started to be concerned about content and the disk 1251 ** read should occur at that point. 1252 */ 1253 int sqlite3BtreeGetPage( 1254 BtShared *pBt, /* The btree */ 1255 Pgno pgno, /* Number of the page to fetch */ 1256 MemPage **ppPage, /* Return the page in this parameter */ 1257 int noContent /* Do not load page content if true */ 1258 ){ 1259 int rc; 1260 DbPage *pDbPage; 1261 1262 assert( sqlite3_mutex_held(pBt->mutex) ); 1263 rc = sqlite3PagerAcquire(pBt->pPager, pgno, (DbPage**)&pDbPage, noContent); 1264 if( rc ) return rc; 1265 *ppPage = btreePageFromDbPage(pDbPage, pgno, pBt); 1266 return SQLITE_OK; 1267 } 1268 1269 /* 1270 ** Retrieve a page from the pager cache. If the requested page is not 1271 ** already in the pager cache return NULL. Initialize the MemPage.pBt and 1272 ** MemPage.aData elements if needed. 1273 */ 1274 static MemPage *btreePageLookup(BtShared *pBt, Pgno pgno){ 1275 DbPage *pDbPage; 1276 assert( sqlite3_mutex_held(pBt->mutex) ); 1277 pDbPage = sqlite3PagerLookup(pBt->pPager, pgno); 1278 if( pDbPage ){ 1279 return btreePageFromDbPage(pDbPage, pgno, pBt); 1280 } 1281 return 0; 1282 } 1283 1284 /* 1285 ** Return the size of the database file in pages. If there is any kind of 1286 ** error, return ((unsigned int)-1). 1287 */ 1288 static Pgno pagerPagecount(BtShared *pBt){ 1289 int nPage = -1; 1290 int rc; 1291 assert( pBt->pPage1 ); 1292 rc = sqlite3PagerPagecount(pBt->pPager, &nPage); 1293 assert( rc==SQLITE_OK || nPage==-1 ); 1294 return (Pgno)nPage; 1295 } 1296 1297 /* 1298 ** Get a page from the pager and initialize it. This routine 1299 ** is just a convenience wrapper around separate calls to 1300 ** sqlite3BtreeGetPage() and sqlite3BtreeInitPage(). 1301 */ 1302 static int getAndInitPage( 1303 BtShared *pBt, /* The database file */ 1304 Pgno pgno, /* Number of the page to get */ 1305 MemPage **ppPage /* Write the page pointer here */ 1306 ){ 1307 int rc; 1308 MemPage *pPage; 1309 1310 assert( sqlite3_mutex_held(pBt->mutex) ); 1311 if( pgno==0 ){ 1312 return SQLITE_CORRUPT_BKPT; 1313 } 1314 1315 /* It is often the case that the page we want is already in cache. 1316 ** If so, get it directly. This saves us from having to call 1317 ** pagerPagecount() to make sure pgno is within limits, which results 1318 ** in a measureable performance improvements. 1319 */ 1320 *ppPage = pPage = btreePageLookup(pBt, pgno); 1321 if( pPage ){ 1322 /* Page is already in cache */ 1323 rc = SQLITE_OK; 1324 }else{ 1325 /* Page not in cache. Acquire it. */ 1326 if( pgno>pagerPagecount(pBt) ){ 1327 return SQLITE_CORRUPT_BKPT; 1328 } 1329 rc = sqlite3BtreeGetPage(pBt, pgno, ppPage, 0); 1330 if( rc ) return rc; 1331 pPage = *ppPage; 1332 } 1333 if( !pPage->isInit ){ 1334 rc = sqlite3BtreeInitPage(pPage); 1335 } 1336 if( rc!=SQLITE_OK ){ 1337 releasePage(pPage); 1338 *ppPage = 0; 1339 } 1340 return rc; 1341 } 1342 1343 /* 1344 ** Release a MemPage. This should be called once for each prior 1345 ** call to sqlite3BtreeGetPage. 1346 */ 1347 static void releasePage(MemPage *pPage){ 1348 if( pPage ){ 1349 assert( pPage->nOverflow==0 || sqlite3PagerPageRefcount(pPage->pDbPage)>1 ); 1350 assert( pPage->aData ); 1351 assert( pPage->pBt ); 1352 assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage ); 1353 assert( sqlite3PagerGetData(pPage->pDbPage)==pPage->aData ); 1354 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 1355 sqlite3PagerUnref(pPage->pDbPage); 1356 } 1357 } 1358 1359 /* 1360 ** During a rollback, when the pager reloads information into the cache 1361 ** so that the cache is restored to its original state at the start of 1362 ** the transaction, for each page restored this routine is called. 1363 ** 1364 ** This routine needs to reset the extra data section at the end of the 1365 ** page to agree with the restored data. 1366 */ 1367 static void pageReinit(DbPage *pData){ 1368 MemPage *pPage; 1369 pPage = (MemPage *)sqlite3PagerGetExtra(pData); 1370 assert( sqlite3PagerPageRefcount(pData)>0 ); 1371 if( pPage->isInit ){ 1372 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 1373 pPage->isInit = 0; 1374 if( sqlite3PagerPageRefcount(pData)>1 ){ 1375 /* pPage might not be a btree page; it might be an overflow page 1376 ** or ptrmap page or a free page. In those cases, the following 1377 ** call to sqlite3BtreeInitPage() will likely return SQLITE_CORRUPT. 1378 ** But no harm is done by this. And it is very important that 1379 ** sqlite3BtreeInitPage() be called on every btree page so we make 1380 ** the call for every page that comes in for re-initing. */ 1381 sqlite3BtreeInitPage(pPage); 1382 } 1383 } 1384 } 1385 1386 /* 1387 ** Invoke the busy handler for a btree. 1388 */ 1389 static int btreeInvokeBusyHandler(void *pArg){ 1390 BtShared *pBt = (BtShared*)pArg; 1391 assert( pBt->db ); 1392 assert( sqlite3_mutex_held(pBt->db->mutex) ); 1393 return sqlite3InvokeBusyHandler(&pBt->db->busyHandler); 1394 } 1395 1396 /* 1397 ** Open a database file. 1398 ** 1399 ** zFilename is the name of the database file. If zFilename is NULL 1400 ** a new database with a random name is created. This randomly named 1401 ** database file will be deleted when sqlite3BtreeClose() is called. 1402 ** If zFilename is ":memory:" then an in-memory database is created 1403 ** that is automatically destroyed when it is closed. 1404 ** 1405 ** If the database is already opened in the same database connection 1406 ** and we are in shared cache mode, then the open will fail with an 1407 ** SQLITE_CONSTRAINT error. We cannot allow two or more BtShared 1408 ** objects in the same database connection since doing so will lead 1409 ** to problems with locking. 1410 */ 1411 int sqlite3BtreeOpen( 1412 const char *zFilename, /* Name of the file containing the BTree database */ 1413 sqlite3 *db, /* Associated database handle */ 1414 Btree **ppBtree, /* Pointer to new Btree object written here */ 1415 int flags, /* Options */ 1416 int vfsFlags /* Flags passed through to sqlite3_vfs.xOpen() */ 1417 ){ 1418 sqlite3_vfs *pVfs; /* The VFS to use for this btree */ 1419 BtShared *pBt = 0; /* Shared part of btree structure */ 1420 Btree *p; /* Handle to return */ 1421 sqlite3_mutex *mutexOpen = 0; /* Prevents a race condition. Ticket #3537 */ 1422 int rc = SQLITE_OK; /* Result code from this function */ 1423 u8 nReserve; /* Byte of unused space on each page */ 1424 unsigned char zDbHeader[100]; /* Database header content */ 1425 1426 /* Set the variable isMemdb to true for an in-memory database, or 1427 ** false for a file-based database. This symbol is only required if 1428 ** either of the shared-data or autovacuum features are compiled 1429 ** into the library. 1430 */ 1431 #if !defined(SQLITE_OMIT_SHARED_CACHE) || !defined(SQLITE_OMIT_AUTOVACUUM) 1432 #ifdef SQLITE_OMIT_MEMORYDB 1433 const int isMemdb = 0; 1434 #else 1435 const int isMemdb = zFilename && !strcmp(zFilename, ":memory:"); 1436 #endif 1437 #endif 1438 1439 assert( db!=0 ); 1440 assert( sqlite3_mutex_held(db->mutex) ); 1441 1442 pVfs = db->pVfs; 1443 p = sqlite3MallocZero(sizeof(Btree)); 1444 if( !p ){ 1445 return SQLITE_NOMEM; 1446 } 1447 p->inTrans = TRANS_NONE; 1448 p->db = db; 1449 1450 #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO) 1451 /* 1452 ** If this Btree is a candidate for shared cache, try to find an 1453 ** existing BtShared object that we can share with 1454 */ 1455 if( isMemdb==0 && zFilename && zFilename[0] ){ 1456 if( sqlite3GlobalConfig.sharedCacheEnabled ){ 1457 int nFullPathname = pVfs->mxPathname+1; 1458 char *zFullPathname = sqlite3Malloc(nFullPathname); 1459 sqlite3_mutex *mutexShared; 1460 p->sharable = 1; 1461 db->flags |= SQLITE_SharedCache; 1462 if( !zFullPathname ){ 1463 sqlite3_free(p); 1464 return SQLITE_NOMEM; 1465 } 1466 sqlite3OsFullPathname(pVfs, zFilename, nFullPathname, zFullPathname); 1467 mutexOpen = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_OPEN); 1468 sqlite3_mutex_enter(mutexOpen); 1469 mutexShared = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER); 1470 sqlite3_mutex_enter(mutexShared); 1471 for(pBt=GLOBAL(BtShared*,sqlite3SharedCacheList); pBt; pBt=pBt->pNext){ 1472 assert( pBt->nRef>0 ); 1473 if( 0==strcmp(zFullPathname, sqlite3PagerFilename(pBt->pPager)) 1474 && sqlite3PagerVfs(pBt->pPager)==pVfs ){ 1475 int iDb; 1476 for(iDb=db->nDb-1; iDb>=0; iDb--){ 1477 Btree *pExisting = db->aDb[iDb].pBt; 1478 if( pExisting && pExisting->pBt==pBt ){ 1479 sqlite3_mutex_leave(mutexShared); 1480 sqlite3_mutex_leave(mutexOpen); 1481 sqlite3_free(zFullPathname); 1482 sqlite3_free(p); 1483 return SQLITE_CONSTRAINT; 1484 } 1485 } 1486 p->pBt = pBt; 1487 pBt->nRef++; 1488 break; 1489 } 1490 } 1491 sqlite3_mutex_leave(mutexShared); 1492 sqlite3_free(zFullPathname); 1493 } 1494 #ifdef SQLITE_DEBUG 1495 else{ 1496 /* In debug mode, we mark all persistent databases as sharable 1497 ** even when they are not. This exercises the locking code and 1498 ** gives more opportunity for asserts(sqlite3_mutex_held()) 1499 ** statements to find locking problems. 1500 */ 1501 p->sharable = 1; 1502 } 1503 #endif 1504 } 1505 #endif 1506 if( pBt==0 ){ 1507 /* 1508 ** The following asserts make sure that structures used by the btree are 1509 ** the right size. This is to guard against size changes that result 1510 ** when compiling on a different architecture. 1511 */ 1512 assert( sizeof(i64)==8 || sizeof(i64)==4 ); 1513 assert( sizeof(u64)==8 || sizeof(u64)==4 ); 1514 assert( sizeof(u32)==4 ); 1515 assert( sizeof(u16)==2 ); 1516 assert( sizeof(Pgno)==4 ); 1517 1518 pBt = sqlite3MallocZero( sizeof(*pBt) ); 1519 if( pBt==0 ){ 1520 rc = SQLITE_NOMEM; 1521 goto btree_open_out; 1522 } 1523 rc = sqlite3PagerOpen(pVfs, &pBt->pPager, zFilename, 1524 EXTRA_SIZE, flags, vfsFlags); 1525 if( rc==SQLITE_OK ){ 1526 rc = sqlite3PagerReadFileheader(pBt->pPager,sizeof(zDbHeader),zDbHeader); 1527 } 1528 if( rc!=SQLITE_OK ){ 1529 goto btree_open_out; 1530 } 1531 pBt->db = db; 1532 sqlite3PagerSetBusyhandler(pBt->pPager, btreeInvokeBusyHandler, pBt); 1533 p->pBt = pBt; 1534 1535 sqlite3PagerSetReiniter(pBt->pPager, pageReinit); 1536 pBt->pCursor = 0; 1537 pBt->pPage1 = 0; 1538 pBt->readOnly = sqlite3PagerIsreadonly(pBt->pPager); 1539 pBt->pageSize = get2byte(&zDbHeader[16]); 1540 if( pBt->pageSize<512 || pBt->pageSize>SQLITE_MAX_PAGE_SIZE 1541 || ((pBt->pageSize-1)&pBt->pageSize)!=0 ){ 1542 pBt->pageSize = 0; 1543 #ifndef SQLITE_OMIT_AUTOVACUUM 1544 /* If the magic name ":memory:" will create an in-memory database, then 1545 ** leave the autoVacuum mode at 0 (do not auto-vacuum), even if 1546 ** SQLITE_DEFAULT_AUTOVACUUM is true. On the other hand, if 1547 ** SQLITE_OMIT_MEMORYDB has been defined, then ":memory:" is just a 1548 ** regular file-name. In this case the auto-vacuum applies as per normal. 1549 */ 1550 if( zFilename && !isMemdb ){ 1551 pBt->autoVacuum = (SQLITE_DEFAULT_AUTOVACUUM ? 1 : 0); 1552 pBt->incrVacuum = (SQLITE_DEFAULT_AUTOVACUUM==2 ? 1 : 0); 1553 } 1554 #endif 1555 nReserve = 0; 1556 }else{ 1557 nReserve = zDbHeader[20]; 1558 pBt->pageSizeFixed = 1; 1559 #ifndef SQLITE_OMIT_AUTOVACUUM 1560 pBt->autoVacuum = (get4byte(&zDbHeader[36 + 4*4])?1:0); 1561 pBt->incrVacuum = (get4byte(&zDbHeader[36 + 7*4])?1:0); 1562 #endif 1563 } 1564 rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize); 1565 if( rc ) goto btree_open_out; 1566 pBt->usableSize = pBt->pageSize - nReserve; 1567 assert( (pBt->pageSize & 7)==0 ); /* 8-byte alignment of pageSize */ 1568 1569 #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO) 1570 /* Add the new BtShared object to the linked list sharable BtShareds. 1571 */ 1572 if( p->sharable ){ 1573 sqlite3_mutex *mutexShared; 1574 pBt->nRef = 1; 1575 mutexShared = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER); 1576 if( SQLITE_THREADSAFE && sqlite3GlobalConfig.bCoreMutex ){ 1577 pBt->mutex = sqlite3MutexAlloc(SQLITE_MUTEX_FAST); 1578 if( pBt->mutex==0 ){ 1579 rc = SQLITE_NOMEM; 1580 db->mallocFailed = 0; 1581 goto btree_open_out; 1582 } 1583 } 1584 sqlite3_mutex_enter(mutexShared); 1585 pBt->pNext = GLOBAL(BtShared*,sqlite3SharedCacheList); 1586 GLOBAL(BtShared*,sqlite3SharedCacheList) = pBt; 1587 sqlite3_mutex_leave(mutexShared); 1588 } 1589 #endif 1590 } 1591 1592 #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO) 1593 /* If the new Btree uses a sharable pBtShared, then link the new 1594 ** Btree into the list of all sharable Btrees for the same connection. 1595 ** The list is kept in ascending order by pBt address. 1596 */ 1597 if( p->sharable ){ 1598 int i; 1599 Btree *pSib; 1600 for(i=0; i<db->nDb; i++){ 1601 if( (pSib = db->aDb[i].pBt)!=0 && pSib->sharable ){ 1602 while( pSib->pPrev ){ pSib = pSib->pPrev; } 1603 if( p->pBt<pSib->pBt ){ 1604 p->pNext = pSib; 1605 p->pPrev = 0; 1606 pSib->pPrev = p; 1607 }else{ 1608 while( pSib->pNext && pSib->pNext->pBt<p->pBt ){ 1609 pSib = pSib->pNext; 1610 } 1611 p->pNext = pSib->pNext; 1612 p->pPrev = pSib; 1613 if( p->pNext ){ 1614 p->pNext->pPrev = p; 1615 } 1616 pSib->pNext = p; 1617 } 1618 break; 1619 } 1620 } 1621 } 1622 #endif 1623 *ppBtree = p; 1624 1625 btree_open_out: 1626 if( rc!=SQLITE_OK ){ 1627 if( pBt && pBt->pPager ){ 1628 sqlite3PagerClose(pBt->pPager); 1629 } 1630 sqlite3_free(pBt); 1631 sqlite3_free(p); 1632 *ppBtree = 0; 1633 } 1634 if( mutexOpen ){ 1635 assert( sqlite3_mutex_held(mutexOpen) ); 1636 sqlite3_mutex_leave(mutexOpen); 1637 } 1638 return rc; 1639 } 1640 1641 /* 1642 ** Decrement the BtShared.nRef counter. When it reaches zero, 1643 ** remove the BtShared structure from the sharing list. Return 1644 ** true if the BtShared.nRef counter reaches zero and return 1645 ** false if it is still positive. 1646 */ 1647 static int removeFromSharingList(BtShared *pBt){ 1648 #ifndef SQLITE_OMIT_SHARED_CACHE 1649 sqlite3_mutex *pMaster; 1650 BtShared *pList; 1651 int removed = 0; 1652 1653 assert( sqlite3_mutex_notheld(pBt->mutex) ); 1654 pMaster = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER); 1655 sqlite3_mutex_enter(pMaster); 1656 pBt->nRef--; 1657 if( pBt->nRef<=0 ){ 1658 if( GLOBAL(BtShared*,sqlite3SharedCacheList)==pBt ){ 1659 GLOBAL(BtShared*,sqlite3SharedCacheList) = pBt->pNext; 1660 }else{ 1661 pList = GLOBAL(BtShared*,sqlite3SharedCacheList); 1662 while( ALWAYS(pList) && pList->pNext!=pBt ){ 1663 pList=pList->pNext; 1664 } 1665 if( ALWAYS(pList) ){ 1666 pList->pNext = pBt->pNext; 1667 } 1668 } 1669 if( SQLITE_THREADSAFE ){ 1670 sqlite3_mutex_free(pBt->mutex); 1671 } 1672 removed = 1; 1673 } 1674 sqlite3_mutex_leave(pMaster); 1675 return removed; 1676 #else 1677 return 1; 1678 #endif 1679 } 1680 1681 /* 1682 ** Make sure pBt->pTmpSpace points to an allocation of 1683 ** MX_CELL_SIZE(pBt) bytes. 1684 */ 1685 static void allocateTempSpace(BtShared *pBt){ 1686 if( !pBt->pTmpSpace ){ 1687 pBt->pTmpSpace = sqlite3PageMalloc( pBt->pageSize ); 1688 } 1689 } 1690 1691 /* 1692 ** Free the pBt->pTmpSpace allocation 1693 */ 1694 static void freeTempSpace(BtShared *pBt){ 1695 sqlite3PageFree( pBt->pTmpSpace); 1696 pBt->pTmpSpace = 0; 1697 } 1698 1699 /* 1700 ** Close an open database and invalidate all cursors. 1701 */ 1702 int sqlite3BtreeClose(Btree *p){ 1703 BtShared *pBt = p->pBt; 1704 BtCursor *pCur; 1705 1706 /* Close all cursors opened via this handle. */ 1707 assert( sqlite3_mutex_held(p->db->mutex) ); 1708 sqlite3BtreeEnter(p); 1709 pCur = pBt->pCursor; 1710 while( pCur ){ 1711 BtCursor *pTmp = pCur; 1712 pCur = pCur->pNext; 1713 if( pTmp->pBtree==p ){ 1714 sqlite3BtreeCloseCursor(pTmp); 1715 } 1716 } 1717 1718 /* Rollback any active transaction and free the handle structure. 1719 ** The call to sqlite3BtreeRollback() drops any table-locks held by 1720 ** this handle. 1721 */ 1722 sqlite3BtreeRollback(p); 1723 sqlite3BtreeLeave(p); 1724 1725 /* If there are still other outstanding references to the shared-btree 1726 ** structure, return now. The remainder of this procedure cleans 1727 ** up the shared-btree. 1728 */ 1729 assert( p->wantToLock==0 && p->locked==0 ); 1730 if( !p->sharable || removeFromSharingList(pBt) ){ 1731 /* The pBt is no longer on the sharing list, so we can access 1732 ** it without having to hold the mutex. 1733 ** 1734 ** Clean out and delete the BtShared object. 1735 */ 1736 assert( !pBt->pCursor ); 1737 sqlite3PagerClose(pBt->pPager); 1738 if( pBt->xFreeSchema && pBt->pSchema ){ 1739 pBt->xFreeSchema(pBt->pSchema); 1740 } 1741 sqlite3_free(pBt->pSchema); 1742 freeTempSpace(pBt); 1743 sqlite3_free(pBt); 1744 } 1745 1746 #ifndef SQLITE_OMIT_SHARED_CACHE 1747 assert( p->wantToLock==0 ); 1748 assert( p->locked==0 ); 1749 if( p->pPrev ) p->pPrev->pNext = p->pNext; 1750 if( p->pNext ) p->pNext->pPrev = p->pPrev; 1751 #endif 1752 1753 sqlite3_free(p); 1754 return SQLITE_OK; 1755 } 1756 1757 /* 1758 ** Change the limit on the number of pages allowed in the cache. 1759 ** 1760 ** The maximum number of cache pages is set to the absolute 1761 ** value of mxPage. If mxPage is negative, the pager will 1762 ** operate asynchronously - it will not stop to do fsync()s 1763 ** to insure data is written to the disk surface before 1764 ** continuing. Transactions still work if synchronous is off, 1765 ** and the database cannot be corrupted if this program 1766 ** crashes. But if the operating system crashes or there is 1767 ** an abrupt power failure when synchronous is off, the database 1768 ** could be left in an inconsistent and unrecoverable state. 1769 ** Synchronous is on by default so database corruption is not 1770 ** normally a worry. 1771 */ 1772 int sqlite3BtreeSetCacheSize(Btree *p, int mxPage){ 1773 BtShared *pBt = p->pBt; 1774 assert( sqlite3_mutex_held(p->db->mutex) ); 1775 sqlite3BtreeEnter(p); 1776 sqlite3PagerSetCachesize(pBt->pPager, mxPage); 1777 sqlite3BtreeLeave(p); 1778 return SQLITE_OK; 1779 } 1780 1781 /* 1782 ** Change the way data is synced to disk in order to increase or decrease 1783 ** how well the database resists damage due to OS crashes and power 1784 ** failures. Level 1 is the same as asynchronous (no syncs() occur and 1785 ** there is a high probability of damage) Level 2 is the default. There 1786 ** is a very low but non-zero probability of damage. Level 3 reduces the 1787 ** probability of damage to near zero but with a write performance reduction. 1788 */ 1789 #ifndef SQLITE_OMIT_PAGER_PRAGMAS 1790 int sqlite3BtreeSetSafetyLevel(Btree *p, int level, int fullSync){ 1791 BtShared *pBt = p->pBt; 1792 assert( sqlite3_mutex_held(p->db->mutex) ); 1793 sqlite3BtreeEnter(p); 1794 sqlite3PagerSetSafetyLevel(pBt->pPager, level, fullSync); 1795 sqlite3BtreeLeave(p); 1796 return SQLITE_OK; 1797 } 1798 #endif 1799 1800 /* 1801 ** Return TRUE if the given btree is set to safety level 1. In other 1802 ** words, return TRUE if no sync() occurs on the disk files. 1803 */ 1804 int sqlite3BtreeSyncDisabled(Btree *p){ 1805 BtShared *pBt = p->pBt; 1806 int rc; 1807 assert( sqlite3_mutex_held(p->db->mutex) ); 1808 sqlite3BtreeEnter(p); 1809 assert( pBt && pBt->pPager ); 1810 rc = sqlite3PagerNosync(pBt->pPager); 1811 sqlite3BtreeLeave(p); 1812 return rc; 1813 } 1814 1815 #if !defined(SQLITE_OMIT_PAGER_PRAGMAS) || !defined(SQLITE_OMIT_VACUUM) 1816 /* 1817 ** Change the default pages size and the number of reserved bytes per page. 1818 ** Or, if the page size has already been fixed, return SQLITE_READONLY 1819 ** without changing anything. 1820 ** 1821 ** The page size must be a power of 2 between 512 and 65536. If the page 1822 ** size supplied does not meet this constraint then the page size is not 1823 ** changed. 1824 ** 1825 ** Page sizes are constrained to be a power of two so that the region 1826 ** of the database file used for locking (beginning at PENDING_BYTE, 1827 ** the first byte past the 1GB boundary, 0x40000000) needs to occur 1828 ** at the beginning of a page. 1829 ** 1830 ** If parameter nReserve is less than zero, then the number of reserved 1831 ** bytes per page is left unchanged. 1832 ** 1833 ** If the iFix!=0 then the pageSizeFixed flag is set so that the page size 1834 ** and autovacuum mode can no longer be changed. 1835 */ 1836 int sqlite3BtreeSetPageSize(Btree *p, int pageSize, int nReserve, int iFix){ 1837 int rc = SQLITE_OK; 1838 BtShared *pBt = p->pBt; 1839 assert( nReserve>=-1 && nReserve<=255 ); 1840 sqlite3BtreeEnter(p); 1841 if( pBt->pageSizeFixed ){ 1842 sqlite3BtreeLeave(p); 1843 return SQLITE_READONLY; 1844 } 1845 if( nReserve<0 ){ 1846 nReserve = pBt->pageSize - pBt->usableSize; 1847 } 1848 assert( nReserve>=0 && nReserve<=255 ); 1849 if( pageSize>=512 && pageSize<=SQLITE_MAX_PAGE_SIZE && 1850 ((pageSize-1)&pageSize)==0 ){ 1851 assert( (pageSize & 7)==0 ); 1852 assert( !pBt->pPage1 && !pBt->pCursor ); 1853 pBt->pageSize = (u16)pageSize; 1854 freeTempSpace(pBt); 1855 rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize); 1856 } 1857 pBt->usableSize = pBt->pageSize - (u16)nReserve; 1858 if( iFix ) pBt->pageSizeFixed = 1; 1859 sqlite3BtreeLeave(p); 1860 return rc; 1861 } 1862 1863 /* 1864 ** Return the currently defined page size 1865 */ 1866 int sqlite3BtreeGetPageSize(Btree *p){ 1867 return p->pBt->pageSize; 1868 } 1869 1870 /* 1871 ** Return the number of bytes of space at the end of every page that 1872 ** are intentually left unused. This is the "reserved" space that is 1873 ** sometimes used by extensions. 1874 */ 1875 int sqlite3BtreeGetReserve(Btree *p){ 1876 int n; 1877 sqlite3BtreeEnter(p); 1878 n = p->pBt->pageSize - p->pBt->usableSize; 1879 sqlite3BtreeLeave(p); 1880 return n; 1881 } 1882 1883 /* 1884 ** Set the maximum page count for a database if mxPage is positive. 1885 ** No changes are made if mxPage is 0 or negative. 1886 ** Regardless of the value of mxPage, return the maximum page count. 1887 */ 1888 int sqlite3BtreeMaxPageCount(Btree *p, int mxPage){ 1889 int n; 1890 sqlite3BtreeEnter(p); 1891 n = sqlite3PagerMaxPageCount(p->pBt->pPager, mxPage); 1892 sqlite3BtreeLeave(p); 1893 return n; 1894 } 1895 #endif /* !defined(SQLITE_OMIT_PAGER_PRAGMAS) || !defined(SQLITE_OMIT_VACUUM) */ 1896 1897 /* 1898 ** Change the 'auto-vacuum' property of the database. If the 'autoVacuum' 1899 ** parameter is non-zero, then auto-vacuum mode is enabled. If zero, it 1900 ** is disabled. The default value for the auto-vacuum property is 1901 ** determined by the SQLITE_DEFAULT_AUTOVACUUM macro. 1902 */ 1903 int sqlite3BtreeSetAutoVacuum(Btree *p, int autoVacuum){ 1904 #ifdef SQLITE_OMIT_AUTOVACUUM 1905 return SQLITE_READONLY; 1906 #else 1907 BtShared *pBt = p->pBt; 1908 int rc = SQLITE_OK; 1909 u8 av = (u8)autoVacuum; 1910 1911 sqlite3BtreeEnter(p); 1912 if( pBt->pageSizeFixed && (av ?1:0)!=pBt->autoVacuum ){ 1913 rc = SQLITE_READONLY; 1914 }else{ 1915 pBt->autoVacuum = av ?1:0; 1916 pBt->incrVacuum = av==2 ?1:0; 1917 } 1918 sqlite3BtreeLeave(p); 1919 return rc; 1920 #endif 1921 } 1922 1923 /* 1924 ** Return the value of the 'auto-vacuum' property. If auto-vacuum is 1925 ** enabled 1 is returned. Otherwise 0. 1926 */ 1927 int sqlite3BtreeGetAutoVacuum(Btree *p){ 1928 #ifdef SQLITE_OMIT_AUTOVACUUM 1929 return BTREE_AUTOVACUUM_NONE; 1930 #else 1931 int rc; 1932 sqlite3BtreeEnter(p); 1933 rc = ( 1934 (!p->pBt->autoVacuum)?BTREE_AUTOVACUUM_NONE: 1935 (!p->pBt->incrVacuum)?BTREE_AUTOVACUUM_FULL: 1936 BTREE_AUTOVACUUM_INCR 1937 ); 1938 sqlite3BtreeLeave(p); 1939 return rc; 1940 #endif 1941 } 1942 1943 1944 /* 1945 ** Get a reference to pPage1 of the database file. This will 1946 ** also acquire a readlock on that file. 1947 ** 1948 ** SQLITE_OK is returned on success. If the file is not a 1949 ** well-formed database file, then SQLITE_CORRUPT is returned. 1950 ** SQLITE_BUSY is returned if the database is locked. SQLITE_NOMEM 1951 ** is returned if we run out of memory. 1952 */ 1953 static int lockBtree(BtShared *pBt){ 1954 int rc; 1955 MemPage *pPage1; 1956 int nPage; 1957 1958 assert( sqlite3_mutex_held(pBt->mutex) ); 1959 assert( pBt->pPage1==0 ); 1960 rc = sqlite3BtreeGetPage(pBt, 1, &pPage1, 0); 1961 if( rc!=SQLITE_OK ) return rc; 1962 1963 /* Do some checking to help insure the file we opened really is 1964 ** a valid database file. 1965 */ 1966 rc = sqlite3PagerPagecount(pBt->pPager, &nPage); 1967 if( rc!=SQLITE_OK ){ 1968 goto page1_init_failed; 1969 }else if( nPage>0 ){ 1970 int pageSize; 1971 int usableSize; 1972 u8 *page1 = pPage1->aData; 1973 rc = SQLITE_NOTADB; 1974 if( memcmp(page1, zMagicHeader, 16)!=0 ){ 1975 goto page1_init_failed; 1976 } 1977 if( page1[18]>1 ){ 1978 pBt->readOnly = 1; 1979 } 1980 if( page1[19]>1 ){ 1981 goto page1_init_failed; 1982 } 1983 1984 /* The maximum embedded fraction must be exactly 25%. And the minimum 1985 ** embedded fraction must be 12.5% for both leaf-data and non-leaf-data. 1986 ** The original design allowed these amounts to vary, but as of 1987 ** version 3.6.0, we require them to be fixed. 1988 */ 1989 if( memcmp(&page1[21], "\100\040\040",3)!=0 ){ 1990 goto page1_init_failed; 1991 } 1992 pageSize = get2byte(&page1[16]); 1993 if( ((pageSize-1)&pageSize)!=0 || pageSize<512 || 1994 (SQLITE_MAX_PAGE_SIZE<32768 && pageSize>SQLITE_MAX_PAGE_SIZE) 1995 ){ 1996 goto page1_init_failed; 1997 } 1998 assert( (pageSize & 7)==0 ); 1999 usableSize = pageSize - page1[20]; 2000 if( pageSize!=pBt->pageSize ){ 2001 /* After reading the first page of the database assuming a page size 2002 ** of BtShared.pageSize, we have discovered that the page-size is 2003 ** actually pageSize. Unlock the database, leave pBt->pPage1 at 2004 ** zero and return SQLITE_OK. The caller will call this function 2005 ** again with the correct page-size. 2006 */ 2007 releasePage(pPage1); 2008 pBt->usableSize = (u16)usableSize; 2009 pBt->pageSize = (u16)pageSize; 2010 freeTempSpace(pBt); 2011 rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize); 2012 if( rc ) goto page1_init_failed; 2013 return SQLITE_OK; 2014 } 2015 if( usableSize<500 ){ 2016 goto page1_init_failed; 2017 } 2018 pBt->pageSize = (u16)pageSize; 2019 pBt->usableSize = (u16)usableSize; 2020 #ifndef SQLITE_OMIT_AUTOVACUUM 2021 pBt->autoVacuum = (get4byte(&page1[36 + 4*4])?1:0); 2022 pBt->incrVacuum = (get4byte(&page1[36 + 7*4])?1:0); 2023 #endif 2024 } 2025 2026 /* maxLocal is the maximum amount of payload to store locally for 2027 ** a cell. Make sure it is small enough so that at least minFanout 2028 ** cells can will fit on one page. We assume a 10-byte page header. 2029 ** Besides the payload, the cell must store: 2030 ** 2-byte pointer to the cell 2031 ** 4-byte child pointer 2032 ** 9-byte nKey value 2033 ** 4-byte nData value 2034 ** 4-byte overflow page pointer 2035 ** So a cell consists of a 2-byte poiner, a header which is as much as 2036 ** 17 bytes long, 0 to N bytes of payload, and an optional 4 byte overflow 2037 ** page pointer. 2038 */ 2039 pBt->maxLocal = (pBt->usableSize-12)*64/255 - 23; 2040 pBt->minLocal = (pBt->usableSize-12)*32/255 - 23; 2041 pBt->maxLeaf = pBt->usableSize - 35; 2042 pBt->minLeaf = (pBt->usableSize-12)*32/255 - 23; 2043 assert( pBt->maxLeaf + 23 <= MX_CELL_SIZE(pBt) ); 2044 pBt->pPage1 = pPage1; 2045 return SQLITE_OK; 2046 2047 page1_init_failed: 2048 releasePage(pPage1); 2049 pBt->pPage1 = 0; 2050 return rc; 2051 } 2052 2053 /* 2054 ** This routine works like lockBtree() except that it also invokes the 2055 ** busy callback if there is lock contention. 2056 */ 2057 static int lockBtreeWithRetry(Btree *pRef){ 2058 int rc = SQLITE_OK; 2059 2060 assert( sqlite3BtreeHoldsMutex(pRef) ); 2061 if( pRef->inTrans==TRANS_NONE ){ 2062 u8 inTransaction = pRef->pBt->inTransaction; 2063 btreeIntegrity(pRef); 2064 rc = sqlite3BtreeBeginTrans(pRef, 0); 2065 pRef->pBt->inTransaction = inTransaction; 2066 pRef->inTrans = TRANS_NONE; 2067 if( rc==SQLITE_OK ){ 2068 pRef->pBt->nTransaction--; 2069 } 2070 btreeIntegrity(pRef); 2071 } 2072 return rc; 2073 } 2074 2075 2076 /* 2077 ** If there are no outstanding cursors and we are not in the middle 2078 ** of a transaction but there is a read lock on the database, then 2079 ** this routine unrefs the first page of the database file which 2080 ** has the effect of releasing the read lock. 2081 ** 2082 ** If there are any outstanding cursors, this routine is a no-op. 2083 ** 2084 ** If there is a transaction in progress, this routine is a no-op. 2085 */ 2086 static void unlockBtreeIfUnused(BtShared *pBt){ 2087 assert( sqlite3_mutex_held(pBt->mutex) ); 2088 if( pBt->inTransaction==TRANS_NONE && pBt->pCursor==0 && pBt->pPage1!=0 ){ 2089 if( sqlite3PagerRefcount(pBt->pPager)>=1 ){ 2090 assert( pBt->pPage1->aData ); 2091 releasePage(pBt->pPage1); 2092 } 2093 pBt->pPage1 = 0; 2094 } 2095 } 2096 2097 /* 2098 ** Create a new database by initializing the first page of the 2099 ** file. 2100 */ 2101 static int newDatabase(BtShared *pBt){ 2102 MemPage *pP1; 2103 unsigned char *data; 2104 int rc; 2105 int nPage; 2106 2107 assert( sqlite3_mutex_held(pBt->mutex) ); 2108 rc = sqlite3PagerPagecount(pBt->pPager, &nPage); 2109 if( rc!=SQLITE_OK || nPage>0 ){ 2110 return rc; 2111 } 2112 pP1 = pBt->pPage1; 2113 assert( pP1!=0 ); 2114 data = pP1->aData; 2115 rc = sqlite3PagerWrite(pP1->pDbPage); 2116 if( rc ) return rc; 2117 memcpy(data, zMagicHeader, sizeof(zMagicHeader)); 2118 assert( sizeof(zMagicHeader)==16 ); 2119 put2byte(&data[16], pBt->pageSize); 2120 data[18] = 1; 2121 data[19] = 1; 2122 assert( pBt->usableSize<=pBt->pageSize && pBt->usableSize+255>=pBt->pageSize); 2123 data[20] = (u8)(pBt->pageSize - pBt->usableSize); 2124 data[21] = 64; 2125 data[22] = 32; 2126 data[23] = 32; 2127 memset(&data[24], 0, 100-24); 2128 zeroPage(pP1, PTF_INTKEY|PTF_LEAF|PTF_LEAFDATA ); 2129 pBt->pageSizeFixed = 1; 2130 #ifndef SQLITE_OMIT_AUTOVACUUM 2131 assert( pBt->autoVacuum==1 || pBt->autoVacuum==0 ); 2132 assert( pBt->incrVacuum==1 || pBt->incrVacuum==0 ); 2133 put4byte(&data[36 + 4*4], pBt->autoVacuum); 2134 put4byte(&data[36 + 7*4], pBt->incrVacuum); 2135 #endif 2136 return SQLITE_OK; 2137 } 2138 2139 /* 2140 ** Attempt to start a new transaction. A write-transaction 2141 ** is started if the second argument is nonzero, otherwise a read- 2142 ** transaction. If the second argument is 2 or more and exclusive 2143 ** transaction is started, meaning that no other process is allowed 2144 ** to access the database. A preexisting transaction may not be 2145 ** upgraded to exclusive by calling this routine a second time - the 2146 ** exclusivity flag only works for a new transaction. 2147 ** 2148 ** A write-transaction must be started before attempting any 2149 ** changes to the database. None of the following routines 2150 ** will work unless a transaction is started first: 2151 ** 2152 ** sqlite3BtreeCreateTable() 2153 ** sqlite3BtreeCreateIndex() 2154 ** sqlite3BtreeClearTable() 2155 ** sqlite3BtreeDropTable() 2156 ** sqlite3BtreeInsert() 2157 ** sqlite3BtreeDelete() 2158 ** sqlite3BtreeUpdateMeta() 2159 ** 2160 ** If an initial attempt to acquire the lock fails because of lock contention 2161 ** and the database was previously unlocked, then invoke the busy handler 2162 ** if there is one. But if there was previously a read-lock, do not 2163 ** invoke the busy handler - just return SQLITE_BUSY. SQLITE_BUSY is 2164 ** returned when there is already a read-lock in order to avoid a deadlock. 2165 ** 2166 ** Suppose there are two processes A and B. A has a read lock and B has 2167 ** a reserved lock. B tries to promote to exclusive but is blocked because 2168 ** of A's read lock. A tries to promote to reserved but is blocked by B. 2169 ** One or the other of the two processes must give way or there can be 2170 ** no progress. By returning SQLITE_BUSY and not invoking the busy callback 2171 ** when A already has a read lock, we encourage A to give up and let B 2172 ** proceed. 2173 */ 2174 int sqlite3BtreeBeginTrans(Btree *p, int wrflag){ 2175 sqlite3 *pBlock = 0; 2176 BtShared *pBt = p->pBt; 2177 int rc = SQLITE_OK; 2178 2179 sqlite3BtreeEnter(p); 2180 btreeIntegrity(p); 2181 2182 /* If the btree is already in a write-transaction, or it 2183 ** is already in a read-transaction and a read-transaction 2184 ** is requested, this is a no-op. 2185 */ 2186 if( p->inTrans==TRANS_WRITE || (p->inTrans==TRANS_READ && !wrflag) ){ 2187 goto trans_begun; 2188 } 2189 2190 /* Write transactions are not possible on a read-only database */ 2191 if( pBt->readOnly && wrflag ){ 2192 rc = SQLITE_READONLY; 2193 goto trans_begun; 2194 } 2195 2196 #ifndef SQLITE_OMIT_SHARED_CACHE 2197 /* If another database handle has already opened a write transaction 2198 ** on this shared-btree structure and a second write transaction is 2199 ** requested, return SQLITE_LOCKED. 2200 */ 2201 if( (wrflag && pBt->inTransaction==TRANS_WRITE) || pBt->isPending ){ 2202 pBlock = pBt->pWriter->db; 2203 }else if( wrflag>1 ){ 2204 BtLock *pIter; 2205 for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){ 2206 if( pIter->pBtree!=p ){ 2207 pBlock = pIter->pBtree->db; 2208 break; 2209 } 2210 } 2211 } 2212 if( pBlock ){ 2213 sqlite3ConnectionBlocked(p->db, pBlock); 2214 rc = SQLITE_LOCKED_SHAREDCACHE; 2215 goto trans_begun; 2216 } 2217 #endif 2218 2219 do { 2220 /* Call lockBtree() until either pBt->pPage1 is populated or 2221 ** lockBtree() returns something other than SQLITE_OK. lockBtree() 2222 ** may return SQLITE_OK but leave pBt->pPage1 set to 0 if after 2223 ** reading page 1 it discovers that the page-size of the database 2224 ** file is not pBt->pageSize. In this case lockBtree() will update 2225 ** pBt->pageSize to the page-size of the file on disk. 2226 */ 2227 while( pBt->pPage1==0 && SQLITE_OK==(rc = lockBtree(pBt)) ); 2228 2229 if( rc==SQLITE_OK && wrflag ){ 2230 if( pBt->readOnly ){ 2231 rc = SQLITE_READONLY; 2232 }else{ 2233 rc = sqlite3PagerBegin(pBt->pPager,wrflag>1,sqlite3TempInMemory(p->db)); 2234 if( rc==SQLITE_OK ){ 2235 rc = newDatabase(pBt); 2236 } 2237 } 2238 } 2239 2240 if( rc!=SQLITE_OK ){ 2241 unlockBtreeIfUnused(pBt); 2242 } 2243 }while( rc==SQLITE_BUSY && pBt->inTransaction==TRANS_NONE && 2244 btreeInvokeBusyHandler(pBt) ); 2245 2246 if( rc==SQLITE_OK ){ 2247 if( p->inTrans==TRANS_NONE ){ 2248 pBt->nTransaction++; 2249 } 2250 p->inTrans = (wrflag?TRANS_WRITE:TRANS_READ); 2251 if( p->inTrans>pBt->inTransaction ){ 2252 pBt->inTransaction = p->inTrans; 2253 } 2254 #ifndef SQLITE_OMIT_SHARED_CACHE 2255 if( wrflag ){ 2256 assert( !pBt->pWriter ); 2257 pBt->pWriter = p; 2258 pBt->isExclusive = (u8)(wrflag>1); 2259 } 2260 #endif 2261 } 2262 2263 2264 trans_begun: 2265 if( rc==SQLITE_OK && wrflag ){ 2266 /* This call makes sure that the pager has the correct number of 2267 ** open savepoints. If the second parameter is greater than 0 and 2268 ** the sub-journal is not already open, then it will be opened here. 2269 */ 2270 rc = sqlite3PagerOpenSavepoint(pBt->pPager, p->db->nSavepoint); 2271 } 2272 2273 btreeIntegrity(p); 2274 sqlite3BtreeLeave(p); 2275 return rc; 2276 } 2277 2278 #ifndef SQLITE_OMIT_AUTOVACUUM 2279 2280 /* 2281 ** Set the pointer-map entries for all children of page pPage. Also, if 2282 ** pPage contains cells that point to overflow pages, set the pointer 2283 ** map entries for the overflow pages as well. 2284 */ 2285 static int setChildPtrmaps(MemPage *pPage){ 2286 int i; /* Counter variable */ 2287 int nCell; /* Number of cells in page pPage */ 2288 int rc; /* Return code */ 2289 BtShared *pBt = pPage->pBt; 2290 u8 isInitOrig = pPage->isInit; 2291 Pgno pgno = pPage->pgno; 2292 2293 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 2294 rc = sqlite3BtreeInitPage(pPage); 2295 if( rc!=SQLITE_OK ){ 2296 goto set_child_ptrmaps_out; 2297 } 2298 nCell = pPage->nCell; 2299 2300 for(i=0; i<nCell; i++){ 2301 u8 *pCell = findCell(pPage, i); 2302 2303 rc = ptrmapPutOvflPtr(pPage, pCell); 2304 if( rc!=SQLITE_OK ){ 2305 goto set_child_ptrmaps_out; 2306 } 2307 2308 if( !pPage->leaf ){ 2309 Pgno childPgno = get4byte(pCell); 2310 rc = ptrmapPut(pBt, childPgno, PTRMAP_BTREE, pgno); 2311 if( rc!=SQLITE_OK ) goto set_child_ptrmaps_out; 2312 } 2313 } 2314 2315 if( !pPage->leaf ){ 2316 Pgno childPgno = get4byte(&pPage->aData[pPage->hdrOffset+8]); 2317 rc = ptrmapPut(pBt, childPgno, PTRMAP_BTREE, pgno); 2318 } 2319 2320 set_child_ptrmaps_out: 2321 pPage->isInit = isInitOrig; 2322 return rc; 2323 } 2324 2325 /* 2326 ** Somewhere on pPage, which is guaranteed to be a btree page, not an overflow 2327 ** page, is a pointer to page iFrom. Modify this pointer so that it points to 2328 ** iTo. Parameter eType describes the type of pointer to be modified, as 2329 ** follows: 2330 ** 2331 ** PTRMAP_BTREE: pPage is a btree-page. The pointer points at a child 2332 ** page of pPage. 2333 ** 2334 ** PTRMAP_OVERFLOW1: pPage is a btree-page. The pointer points at an overflow 2335 ** page pointed to by one of the cells on pPage. 2336 ** 2337 ** PTRMAP_OVERFLOW2: pPage is an overflow-page. The pointer points at the next 2338 ** overflow page in the list. 2339 */ 2340 static int modifyPagePointer(MemPage *pPage, Pgno iFrom, Pgno iTo, u8 eType){ 2341 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 2342 assert( sqlite3PagerIswriteable(pPage->pDbPage) ); 2343 if( eType==PTRMAP_OVERFLOW2 ){ 2344 /* The pointer is always the first 4 bytes of the page in this case. */ 2345 if( get4byte(pPage->aData)!=iFrom ){ 2346 return SQLITE_CORRUPT_BKPT; 2347 } 2348 put4byte(pPage->aData, iTo); 2349 }else{ 2350 u8 isInitOrig = pPage->isInit; 2351 int i; 2352 int nCell; 2353 2354 sqlite3BtreeInitPage(pPage); 2355 nCell = pPage->nCell; 2356 2357 for(i=0; i<nCell; i++){ 2358 u8 *pCell = findCell(pPage, i); 2359 if( eType==PTRMAP_OVERFLOW1 ){ 2360 CellInfo info; 2361 sqlite3BtreeParseCellPtr(pPage, pCell, &info); 2362 if( info.iOverflow ){ 2363 if( iFrom==get4byte(&pCell[info.iOverflow]) ){ 2364 put4byte(&pCell[info.iOverflow], iTo); 2365 break; 2366 } 2367 } 2368 }else{ 2369 if( get4byte(pCell)==iFrom ){ 2370 put4byte(pCell, iTo); 2371 break; 2372 } 2373 } 2374 } 2375 2376 if( i==nCell ){ 2377 if( eType!=PTRMAP_BTREE || 2378 get4byte(&pPage->aData[pPage->hdrOffset+8])!=iFrom ){ 2379 return SQLITE_CORRUPT_BKPT; 2380 } 2381 put4byte(&pPage->aData[pPage->hdrOffset+8], iTo); 2382 } 2383 2384 pPage->isInit = isInitOrig; 2385 } 2386 return SQLITE_OK; 2387 } 2388 2389 2390 /* 2391 ** Move the open database page pDbPage to location iFreePage in the 2392 ** database. The pDbPage reference remains valid. 2393 */ 2394 static int relocatePage( 2395 BtShared *pBt, /* Btree */ 2396 MemPage *pDbPage, /* Open page to move */ 2397 u8 eType, /* Pointer map 'type' entry for pDbPage */ 2398 Pgno iPtrPage, /* Pointer map 'page-no' entry for pDbPage */ 2399 Pgno iFreePage, /* The location to move pDbPage to */ 2400 int isCommit 2401 ){ 2402 MemPage *pPtrPage; /* The page that contains a pointer to pDbPage */ 2403 Pgno iDbPage = pDbPage->pgno; 2404 Pager *pPager = pBt->pPager; 2405 int rc; 2406 2407 assert( eType==PTRMAP_OVERFLOW2 || eType==PTRMAP_OVERFLOW1 || 2408 eType==PTRMAP_BTREE || eType==PTRMAP_ROOTPAGE ); 2409 assert( sqlite3_mutex_held(pBt->mutex) ); 2410 assert( pDbPage->pBt==pBt ); 2411 2412 /* Move page iDbPage from its current location to page number iFreePage */ 2413 TRACE(("AUTOVACUUM: Moving %d to free page %d (ptr page %d type %d)\n", 2414 iDbPage, iFreePage, iPtrPage, eType)); 2415 rc = sqlite3PagerMovepage(pPager, pDbPage->pDbPage, iFreePage, isCommit); 2416 if( rc!=SQLITE_OK ){ 2417 return rc; 2418 } 2419 pDbPage->pgno = iFreePage; 2420 2421 /* If pDbPage was a btree-page, then it may have child pages and/or cells 2422 ** that point to overflow pages. The pointer map entries for all these 2423 ** pages need to be changed. 2424 ** 2425 ** If pDbPage is an overflow page, then the first 4 bytes may store a 2426 ** pointer to a subsequent overflow page. If this is the case, then 2427 ** the pointer map needs to be updated for the subsequent overflow page. 2428 */ 2429 if( eType==PTRMAP_BTREE || eType==PTRMAP_ROOTPAGE ){ 2430 rc = setChildPtrmaps(pDbPage); 2431 if( rc!=SQLITE_OK ){ 2432 return rc; 2433 } 2434 }else{ 2435 Pgno nextOvfl = get4byte(pDbPage->aData); 2436 if( nextOvfl!=0 ){ 2437 rc = ptrmapPut(pBt, nextOvfl, PTRMAP_OVERFLOW2, iFreePage); 2438 if( rc!=SQLITE_OK ){ 2439 return rc; 2440 } 2441 } 2442 } 2443 2444 /* Fix the database pointer on page iPtrPage that pointed at iDbPage so 2445 ** that it points at iFreePage. Also fix the pointer map entry for 2446 ** iPtrPage. 2447 */ 2448 if( eType!=PTRMAP_ROOTPAGE ){ 2449 rc = sqlite3BtreeGetPage(pBt, iPtrPage, &pPtrPage, 0); 2450 if( rc!=SQLITE_OK ){ 2451 return rc; 2452 } 2453 rc = sqlite3PagerWrite(pPtrPage->pDbPage); 2454 if( rc!=SQLITE_OK ){ 2455 releasePage(pPtrPage); 2456 return rc; 2457 } 2458 rc = modifyPagePointer(pPtrPage, iDbPage, iFreePage, eType); 2459 releasePage(pPtrPage); 2460 if( rc==SQLITE_OK ){ 2461 rc = ptrmapPut(pBt, iFreePage, eType, iPtrPage); 2462 } 2463 } 2464 return rc; 2465 } 2466 2467 /* Forward declaration required by incrVacuumStep(). */ 2468 static int allocateBtreePage(BtShared *, MemPage **, Pgno *, Pgno, u8); 2469 2470 /* 2471 ** Perform a single step of an incremental-vacuum. If successful, 2472 ** return SQLITE_OK. If there is no work to do (and therefore no 2473 ** point in calling this function again), return SQLITE_DONE. 2474 ** 2475 ** More specificly, this function attempts to re-organize the 2476 ** database so that the last page of the file currently in use 2477 ** is no longer in use. 2478 ** 2479 ** If the nFin parameter is non-zero, the implementation assumes 2480 ** that the caller will keep calling incrVacuumStep() until 2481 ** it returns SQLITE_DONE or an error, and that nFin is the 2482 ** number of pages the database file will contain after this 2483 ** process is complete. 2484 */ 2485 static int incrVacuumStep(BtShared *pBt, Pgno nFin, Pgno iLastPg){ 2486 Pgno nFreeList; /* Number of pages still on the free-list */ 2487 2488 assert( sqlite3_mutex_held(pBt->mutex) ); 2489 assert( iLastPg>nFin ); 2490 2491 if( !PTRMAP_ISPAGE(pBt, iLastPg) && iLastPg!=PENDING_BYTE_PAGE(pBt) ){ 2492 int rc; 2493 u8 eType; 2494 Pgno iPtrPage; 2495 2496 nFreeList = get4byte(&pBt->pPage1->aData[36]); 2497 if( nFreeList==0 ){ 2498 return SQLITE_DONE; 2499 } 2500 2501 rc = ptrmapGet(pBt, iLastPg, &eType, &iPtrPage); 2502 if( rc!=SQLITE_OK ){ 2503 return rc; 2504 } 2505 if( eType==PTRMAP_ROOTPAGE ){ 2506 return SQLITE_CORRUPT_BKPT; 2507 } 2508 2509 if( eType==PTRMAP_FREEPAGE ){ 2510 if( nFin==0 ){ 2511 /* Remove the page from the files free-list. This is not required 2512 ** if nFin is non-zero. In that case, the free-list will be 2513 ** truncated to zero after this function returns, so it doesn't 2514 ** matter if it still contains some garbage entries. 2515 */ 2516 Pgno iFreePg; 2517 MemPage *pFreePg; 2518 rc = allocateBtreePage(pBt, &pFreePg, &iFreePg, iLastPg, 1); 2519 if( rc!=SQLITE_OK ){ 2520 return rc; 2521 } 2522 assert( iFreePg==iLastPg ); 2523 releasePage(pFreePg); 2524 } 2525 } else { 2526 Pgno iFreePg; /* Index of free page to move pLastPg to */ 2527 MemPage *pLastPg; 2528 2529 rc = sqlite3BtreeGetPage(pBt, iLastPg, &pLastPg, 0); 2530 if( rc!=SQLITE_OK ){ 2531 return rc; 2532 } 2533 2534 /* If nFin is zero, this loop runs exactly once and page pLastPg 2535 ** is swapped with the first free page pulled off the free list. 2536 ** 2537 ** On the other hand, if nFin is greater than zero, then keep 2538 ** looping until a free-page located within the first nFin pages 2539 ** of the file is found. 2540 */ 2541 do { 2542 MemPage *pFreePg; 2543 rc = allocateBtreePage(pBt, &pFreePg, &iFreePg, 0, 0); 2544 if( rc!=SQLITE_OK ){ 2545 releasePage(pLastPg); 2546 return rc; 2547 } 2548 releasePage(pFreePg); 2549 }while( nFin!=0 && iFreePg>nFin ); 2550 assert( iFreePg<iLastPg ); 2551 2552 rc = sqlite3PagerWrite(pLastPg->pDbPage); 2553 if( rc==SQLITE_OK ){ 2554 rc = relocatePage(pBt, pLastPg, eType, iPtrPage, iFreePg, nFin!=0); 2555 } 2556 releasePage(pLastPg); 2557 if( rc!=SQLITE_OK ){ 2558 return rc; 2559 } 2560 } 2561 } 2562 2563 if( nFin==0 ){ 2564 iLastPg--; 2565 while( iLastPg==PENDING_BYTE_PAGE(pBt)||PTRMAP_ISPAGE(pBt, iLastPg) ){ 2566 if( PTRMAP_ISPAGE(pBt, iLastPg) ){ 2567 MemPage *pPg; 2568 int rc = sqlite3BtreeGetPage(pBt, iLastPg, &pPg, 0); 2569 if( rc!=SQLITE_OK ){ 2570 return rc; 2571 } 2572 rc = sqlite3PagerWrite(pPg->pDbPage); 2573 releasePage(pPg); 2574 if( rc!=SQLITE_OK ){ 2575 return rc; 2576 } 2577 } 2578 iLastPg--; 2579 } 2580 sqlite3PagerTruncateImage(pBt->pPager, iLastPg); 2581 } 2582 return SQLITE_OK; 2583 } 2584 2585 /* 2586 ** A write-transaction must be opened before calling this function. 2587 ** It performs a single unit of work towards an incremental vacuum. 2588 ** 2589 ** If the incremental vacuum is finished after this function has run, 2590 ** SQLITE_DONE is returned. If it is not finished, but no error occurred, 2591 ** SQLITE_OK is returned. Otherwise an SQLite error code. 2592 */ 2593 int sqlite3BtreeIncrVacuum(Btree *p){ 2594 int rc; 2595 BtShared *pBt = p->pBt; 2596 2597 sqlite3BtreeEnter(p); 2598 assert( pBt->inTransaction==TRANS_WRITE && p->inTrans==TRANS_WRITE ); 2599 if( !pBt->autoVacuum ){ 2600 rc = SQLITE_DONE; 2601 }else{ 2602 invalidateAllOverflowCache(pBt); 2603 rc = incrVacuumStep(pBt, 0, pagerPagecount(pBt)); 2604 } 2605 sqlite3BtreeLeave(p); 2606 return rc; 2607 } 2608 2609 /* 2610 ** This routine is called prior to sqlite3PagerCommit when a transaction 2611 ** is commited for an auto-vacuum database. 2612 ** 2613 ** If SQLITE_OK is returned, then *pnTrunc is set to the number of pages 2614 ** the database file should be truncated to during the commit process. 2615 ** i.e. the database has been reorganized so that only the first *pnTrunc 2616 ** pages are in use. 2617 */ 2618 static int autoVacuumCommit(BtShared *pBt){ 2619 int rc = SQLITE_OK; 2620 Pager *pPager = pBt->pPager; 2621 VVA_ONLY( int nRef = sqlite3PagerRefcount(pPager) ); 2622 2623 assert( sqlite3_mutex_held(pBt->mutex) ); 2624 invalidateAllOverflowCache(pBt); 2625 assert(pBt->autoVacuum); 2626 if( !pBt->incrVacuum ){ 2627 Pgno nFin; 2628 Pgno nFree; 2629 Pgno nPtrmap; 2630 Pgno iFree; 2631 const int pgsz = pBt->pageSize; 2632 Pgno nOrig = pagerPagecount(pBt); 2633 2634 if( PTRMAP_ISPAGE(pBt, nOrig) || nOrig==PENDING_BYTE_PAGE(pBt) ){ 2635 /* It is not possible to create a database for which the final page 2636 ** is either a pointer-map page or the pending-byte page. If one 2637 ** is encountered, this indicates corruption. 2638 */ 2639 return SQLITE_CORRUPT_BKPT; 2640 } 2641 2642 nFree = get4byte(&pBt->pPage1->aData[36]); 2643 nPtrmap = (nFree-nOrig+PTRMAP_PAGENO(pBt, nOrig)+pgsz/5)/(pgsz/5); 2644 nFin = nOrig - nFree - nPtrmap; 2645 if( nOrig>PENDING_BYTE_PAGE(pBt) && nFin<PENDING_BYTE_PAGE(pBt) ){ 2646 nFin--; 2647 } 2648 while( PTRMAP_ISPAGE(pBt, nFin) || nFin==PENDING_BYTE_PAGE(pBt) ){ 2649 nFin--; 2650 } 2651 2652 for(iFree=nOrig; iFree>nFin && rc==SQLITE_OK; iFree--){ 2653 rc = incrVacuumStep(pBt, nFin, iFree); 2654 } 2655 if( (rc==SQLITE_DONE || rc==SQLITE_OK) && nFree>0 ){ 2656 rc = SQLITE_OK; 2657 rc = sqlite3PagerWrite(pBt->pPage1->pDbPage); 2658 put4byte(&pBt->pPage1->aData[32], 0); 2659 put4byte(&pBt->pPage1->aData[36], 0); 2660 sqlite3PagerTruncateImage(pBt->pPager, nFin); 2661 } 2662 if( rc!=SQLITE_OK ){ 2663 sqlite3PagerRollback(pPager); 2664 } 2665 } 2666 2667 assert( nRef==sqlite3PagerRefcount(pPager) ); 2668 return rc; 2669 } 2670 2671 #endif /* ifndef SQLITE_OMIT_AUTOVACUUM */ 2672 2673 /* 2674 ** This routine does the first phase of a two-phase commit. This routine 2675 ** causes a rollback journal to be created (if it does not already exist) 2676 ** and populated with enough information so that if a power loss occurs 2677 ** the database can be restored to its original state by playing back 2678 ** the journal. Then the contents of the journal are flushed out to 2679 ** the disk. After the journal is safely on oxide, the changes to the 2680 ** database are written into the database file and flushed to oxide. 2681 ** At the end of this call, the rollback journal still exists on the 2682 ** disk and we are still holding all locks, so the transaction has not 2683 ** committed. See sqlite3BtreeCommitPhaseTwo() for the second phase of the 2684 ** commit process. 2685 ** 2686 ** This call is a no-op if no write-transaction is currently active on pBt. 2687 ** 2688 ** Otherwise, sync the database file for the btree pBt. zMaster points to 2689 ** the name of a master journal file that should be written into the 2690 ** individual journal file, or is NULL, indicating no master journal file 2691 ** (single database transaction). 2692 ** 2693 ** When this is called, the master journal should already have been 2694 ** created, populated with this journal pointer and synced to disk. 2695 ** 2696 ** Once this is routine has returned, the only thing required to commit 2697 ** the write-transaction for this database file is to delete the journal. 2698 */ 2699 int sqlite3BtreeCommitPhaseOne(Btree *p, const char *zMaster){ 2700 int rc = SQLITE_OK; 2701 if( p->inTrans==TRANS_WRITE ){ 2702 BtShared *pBt = p->pBt; 2703 sqlite3BtreeEnter(p); 2704 #ifndef SQLITE_OMIT_AUTOVACUUM 2705 if( pBt->autoVacuum ){ 2706 rc = autoVacuumCommit(pBt); 2707 if( rc!=SQLITE_OK ){ 2708 sqlite3BtreeLeave(p); 2709 return rc; 2710 } 2711 } 2712 #endif 2713 rc = sqlite3PagerCommitPhaseOne(pBt->pPager, zMaster, 0); 2714 sqlite3BtreeLeave(p); 2715 } 2716 return rc; 2717 } 2718 2719 /* 2720 ** Commit the transaction currently in progress. 2721 ** 2722 ** This routine implements the second phase of a 2-phase commit. The 2723 ** sqlite3BtreeCommitPhaseOne() routine does the first phase and should 2724 ** be invoked prior to calling this routine. The sqlite3BtreeCommitPhaseOne() 2725 ** routine did all the work of writing information out to disk and flushing the 2726 ** contents so that they are written onto the disk platter. All this 2727 ** routine has to do is delete or truncate or zero the header in the 2728 ** the rollback journal (which causes the transaction to commit) and 2729 ** drop locks. 2730 ** 2731 ** This will release the write lock on the database file. If there 2732 ** are no active cursors, it also releases the read lock. 2733 */ 2734 int sqlite3BtreeCommitPhaseTwo(Btree *p){ 2735 BtShared *pBt = p->pBt; 2736 2737 sqlite3BtreeEnter(p); 2738 btreeIntegrity(p); 2739 2740 /* If the handle has a write-transaction open, commit the shared-btrees 2741 ** transaction and set the shared state to TRANS_READ. 2742 */ 2743 if( p->inTrans==TRANS_WRITE ){ 2744 int rc; 2745 assert( pBt->inTransaction==TRANS_WRITE ); 2746 assert( pBt->nTransaction>0 ); 2747 rc = sqlite3PagerCommitPhaseTwo(pBt->pPager); 2748 if( rc!=SQLITE_OK ){ 2749 sqlite3BtreeLeave(p); 2750 return rc; 2751 } 2752 pBt->inTransaction = TRANS_READ; 2753 } 2754 2755 /* If the handle has any kind of transaction open, decrement the transaction 2756 ** count of the shared btree. If the transaction count reaches 0, set 2757 ** the shared state to TRANS_NONE. The unlockBtreeIfUnused() call below 2758 ** will unlock the pager. 2759 */ 2760 if( p->inTrans!=TRANS_NONE ){ 2761 clearAllSharedCacheTableLocks(p); 2762 pBt->nTransaction--; 2763 if( 0==pBt->nTransaction ){ 2764 pBt->inTransaction = TRANS_NONE; 2765 } 2766 } 2767 2768 /* Set the current transaction state to TRANS_NONE and unlock 2769 ** the pager if this call closed the only read or write transaction. 2770 */ 2771 btreeClearHasContent(pBt); 2772 p->inTrans = TRANS_NONE; 2773 unlockBtreeIfUnused(pBt); 2774 2775 btreeIntegrity(p); 2776 sqlite3BtreeLeave(p); 2777 return SQLITE_OK; 2778 } 2779 2780 /* 2781 ** Do both phases of a commit. 2782 */ 2783 int sqlite3BtreeCommit(Btree *p){ 2784 int rc; 2785 sqlite3BtreeEnter(p); 2786 rc = sqlite3BtreeCommitPhaseOne(p, 0); 2787 if( rc==SQLITE_OK ){ 2788 rc = sqlite3BtreeCommitPhaseTwo(p); 2789 } 2790 sqlite3BtreeLeave(p); 2791 return rc; 2792 } 2793 2794 #ifndef NDEBUG 2795 /* 2796 ** Return the number of write-cursors open on this handle. This is for use 2797 ** in assert() expressions, so it is only compiled if NDEBUG is not 2798 ** defined. 2799 ** 2800 ** For the purposes of this routine, a write-cursor is any cursor that 2801 ** is capable of writing to the databse. That means the cursor was 2802 ** originally opened for writing and the cursor has not be disabled 2803 ** by having its state changed to CURSOR_FAULT. 2804 */ 2805 static int countWriteCursors(BtShared *pBt){ 2806 BtCursor *pCur; 2807 int r = 0; 2808 for(pCur=pBt->pCursor; pCur; pCur=pCur->pNext){ 2809 if( pCur->wrFlag && pCur->eState!=CURSOR_FAULT ) r++; 2810 } 2811 return r; 2812 } 2813 #endif 2814 2815 /* 2816 ** This routine sets the state to CURSOR_FAULT and the error 2817 ** code to errCode for every cursor on BtShared that pBtree 2818 ** references. 2819 ** 2820 ** Every cursor is tripped, including cursors that belong 2821 ** to other database connections that happen to be sharing 2822 ** the cache with pBtree. 2823 ** 2824 ** This routine gets called when a rollback occurs. 2825 ** All cursors using the same cache must be tripped 2826 ** to prevent them from trying to use the btree after 2827 ** the rollback. The rollback may have deleted tables 2828 ** or moved root pages, so it is not sufficient to 2829 ** save the state of the cursor. The cursor must be 2830 ** invalidated. 2831 */ 2832 void sqlite3BtreeTripAllCursors(Btree *pBtree, int errCode){ 2833 BtCursor *p; 2834 sqlite3BtreeEnter(pBtree); 2835 for(p=pBtree->pBt->pCursor; p; p=p->pNext){ 2836 int i; 2837 sqlite3BtreeClearCursor(p); 2838 p->eState = CURSOR_FAULT; 2839 p->skip = errCode; 2840 for(i=0; i<=p->iPage; i++){ 2841 releasePage(p->apPage[i]); 2842 p->apPage[i] = 0; 2843 } 2844 } 2845 sqlite3BtreeLeave(pBtree); 2846 } 2847 2848 /* 2849 ** Rollback the transaction in progress. All cursors will be 2850 ** invalided by this operation. Any attempt to use a cursor 2851 ** that was open at the beginning of this operation will result 2852 ** in an error. 2853 ** 2854 ** This will release the write lock on the database file. If there 2855 ** are no active cursors, it also releases the read lock. 2856 */ 2857 int sqlite3BtreeRollback(Btree *p){ 2858 int rc; 2859 BtShared *pBt = p->pBt; 2860 MemPage *pPage1; 2861 2862 sqlite3BtreeEnter(p); 2863 rc = saveAllCursors(pBt, 0, 0); 2864 #ifndef SQLITE_OMIT_SHARED_CACHE 2865 if( rc!=SQLITE_OK ){ 2866 /* This is a horrible situation. An IO or malloc() error occurred whilst 2867 ** trying to save cursor positions. If this is an automatic rollback (as 2868 ** the result of a constraint, malloc() failure or IO error) then 2869 ** the cache may be internally inconsistent (not contain valid trees) so 2870 ** we cannot simply return the error to the caller. Instead, abort 2871 ** all queries that may be using any of the cursors that failed to save. 2872 */ 2873 sqlite3BtreeTripAllCursors(p, rc); 2874 } 2875 #endif 2876 btreeIntegrity(p); 2877 2878 if( p->inTrans==TRANS_WRITE ){ 2879 int rc2; 2880 2881 assert( TRANS_WRITE==pBt->inTransaction ); 2882 rc2 = sqlite3PagerRollback(pBt->pPager); 2883 if( rc2!=SQLITE_OK ){ 2884 rc = rc2; 2885 } 2886 2887 /* The rollback may have destroyed the pPage1->aData value. So 2888 ** call sqlite3BtreeGetPage() on page 1 again to make 2889 ** sure pPage1->aData is set correctly. */ 2890 if( sqlite3BtreeGetPage(pBt, 1, &pPage1, 0)==SQLITE_OK ){ 2891 releasePage(pPage1); 2892 } 2893 assert( countWriteCursors(pBt)==0 ); 2894 pBt->inTransaction = TRANS_READ; 2895 } 2896 2897 if( p->inTrans!=TRANS_NONE ){ 2898 clearAllSharedCacheTableLocks(p); 2899 assert( pBt->nTransaction>0 ); 2900 pBt->nTransaction--; 2901 if( 0==pBt->nTransaction ){ 2902 pBt->inTransaction = TRANS_NONE; 2903 } 2904 } 2905 2906 btreeClearHasContent(pBt); 2907 p->inTrans = TRANS_NONE; 2908 unlockBtreeIfUnused(pBt); 2909 2910 btreeIntegrity(p); 2911 sqlite3BtreeLeave(p); 2912 return rc; 2913 } 2914 2915 /* 2916 ** Start a statement subtransaction. The subtransaction can can be rolled 2917 ** back independently of the main transaction. You must start a transaction 2918 ** before starting a subtransaction. The subtransaction is ended automatically 2919 ** if the main transaction commits or rolls back. 2920 ** 2921 ** Statement subtransactions are used around individual SQL statements 2922 ** that are contained within a BEGIN...COMMIT block. If a constraint 2923 ** error occurs within the statement, the effect of that one statement 2924 ** can be rolled back without having to rollback the entire transaction. 2925 ** 2926 ** A statement sub-transaction is implemented as an anonymous savepoint. The 2927 ** value passed as the second parameter is the total number of savepoints, 2928 ** including the new anonymous savepoint, open on the B-Tree. i.e. if there 2929 ** are no active savepoints and no other statement-transactions open, 2930 ** iStatement is 1. This anonymous savepoint can be released or rolled back 2931 ** using the sqlite3BtreeSavepoint() function. 2932 */ 2933 int sqlite3BtreeBeginStmt(Btree *p, int iStatement){ 2934 int rc; 2935 BtShared *pBt = p->pBt; 2936 sqlite3BtreeEnter(p); 2937 assert( p->inTrans==TRANS_WRITE ); 2938 assert( pBt->readOnly==0 ); 2939 assert( iStatement>0 ); 2940 assert( iStatement>p->db->nSavepoint ); 2941 if( NEVER(p->inTrans!=TRANS_WRITE || pBt->readOnly) ){ 2942 rc = SQLITE_INTERNAL; 2943 }else{ 2944 assert( pBt->inTransaction==TRANS_WRITE ); 2945 /* At the pager level, a statement transaction is a savepoint with 2946 ** an index greater than all savepoints created explicitly using 2947 ** SQL statements. It is illegal to open, release or rollback any 2948 ** such savepoints while the statement transaction savepoint is active. 2949 */ 2950 rc = sqlite3PagerOpenSavepoint(pBt->pPager, iStatement); 2951 } 2952 sqlite3BtreeLeave(p); 2953 return rc; 2954 } 2955 2956 /* 2957 ** The second argument to this function, op, is always SAVEPOINT_ROLLBACK 2958 ** or SAVEPOINT_RELEASE. This function either releases or rolls back the 2959 ** savepoint identified by parameter iSavepoint, depending on the value 2960 ** of op. 2961 ** 2962 ** Normally, iSavepoint is greater than or equal to zero. However, if op is 2963 ** SAVEPOINT_ROLLBACK, then iSavepoint may also be -1. In this case the 2964 ** contents of the entire transaction are rolled back. This is different 2965 ** from a normal transaction rollback, as no locks are released and the 2966 ** transaction remains open. 2967 */ 2968 int sqlite3BtreeSavepoint(Btree *p, int op, int iSavepoint){ 2969 int rc = SQLITE_OK; 2970 if( p && p->inTrans==TRANS_WRITE ){ 2971 BtShared *pBt = p->pBt; 2972 assert( op==SAVEPOINT_RELEASE || op==SAVEPOINT_ROLLBACK ); 2973 assert( iSavepoint>=0 || (iSavepoint==-1 && op==SAVEPOINT_ROLLBACK) ); 2974 sqlite3BtreeEnter(p); 2975 rc = sqlite3PagerSavepoint(pBt->pPager, op, iSavepoint); 2976 if( rc==SQLITE_OK ){ 2977 rc = newDatabase(pBt); 2978 } 2979 sqlite3BtreeLeave(p); 2980 } 2981 return rc; 2982 } 2983 2984 /* 2985 ** Create a new cursor for the BTree whose root is on the page 2986 ** iTable. The act of acquiring a cursor gets a read lock on 2987 ** the database file. 2988 ** 2989 ** If wrFlag==0, then the cursor can only be used for reading. 2990 ** If wrFlag==1, then the cursor can be used for reading or for 2991 ** writing if other conditions for writing are also met. These 2992 ** are the conditions that must be met in order for writing to 2993 ** be allowed: 2994 ** 2995 ** 1: The cursor must have been opened with wrFlag==1 2996 ** 2997 ** 2: Other database connections that share the same pager cache 2998 ** but which are not in the READ_UNCOMMITTED state may not have 2999 ** cursors open with wrFlag==0 on the same table. Otherwise 3000 ** the changes made by this write cursor would be visible to 3001 ** the read cursors in the other database connection. 3002 ** 3003 ** 3: The database must be writable (not on read-only media) 3004 ** 3005 ** 4: There must be an active transaction. 3006 ** 3007 ** No checking is done to make sure that page iTable really is the 3008 ** root page of a b-tree. If it is not, then the cursor acquired 3009 ** will not work correctly. 3010 ** 3011 ** It is assumed that the sqlite3BtreeCursorSize() bytes of memory 3012 ** pointed to by pCur have been zeroed by the caller. 3013 */ 3014 static int btreeCursor( 3015 Btree *p, /* The btree */ 3016 int iTable, /* Root page of table to open */ 3017 int wrFlag, /* 1 to write. 0 read-only */ 3018 struct KeyInfo *pKeyInfo, /* First arg to comparison function */ 3019 BtCursor *pCur /* Space for new cursor */ 3020 ){ 3021 int rc; 3022 Pgno nPage; 3023 BtShared *pBt = p->pBt; 3024 3025 assert( sqlite3BtreeHoldsMutex(p) ); 3026 assert( wrFlag==0 || wrFlag==1 ); 3027 if( wrFlag ){ 3028 assert( !pBt->readOnly ); 3029 if( NEVER(pBt->readOnly) ){ 3030 return SQLITE_READONLY; 3031 } 3032 rc = checkForReadConflicts(p, iTable, 0, 0); 3033 if( rc!=SQLITE_OK ){ 3034 assert( rc==SQLITE_LOCKED_SHAREDCACHE ); 3035 return rc; 3036 } 3037 } 3038 3039 if( pBt->pPage1==0 ){ 3040 rc = lockBtreeWithRetry(p); 3041 if( rc!=SQLITE_OK ){ 3042 return rc; 3043 } 3044 } 3045 pCur->pgnoRoot = (Pgno)iTable; 3046 rc = sqlite3PagerPagecount(pBt->pPager, (int *)&nPage); 3047 if( rc!=SQLITE_OK ){ 3048 return rc; 3049 } 3050 if( iTable==1 && nPage==0 ){ 3051 rc = SQLITE_EMPTY; 3052 goto create_cursor_exception; 3053 } 3054 rc = getAndInitPage(pBt, pCur->pgnoRoot, &pCur->apPage[0]); 3055 if( rc!=SQLITE_OK ){ 3056 goto create_cursor_exception; 3057 } 3058 3059 /* Now that no other errors can occur, finish filling in the BtCursor 3060 ** variables, link the cursor into the BtShared list and set *ppCur (the 3061 ** output argument to this function). 3062 */ 3063 pCur->pKeyInfo = pKeyInfo; 3064 pCur->pBtree = p; 3065 pCur->pBt = pBt; 3066 pCur->wrFlag = (u8)wrFlag; 3067 pCur->pNext = pBt->pCursor; 3068 if( pCur->pNext ){ 3069 pCur->pNext->pPrev = pCur; 3070 } 3071 pBt->pCursor = pCur; 3072 pCur->eState = CURSOR_INVALID; 3073 pCur->cachedRowid = 0; 3074 3075 return SQLITE_OK; 3076 3077 create_cursor_exception: 3078 releasePage(pCur->apPage[0]); 3079 unlockBtreeIfUnused(pBt); 3080 return rc; 3081 } 3082 int sqlite3BtreeCursor( 3083 Btree *p, /* The btree */ 3084 int iTable, /* Root page of table to open */ 3085 int wrFlag, /* 1 to write. 0 read-only */ 3086 struct KeyInfo *pKeyInfo, /* First arg to xCompare() */ 3087 BtCursor *pCur /* Write new cursor here */ 3088 ){ 3089 int rc; 3090 sqlite3BtreeEnter(p); 3091 rc = btreeCursor(p, iTable, wrFlag, pKeyInfo, pCur); 3092 sqlite3BtreeLeave(p); 3093 return rc; 3094 } 3095 3096 /* 3097 ** Return the size of a BtCursor object in bytes. 3098 ** 3099 ** This interfaces is needed so that users of cursors can preallocate 3100 ** sufficient storage to hold a cursor. The BtCursor object is opaque 3101 ** to users so they cannot do the sizeof() themselves - they must call 3102 ** this routine. 3103 */ 3104 int sqlite3BtreeCursorSize(void){ 3105 return sizeof(BtCursor); 3106 } 3107 3108 /* 3109 ** Set the cached rowid value of every cursor in the same database file 3110 ** as pCur and having the same root page number as pCur. The value is 3111 ** set to iRowid. 3112 ** 3113 ** Only positive rowid values are considered valid for this cache. 3114 ** The cache is initialized to zero, indicating an invalid cache. 3115 ** A btree will work fine with zero or negative rowids. We just cannot 3116 ** cache zero or negative rowids, which means tables that use zero or 3117 ** negative rowids might run a little slower. But in practice, zero 3118 ** or negative rowids are very uncommon so this should not be a problem. 3119 */ 3120 void sqlite3BtreeSetCachedRowid(BtCursor *pCur, sqlite3_int64 iRowid){ 3121 BtCursor *p; 3122 for(p=pCur->pBt->pCursor; p; p=p->pNext){ 3123 if( p->pgnoRoot==pCur->pgnoRoot ) p->cachedRowid = iRowid; 3124 } 3125 assert( pCur->cachedRowid==iRowid ); 3126 } 3127 3128 /* 3129 ** Return the cached rowid for the given cursor. A negative or zero 3130 ** return value indicates that the rowid cache is invalid and should be 3131 ** ignored. If the rowid cache has never before been set, then a 3132 ** zero is returned. 3133 */ 3134 sqlite3_int64 sqlite3BtreeGetCachedRowid(BtCursor *pCur){ 3135 return pCur->cachedRowid; 3136 } 3137 3138 /* 3139 ** Close a cursor. The read lock on the database file is released 3140 ** when the last cursor is closed. 3141 */ 3142 int sqlite3BtreeCloseCursor(BtCursor *pCur){ 3143 Btree *pBtree = pCur->pBtree; 3144 if( pBtree ){ 3145 int i; 3146 BtShared *pBt = pCur->pBt; 3147 sqlite3BtreeEnter(pBtree); 3148 sqlite3BtreeClearCursor(pCur); 3149 if( pCur->pPrev ){ 3150 pCur->pPrev->pNext = pCur->pNext; 3151 }else{ 3152 pBt->pCursor = pCur->pNext; 3153 } 3154 if( pCur->pNext ){ 3155 pCur->pNext->pPrev = pCur->pPrev; 3156 } 3157 for(i=0; i<=pCur->iPage; i++){ 3158 releasePage(pCur->apPage[i]); 3159 } 3160 unlockBtreeIfUnused(pBt); 3161 invalidateOverflowCache(pCur); 3162 /* sqlite3_free(pCur); */ 3163 sqlite3BtreeLeave(pBtree); 3164 } 3165 return SQLITE_OK; 3166 } 3167 3168 /* 3169 ** Make a temporary cursor by filling in the fields of pTempCur. 3170 ** The temporary cursor is not on the cursor list for the Btree. 3171 */ 3172 void sqlite3BtreeGetTempCursor(BtCursor *pCur, BtCursor *pTempCur){ 3173 int i; 3174 assert( cursorHoldsMutex(pCur) ); 3175 memcpy(pTempCur, pCur, sizeof(BtCursor)); 3176 pTempCur->pNext = 0; 3177 pTempCur->pPrev = 0; 3178 for(i=0; i<=pTempCur->iPage; i++){ 3179 sqlite3PagerRef(pTempCur->apPage[i]->pDbPage); 3180 } 3181 assert( pTempCur->pKey==0 ); 3182 } 3183 3184 /* 3185 ** Delete a temporary cursor such as was made by the CreateTemporaryCursor() 3186 ** function above. 3187 */ 3188 void sqlite3BtreeReleaseTempCursor(BtCursor *pCur){ 3189 int i; 3190 assert( cursorHoldsMutex(pCur) ); 3191 for(i=0; i<=pCur->iPage; i++){ 3192 sqlite3PagerUnref(pCur->apPage[i]->pDbPage); 3193 } 3194 sqlite3_free(pCur->pKey); 3195 } 3196 3197 3198 3199 /* 3200 ** Make sure the BtCursor* given in the argument has a valid 3201 ** BtCursor.info structure. If it is not already valid, call 3202 ** sqlite3BtreeParseCell() to fill it in. 3203 ** 3204 ** BtCursor.info is a cache of the information in the current cell. 3205 ** Using this cache reduces the number of calls to sqlite3BtreeParseCell(). 3206 ** 3207 ** 2007-06-25: There is a bug in some versions of MSVC that cause the 3208 ** compiler to crash when getCellInfo() is implemented as a macro. 3209 ** But there is a measureable speed advantage to using the macro on gcc 3210 ** (when less compiler optimizations like -Os or -O0 are used and the 3211 ** compiler is not doing agressive inlining.) So we use a real function 3212 ** for MSVC and a macro for everything else. Ticket #2457. 3213 */ 3214 #ifndef NDEBUG 3215 static void assertCellInfo(BtCursor *pCur){ 3216 CellInfo info; 3217 int iPage = pCur->iPage; 3218 memset(&info, 0, sizeof(info)); 3219 sqlite3BtreeParseCell(pCur->apPage[iPage], pCur->aiIdx[iPage], &info); 3220 assert( memcmp(&info, &pCur->info, sizeof(info))==0 ); 3221 } 3222 #else 3223 #define assertCellInfo(x) 3224 #endif 3225 #ifdef _MSC_VER 3226 /* Use a real function in MSVC to work around bugs in that compiler. */ 3227 static void getCellInfo(BtCursor *pCur){ 3228 if( pCur->info.nSize==0 ){ 3229 int iPage = pCur->iPage; 3230 sqlite3BtreeParseCell(pCur->apPage[iPage],pCur->aiIdx[iPage],&pCur->info); 3231 pCur->validNKey = 1; 3232 }else{ 3233 assertCellInfo(pCur); 3234 } 3235 } 3236 #else /* if not _MSC_VER */ 3237 /* Use a macro in all other compilers so that the function is inlined */ 3238 #define getCellInfo(pCur) \ 3239 if( pCur->info.nSize==0 ){ \ 3240 int iPage = pCur->iPage; \ 3241 sqlite3BtreeParseCell(pCur->apPage[iPage],pCur->aiIdx[iPage],&pCur->info); \ 3242 pCur->validNKey = 1; \ 3243 }else{ \ 3244 assertCellInfo(pCur); \ 3245 } 3246 #endif /* _MSC_VER */ 3247 3248 /* 3249 ** Set *pSize to the size of the buffer needed to hold the value of 3250 ** the key for the current entry. If the cursor is not pointing 3251 ** to a valid entry, *pSize is set to 0. 3252 ** 3253 ** For a table with the INTKEY flag set, this routine returns the key 3254 ** itself, not the number of bytes in the key. 3255 */ 3256 int sqlite3BtreeKeySize(BtCursor *pCur, i64 *pSize){ 3257 int rc; 3258 3259 assert( cursorHoldsMutex(pCur) ); 3260 rc = restoreCursorPosition(pCur); 3261 if( rc==SQLITE_OK ){ 3262 assert( pCur->eState==CURSOR_INVALID || pCur->eState==CURSOR_VALID ); 3263 if( pCur->eState==CURSOR_INVALID ){ 3264 *pSize = 0; 3265 }else{ 3266 getCellInfo(pCur); 3267 *pSize = pCur->info.nKey; 3268 } 3269 } 3270 return rc; 3271 } 3272 3273 /* 3274 ** Set *pSize to the number of bytes of data in the entry the 3275 ** cursor currently points to. Always return SQLITE_OK. 3276 ** Failure is not possible. If the cursor is not currently 3277 ** pointing to an entry (which can happen, for example, if 3278 ** the database is empty) then *pSize is set to 0. 3279 */ 3280 int sqlite3BtreeDataSize(BtCursor *pCur, u32 *pSize){ 3281 int rc; 3282 3283 assert( cursorHoldsMutex(pCur) ); 3284 rc = restoreCursorPosition(pCur); 3285 if( rc==SQLITE_OK ){ 3286 assert( pCur->eState==CURSOR_INVALID || pCur->eState==CURSOR_VALID ); 3287 if( pCur->eState==CURSOR_INVALID ){ 3288 /* Not pointing at a valid entry - set *pSize to 0. */ 3289 *pSize = 0; 3290 }else{ 3291 getCellInfo(pCur); 3292 *pSize = pCur->info.nData; 3293 } 3294 } 3295 return rc; 3296 } 3297 3298 /* 3299 ** Given the page number of an overflow page in the database (parameter 3300 ** ovfl), this function finds the page number of the next page in the 3301 ** linked list of overflow pages. If possible, it uses the auto-vacuum 3302 ** pointer-map data instead of reading the content of page ovfl to do so. 3303 ** 3304 ** If an error occurs an SQLite error code is returned. Otherwise: 3305 ** 3306 ** The page number of the next overflow page in the linked list is 3307 ** written to *pPgnoNext. If page ovfl is the last page in its linked 3308 ** list, *pPgnoNext is set to zero. 3309 ** 3310 ** If ppPage is not NULL, and a reference to the MemPage object corresponding 3311 ** to page number pOvfl was obtained, then *ppPage is set to point to that 3312 ** reference. It is the responsibility of the caller to call releasePage() 3313 ** on *ppPage to free the reference. In no reference was obtained (because 3314 ** the pointer-map was used to obtain the value for *pPgnoNext), then 3315 ** *ppPage is set to zero. 3316 */ 3317 static int getOverflowPage( 3318 BtShared *pBt, 3319 Pgno ovfl, /* Overflow page */ 3320 MemPage **ppPage, /* OUT: MemPage handle (may be NULL) */ 3321 Pgno *pPgnoNext /* OUT: Next overflow page number */ 3322 ){ 3323 Pgno next = 0; 3324 MemPage *pPage = 0; 3325 int rc = SQLITE_OK; 3326 3327 assert( sqlite3_mutex_held(pBt->mutex) ); 3328 assert(pPgnoNext); 3329 3330 #ifndef SQLITE_OMIT_AUTOVACUUM 3331 /* Try to find the next page in the overflow list using the 3332 ** autovacuum pointer-map pages. Guess that the next page in 3333 ** the overflow list is page number (ovfl+1). If that guess turns 3334 ** out to be wrong, fall back to loading the data of page 3335 ** number ovfl to determine the next page number. 3336 */ 3337 if( pBt->autoVacuum ){ 3338 Pgno pgno; 3339 Pgno iGuess = ovfl+1; 3340 u8 eType; 3341 3342 while( PTRMAP_ISPAGE(pBt, iGuess) || iGuess==PENDING_BYTE_PAGE(pBt) ){ 3343 iGuess++; 3344 } 3345 3346 if( iGuess<=pagerPagecount(pBt) ){ 3347 rc = ptrmapGet(pBt, iGuess, &eType, &pgno); 3348 if( rc==SQLITE_OK && eType==PTRMAP_OVERFLOW2 && pgno==ovfl ){ 3349 next = iGuess; 3350 rc = SQLITE_DONE; 3351 } 3352 } 3353 } 3354 #endif 3355 3356 if( rc==SQLITE_OK ){ 3357 rc = sqlite3BtreeGetPage(pBt, ovfl, &pPage, 0); 3358 assert(rc==SQLITE_OK || pPage==0); 3359 if( next==0 && rc==SQLITE_OK ){ 3360 next = get4byte(pPage->aData); 3361 } 3362 } 3363 3364 *pPgnoNext = next; 3365 if( ppPage ){ 3366 *ppPage = pPage; 3367 }else{ 3368 releasePage(pPage); 3369 } 3370 return (rc==SQLITE_DONE ? SQLITE_OK : rc); 3371 } 3372 3373 /* 3374 ** Copy data from a buffer to a page, or from a page to a buffer. 3375 ** 3376 ** pPayload is a pointer to data stored on database page pDbPage. 3377 ** If argument eOp is false, then nByte bytes of data are copied 3378 ** from pPayload to the buffer pointed at by pBuf. If eOp is true, 3379 ** then sqlite3PagerWrite() is called on pDbPage and nByte bytes 3380 ** of data are copied from the buffer pBuf to pPayload. 3381 ** 3382 ** SQLITE_OK is returned on success, otherwise an error code. 3383 */ 3384 static int copyPayload( 3385 void *pPayload, /* Pointer to page data */ 3386 void *pBuf, /* Pointer to buffer */ 3387 int nByte, /* Number of bytes to copy */ 3388 int eOp, /* 0 -> copy from page, 1 -> copy to page */ 3389 DbPage *pDbPage /* Page containing pPayload */ 3390 ){ 3391 if( eOp ){ 3392 /* Copy data from buffer to page (a write operation) */ 3393 int rc = sqlite3PagerWrite(pDbPage); 3394 if( rc!=SQLITE_OK ){ 3395 return rc; 3396 } 3397 memcpy(pPayload, pBuf, nByte); 3398 }else{ 3399 /* Copy data from page to buffer (a read operation) */ 3400 memcpy(pBuf, pPayload, nByte); 3401 } 3402 return SQLITE_OK; 3403 } 3404 3405 /* 3406 ** This function is used to read or overwrite payload information 3407 ** for the entry that the pCur cursor is pointing to. If the eOp 3408 ** parameter is 0, this is a read operation (data copied into 3409 ** buffer pBuf). If it is non-zero, a write (data copied from 3410 ** buffer pBuf). 3411 ** 3412 ** A total of "amt" bytes are read or written beginning at "offset". 3413 ** Data is read to or from the buffer pBuf. 3414 ** 3415 ** This routine does not make a distinction between key and data. 3416 ** It just reads or writes bytes from the payload area. Data might 3417 ** appear on the main page or be scattered out on multiple overflow 3418 ** pages. 3419 ** 3420 ** If the BtCursor.isIncrblobHandle flag is set, and the current 3421 ** cursor entry uses one or more overflow pages, this function 3422 ** allocates space for and lazily popluates the overflow page-list 3423 ** cache array (BtCursor.aOverflow). Subsequent calls use this 3424 ** cache to make seeking to the supplied offset more efficient. 3425 ** 3426 ** Once an overflow page-list cache has been allocated, it may be 3427 ** invalidated if some other cursor writes to the same table, or if 3428 ** the cursor is moved to a different row. Additionally, in auto-vacuum 3429 ** mode, the following events may invalidate an overflow page-list cache. 3430 ** 3431 ** * An incremental vacuum, 3432 ** * A commit in auto_vacuum="full" mode, 3433 ** * Creating a table (may require moving an overflow page). 3434 */ 3435 static int accessPayload( 3436 BtCursor *pCur, /* Cursor pointing to entry to read from */ 3437 u32 offset, /* Begin reading this far into payload */ 3438 u32 amt, /* Read this many bytes */ 3439 unsigned char *pBuf, /* Write the bytes into this buffer */ 3440 int skipKey, /* offset begins at data if this is true */ 3441 int eOp /* zero to read. non-zero to write. */ 3442 ){ 3443 unsigned char *aPayload; 3444 int rc = SQLITE_OK; 3445 u32 nKey; 3446 int iIdx = 0; 3447 MemPage *pPage = pCur->apPage[pCur->iPage]; /* Btree page of current entry */ 3448 BtShared *pBt = pCur->pBt; /* Btree this cursor belongs to */ 3449 3450 assert( pPage ); 3451 assert( pCur->eState==CURSOR_VALID ); 3452 assert( pCur->aiIdx[pCur->iPage]<pPage->nCell ); 3453 assert( cursorHoldsMutex(pCur) ); 3454 3455 getCellInfo(pCur); 3456 aPayload = pCur->info.pCell + pCur->info.nHeader; 3457 nKey = (pPage->intKey ? 0 : (int)pCur->info.nKey); 3458 3459 if( skipKey ){ 3460 offset += nKey; 3461 } 3462 if( offset+amt > nKey+pCur->info.nData 3463 || &aPayload[pCur->info.nLocal] > &pPage->aData[pBt->usableSize] 3464 ){ 3465 /* Trying to read or write past the end of the data is an error */ 3466 return SQLITE_CORRUPT_BKPT; 3467 } 3468 3469 /* Check if data must be read/written to/from the btree page itself. */ 3470 if( offset<pCur->info.nLocal ){ 3471 int a = amt; 3472 if( a+offset>pCur->info.nLocal ){ 3473 a = pCur->info.nLocal - offset; 3474 } 3475 rc = copyPayload(&aPayload[offset], pBuf, a, eOp, pPage->pDbPage); 3476 offset = 0; 3477 pBuf += a; 3478 amt -= a; 3479 }else{ 3480 offset -= pCur->info.nLocal; 3481 } 3482 3483 if( rc==SQLITE_OK && amt>0 ){ 3484 const u32 ovflSize = pBt->usableSize - 4; /* Bytes content per ovfl page */ 3485 Pgno nextPage; 3486 3487 nextPage = get4byte(&aPayload[pCur->info.nLocal]); 3488 3489 #ifndef SQLITE_OMIT_INCRBLOB 3490 /* If the isIncrblobHandle flag is set and the BtCursor.aOverflow[] 3491 ** has not been allocated, allocate it now. The array is sized at 3492 ** one entry for each overflow page in the overflow chain. The 3493 ** page number of the first overflow page is stored in aOverflow[0], 3494 ** etc. A value of 0 in the aOverflow[] array means "not yet known" 3495 ** (the cache is lazily populated). 3496 */ 3497 if( pCur->isIncrblobHandle && !pCur->aOverflow ){ 3498 int nOvfl = (pCur->info.nPayload-pCur->info.nLocal+ovflSize-1)/ovflSize; 3499 pCur->aOverflow = (Pgno *)sqlite3MallocZero(sizeof(Pgno)*nOvfl); 3500 if( nOvfl && !pCur->aOverflow ){ 3501 rc = SQLITE_NOMEM; 3502 } 3503 } 3504 3505 /* If the overflow page-list cache has been allocated and the 3506 ** entry for the first required overflow page is valid, skip 3507 ** directly to it. 3508 */ 3509 if( pCur->aOverflow && pCur->aOverflow[offset/ovflSize] ){ 3510 iIdx = (offset/ovflSize); 3511 nextPage = pCur->aOverflow[iIdx]; 3512 offset = (offset%ovflSize); 3513 } 3514 #endif 3515 3516 for( ; rc==SQLITE_OK && amt>0 && nextPage; iIdx++){ 3517 3518 #ifndef SQLITE_OMIT_INCRBLOB 3519 /* If required, populate the overflow page-list cache. */ 3520 if( pCur->aOverflow ){ 3521 assert(!pCur->aOverflow[iIdx] || pCur->aOverflow[iIdx]==nextPage); 3522 pCur->aOverflow[iIdx] = nextPage; 3523 } 3524 #endif 3525 3526 if( offset>=ovflSize ){ 3527 /* The only reason to read this page is to obtain the page 3528 ** number for the next page in the overflow chain. The page 3529 ** data is not required. So first try to lookup the overflow 3530 ** page-list cache, if any, then fall back to the getOverflowPage() 3531 ** function. 3532 */ 3533 #ifndef SQLITE_OMIT_INCRBLOB 3534 if( pCur->aOverflow && pCur->aOverflow[iIdx+1] ){ 3535 nextPage = pCur->aOverflow[iIdx+1]; 3536 } else 3537 #endif 3538 rc = getOverflowPage(pBt, nextPage, 0, &nextPage); 3539 offset -= ovflSize; 3540 }else{ 3541 /* Need to read this page properly. It contains some of the 3542 ** range of data that is being read (eOp==0) or written (eOp!=0). 3543 */ 3544 DbPage *pDbPage; 3545 int a = amt; 3546 rc = sqlite3PagerGet(pBt->pPager, nextPage, &pDbPage); 3547 if( rc==SQLITE_OK ){ 3548 aPayload = sqlite3PagerGetData(pDbPage); 3549 nextPage = get4byte(aPayload); 3550 if( a + offset > ovflSize ){ 3551 a = ovflSize - offset; 3552 } 3553 rc = copyPayload(&aPayload[offset+4], pBuf, a, eOp, pDbPage); 3554 sqlite3PagerUnref(pDbPage); 3555 offset = 0; 3556 amt -= a; 3557 pBuf += a; 3558 } 3559 } 3560 } 3561 } 3562 3563 if( rc==SQLITE_OK && amt>0 ){ 3564 return SQLITE_CORRUPT_BKPT; 3565 } 3566 return rc; 3567 } 3568 3569 /* 3570 ** Read part of the key associated with cursor pCur. Exactly 3571 ** "amt" bytes will be transfered into pBuf[]. The transfer 3572 ** begins at "offset". 3573 ** 3574 ** Return SQLITE_OK on success or an error code if anything goes 3575 ** wrong. An error is returned if "offset+amt" is larger than 3576 ** the available payload. 3577 */ 3578 int sqlite3BtreeKey(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){ 3579 int rc; 3580 3581 assert( cursorHoldsMutex(pCur) ); 3582 rc = restoreCursorPosition(pCur); 3583 if( rc==SQLITE_OK ){ 3584 assert( pCur->eState==CURSOR_VALID ); 3585 assert( pCur->iPage>=0 && pCur->apPage[pCur->iPage] ); 3586 if( pCur->apPage[0]->intKey ){ 3587 return SQLITE_CORRUPT_BKPT; 3588 } 3589 assert( pCur->aiIdx[pCur->iPage]<pCur->apPage[pCur->iPage]->nCell ); 3590 rc = accessPayload(pCur, offset, amt, (unsigned char*)pBuf, 0, 0); 3591 } 3592 return rc; 3593 } 3594 3595 /* 3596 ** Read part of the data associated with cursor pCur. Exactly 3597 ** "amt" bytes will be transfered into pBuf[]. The transfer 3598 ** begins at "offset". 3599 ** 3600 ** Return SQLITE_OK on success or an error code if anything goes 3601 ** wrong. An error is returned if "offset+amt" is larger than 3602 ** the available payload. 3603 */ 3604 int sqlite3BtreeData(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){ 3605 int rc; 3606 3607 #ifndef SQLITE_OMIT_INCRBLOB 3608 if ( pCur->eState==CURSOR_INVALID ){ 3609 return SQLITE_ABORT; 3610 } 3611 #endif 3612 3613 assert( cursorHoldsMutex(pCur) ); 3614 rc = restoreCursorPosition(pCur); 3615 if( rc==SQLITE_OK ){ 3616 assert( pCur->eState==CURSOR_VALID ); 3617 assert( pCur->iPage>=0 && pCur->apPage[pCur->iPage] ); 3618 assert( pCur->aiIdx[pCur->iPage]<pCur->apPage[pCur->iPage]->nCell ); 3619 rc = accessPayload(pCur, offset, amt, pBuf, 1, 0); 3620 } 3621 return rc; 3622 } 3623 3624 /* 3625 ** Return a pointer to payload information from the entry that the 3626 ** pCur cursor is pointing to. The pointer is to the beginning of 3627 ** the key if skipKey==0 and it points to the beginning of data if 3628 ** skipKey==1. The number of bytes of available key/data is written 3629 ** into *pAmt. If *pAmt==0, then the value returned will not be 3630 ** a valid pointer. 3631 ** 3632 ** This routine is an optimization. It is common for the entire key 3633 ** and data to fit on the local page and for there to be no overflow 3634 ** pages. When that is so, this routine can be used to access the 3635 ** key and data without making a copy. If the key and/or data spills 3636 ** onto overflow pages, then accessPayload() must be used to reassemble 3637 ** the key/data and copy it into a preallocated buffer. 3638 ** 3639 ** The pointer returned by this routine looks directly into the cached 3640 ** page of the database. The data might change or move the next time 3641 ** any btree routine is called. 3642 */ 3643 static const unsigned char *fetchPayload( 3644 BtCursor *pCur, /* Cursor pointing to entry to read from */ 3645 int *pAmt, /* Write the number of available bytes here */ 3646 int skipKey /* read beginning at data if this is true */ 3647 ){ 3648 unsigned char *aPayload; 3649 MemPage *pPage; 3650 u32 nKey; 3651 u32 nLocal; 3652 3653 assert( pCur!=0 && pCur->iPage>=0 && pCur->apPage[pCur->iPage]); 3654 assert( pCur->eState==CURSOR_VALID ); 3655 assert( cursorHoldsMutex(pCur) ); 3656 pPage = pCur->apPage[pCur->iPage]; 3657 assert( pCur->aiIdx[pCur->iPage]<pPage->nCell ); 3658 getCellInfo(pCur); 3659 aPayload = pCur->info.pCell; 3660 aPayload += pCur->info.nHeader; 3661 if( pPage->intKey ){ 3662 nKey = 0; 3663 }else{ 3664 nKey = (int)pCur->info.nKey; 3665 } 3666 if( skipKey ){ 3667 aPayload += nKey; 3668 nLocal = pCur->info.nLocal - nKey; 3669 }else{ 3670 nLocal = pCur->info.nLocal; 3671 if( nLocal>nKey ){ 3672 nLocal = nKey; 3673 } 3674 } 3675 *pAmt = nLocal; 3676 return aPayload; 3677 } 3678 3679 3680 /* 3681 ** For the entry that cursor pCur is point to, return as 3682 ** many bytes of the key or data as are available on the local 3683 ** b-tree page. Write the number of available bytes into *pAmt. 3684 ** 3685 ** The pointer returned is ephemeral. The key/data may move 3686 ** or be destroyed on the next call to any Btree routine, 3687 ** including calls from other threads against the same cache. 3688 ** Hence, a mutex on the BtShared should be held prior to calling 3689 ** this routine. 3690 ** 3691 ** These routines is used to get quick access to key and data 3692 ** in the common case where no overflow pages are used. 3693 */ 3694 const void *sqlite3BtreeKeyFetch(BtCursor *pCur, int *pAmt){ 3695 assert( cursorHoldsMutex(pCur) ); 3696 if( pCur->eState==CURSOR_VALID ){ 3697 return (const void*)fetchPayload(pCur, pAmt, 0); 3698 } 3699 return 0; 3700 } 3701 const void *sqlite3BtreeDataFetch(BtCursor *pCur, int *pAmt){ 3702 assert( cursorHoldsMutex(pCur) ); 3703 if( pCur->eState==CURSOR_VALID ){ 3704 return (const void*)fetchPayload(pCur, pAmt, 1); 3705 } 3706 return 0; 3707 } 3708 3709 3710 /* 3711 ** Move the cursor down to a new child page. The newPgno argument is the 3712 ** page number of the child page to move to. 3713 */ 3714 static int moveToChild(BtCursor *pCur, u32 newPgno){ 3715 int rc; 3716 int i = pCur->iPage; 3717 MemPage *pNewPage; 3718 BtShared *pBt = pCur->pBt; 3719 3720 assert( cursorHoldsMutex(pCur) ); 3721 assert( pCur->eState==CURSOR_VALID ); 3722 assert( pCur->iPage<BTCURSOR_MAX_DEPTH ); 3723 if( pCur->iPage>=(BTCURSOR_MAX_DEPTH-1) ){ 3724 return SQLITE_CORRUPT_BKPT; 3725 } 3726 rc = getAndInitPage(pBt, newPgno, &pNewPage); 3727 if( rc ) return rc; 3728 pCur->apPage[i+1] = pNewPage; 3729 pCur->aiIdx[i+1] = 0; 3730 pCur->iPage++; 3731 3732 pCur->info.nSize = 0; 3733 pCur->validNKey = 0; 3734 if( pNewPage->nCell<1 ){ 3735 return SQLITE_CORRUPT_BKPT; 3736 } 3737 return SQLITE_OK; 3738 } 3739 3740 #ifndef NDEBUG 3741 /* 3742 ** Page pParent is an internal (non-leaf) tree page. This function 3743 ** asserts that page number iChild is the left-child if the iIdx'th 3744 ** cell in page pParent. Or, if iIdx is equal to the total number of 3745 ** cells in pParent, that page number iChild is the right-child of 3746 ** the page. 3747 */ 3748 static void assertParentIndex(MemPage *pParent, int iIdx, Pgno iChild){ 3749 assert( iIdx<=pParent->nCell ); 3750 if( iIdx==pParent->nCell ){ 3751 assert( get4byte(&pParent->aData[pParent->hdrOffset+8])==iChild ); 3752 }else{ 3753 assert( get4byte(findCell(pParent, iIdx))==iChild ); 3754 } 3755 } 3756 #else 3757 # define assertParentIndex(x,y,z) 3758 #endif 3759 3760 /* 3761 ** Move the cursor up to the parent page. 3762 ** 3763 ** pCur->idx is set to the cell index that contains the pointer 3764 ** to the page we are coming from. If we are coming from the 3765 ** right-most child page then pCur->idx is set to one more than 3766 ** the largest cell index. 3767 */ 3768 void sqlite3BtreeMoveToParent(BtCursor *pCur){ 3769 assert( cursorHoldsMutex(pCur) ); 3770 assert( pCur->eState==CURSOR_VALID ); 3771 assert( pCur->iPage>0 ); 3772 assert( pCur->apPage[pCur->iPage] ); 3773 assertParentIndex( 3774 pCur->apPage[pCur->iPage-1], 3775 pCur->aiIdx[pCur->iPage-1], 3776 pCur->apPage[pCur->iPage]->pgno 3777 ); 3778 releasePage(pCur->apPage[pCur->iPage]); 3779 pCur->iPage--; 3780 pCur->info.nSize = 0; 3781 pCur->validNKey = 0; 3782 } 3783 3784 /* 3785 ** Move the cursor to the root page 3786 */ 3787 static int moveToRoot(BtCursor *pCur){ 3788 MemPage *pRoot; 3789 int rc = SQLITE_OK; 3790 Btree *p = pCur->pBtree; 3791 BtShared *pBt = p->pBt; 3792 3793 assert( cursorHoldsMutex(pCur) ); 3794 assert( CURSOR_INVALID < CURSOR_REQUIRESEEK ); 3795 assert( CURSOR_VALID < CURSOR_REQUIRESEEK ); 3796 assert( CURSOR_FAULT > CURSOR_REQUIRESEEK ); 3797 if( pCur->eState>=CURSOR_REQUIRESEEK ){ 3798 if( pCur->eState==CURSOR_FAULT ){ 3799 return pCur->skip; 3800 } 3801 sqlite3BtreeClearCursor(pCur); 3802 } 3803 3804 if( pCur->iPage>=0 ){ 3805 int i; 3806 for(i=1; i<=pCur->iPage; i++){ 3807 releasePage(pCur->apPage[i]); 3808 } 3809 }else{ 3810 if( 3811 SQLITE_OK!=(rc = getAndInitPage(pBt, pCur->pgnoRoot, &pCur->apPage[0])) 3812 ){ 3813 pCur->eState = CURSOR_INVALID; 3814 return rc; 3815 } 3816 } 3817 3818 pRoot = pCur->apPage[0]; 3819 assert( pRoot->pgno==pCur->pgnoRoot ); 3820 pCur->iPage = 0; 3821 pCur->aiIdx[0] = 0; 3822 pCur->info.nSize = 0; 3823 pCur->atLast = 0; 3824 pCur->validNKey = 0; 3825 3826 if( pRoot->nCell==0 && !pRoot->leaf ){ 3827 Pgno subpage; 3828 assert( pRoot->pgno==1 ); 3829 subpage = get4byte(&pRoot->aData[pRoot->hdrOffset+8]); 3830 assert( subpage>0 ); 3831 pCur->eState = CURSOR_VALID; 3832 rc = moveToChild(pCur, subpage); 3833 }else{ 3834 pCur->eState = ((pRoot->nCell>0)?CURSOR_VALID:CURSOR_INVALID); 3835 } 3836 return rc; 3837 } 3838 3839 /* 3840 ** Move the cursor down to the left-most leaf entry beneath the 3841 ** entry to which it is currently pointing. 3842 ** 3843 ** The left-most leaf is the one with the smallest key - the first 3844 ** in ascending order. 3845 */ 3846 static int moveToLeftmost(BtCursor *pCur){ 3847 Pgno pgno; 3848 int rc = SQLITE_OK; 3849 MemPage *pPage; 3850 3851 assert( cursorHoldsMutex(pCur) ); 3852 assert( pCur->eState==CURSOR_VALID ); 3853 while( rc==SQLITE_OK && !(pPage = pCur->apPage[pCur->iPage])->leaf ){ 3854 assert( pCur->aiIdx[pCur->iPage]<pPage->nCell ); 3855 pgno = get4byte(findCell(pPage, pCur->aiIdx[pCur->iPage])); 3856 rc = moveToChild(pCur, pgno); 3857 } 3858 return rc; 3859 } 3860 3861 /* 3862 ** Move the cursor down to the right-most leaf entry beneath the 3863 ** page to which it is currently pointing. Notice the difference 3864 ** between moveToLeftmost() and moveToRightmost(). moveToLeftmost() 3865 ** finds the left-most entry beneath the *entry* whereas moveToRightmost() 3866 ** finds the right-most entry beneath the *page*. 3867 ** 3868 ** The right-most entry is the one with the largest key - the last 3869 ** key in ascending order. 3870 */ 3871 static int moveToRightmost(BtCursor *pCur){ 3872 Pgno pgno; 3873 int rc = SQLITE_OK; 3874 MemPage *pPage = 0; 3875 3876 assert( cursorHoldsMutex(pCur) ); 3877 assert( pCur->eState==CURSOR_VALID ); 3878 while( rc==SQLITE_OK && !(pPage = pCur->apPage[pCur->iPage])->leaf ){ 3879 pgno = get4byte(&pPage->aData[pPage->hdrOffset+8]); 3880 pCur->aiIdx[pCur->iPage] = pPage->nCell; 3881 rc = moveToChild(pCur, pgno); 3882 } 3883 if( rc==SQLITE_OK ){ 3884 pCur->aiIdx[pCur->iPage] = pPage->nCell-1; 3885 pCur->info.nSize = 0; 3886 pCur->validNKey = 0; 3887 } 3888 return rc; 3889 } 3890 3891 /* Move the cursor to the first entry in the table. Return SQLITE_OK 3892 ** on success. Set *pRes to 0 if the cursor actually points to something 3893 ** or set *pRes to 1 if the table is empty. 3894 */ 3895 int sqlite3BtreeFirst(BtCursor *pCur, int *pRes){ 3896 int rc; 3897 3898 assert( cursorHoldsMutex(pCur) ); 3899 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) ); 3900 rc = moveToRoot(pCur); 3901 if( rc==SQLITE_OK ){ 3902 if( pCur->eState==CURSOR_INVALID ){ 3903 assert( pCur->apPage[pCur->iPage]->nCell==0 ); 3904 *pRes = 1; 3905 rc = SQLITE_OK; 3906 }else{ 3907 assert( pCur->apPage[pCur->iPage]->nCell>0 ); 3908 *pRes = 0; 3909 rc = moveToLeftmost(pCur); 3910 } 3911 } 3912 return rc; 3913 } 3914 3915 /* Move the cursor to the last entry in the table. Return SQLITE_OK 3916 ** on success. Set *pRes to 0 if the cursor actually points to something 3917 ** or set *pRes to 1 if the table is empty. 3918 */ 3919 int sqlite3BtreeLast(BtCursor *pCur, int *pRes){ 3920 int rc; 3921 3922 assert( cursorHoldsMutex(pCur) ); 3923 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) ); 3924 rc = moveToRoot(pCur); 3925 if( rc==SQLITE_OK ){ 3926 if( CURSOR_INVALID==pCur->eState ){ 3927 assert( pCur->apPage[pCur->iPage]->nCell==0 ); 3928 *pRes = 1; 3929 }else{ 3930 assert( pCur->eState==CURSOR_VALID ); 3931 *pRes = 0; 3932 rc = moveToRightmost(pCur); 3933 pCur->atLast = rc==SQLITE_OK ?1:0; 3934 } 3935 } 3936 return rc; 3937 } 3938 3939 /* Move the cursor so that it points to an entry near the key 3940 ** specified by pIdxKey or intKey. Return a success code. 3941 ** 3942 ** For INTKEY tables, the intKey parameter is used. pIdxKey 3943 ** must be NULL. For index tables, pIdxKey is used and intKey 3944 ** is ignored. 3945 ** 3946 ** If an exact match is not found, then the cursor is always 3947 ** left pointing at a leaf page which would hold the entry if it 3948 ** were present. The cursor might point to an entry that comes 3949 ** before or after the key. 3950 ** 3951 ** An integer is written into *pRes which is the result of 3952 ** comparing the key with the entry to which the cursor is 3953 ** pointing. The meaning of the integer written into 3954 ** *pRes is as follows: 3955 ** 3956 ** *pRes<0 The cursor is left pointing at an entry that 3957 ** is smaller than intKey/pIdxKey or if the table is empty 3958 ** and the cursor is therefore left point to nothing. 3959 ** 3960 ** *pRes==0 The cursor is left pointing at an entry that 3961 ** exactly matches intKey/pIdxKey. 3962 ** 3963 ** *pRes>0 The cursor is left pointing at an entry that 3964 ** is larger than intKey/pIdxKey. 3965 ** 3966 */ 3967 int sqlite3BtreeMovetoUnpacked( 3968 BtCursor *pCur, /* The cursor to be moved */ 3969 UnpackedRecord *pIdxKey, /* Unpacked index key */ 3970 i64 intKey, /* The table key */ 3971 int biasRight, /* If true, bias the search to the high end */ 3972 int *pRes /* Write search results here */ 3973 ){ 3974 int rc; 3975 3976 assert( cursorHoldsMutex(pCur) ); 3977 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) ); 3978 3979 /* If the cursor is already positioned at the point we are trying 3980 ** to move to, then just return without doing any work */ 3981 if( pCur->eState==CURSOR_VALID && pCur->validNKey 3982 && pCur->apPage[0]->intKey 3983 ){ 3984 if( pCur->info.nKey==intKey ){ 3985 *pRes = 0; 3986 return SQLITE_OK; 3987 } 3988 if( pCur->atLast && pCur->info.nKey<intKey ){ 3989 *pRes = -1; 3990 return SQLITE_OK; 3991 } 3992 } 3993 3994 rc = moveToRoot(pCur); 3995 if( rc ){ 3996 return rc; 3997 } 3998 assert( pCur->apPage[pCur->iPage] ); 3999 assert( pCur->apPage[pCur->iPage]->isInit ); 4000 if( pCur->eState==CURSOR_INVALID ){ 4001 *pRes = -1; 4002 assert( pCur->apPage[pCur->iPage]->nCell==0 ); 4003 return SQLITE_OK; 4004 } 4005 assert( pCur->apPage[0]->intKey || pIdxKey ); 4006 for(;;){ 4007 int lwr, upr; 4008 Pgno chldPg; 4009 MemPage *pPage = pCur->apPage[pCur->iPage]; 4010 int c = -1; /* pRes return if table is empty must be -1 */ 4011 lwr = 0; 4012 upr = pPage->nCell-1; 4013 if( (!pPage->intKey && pIdxKey==0) || upr<0 ){ 4014 rc = SQLITE_CORRUPT_BKPT; 4015 goto moveto_finish; 4016 } 4017 if( biasRight ){ 4018 pCur->aiIdx[pCur->iPage] = (u16)upr; 4019 }else{ 4020 pCur->aiIdx[pCur->iPage] = (u16)((upr+lwr)/2); 4021 } 4022 for(;;){ 4023 void *pCellKey; 4024 i64 nCellKey; 4025 int idx = pCur->aiIdx[pCur->iPage]; 4026 pCur->info.nSize = 0; 4027 pCur->validNKey = 1; 4028 if( pPage->intKey ){ 4029 u8 *pCell; 4030 pCell = findCell(pPage, idx) + pPage->childPtrSize; 4031 if( pPage->hasData ){ 4032 u32 dummy; 4033 pCell += getVarint32(pCell, dummy); 4034 } 4035 getVarint(pCell, (u64*)&nCellKey); 4036 if( nCellKey==intKey ){ 4037 c = 0; 4038 }else if( nCellKey<intKey ){ 4039 c = -1; 4040 }else{ 4041 assert( nCellKey>intKey ); 4042 c = +1; 4043 } 4044 }else{ 4045 int available; 4046 pCellKey = (void *)fetchPayload(pCur, &available, 0); 4047 nCellKey = pCur->info.nKey; 4048 if( available>=nCellKey ){ 4049 c = sqlite3VdbeRecordCompare((int)nCellKey, pCellKey, pIdxKey); 4050 }else{ 4051 pCellKey = sqlite3Malloc( (int)nCellKey ); 4052 if( pCellKey==0 ){ 4053 rc = SQLITE_NOMEM; 4054 goto moveto_finish; 4055 } 4056 rc = sqlite3BtreeKey(pCur, 0, (int)nCellKey, (void*)pCellKey); 4057 c = sqlite3VdbeRecordCompare((int)nCellKey, pCellKey, pIdxKey); 4058 sqlite3_free(pCellKey); 4059 if( rc ) goto moveto_finish; 4060 } 4061 } 4062 if( c==0 ){ 4063 pCur->info.nKey = nCellKey; 4064 if( pPage->intKey && !pPage->leaf ){ 4065 lwr = idx; 4066 upr = lwr - 1; 4067 break; 4068 }else{ 4069 *pRes = 0; 4070 rc = SQLITE_OK; 4071 goto moveto_finish; 4072 } 4073 } 4074 if( c<0 ){ 4075 lwr = idx+1; 4076 }else{ 4077 upr = idx-1; 4078 } 4079 if( lwr>upr ){ 4080 pCur->info.nKey = nCellKey; 4081 break; 4082 } 4083 pCur->aiIdx[pCur->iPage] = (u16)((lwr+upr)/2); 4084 } 4085 assert( lwr==upr+1 ); 4086 assert( pPage->isInit ); 4087 if( pPage->leaf ){ 4088 chldPg = 0; 4089 }else if( lwr>=pPage->nCell ){ 4090 chldPg = get4byte(&pPage->aData[pPage->hdrOffset+8]); 4091 }else{ 4092 chldPg = get4byte(findCell(pPage, lwr)); 4093 } 4094 if( chldPg==0 ){ 4095 assert( pCur->aiIdx[pCur->iPage]<pCur->apPage[pCur->iPage]->nCell ); 4096 if( pRes ) *pRes = c; 4097 rc = SQLITE_OK; 4098 goto moveto_finish; 4099 } 4100 pCur->aiIdx[pCur->iPage] = (u16)lwr; 4101 pCur->info.nSize = 0; 4102 pCur->validNKey = 0; 4103 rc = moveToChild(pCur, chldPg); 4104 if( rc ) goto moveto_finish; 4105 } 4106 moveto_finish: 4107 return rc; 4108 } 4109 4110 /* 4111 ** In this version of BtreeMoveto, pKey is a packed index record 4112 ** such as is generated by the OP_MakeRecord opcode. Unpack the 4113 ** record and then call BtreeMovetoUnpacked() to do the work. 4114 */ 4115 int sqlite3BtreeMoveto( 4116 BtCursor *pCur, /* Cursor open on the btree to be searched */ 4117 const void *pKey, /* Packed key if the btree is an index */ 4118 i64 nKey, /* Integer key for tables. Size of pKey for indices */ 4119 int bias, /* Bias search to the high end */ 4120 int *pRes /* Write search results here */ 4121 ){ 4122 int rc; /* Status code */ 4123 UnpackedRecord *pIdxKey; /* Unpacked index key */ 4124 char aSpace[150]; /* Temp space for pIdxKey - to avoid a malloc */ 4125 4126 4127 if( pKey ){ 4128 assert( nKey==(i64)(int)nKey ); 4129 pIdxKey = sqlite3VdbeRecordUnpack(pCur->pKeyInfo, (int)nKey, pKey, 4130 aSpace, sizeof(aSpace)); 4131 if( pIdxKey==0 ) return SQLITE_NOMEM; 4132 }else{ 4133 pIdxKey = 0; 4134 } 4135 rc = sqlite3BtreeMovetoUnpacked(pCur, pIdxKey, nKey, bias, pRes); 4136 if( pKey ){ 4137 sqlite3VdbeDeleteUnpackedRecord(pIdxKey); 4138 } 4139 return rc; 4140 } 4141 4142 4143 /* 4144 ** Return TRUE if the cursor is not pointing at an entry of the table. 4145 ** 4146 ** TRUE will be returned after a call to sqlite3BtreeNext() moves 4147 ** past the last entry in the table or sqlite3BtreePrev() moves past 4148 ** the first entry. TRUE is also returned if the table is empty. 4149 */ 4150 int sqlite3BtreeEof(BtCursor *pCur){ 4151 /* TODO: What if the cursor is in CURSOR_REQUIRESEEK but all table entries 4152 ** have been deleted? This API will need to change to return an error code 4153 ** as well as the boolean result value. 4154 */ 4155 return (CURSOR_VALID!=pCur->eState); 4156 } 4157 4158 /* 4159 ** Return the database connection handle for a cursor. 4160 */ 4161 sqlite3 *sqlite3BtreeCursorDb(const BtCursor *pCur){ 4162 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) ); 4163 return pCur->pBtree->db; 4164 } 4165 4166 /* 4167 ** Advance the cursor to the next entry in the database. If 4168 ** successful then set *pRes=0. If the cursor 4169 ** was already pointing to the last entry in the database before 4170 ** this routine was called, then set *pRes=1. 4171 */ 4172 int sqlite3BtreeNext(BtCursor *pCur, int *pRes){ 4173 int rc; 4174 int idx; 4175 MemPage *pPage; 4176 4177 assert( cursorHoldsMutex(pCur) ); 4178 rc = restoreCursorPosition(pCur); 4179 if( rc!=SQLITE_OK ){ 4180 return rc; 4181 } 4182 assert( pRes!=0 ); 4183 if( CURSOR_INVALID==pCur->eState ){ 4184 *pRes = 1; 4185 return SQLITE_OK; 4186 } 4187 if( pCur->skip>0 ){ 4188 pCur->skip = 0; 4189 *pRes = 0; 4190 return SQLITE_OK; 4191 } 4192 pCur->skip = 0; 4193 4194 pPage = pCur->apPage[pCur->iPage]; 4195 idx = ++pCur->aiIdx[pCur->iPage]; 4196 assert( pPage->isInit ); 4197 assert( idx<=pPage->nCell ); 4198 4199 pCur->info.nSize = 0; 4200 pCur->validNKey = 0; 4201 if( idx>=pPage->nCell ){ 4202 if( !pPage->leaf ){ 4203 rc = moveToChild(pCur, get4byte(&pPage->aData[pPage->hdrOffset+8])); 4204 if( rc ) return rc; 4205 rc = moveToLeftmost(pCur); 4206 *pRes = 0; 4207 return rc; 4208 } 4209 do{ 4210 if( pCur->iPage==0 ){ 4211 *pRes = 1; 4212 pCur->eState = CURSOR_INVALID; 4213 return SQLITE_OK; 4214 } 4215 sqlite3BtreeMoveToParent(pCur); 4216 pPage = pCur->apPage[pCur->iPage]; 4217 }while( pCur->aiIdx[pCur->iPage]>=pPage->nCell ); 4218 *pRes = 0; 4219 if( pPage->intKey ){ 4220 rc = sqlite3BtreeNext(pCur, pRes); 4221 }else{ 4222 rc = SQLITE_OK; 4223 } 4224 return rc; 4225 } 4226 *pRes = 0; 4227 if( pPage->leaf ){ 4228 return SQLITE_OK; 4229 } 4230 rc = moveToLeftmost(pCur); 4231 return rc; 4232 } 4233 4234 4235 /* 4236 ** Step the cursor to the back to the previous entry in the database. If 4237 ** successful then set *pRes=0. If the cursor 4238 ** was already pointing to the first entry in the database before 4239 ** this routine was called, then set *pRes=1. 4240 */ 4241 int sqlite3BtreePrevious(BtCursor *pCur, int *pRes){ 4242 int rc; 4243 MemPage *pPage; 4244 4245 assert( cursorHoldsMutex(pCur) ); 4246 rc = restoreCursorPosition(pCur); 4247 if( rc!=SQLITE_OK ){ 4248 return rc; 4249 } 4250 pCur->atLast = 0; 4251 if( CURSOR_INVALID==pCur->eState ){ 4252 *pRes = 1; 4253 return SQLITE_OK; 4254 } 4255 if( pCur->skip<0 ){ 4256 pCur->skip = 0; 4257 *pRes = 0; 4258 return SQLITE_OK; 4259 } 4260 pCur->skip = 0; 4261 4262 pPage = pCur->apPage[pCur->iPage]; 4263 assert( pPage->isInit ); 4264 if( !pPage->leaf ){ 4265 int idx = pCur->aiIdx[pCur->iPage]; 4266 rc = moveToChild(pCur, get4byte(findCell(pPage, idx))); 4267 if( rc ){ 4268 return rc; 4269 } 4270 rc = moveToRightmost(pCur); 4271 }else{ 4272 while( pCur->aiIdx[pCur->iPage]==0 ){ 4273 if( pCur->iPage==0 ){ 4274 pCur->eState = CURSOR_INVALID; 4275 *pRes = 1; 4276 return SQLITE_OK; 4277 } 4278 sqlite3BtreeMoveToParent(pCur); 4279 } 4280 pCur->info.nSize = 0; 4281 pCur->validNKey = 0; 4282 4283 pCur->aiIdx[pCur->iPage]--; 4284 pPage = pCur->apPage[pCur->iPage]; 4285 if( pPage->intKey && !pPage->leaf ){ 4286 rc = sqlite3BtreePrevious(pCur, pRes); 4287 }else{ 4288 rc = SQLITE_OK; 4289 } 4290 } 4291 *pRes = 0; 4292 return rc; 4293 } 4294 4295 /* 4296 ** Allocate a new page from the database file. 4297 ** 4298 ** The new page is marked as dirty. (In other words, sqlite3PagerWrite() 4299 ** has already been called on the new page.) The new page has also 4300 ** been referenced and the calling routine is responsible for calling 4301 ** sqlite3PagerUnref() on the new page when it is done. 4302 ** 4303 ** SQLITE_OK is returned on success. Any other return value indicates 4304 ** an error. *ppPage and *pPgno are undefined in the event of an error. 4305 ** Do not invoke sqlite3PagerUnref() on *ppPage if an error is returned. 4306 ** 4307 ** If the "nearby" parameter is not 0, then a (feeble) effort is made to 4308 ** locate a page close to the page number "nearby". This can be used in an 4309 ** attempt to keep related pages close to each other in the database file, 4310 ** which in turn can make database access faster. 4311 ** 4312 ** If the "exact" parameter is not 0, and the page-number nearby exists 4313 ** anywhere on the free-list, then it is guarenteed to be returned. This 4314 ** is only used by auto-vacuum databases when allocating a new table. 4315 */ 4316 static int allocateBtreePage( 4317 BtShared *pBt, 4318 MemPage **ppPage, 4319 Pgno *pPgno, 4320 Pgno nearby, 4321 u8 exact 4322 ){ 4323 MemPage *pPage1; 4324 int rc; 4325 int n; /* Number of pages on the freelist */ 4326 int k; /* Number of leaves on the trunk of the freelist */ 4327 MemPage *pTrunk = 0; 4328 MemPage *pPrevTrunk = 0; 4329 4330 assert( sqlite3_mutex_held(pBt->mutex) ); 4331 pPage1 = pBt->pPage1; 4332 n = get4byte(&pPage1->aData[36]); 4333 if( n>0 ){ 4334 /* There are pages on the freelist. Reuse one of those pages. */ 4335 Pgno iTrunk; 4336 u8 searchList = 0; /* If the free-list must be searched for 'nearby' */ 4337 4338 /* If the 'exact' parameter was true and a query of the pointer-map 4339 ** shows that the page 'nearby' is somewhere on the free-list, then 4340 ** the entire-list will be searched for that page. 4341 */ 4342 #ifndef SQLITE_OMIT_AUTOVACUUM 4343 if( exact && nearby<=pagerPagecount(pBt) ){ 4344 u8 eType; 4345 assert( nearby>0 ); 4346 assert( pBt->autoVacuum ); 4347 rc = ptrmapGet(pBt, nearby, &eType, 0); 4348 if( rc ) return rc; 4349 if( eType==PTRMAP_FREEPAGE ){ 4350 searchList = 1; 4351 } 4352 *pPgno = nearby; 4353 } 4354 #endif 4355 4356 /* Decrement the free-list count by 1. Set iTrunk to the index of the 4357 ** first free-list trunk page. iPrevTrunk is initially 1. 4358 */ 4359 rc = sqlite3PagerWrite(pPage1->pDbPage); 4360 if( rc ) return rc; 4361 put4byte(&pPage1->aData[36], n-1); 4362 4363 /* The code within this loop is run only once if the 'searchList' variable 4364 ** is not true. Otherwise, it runs once for each trunk-page on the 4365 ** free-list until the page 'nearby' is located. 4366 */ 4367 do { 4368 pPrevTrunk = pTrunk; 4369 if( pPrevTrunk ){ 4370 iTrunk = get4byte(&pPrevTrunk->aData[0]); 4371 }else{ 4372 iTrunk = get4byte(&pPage1->aData[32]); 4373 } 4374 rc = sqlite3BtreeGetPage(pBt, iTrunk, &pTrunk, 0); 4375 if( rc ){ 4376 pTrunk = 0; 4377 goto end_allocate_page; 4378 } 4379 4380 k = get4byte(&pTrunk->aData[4]); 4381 if( k==0 && !searchList ){ 4382 /* The trunk has no leaves and the list is not being searched. 4383 ** So extract the trunk page itself and use it as the newly 4384 ** allocated page */ 4385 assert( pPrevTrunk==0 ); 4386 rc = sqlite3PagerWrite(pTrunk->pDbPage); 4387 if( rc ){ 4388 goto end_allocate_page; 4389 } 4390 *pPgno = iTrunk; 4391 memcpy(&pPage1->aData[32], &pTrunk->aData[0], 4); 4392 *ppPage = pTrunk; 4393 pTrunk = 0; 4394 TRACE(("ALLOCATE: %d trunk - %d free pages left\n", *pPgno, n-1)); 4395 }else if( k>pBt->usableSize/4 - 2 ){ 4396 /* Value of k is out of range. Database corruption */ 4397 rc = SQLITE_CORRUPT_BKPT; 4398 goto end_allocate_page; 4399 #ifndef SQLITE_OMIT_AUTOVACUUM 4400 }else if( searchList && nearby==iTrunk ){ 4401 /* The list is being searched and this trunk page is the page 4402 ** to allocate, regardless of whether it has leaves. 4403 */ 4404 assert( *pPgno==iTrunk ); 4405 *ppPage = pTrunk; 4406 searchList = 0; 4407 rc = sqlite3PagerWrite(pTrunk->pDbPage); 4408 if( rc ){ 4409 goto end_allocate_page; 4410 } 4411 if( k==0 ){ 4412 if( !pPrevTrunk ){ 4413 memcpy(&pPage1->aData[32], &pTrunk->aData[0], 4); 4414 }else{ 4415 memcpy(&pPrevTrunk->aData[0], &pTrunk->aData[0], 4); 4416 } 4417 }else{ 4418 /* The trunk page is required by the caller but it contains 4419 ** pointers to free-list leaves. The first leaf becomes a trunk 4420 ** page in this case. 4421 */ 4422 MemPage *pNewTrunk; 4423 Pgno iNewTrunk = get4byte(&pTrunk->aData[8]); 4424 rc = sqlite3BtreeGetPage(pBt, iNewTrunk, &pNewTrunk, 0); 4425 if( rc!=SQLITE_OK ){ 4426 goto end_allocate_page; 4427 } 4428 rc = sqlite3PagerWrite(pNewTrunk->pDbPage); 4429 if( rc!=SQLITE_OK ){ 4430 releasePage(pNewTrunk); 4431 goto end_allocate_page; 4432 } 4433 memcpy(&pNewTrunk->aData[0], &pTrunk->aData[0], 4); 4434 put4byte(&pNewTrunk->aData[4], k-1); 4435 memcpy(&pNewTrunk->aData[8], &pTrunk->aData[12], (k-1)*4); 4436 releasePage(pNewTrunk); 4437 if( !pPrevTrunk ){ 4438 assert( sqlite3PagerIswriteable(pPage1->pDbPage) ); 4439 put4byte(&pPage1->aData[32], iNewTrunk); 4440 }else{ 4441 rc = sqlite3PagerWrite(pPrevTrunk->pDbPage); 4442 if( rc ){ 4443 goto end_allocate_page; 4444 } 4445 put4byte(&pPrevTrunk->aData[0], iNewTrunk); 4446 } 4447 } 4448 pTrunk = 0; 4449 TRACE(("ALLOCATE: %d trunk - %d free pages left\n", *pPgno, n-1)); 4450 #endif 4451 }else{ 4452 /* Extract a leaf from the trunk */ 4453 int closest; 4454 Pgno iPage; 4455 unsigned char *aData = pTrunk->aData; 4456 rc = sqlite3PagerWrite(pTrunk->pDbPage); 4457 if( rc ){ 4458 goto end_allocate_page; 4459 } 4460 if( nearby>0 ){ 4461 int i, dist; 4462 closest = 0; 4463 dist = get4byte(&aData[8]) - nearby; 4464 if( dist<0 ) dist = -dist; 4465 for(i=1; i<k; i++){ 4466 int d2 = get4byte(&aData[8+i*4]) - nearby; 4467 if( d2<0 ) d2 = -d2; 4468 if( d2<dist ){ 4469 closest = i; 4470 dist = d2; 4471 } 4472 } 4473 }else{ 4474 closest = 0; 4475 } 4476 4477 iPage = get4byte(&aData[8+closest*4]); 4478 if( !searchList || iPage==nearby ){ 4479 int noContent; 4480 Pgno nPage; 4481 *pPgno = iPage; 4482 nPage = pagerPagecount(pBt); 4483 if( *pPgno>nPage ){ 4484 /* Free page off the end of the file */ 4485 rc = SQLITE_CORRUPT_BKPT; 4486 goto end_allocate_page; 4487 } 4488 TRACE(("ALLOCATE: %d was leaf %d of %d on trunk %d" 4489 ": %d more free pages\n", 4490 *pPgno, closest+1, k, pTrunk->pgno, n-1)); 4491 if( closest<k-1 ){ 4492 memcpy(&aData[8+closest*4], &aData[4+k*4], 4); 4493 } 4494 put4byte(&aData[4], k-1); 4495 assert( sqlite3PagerIswriteable(pTrunk->pDbPage) ); 4496 noContent = !btreeGetHasContent(pBt, *pPgno); 4497 rc = sqlite3BtreeGetPage(pBt, *pPgno, ppPage, noContent); 4498 if( rc==SQLITE_OK ){ 4499 rc = sqlite3PagerWrite((*ppPage)->pDbPage); 4500 if( rc!=SQLITE_OK ){ 4501 releasePage(*ppPage); 4502 } 4503 } 4504 searchList = 0; 4505 } 4506 } 4507 releasePage(pPrevTrunk); 4508 pPrevTrunk = 0; 4509 }while( searchList ); 4510 }else{ 4511 /* There are no pages on the freelist, so create a new page at the 4512 ** end of the file */ 4513 int nPage = pagerPagecount(pBt); 4514 *pPgno = nPage + 1; 4515 4516 if( *pPgno==PENDING_BYTE_PAGE(pBt) ){ 4517 (*pPgno)++; 4518 } 4519 4520 #ifndef SQLITE_OMIT_AUTOVACUUM 4521 if( pBt->autoVacuum && PTRMAP_ISPAGE(pBt, *pPgno) ){ 4522 /* If *pPgno refers to a pointer-map page, allocate two new pages 4523 ** at the end of the file instead of one. The first allocated page 4524 ** becomes a new pointer-map page, the second is used by the caller. 4525 */ 4526 MemPage *pPg = 0; 4527 TRACE(("ALLOCATE: %d from end of file (pointer-map page)\n", *pPgno)); 4528 assert( *pPgno!=PENDING_BYTE_PAGE(pBt) ); 4529 rc = sqlite3BtreeGetPage(pBt, *pPgno, &pPg, 0); 4530 if( rc==SQLITE_OK ){ 4531 rc = sqlite3PagerWrite(pPg->pDbPage); 4532 releasePage(pPg); 4533 } 4534 if( rc ) return rc; 4535 (*pPgno)++; 4536 if( *pPgno==PENDING_BYTE_PAGE(pBt) ){ (*pPgno)++; } 4537 } 4538 #endif 4539 4540 assert( *pPgno!=PENDING_BYTE_PAGE(pBt) ); 4541 rc = sqlite3BtreeGetPage(pBt, *pPgno, ppPage, 0); 4542 if( rc ) return rc; 4543 rc = sqlite3PagerWrite((*ppPage)->pDbPage); 4544 if( rc!=SQLITE_OK ){ 4545 releasePage(*ppPage); 4546 } 4547 TRACE(("ALLOCATE: %d from end of file\n", *pPgno)); 4548 } 4549 4550 assert( *pPgno!=PENDING_BYTE_PAGE(pBt) ); 4551 4552 end_allocate_page: 4553 releasePage(pTrunk); 4554 releasePage(pPrevTrunk); 4555 if( rc==SQLITE_OK ){ 4556 if( sqlite3PagerPageRefcount((*ppPage)->pDbPage)>1 ){ 4557 releasePage(*ppPage); 4558 return SQLITE_CORRUPT_BKPT; 4559 } 4560 (*ppPage)->isInit = 0; 4561 } 4562 return rc; 4563 } 4564 4565 /* 4566 ** This function is used to add page iPage to the database file free-list. 4567 ** It is assumed that the page is not already a part of the free-list. 4568 ** 4569 ** The value passed as the second argument to this function is optional. 4570 ** If the caller happens to have a pointer to the MemPage object 4571 ** corresponding to page iPage handy, it may pass it as the second value. 4572 ** Otherwise, it may pass NULL. 4573 ** 4574 ** If a pointer to a MemPage object is passed as the second argument, 4575 ** its reference count is not altered by this function. 4576 */ 4577 static int freePage2(BtShared *pBt, MemPage *pMemPage, Pgno iPage){ 4578 MemPage *pTrunk = 0; /* Free-list trunk page */ 4579 Pgno iTrunk = 0; /* Page number of free-list trunk page */ 4580 MemPage *pPage1 = pBt->pPage1; /* Local reference to page 1 */ 4581 MemPage *pPage; /* Page being freed. May be NULL. */ 4582 int rc; /* Return Code */ 4583 int nFree; /* Initial number of pages on free-list */ 4584 4585 assert( sqlite3_mutex_held(pBt->mutex) ); 4586 assert( iPage>1 ); 4587 assert( !pMemPage || pMemPage->pgno==iPage ); 4588 4589 if( pMemPage ){ 4590 pPage = pMemPage; 4591 sqlite3PagerRef(pPage->pDbPage); 4592 }else{ 4593 pPage = btreePageLookup(pBt, iPage); 4594 } 4595 4596 /* Increment the free page count on pPage1 */ 4597 rc = sqlite3PagerWrite(pPage1->pDbPage); 4598 if( rc ) goto freepage_out; 4599 nFree = get4byte(&pPage1->aData[36]); 4600 put4byte(&pPage1->aData[36], nFree+1); 4601 4602 #ifdef SQLITE_SECURE_DELETE 4603 /* If the SQLITE_SECURE_DELETE compile-time option is enabled, then 4604 ** always fully overwrite deleted information with zeros. 4605 */ 4606 if( (!pPage && (rc = sqlite3BtreeGetPage(pBt, iPage, &pPage, 0))) 4607 || (rc = sqlite3PagerWrite(pPage->pDbPage)) 4608 ){ 4609 goto freepage_out; 4610 } 4611 memset(pPage->aData, 0, pPage->pBt->pageSize); 4612 #endif 4613 4614 /* If the database supports auto-vacuum, write an entry in the pointer-map 4615 ** to indicate that the page is free. 4616 */ 4617 if( ISAUTOVACUUM ){ 4618 rc = ptrmapPut(pBt, iPage, PTRMAP_FREEPAGE, 0); 4619 if( rc ) goto freepage_out; 4620 } 4621 4622 /* Now manipulate the actual database free-list structure. There are two 4623 ** possibilities. If the free-list is currently empty, or if the first 4624 ** trunk page in the free-list is full, then this page will become a 4625 ** new free-list trunk page. Otherwise, it will become a leaf of the 4626 ** first trunk page in the current free-list. This block tests if it 4627 ** is possible to add the page as a new free-list leaf. 4628 */ 4629 if( nFree!=0 ){ 4630 int nLeaf; /* Initial number of leaf cells on trunk page */ 4631 4632 iTrunk = get4byte(&pPage1->aData[32]); 4633 rc = sqlite3BtreeGetPage(pBt, iTrunk, &pTrunk, 0); 4634 if( rc!=SQLITE_OK ){ 4635 goto freepage_out; 4636 } 4637 4638 nLeaf = get4byte(&pTrunk->aData[4]); 4639 if( nLeaf<0 ){ 4640 rc = SQLITE_CORRUPT_BKPT; 4641 goto freepage_out; 4642 } 4643 if( nLeaf<pBt->usableSize/4 - 8 ){ 4644 /* In this case there is room on the trunk page to insert the page 4645 ** being freed as a new leaf. 4646 ** 4647 ** Note that the trunk page is not really full until it contains 4648 ** usableSize/4 - 2 entries, not usableSize/4 - 8 entries as we have 4649 ** coded. But due to a coding error in versions of SQLite prior to 4650 ** 3.6.0, databases with freelist trunk pages holding more than 4651 ** usableSize/4 - 8 entries will be reported as corrupt. In order 4652 ** to maintain backwards compatibility with older versions of SQLite, 4653 ** we will contain to restrict the number of entries to usableSize/4 - 8 4654 ** for now. At some point in the future (once everyone has upgraded 4655 ** to 3.6.0 or later) we should consider fixing the conditional above 4656 ** to read "usableSize/4-2" instead of "usableSize/4-8". 4657 */ 4658 rc = sqlite3PagerWrite(pTrunk->pDbPage); 4659 if( rc==SQLITE_OK ){ 4660 put4byte(&pTrunk->aData[4], nLeaf+1); 4661 put4byte(&pTrunk->aData[8+nLeaf*4], iPage); 4662 #ifndef SQLITE_SECURE_DELETE 4663 if( pPage ){ 4664 sqlite3PagerDontWrite(pPage->pDbPage); 4665 } 4666 #endif 4667 rc = btreeSetHasContent(pBt, iPage); 4668 } 4669 TRACE(("FREE-PAGE: %d leaf on trunk page %d\n",pPage->pgno,pTrunk->pgno)); 4670 goto freepage_out; 4671 } 4672 } 4673 4674 /* If control flows to this point, then it was not possible to add the 4675 ** the page being freed as a leaf page of the first trunk in the free-list. 4676 ** Possibly because the free-list is empty, or possibly because the 4677 ** first trunk in the free-list is full. Either way, the page being freed 4678 ** will become the new first trunk page in the free-list. 4679 */ 4680 if( ((!pPage) && (0 != (rc = sqlite3BtreeGetPage(pBt, iPage, &pPage, 0)))) 4681 || (0 != (rc = sqlite3PagerWrite(pPage->pDbPage))) 4682 ){ 4683 goto freepage_out; 4684 } 4685 put4byte(pPage->aData, iTrunk); 4686 put4byte(&pPage->aData[4], 0); 4687 put4byte(&pPage1->aData[32], iPage); 4688 TRACE(("FREE-PAGE: %d new trunk page replacing %d\n", pPage->pgno, iTrunk)); 4689 4690 freepage_out: 4691 if( pPage ){ 4692 pPage->isInit = 0; 4693 } 4694 releasePage(pPage); 4695 releasePage(pTrunk); 4696 return rc; 4697 } 4698 static int freePage(MemPage *pPage){ 4699 return freePage2(pPage->pBt, pPage, pPage->pgno); 4700 } 4701 4702 /* 4703 ** Free any overflow pages associated with the given Cell. 4704 */ 4705 static int clearCell(MemPage *pPage, unsigned char *pCell){ 4706 BtShared *pBt = pPage->pBt; 4707 CellInfo info; 4708 Pgno ovflPgno; 4709 int rc; 4710 int nOvfl; 4711 u16 ovflPageSize; 4712 4713 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 4714 sqlite3BtreeParseCellPtr(pPage, pCell, &info); 4715 if( info.iOverflow==0 ){ 4716 return SQLITE_OK; /* No overflow pages. Return without doing anything */ 4717 } 4718 ovflPgno = get4byte(&pCell[info.iOverflow]); 4719 assert( pBt->usableSize > 4 ); 4720 ovflPageSize = pBt->usableSize - 4; 4721 nOvfl = (info.nPayload - info.nLocal + ovflPageSize - 1)/ovflPageSize; 4722 assert( ovflPgno==0 || nOvfl>0 ); 4723 while( nOvfl-- ){ 4724 Pgno iNext = 0; 4725 MemPage *pOvfl = 0; 4726 if( ovflPgno<2 || ovflPgno>pagerPagecount(pBt) ){ 4727 /* 0 is not a legal page number and page 1 cannot be an 4728 ** overflow page. Therefore if ovflPgno<2 or past the end of the 4729 ** file the database must be corrupt. */ 4730 return SQLITE_CORRUPT_BKPT; 4731 } 4732 if( nOvfl ){ 4733 rc = getOverflowPage(pBt, ovflPgno, &pOvfl, &iNext); 4734 if( rc ) return rc; 4735 } 4736 rc = freePage2(pBt, pOvfl, ovflPgno); 4737 if( pOvfl ){ 4738 sqlite3PagerUnref(pOvfl->pDbPage); 4739 } 4740 if( rc ) return rc; 4741 ovflPgno = iNext; 4742 } 4743 return SQLITE_OK; 4744 } 4745 4746 /* 4747 ** Create the byte sequence used to represent a cell on page pPage 4748 ** and write that byte sequence into pCell[]. Overflow pages are 4749 ** allocated and filled in as necessary. The calling procedure 4750 ** is responsible for making sure sufficient space has been allocated 4751 ** for pCell[]. 4752 ** 4753 ** Note that pCell does not necessary need to point to the pPage->aData 4754 ** area. pCell might point to some temporary storage. The cell will 4755 ** be constructed in this temporary area then copied into pPage->aData 4756 ** later. 4757 */ 4758 static int fillInCell( 4759 MemPage *pPage, /* The page that contains the cell */ 4760 unsigned char *pCell, /* Complete text of the cell */ 4761 const void *pKey, i64 nKey, /* The key */ 4762 const void *pData,int nData, /* The data */ 4763 int nZero, /* Extra zero bytes to append to pData */ 4764 int *pnSize /* Write cell size here */ 4765 ){ 4766 int nPayload; 4767 const u8 *pSrc; 4768 int nSrc, n, rc; 4769 int spaceLeft; 4770 MemPage *pOvfl = 0; 4771 MemPage *pToRelease = 0; 4772 unsigned char *pPrior; 4773 unsigned char *pPayload; 4774 BtShared *pBt = pPage->pBt; 4775 Pgno pgnoOvfl = 0; 4776 int nHeader; 4777 CellInfo info; 4778 4779 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 4780 4781 /* pPage is not necessarily writeable since pCell might be auxiliary 4782 ** buffer space that is separate from the pPage buffer area */ 4783 assert( pCell<pPage->aData || pCell>=&pPage->aData[pBt->pageSize] 4784 || sqlite3PagerIswriteable(pPage->pDbPage) ); 4785 4786 /* Fill in the header. */ 4787 nHeader = 0; 4788 if( !pPage->leaf ){ 4789 nHeader += 4; 4790 } 4791 if( pPage->hasData ){ 4792 nHeader += putVarint(&pCell[nHeader], nData+nZero); 4793 }else{ 4794 nData = nZero = 0; 4795 } 4796 nHeader += putVarint(&pCell[nHeader], *(u64*)&nKey); 4797 sqlite3BtreeParseCellPtr(pPage, pCell, &info); 4798 assert( info.nHeader==nHeader ); 4799 assert( info.nKey==nKey ); 4800 assert( info.nData==(u32)(nData+nZero) ); 4801 4802 /* Fill in the payload */ 4803 nPayload = nData + nZero; 4804 if( pPage->intKey ){ 4805 pSrc = pData; 4806 nSrc = nData; 4807 nData = 0; 4808 }else{ 4809 if( nKey>0x7fffffff || pKey==0 ){ 4810 return SQLITE_CORRUPT; 4811 } 4812 nPayload += (int)nKey; 4813 pSrc = pKey; 4814 nSrc = (int)nKey; 4815 } 4816 *pnSize = info.nSize; 4817 spaceLeft = info.nLocal; 4818 pPayload = &pCell[nHeader]; 4819 pPrior = &pCell[info.iOverflow]; 4820 4821 while( nPayload>0 ){ 4822 if( spaceLeft==0 ){ 4823 #ifndef SQLITE_OMIT_AUTOVACUUM 4824 Pgno pgnoPtrmap = pgnoOvfl; /* Overflow page pointer-map entry page */ 4825 if( pBt->autoVacuum ){ 4826 do{ 4827 pgnoOvfl++; 4828 } while( 4829 PTRMAP_ISPAGE(pBt, pgnoOvfl) || pgnoOvfl==PENDING_BYTE_PAGE(pBt) 4830 ); 4831 } 4832 #endif 4833 rc = allocateBtreePage(pBt, &pOvfl, &pgnoOvfl, pgnoOvfl, 0); 4834 #ifndef SQLITE_OMIT_AUTOVACUUM 4835 /* If the database supports auto-vacuum, and the second or subsequent 4836 ** overflow page is being allocated, add an entry to the pointer-map 4837 ** for that page now. 4838 ** 4839 ** If this is the first overflow page, then write a partial entry 4840 ** to the pointer-map. If we write nothing to this pointer-map slot, 4841 ** then the optimistic overflow chain processing in clearCell() 4842 ** may misinterpret the uninitialised values and delete the 4843 ** wrong pages from the database. 4844 */ 4845 if( pBt->autoVacuum && rc==SQLITE_OK ){ 4846 u8 eType = (pgnoPtrmap?PTRMAP_OVERFLOW2:PTRMAP_OVERFLOW1); 4847 rc = ptrmapPut(pBt, pgnoOvfl, eType, pgnoPtrmap); 4848 if( rc ){ 4849 releasePage(pOvfl); 4850 } 4851 } 4852 #endif 4853 if( rc ){ 4854 releasePage(pToRelease); 4855 return rc; 4856 } 4857 4858 /* If pToRelease is not zero than pPrior points into the data area 4859 ** of pToRelease. Make sure pToRelease is still writeable. */ 4860 assert( pToRelease==0 || sqlite3PagerIswriteable(pToRelease->pDbPage) ); 4861 4862 /* If pPrior is part of the data area of pPage, then make sure pPage 4863 ** is still writeable */ 4864 assert( pPrior<pPage->aData || pPrior>=&pPage->aData[pBt->pageSize] 4865 || sqlite3PagerIswriteable(pPage->pDbPage) ); 4866 4867 put4byte(pPrior, pgnoOvfl); 4868 releasePage(pToRelease); 4869 pToRelease = pOvfl; 4870 pPrior = pOvfl->aData; 4871 put4byte(pPrior, 0); 4872 pPayload = &pOvfl->aData[4]; 4873 spaceLeft = pBt->usableSize - 4; 4874 } 4875 n = nPayload; 4876 if( n>spaceLeft ) n = spaceLeft; 4877 4878 /* If pToRelease is not zero than pPayload points into the data area 4879 ** of pToRelease. Make sure pToRelease is still writeable. */ 4880 assert( pToRelease==0 || sqlite3PagerIswriteable(pToRelease->pDbPage) ); 4881 4882 /* If pPayload is part of the data area of pPage, then make sure pPage 4883 ** is still writeable */ 4884 assert( pPayload<pPage->aData || pPayload>=&pPage->aData[pBt->pageSize] 4885 || sqlite3PagerIswriteable(pPage->pDbPage) ); 4886 4887 if( nSrc>0 ){ 4888 if( n>nSrc ) n = nSrc; 4889 assert( pSrc ); 4890 memcpy(pPayload, pSrc, n); 4891 }else{ 4892 memset(pPayload, 0, n); 4893 } 4894 nPayload -= n; 4895 pPayload += n; 4896 pSrc += n; 4897 nSrc -= n; 4898 spaceLeft -= n; 4899 if( nSrc==0 ){ 4900 nSrc = nData; 4901 pSrc = pData; 4902 } 4903 } 4904 releasePage(pToRelease); 4905 return SQLITE_OK; 4906 } 4907 4908 /* 4909 ** Remove the i-th cell from pPage. This routine effects pPage only. 4910 ** The cell content is not freed or deallocated. It is assumed that 4911 ** the cell content has been copied someplace else. This routine just 4912 ** removes the reference to the cell from pPage. 4913 ** 4914 ** "sz" must be the number of bytes in the cell. 4915 */ 4916 static int dropCell(MemPage *pPage, int idx, int sz){ 4917 int i; /* Loop counter */ 4918 int pc; /* Offset to cell content of cell being deleted */ 4919 u8 *data; /* pPage->aData */ 4920 u8 *ptr; /* Used to move bytes around within data[] */ 4921 int rc; /* The return code */ 4922 4923 assert( idx>=0 && idx<pPage->nCell ); 4924 assert( sz==cellSize(pPage, idx) ); 4925 assert( sqlite3PagerIswriteable(pPage->pDbPage) ); 4926 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 4927 data = pPage->aData; 4928 ptr = &data[pPage->cellOffset + 2*idx]; 4929 pc = get2byte(ptr); 4930 if( (pc<pPage->hdrOffset+6+(pPage->leaf?0:4)) 4931 || (pc+sz>pPage->pBt->usableSize) ){ 4932 return SQLITE_CORRUPT_BKPT; 4933 } 4934 rc = freeSpace(pPage, pc, sz); 4935 if( rc!=SQLITE_OK ){ 4936 return rc; 4937 } 4938 for(i=idx+1; i<pPage->nCell; i++, ptr+=2){ 4939 ptr[0] = ptr[2]; 4940 ptr[1] = ptr[3]; 4941 } 4942 pPage->nCell--; 4943 put2byte(&data[pPage->hdrOffset+3], pPage->nCell); 4944 pPage->nFree += 2; 4945 return SQLITE_OK; 4946 } 4947 4948 /* 4949 ** Insert a new cell on pPage at cell index "i". pCell points to the 4950 ** content of the cell. 4951 ** 4952 ** If the cell content will fit on the page, then put it there. If it 4953 ** will not fit, then make a copy of the cell content into pTemp if 4954 ** pTemp is not null. Regardless of pTemp, allocate a new entry 4955 ** in pPage->aOvfl[] and make it point to the cell content (either 4956 ** in pTemp or the original pCell) and also record its index. 4957 ** Allocating a new entry in pPage->aCell[] implies that 4958 ** pPage->nOverflow is incremented. 4959 ** 4960 ** If nSkip is non-zero, then do not copy the first nSkip bytes of the 4961 ** cell. The caller will overwrite them after this function returns. If 4962 ** nSkip is non-zero, then pCell may not point to an invalid memory location 4963 ** (but pCell+nSkip is always valid). 4964 */ 4965 static int insertCell( 4966 MemPage *pPage, /* Page into which we are copying */ 4967 int i, /* New cell becomes the i-th cell of the page */ 4968 u8 *pCell, /* Content of the new cell */ 4969 int sz, /* Bytes of content in pCell */ 4970 u8 *pTemp, /* Temp storage space for pCell, if needed */ 4971 u8 nSkip /* Do not write the first nSkip bytes of the cell */ 4972 ){ 4973 int idx; /* Where to write new cell content in data[] */ 4974 int j; /* Loop counter */ 4975 int top; /* First byte of content for any cell in data[] */ 4976 int end; /* First byte past the last cell pointer in data[] */ 4977 int ins; /* Index in data[] where new cell pointer is inserted */ 4978 int hdr; /* Offset into data[] of the page header */ 4979 int cellOffset; /* Address of first cell pointer in data[] */ 4980 u8 *data; /* The content of the whole page */ 4981 u8 *ptr; /* Used for moving information around in data[] */ 4982 4983 assert( i>=0 && i<=pPage->nCell+pPage->nOverflow ); 4984 assert( pPage->nCell<=MX_CELL(pPage->pBt) && MX_CELL(pPage->pBt)<=5460 ); 4985 assert( pPage->nOverflow<=ArraySize(pPage->aOvfl) ); 4986 assert( sz==cellSizePtr(pPage, pCell) ); 4987 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 4988 if( pPage->nOverflow || sz+2>pPage->nFree ){ 4989 if( pTemp ){ 4990 memcpy(pTemp+nSkip, pCell+nSkip, sz-nSkip); 4991 pCell = pTemp; 4992 } 4993 j = pPage->nOverflow++; 4994 assert( j<(int)(sizeof(pPage->aOvfl)/sizeof(pPage->aOvfl[0])) ); 4995 pPage->aOvfl[j].pCell = pCell; 4996 pPage->aOvfl[j].idx = (u16)i; 4997 pPage->nFree = 0; 4998 }else{ 4999 int rc = sqlite3PagerWrite(pPage->pDbPage); 5000 if( rc!=SQLITE_OK ){ 5001 return rc; 5002 } 5003 assert( sqlite3PagerIswriteable(pPage->pDbPage) ); 5004 data = pPage->aData; 5005 hdr = pPage->hdrOffset; 5006 top = get2byte(&data[hdr+5]); 5007 cellOffset = pPage->cellOffset; 5008 end = cellOffset + 2*pPage->nCell + 2; 5009 ins = cellOffset + 2*i; 5010 if( end > top - sz ){ 5011 rc = defragmentPage(pPage); 5012 if( rc!=SQLITE_OK ){ 5013 return rc; 5014 } 5015 top = get2byte(&data[hdr+5]); 5016 assert( end + sz <= top ); 5017 } 5018 idx = allocateSpace(pPage, sz); 5019 assert( idx>0 ); 5020 assert( end <= get2byte(&data[hdr+5]) ); 5021 if (idx+sz > pPage->pBt->usableSize) { 5022 return SQLITE_CORRUPT_BKPT; 5023 } 5024 pPage->nCell++; 5025 pPage->nFree -= 2; 5026 memcpy(&data[idx+nSkip], pCell+nSkip, sz-nSkip); 5027 for(j=end-2, ptr=&data[j]; j>ins; j-=2, ptr-=2){ 5028 ptr[0] = ptr[-2]; 5029 ptr[1] = ptr[-1]; 5030 } 5031 put2byte(&data[ins], idx); 5032 put2byte(&data[hdr+3], pPage->nCell); 5033 #ifndef SQLITE_OMIT_AUTOVACUUM 5034 if( pPage->pBt->autoVacuum ){ 5035 /* The cell may contain a pointer to an overflow page. If so, write 5036 ** the entry for the overflow page into the pointer map. 5037 */ 5038 CellInfo info; 5039 sqlite3BtreeParseCellPtr(pPage, pCell, &info); 5040 assert( (info.nData+(pPage->intKey?0:info.nKey))==info.nPayload ); 5041 if( info.iOverflow ){ 5042 Pgno pgnoOvfl = get4byte(&pCell[info.iOverflow]); 5043 rc = ptrmapPut(pPage->pBt, pgnoOvfl, PTRMAP_OVERFLOW1, pPage->pgno); 5044 if( rc!=SQLITE_OK ) return rc; 5045 } 5046 } 5047 #endif 5048 } 5049 5050 return SQLITE_OK; 5051 } 5052 5053 /* 5054 ** Add a list of cells to a page. The page should be initially empty. 5055 ** The cells are guaranteed to fit on the page. 5056 */ 5057 static void assemblePage( 5058 MemPage *pPage, /* The page to be assemblied */ 5059 int nCell, /* The number of cells to add to this page */ 5060 u8 **apCell, /* Pointers to cell bodies */ 5061 u16 *aSize /* Sizes of the cells */ 5062 ){ 5063 int i; /* Loop counter */ 5064 u8 *pCellptr; /* Address of next cell pointer */ 5065 int cellbody; /* Address of next cell body */ 5066 u8 * const data = pPage->aData; /* Pointer to data for pPage */ 5067 const int hdr = pPage->hdrOffset; /* Offset of header on pPage */ 5068 const int nUsable = pPage->pBt->usableSize; /* Usable size of page */ 5069 5070 assert( pPage->nOverflow==0 ); 5071 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 5072 assert( nCell>=0 && nCell<=MX_CELL(pPage->pBt) && MX_CELL(pPage->pBt)<=5460 ); 5073 assert( sqlite3PagerIswriteable(pPage->pDbPage) ); 5074 5075 /* Check that the page has just been zeroed by zeroPage() */ 5076 assert( pPage->nCell==0 ); 5077 assert( get2byte(&data[hdr+5])==nUsable ); 5078 5079 pCellptr = &data[pPage->cellOffset + nCell*2]; 5080 cellbody = nUsable; 5081 for(i=nCell-1; i>=0; i--){ 5082 pCellptr -= 2; 5083 cellbody -= aSize[i]; 5084 put2byte(pCellptr, cellbody); 5085 memcpy(&data[cellbody], apCell[i], aSize[i]); 5086 } 5087 put2byte(&data[hdr+3], nCell); 5088 put2byte(&data[hdr+5], cellbody); 5089 pPage->nFree -= (nCell*2 + nUsable - cellbody); 5090 pPage->nCell = (u16)nCell; 5091 } 5092 5093 /* 5094 ** The following parameters determine how many adjacent pages get involved 5095 ** in a balancing operation. NN is the number of neighbors on either side 5096 ** of the page that participate in the balancing operation. NB is the 5097 ** total number of pages that participate, including the target page and 5098 ** NN neighbors on either side. 5099 ** 5100 ** The minimum value of NN is 1 (of course). Increasing NN above 1 5101 ** (to 2 or 3) gives a modest improvement in SELECT and DELETE performance 5102 ** in exchange for a larger degradation in INSERT and UPDATE performance. 5103 ** The value of NN appears to give the best results overall. 5104 */ 5105 #define NN 1 /* Number of neighbors on either side of pPage */ 5106 #define NB (NN*2+1) /* Total pages involved in the balance */ 5107 5108 /* Forward reference */ 5109 static int balance(BtCursor*, int); 5110 5111 #ifndef SQLITE_OMIT_QUICKBALANCE 5112 /* 5113 ** This version of balance() handles the common special case where 5114 ** a new entry is being inserted on the extreme right-end of the 5115 ** tree, in other words, when the new entry will become the largest 5116 ** entry in the tree. 5117 ** 5118 ** Instead of trying balance the 3 right-most leaf pages, just add 5119 ** a new page to the right-hand side and put the one new entry in 5120 ** that page. This leaves the right side of the tree somewhat 5121 ** unbalanced. But odds are that we will be inserting new entries 5122 ** at the end soon afterwards so the nearly empty page will quickly 5123 ** fill up. On average. 5124 ** 5125 ** pPage is the leaf page which is the right-most page in the tree. 5126 ** pParent is its parent. pPage must have a single overflow entry 5127 ** which is also the right-most entry on the page. 5128 */ 5129 static int balance_quick(BtCursor *pCur){ 5130 int rc; 5131 MemPage *pNew = 0; 5132 Pgno pgnoNew; 5133 u8 *pCell; 5134 u16 szCell; 5135 CellInfo info; 5136 MemPage *pPage = pCur->apPage[pCur->iPage]; 5137 MemPage *pParent = pCur->apPage[pCur->iPage-1]; 5138 BtShared *pBt = pPage->pBt; 5139 int parentIdx = pParent->nCell; /* pParent new divider cell index */ 5140 int parentSize; /* Size of new divider cell */ 5141 u8 parentCell[64]; /* Space for the new divider cell */ 5142 5143 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 5144 5145 /* Allocate a new page. Insert the overflow cell from pPage 5146 ** into it. Then remove the overflow cell from pPage. 5147 */ 5148 rc = allocateBtreePage(pBt, &pNew, &pgnoNew, 0, 0); 5149 if( rc==SQLITE_OK ){ 5150 pCell = pPage->aOvfl[0].pCell; 5151 szCell = cellSizePtr(pPage, pCell); 5152 assert( sqlite3PagerIswriteable(pNew->pDbPage) ); 5153 zeroPage(pNew, pPage->aData[0]); 5154 assemblePage(pNew, 1, &pCell, &szCell); 5155 pPage->nOverflow = 0; 5156 5157 /* pPage is currently the right-child of pParent. Change this 5158 ** so that the right-child is the new page allocated above and 5159 ** pPage is the next-to-right child. 5160 ** 5161 ** Ignore the return value of the call to fillInCell(). fillInCell() 5162 ** may only return other than SQLITE_OK if it is required to allocate 5163 ** one or more overflow pages. Since an internal table B-Tree cell 5164 ** may never spill over onto an overflow page (it is a maximum of 5165 ** 13 bytes in size), it is not neccessary to check the return code. 5166 ** 5167 ** Similarly, the insertCell() function cannot fail if the page 5168 ** being inserted into is already writable and the cell does not 5169 ** contain an overflow pointer. So ignore this return code too. 5170 */ 5171 assert( pPage->nCell>0 ); 5172 pCell = findCell(pPage, pPage->nCell-1); 5173 sqlite3BtreeParseCellPtr(pPage, pCell, &info); 5174 fillInCell(pParent, parentCell, 0, info.nKey, 0, 0, 0, &parentSize); 5175 assert( parentSize<64 ); 5176 assert( sqlite3PagerIswriteable(pParent->pDbPage) ); 5177 insertCell(pParent, parentIdx, parentCell, parentSize, 0, 4); 5178 put4byte(findOverflowCell(pParent,parentIdx), pPage->pgno); 5179 put4byte(&pParent->aData[pParent->hdrOffset+8], pgnoNew); 5180 5181 /* If this is an auto-vacuum database, update the pointer map 5182 ** with entries for the new page, and any pointer from the 5183 ** cell on the page to an overflow page. 5184 */ 5185 if( ISAUTOVACUUM ){ 5186 rc = ptrmapPut(pBt, pgnoNew, PTRMAP_BTREE, pParent->pgno); 5187 if( rc==SQLITE_OK ){ 5188 rc = ptrmapPutOvfl(pNew, 0); 5189 } 5190 } 5191 5192 /* Release the reference to the new page. */ 5193 releasePage(pNew); 5194 } 5195 5196 /* At this point the pPage->nFree variable is not set correctly with 5197 ** respect to the content of the page (because it was set to 0 by 5198 ** insertCell). So call sqlite3BtreeInitPage() to make sure it is 5199 ** correct. 5200 ** 5201 ** This has to be done even if an error will be returned. Normally, if 5202 ** an error occurs during tree balancing, the contents of MemPage are 5203 ** not important, as they will be recalculated when the page is rolled 5204 ** back. But here, in balance_quick(), it is possible that pPage has 5205 ** not yet been marked dirty or written into the journal file. Therefore 5206 ** it will not be rolled back and so it is important to make sure that 5207 ** the page data and contents of MemPage are consistent. 5208 */ 5209 pPage->isInit = 0; 5210 sqlite3BtreeInitPage(pPage); 5211 assert( pPage->nOverflow==0 ); 5212 5213 /* If everything else succeeded, balance the parent page, in 5214 ** case the divider cell inserted caused it to become overfull. 5215 */ 5216 if( rc==SQLITE_OK ){ 5217 releasePage(pPage); 5218 pCur->iPage--; 5219 rc = balance(pCur, 0); 5220 } 5221 return rc; 5222 } 5223 #endif /* SQLITE_OMIT_QUICKBALANCE */ 5224 5225 /* 5226 ** This routine redistributes Cells on pPage and up to NN*2 siblings 5227 ** of pPage so that all pages have about the same amount of free space. 5228 ** Usually NN siblings on either side of pPage is used in the balancing, 5229 ** though more siblings might come from one side if pPage is the first 5230 ** or last child of its parent. If pPage has fewer than 2*NN siblings 5231 ** (something which can only happen if pPage is the root page or a 5232 ** child of root) then all available siblings participate in the balancing. 5233 ** 5234 ** The number of siblings of pPage might be increased or decreased by one or 5235 ** two in an effort to keep pages nearly full but not over full. The root page 5236 ** is special and is allowed to be nearly empty. If pPage is 5237 ** the root page, then the depth of the tree might be increased 5238 ** or decreased by one, as necessary, to keep the root page from being 5239 ** overfull or completely empty. 5240 ** 5241 ** Note that when this routine is called, some of the Cells on pPage 5242 ** might not actually be stored in pPage->aData[]. This can happen 5243 ** if the page is overfull. Part of the job of this routine is to 5244 ** make sure all Cells for pPage once again fit in pPage->aData[]. 5245 ** 5246 ** In the course of balancing the siblings of pPage, the parent of pPage 5247 ** might become overfull or underfull. If that happens, then this routine 5248 ** is called recursively on the parent. 5249 ** 5250 ** If this routine fails for any reason, it might leave the database 5251 ** in a corrupted state. So if this routine fails, the database should 5252 ** be rolled back. 5253 */ 5254 static int balance_nonroot(BtCursor *pCur){ 5255 MemPage *pPage; /* The over or underfull page to balance */ 5256 MemPage *pParent; /* The parent of pPage */ 5257 BtShared *pBt; /* The whole database */ 5258 int nCell = 0; /* Number of cells in apCell[] */ 5259 int nMaxCells = 0; /* Allocated size of apCell, szCell, aFrom. */ 5260 int nOld = 0; /* Number of pages in apOld[] */ 5261 int nNew = 0; /* Number of pages in apNew[] */ 5262 int nDiv; /* Number of cells in apDiv[] */ 5263 int i, j, k; /* Loop counters */ 5264 int idx; /* Index of pPage in pParent->aCell[] */ 5265 int nxDiv; /* Next divider slot in pParent->aCell[] */ 5266 int rc; /* The return code */ 5267 int leafCorrection; /* 4 if pPage is a leaf. 0 if not */ 5268 int leafData; /* True if pPage is a leaf of a LEAFDATA tree */ 5269 int usableSpace; /* Bytes in pPage beyond the header */ 5270 int pageFlags; /* Value of pPage->aData[0] */ 5271 int subtotal; /* Subtotal of bytes in cells on one page */ 5272 int iSpace1 = 0; /* First unused byte of aSpace1[] */ 5273 int iSpace2 = 0; /* First unused byte of aSpace2[] */ 5274 int szScratch; /* Size of scratch memory requested */ 5275 MemPage *apOld[NB]; /* pPage and up to two siblings */ 5276 Pgno pgnoOld[NB]; /* Page numbers for each page in apOld[] */ 5277 MemPage *apCopy[NB]; /* Private copies of apOld[] pages */ 5278 MemPage *apNew[NB+2]; /* pPage and up to NB siblings after balancing */ 5279 Pgno pgnoNew[NB+2]; /* Page numbers for each page in apNew[] */ 5280 u8 *apDiv[NB]; /* Divider cells in pParent */ 5281 int cntNew[NB+2]; /* Index in aCell[] of cell after i-th page */ 5282 int szNew[NB+2]; /* Combined size of cells place on i-th page */ 5283 u8 **apCell = 0; /* All cells begin balanced */ 5284 u16 *szCell; /* Local size of all cells in apCell[] */ 5285 u8 *aCopy[NB]; /* Space for holding data of apCopy[] */ 5286 u8 *aSpace1; /* Space for copies of dividers cells before balance */ 5287 u8 *aSpace2 = 0; /* Space for overflow dividers cells after balance */ 5288 u8 *aFrom = 0; 5289 5290 pPage = pCur->apPage[pCur->iPage]; 5291 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 5292 VVA_ONLY( pCur->pagesShuffled = 1 ); 5293 5294 /* 5295 ** Find the parent page. 5296 */ 5297 assert( pCur->iPage>0 ); 5298 assert( pPage->isInit ); 5299 assert( sqlite3PagerIswriteable(pPage->pDbPage) || pPage->nOverflow==1 ); 5300 pBt = pPage->pBt; 5301 pParent = pCur->apPage[pCur->iPage-1]; 5302 assert( pParent ); 5303 if( SQLITE_OK!=(rc = sqlite3PagerWrite(pParent->pDbPage)) ){ 5304 goto balance_cleanup; 5305 } 5306 5307 TRACE(("BALANCE: begin page %d child of %d\n", pPage->pgno, pParent->pgno)); 5308 5309 #ifndef SQLITE_OMIT_QUICKBALANCE 5310 /* 5311 ** A special case: If a new entry has just been inserted into a 5312 ** table (that is, a btree with integer keys and all data at the leaves) 5313 ** and the new entry is the right-most entry in the tree (it has the 5314 ** largest key) then use the special balance_quick() routine for 5315 ** balancing. balance_quick() is much faster and results in a tighter 5316 ** packing of data in the common case. 5317 */ 5318 if( pPage->leaf && 5319 pPage->intKey && 5320 pPage->nOverflow==1 && 5321 pPage->aOvfl[0].idx==pPage->nCell && 5322 pParent->pgno!=1 && 5323 get4byte(&pParent->aData[pParent->hdrOffset+8])==pPage->pgno 5324 ){ 5325 assert( pPage->intKey ); 5326 /* 5327 ** TODO: Check the siblings to the left of pPage. It may be that 5328 ** they are not full and no new page is required. 5329 */ 5330 return balance_quick(pCur); 5331 } 5332 #endif 5333 5334 if( SQLITE_OK!=(rc = sqlite3PagerWrite(pPage->pDbPage)) ){ 5335 goto balance_cleanup; 5336 } 5337 5338 /* 5339 ** Find the cell in the parent page whose left child points back 5340 ** to pPage. The "idx" variable is the index of that cell. If pPage 5341 ** is the rightmost child of pParent then set idx to pParent->nCell 5342 */ 5343 idx = pCur->aiIdx[pCur->iPage-1]; 5344 assertParentIndex(pParent, idx, pPage->pgno); 5345 5346 /* 5347 ** Find sibling pages to pPage and the cells in pParent that divide 5348 ** the siblings. An attempt is made to find NN siblings on either 5349 ** side of pPage. More siblings are taken from one side, however, if 5350 ** pPage there are fewer than NN siblings on the other side. If pParent 5351 ** has NB or fewer children then all children of pParent are taken. 5352 */ 5353 nxDiv = idx - NN; 5354 if( nxDiv + NB > pParent->nCell ){ 5355 nxDiv = pParent->nCell - NB + 1; 5356 } 5357 if( nxDiv<0 ){ 5358 nxDiv = 0; 5359 } 5360 nDiv = 0; 5361 for(i=0, k=nxDiv; i<NB; i++, k++){ 5362 if( k<pParent->nCell ){ 5363 apDiv[i] = findCell(pParent, k); 5364 nDiv++; 5365 assert( !pParent->leaf ); 5366 pgnoOld[i] = get4byte(apDiv[i]); 5367 }else if( k==pParent->nCell ){ 5368 pgnoOld[i] = get4byte(&pParent->aData[pParent->hdrOffset+8]); 5369 }else{ 5370 break; 5371 } 5372 rc = getAndInitPage(pBt, pgnoOld[i], &apOld[i]); 5373 if( rc ) goto balance_cleanup; 5374 /* apOld[i]->idxParent = k; */ 5375 apCopy[i] = 0; 5376 assert( i==nOld ); 5377 nOld++; 5378 nMaxCells += 1+apOld[i]->nCell+apOld[i]->nOverflow; 5379 } 5380 5381 /* Make nMaxCells a multiple of 4 in order to preserve 8-byte 5382 ** alignment */ 5383 nMaxCells = (nMaxCells + 3)&~3; 5384 5385 /* 5386 ** Allocate space for memory structures 5387 */ 5388 szScratch = 5389 nMaxCells*sizeof(u8*) /* apCell */ 5390 + nMaxCells*sizeof(u16) /* szCell */ 5391 + (ROUND8(sizeof(MemPage))+pBt->pageSize)*NB /* aCopy */ 5392 + pBt->pageSize /* aSpace1 */ 5393 + (ISAUTOVACUUM ? nMaxCells : 0); /* aFrom */ 5394 apCell = sqlite3ScratchMalloc( szScratch ); 5395 if( apCell==0 ){ 5396 rc = SQLITE_NOMEM; 5397 goto balance_cleanup; 5398 } 5399 szCell = (u16*)&apCell[nMaxCells]; 5400 aCopy[0] = (u8*)&szCell[nMaxCells]; 5401 assert( EIGHT_BYTE_ALIGNMENT(aCopy[0]) ); 5402 for(i=1; i<NB; i++){ 5403 aCopy[i] = &aCopy[i-1][pBt->pageSize+ROUND8(sizeof(MemPage))]; 5404 assert( ((aCopy[i] - (u8*)0) & 7)==0 ); /* 8-byte alignment required */ 5405 } 5406 aSpace1 = &aCopy[NB-1][pBt->pageSize+ROUND8(sizeof(MemPage))]; 5407 assert( EIGHT_BYTE_ALIGNMENT(aSpace1) ); 5408 if( ISAUTOVACUUM ){ 5409 aFrom = &aSpace1[pBt->pageSize]; 5410 } 5411 aSpace2 = sqlite3PageMalloc(pBt->pageSize); 5412 if( aSpace2==0 ){ 5413 rc = SQLITE_NOMEM; 5414 goto balance_cleanup; 5415 } 5416 5417 /* 5418 ** Make copies of the content of pPage and its siblings into aOld[]. 5419 ** The rest of this function will use data from the copies rather 5420 ** that the original pages since the original pages will be in the 5421 ** process of being overwritten. 5422 */ 5423 for(i=0; i<nOld; i++){ 5424 MemPage *p = apCopy[i] = (MemPage*)aCopy[i]; 5425 memcpy(p, apOld[i], sizeof(MemPage)); 5426 p->aData = (void*)&p[1]; 5427 memcpy(p->aData, apOld[i]->aData, pBt->pageSize); 5428 } 5429 5430 /* 5431 ** Load pointers to all cells on sibling pages and the divider cells 5432 ** into the local apCell[] array. Make copies of the divider cells 5433 ** into space obtained form aSpace1[] and remove the the divider Cells 5434 ** from pParent. 5435 ** 5436 ** If the siblings are on leaf pages, then the child pointers of the 5437 ** divider cells are stripped from the cells before they are copied 5438 ** into aSpace1[]. In this way, all cells in apCell[] are without 5439 ** child pointers. If siblings are not leaves, then all cell in 5440 ** apCell[] include child pointers. Either way, all cells in apCell[] 5441 ** are alike. 5442 ** 5443 ** leafCorrection: 4 if pPage is a leaf. 0 if pPage is not a leaf. 5444 ** leafData: 1 if pPage holds key+data and pParent holds only keys. 5445 */ 5446 nCell = 0; 5447 leafCorrection = pPage->leaf*4; 5448 leafData = pPage->hasData; 5449 for(i=0; i<nOld; i++){ 5450 MemPage *pOld = apCopy[i]; 5451 int limit = pOld->nCell+pOld->nOverflow; 5452 for(j=0; j<limit; j++){ 5453 assert( nCell<nMaxCells ); 5454 apCell[nCell] = findOverflowCell(pOld, j); 5455 szCell[nCell] = cellSizePtr(pOld, apCell[nCell]); 5456 if( ISAUTOVACUUM ){ 5457 int a; 5458 aFrom[nCell] = (u8)i; assert( i>=0 && i<6 ); 5459 for(a=0; a<pOld->nOverflow; a++){ 5460 if( pOld->aOvfl[a].pCell==apCell[nCell] ){ 5461 aFrom[nCell] = 0xFF; 5462 break; 5463 } 5464 } 5465 } 5466 nCell++; 5467 } 5468 if( i<nOld-1 ){ 5469 u16 sz = cellSizePtr(pParent, apDiv[i]); 5470 if( leafData ){ 5471 /* With the LEAFDATA flag, pParent cells hold only INTKEYs that 5472 ** are duplicates of keys on the child pages. We need to remove 5473 ** the divider cells from pParent, but the dividers cells are not 5474 ** added to apCell[] because they are duplicates of child cells. 5475 */ 5476 dropCell(pParent, nxDiv, sz); 5477 }else{ 5478 u8 *pTemp; 5479 assert( nCell<nMaxCells ); 5480 szCell[nCell] = sz; 5481 pTemp = &aSpace1[iSpace1]; 5482 iSpace1 += sz; 5483 assert( sz<=pBt->pageSize/4 ); 5484 assert( iSpace1<=pBt->pageSize ); 5485 memcpy(pTemp, apDiv[i], sz); 5486 apCell[nCell] = pTemp+leafCorrection; 5487 if( ISAUTOVACUUM ){ 5488 aFrom[nCell] = 0xFF; 5489 } 5490 dropCell(pParent, nxDiv, sz); 5491 assert( leafCorrection==0 || leafCorrection==4 ); 5492 szCell[nCell] -= (u16)leafCorrection; 5493 assert( get4byte(pTemp)==pgnoOld[i] ); 5494 if( !pOld->leaf ){ 5495 assert( leafCorrection==0 ); 5496 /* The right pointer of the child page pOld becomes the left 5497 ** pointer of the divider cell */ 5498 memcpy(apCell[nCell], &pOld->aData[pOld->hdrOffset+8], 4); 5499 }else{ 5500 assert( leafCorrection==4 ); 5501 if( szCell[nCell]<4 ){ 5502 /* Do not allow any cells smaller than 4 bytes. */ 5503 szCell[nCell] = 4; 5504 } 5505 } 5506 nCell++; 5507 } 5508 } 5509 } 5510 5511 /* 5512 ** Figure out the number of pages needed to hold all nCell cells. 5513 ** Store this number in "k". Also compute szNew[] which is the total 5514 ** size of all cells on the i-th page and cntNew[] which is the index 5515 ** in apCell[] of the cell that divides page i from page i+1. 5516 ** cntNew[k] should equal nCell. 5517 ** 5518 ** Values computed by this block: 5519 ** 5520 ** k: The total number of sibling pages 5521 ** szNew[i]: Spaced used on the i-th sibling page. 5522 ** cntNew[i]: Index in apCell[] and szCell[] for the first cell to 5523 ** the right of the i-th sibling page. 5524 ** usableSpace: Number of bytes of space available on each sibling. 5525 ** 5526 */ 5527 usableSpace = pBt->usableSize - 12 + leafCorrection; 5528 for(subtotal=k=i=0; i<nCell; i++){ 5529 assert( i<nMaxCells ); 5530 subtotal += szCell[i] + 2; 5531 if( subtotal > usableSpace ){ 5532 szNew[k] = subtotal - szCell[i]; 5533 cntNew[k] = i; 5534 if( leafData ){ i--; } 5535 subtotal = 0; 5536 k++; 5537 } 5538 } 5539 szNew[k] = subtotal; 5540 cntNew[k] = nCell; 5541 k++; 5542 5543 /* 5544 ** The packing computed by the previous block is biased toward the siblings 5545 ** on the left side. The left siblings are always nearly full, while the 5546 ** right-most sibling might be nearly empty. This block of code attempts 5547 ** to adjust the packing of siblings to get a better balance. 5548 ** 5549 ** This adjustment is more than an optimization. The packing above might 5550 ** be so out of balance as to be illegal. For example, the right-most 5551 ** sibling might be completely empty. This adjustment is not optional. 5552 */ 5553 for(i=k-1; i>0; i--){ 5554 int szRight = szNew[i]; /* Size of sibling on the right */ 5555 int szLeft = szNew[i-1]; /* Size of sibling on the left */ 5556 int r; /* Index of right-most cell in left sibling */ 5557 int d; /* Index of first cell to the left of right sibling */ 5558 5559 r = cntNew[i-1] - 1; 5560 d = r + 1 - leafData; 5561 assert( d<nMaxCells ); 5562 assert( r<nMaxCells ); 5563 while( szRight==0 || szRight+szCell[d]+2<=szLeft-(szCell[r]+2) ){ 5564 szRight += szCell[d] + 2; 5565 szLeft -= szCell[r] + 2; 5566 cntNew[i-1]--; 5567 r = cntNew[i-1] - 1; 5568 d = r + 1 - leafData; 5569 } 5570 szNew[i] = szRight; 5571 szNew[i-1] = szLeft; 5572 } 5573 5574 /* Either we found one or more cells (cntnew[0])>0) or we are the 5575 ** a virtual root page. A virtual root page is when the real root 5576 ** page is page 1 and we are the only child of that page. 5577 */ 5578 assert( cntNew[0]>0 || (pParent->pgno==1 && pParent->nCell==0) ); 5579 5580 /* 5581 ** Allocate k new pages. Reuse old pages where possible. 5582 */ 5583 assert( pPage->pgno>1 ); 5584 pageFlags = pPage->aData[0]; 5585 for(i=0; i<k; i++){ 5586 MemPage *pNew; 5587 if( i<nOld ){ 5588 pNew = apNew[i] = apOld[i]; 5589 pgnoNew[i] = pgnoOld[i]; 5590 apOld[i] = 0; 5591 rc = sqlite3PagerWrite(pNew->pDbPage); 5592 nNew++; 5593 if( rc ) goto balance_cleanup; 5594 }else{ 5595 assert( i>0 ); 5596 rc = allocateBtreePage(pBt, &pNew, &pgnoNew[i], pgnoNew[i-1], 0); 5597 if( rc ) goto balance_cleanup; 5598 apNew[i] = pNew; 5599 nNew++; 5600 } 5601 } 5602 5603 /* Free any old pages that were not reused as new pages. 5604 */ 5605 while( i<nOld ){ 5606 rc = freePage(apOld[i]); 5607 if( rc ) goto balance_cleanup; 5608 releasePage(apOld[i]); 5609 apOld[i] = 0; 5610 i++; 5611 } 5612 5613 /* 5614 ** Put the new pages in accending order. This helps to 5615 ** keep entries in the disk file in order so that a scan 5616 ** of the table is a linear scan through the file. That 5617 ** in turn helps the operating system to deliver pages 5618 ** from the disk more rapidly. 5619 ** 5620 ** An O(n^2) insertion sort algorithm is used, but since 5621 ** n is never more than NB (a small constant), that should 5622 ** not be a problem. 5623 ** 5624 ** When NB==3, this one optimization makes the database 5625 ** about 25% faster for large insertions and deletions. 5626 */ 5627 for(i=0; i<k-1; i++){ 5628 int minV = pgnoNew[i]; 5629 int minI = i; 5630 for(j=i+1; j<k; j++){ 5631 if( pgnoNew[j]<(unsigned)minV ){ 5632 minI = j; 5633 minV = pgnoNew[j]; 5634 } 5635 } 5636 if( minI>i ){ 5637 int t; 5638 MemPage *pT; 5639 t = pgnoNew[i]; 5640 pT = apNew[i]; 5641 pgnoNew[i] = pgnoNew[minI]; 5642 apNew[i] = apNew[minI]; 5643 pgnoNew[minI] = t; 5644 apNew[minI] = pT; 5645 } 5646 } 5647 TRACE(("BALANCE: old: %d %d %d new: %d(%d) %d(%d) %d(%d) %d(%d) %d(%d)\n", 5648 pgnoOld[0], 5649 nOld>=2 ? pgnoOld[1] : 0, 5650 nOld>=3 ? pgnoOld[2] : 0, 5651 pgnoNew[0], szNew[0], 5652 nNew>=2 ? pgnoNew[1] : 0, nNew>=2 ? szNew[1] : 0, 5653 nNew>=3 ? pgnoNew[2] : 0, nNew>=3 ? szNew[2] : 0, 5654 nNew>=4 ? pgnoNew[3] : 0, nNew>=4 ? szNew[3] : 0, 5655 nNew>=5 ? pgnoNew[4] : 0, nNew>=5 ? szNew[4] : 0)); 5656 5657 /* 5658 ** Evenly distribute the data in apCell[] across the new pages. 5659 ** Insert divider cells into pParent as necessary. 5660 */ 5661 j = 0; 5662 for(i=0; i<nNew; i++){ 5663 /* Assemble the new sibling page. */ 5664 MemPage *pNew = apNew[i]; 5665 assert( j<nMaxCells ); 5666 assert( pNew->pgno==pgnoNew[i] ); 5667 zeroPage(pNew, pageFlags); 5668 assemblePage(pNew, cntNew[i]-j, &apCell[j], &szCell[j]); 5669 assert( pNew->nCell>0 || (nNew==1 && cntNew[0]==0) ); 5670 assert( pNew->nOverflow==0 ); 5671 5672 /* If this is an auto-vacuum database, update the pointer map entries 5673 ** that point to the siblings that were rearranged. These can be: left 5674 ** children of cells, the right-child of the page, or overflow pages 5675 ** pointed to by cells. 5676 */ 5677 if( ISAUTOVACUUM ){ 5678 for(k=j; k<cntNew[i]; k++){ 5679 assert( k<nMaxCells ); 5680 if( aFrom[k]==0xFF || apCopy[aFrom[k]]->pgno!=pNew->pgno ){ 5681 rc = ptrmapPutOvfl(pNew, k-j); 5682 if( rc==SQLITE_OK && leafCorrection==0 ){ 5683 rc = ptrmapPut(pBt, get4byte(apCell[k]), PTRMAP_BTREE, pNew->pgno); 5684 } 5685 if( rc!=SQLITE_OK ){ 5686 goto balance_cleanup; 5687 } 5688 } 5689 } 5690 } 5691 5692 j = cntNew[i]; 5693 5694 /* If the sibling page assembled above was not the right-most sibling, 5695 ** insert a divider cell into the parent page. 5696 */ 5697 if( i<nNew-1 && j<nCell ){ 5698 u8 *pCell; 5699 u8 *pTemp; 5700 int sz; 5701 5702 assert( j<nMaxCells ); 5703 pCell = apCell[j]; 5704 sz = szCell[j] + leafCorrection; 5705 pTemp = &aSpace2[iSpace2]; 5706 if( !pNew->leaf ){ 5707 memcpy(&pNew->aData[8], pCell, 4); 5708 if( ISAUTOVACUUM 5709 && (aFrom[j]==0xFF || apCopy[aFrom[j]]->pgno!=pNew->pgno) 5710 ){ 5711 rc = ptrmapPut(pBt, get4byte(pCell), PTRMAP_BTREE, pNew->pgno); 5712 if( rc!=SQLITE_OK ){ 5713 goto balance_cleanup; 5714 } 5715 } 5716 }else if( leafData ){ 5717 /* If the tree is a leaf-data tree, and the siblings are leaves, 5718 ** then there is no divider cell in apCell[]. Instead, the divider 5719 ** cell consists of the integer key for the right-most cell of 5720 ** the sibling-page assembled above only. 5721 */ 5722 CellInfo info; 5723 j--; 5724 sqlite3BtreeParseCellPtr(pNew, apCell[j], &info); 5725 pCell = pTemp; 5726 rc = fillInCell(pParent, pCell, 0, info.nKey, 0, 0, 0, &sz); 5727 if( rc!=SQLITE_OK ){ 5728 goto balance_cleanup; 5729 } 5730 pTemp = 0; 5731 }else{ 5732 pCell -= 4; 5733 /* Obscure case for non-leaf-data trees: If the cell at pCell was 5734 ** previously stored on a leaf node, and its reported size was 4 5735 ** bytes, then it may actually be smaller than this 5736 ** (see sqlite3BtreeParseCellPtr(), 4 bytes is the minimum size of 5737 ** any cell). But it is important to pass the correct size to 5738 ** insertCell(), so reparse the cell now. 5739 ** 5740 ** Note that this can never happen in an SQLite data file, as all 5741 ** cells are at least 4 bytes. It only happens in b-trees used 5742 ** to evaluate "IN (SELECT ...)" and similar clauses. 5743 */ 5744 if( szCell[j]==4 ){ 5745 assert(leafCorrection==4); 5746 sz = cellSizePtr(pParent, pCell); 5747 } 5748 } 5749 iSpace2 += sz; 5750 assert( sz<=pBt->pageSize/4 ); 5751 assert( iSpace2<=pBt->pageSize ); 5752 rc = insertCell(pParent, nxDiv, pCell, sz, pTemp, 4); 5753 if( rc!=SQLITE_OK ) goto balance_cleanup; 5754 assert( sqlite3PagerIswriteable(pParent->pDbPage) ); 5755 put4byte(findOverflowCell(pParent,nxDiv), pNew->pgno); 5756 5757 /* If this is an auto-vacuum database, and not a leaf-data tree, 5758 ** then update the pointer map with an entry for the overflow page 5759 ** that the cell just inserted points to (if any). 5760 */ 5761 if( ISAUTOVACUUM && !leafData ){ 5762 rc = ptrmapPutOvfl(pParent, nxDiv); 5763 if( rc!=SQLITE_OK ){ 5764 goto balance_cleanup; 5765 } 5766 } 5767 j++; 5768 nxDiv++; 5769 } 5770 5771 /* Set the pointer-map entry for the new sibling page. */ 5772 if( ISAUTOVACUUM ){ 5773 rc = ptrmapPut(pBt, pNew->pgno, PTRMAP_BTREE, pParent->pgno); 5774 if( rc!=SQLITE_OK ){ 5775 goto balance_cleanup; 5776 } 5777 } 5778 } 5779 assert( j==nCell ); 5780 assert( nOld>0 ); 5781 assert( nNew>0 ); 5782 if( (pageFlags & PTF_LEAF)==0 ){ 5783 u8 *zChild = &apCopy[nOld-1]->aData[8]; 5784 memcpy(&apNew[nNew-1]->aData[8], zChild, 4); 5785 if( ISAUTOVACUUM ){ 5786 rc = ptrmapPut(pBt, get4byte(zChild), PTRMAP_BTREE, apNew[nNew-1]->pgno); 5787 if( rc!=SQLITE_OK ){ 5788 goto balance_cleanup; 5789 } 5790 } 5791 } 5792 assert( sqlite3PagerIswriteable(pParent->pDbPage) ); 5793 if( nxDiv==pParent->nCell+pParent->nOverflow ){ 5794 /* Right-most sibling is the right-most child of pParent */ 5795 put4byte(&pParent->aData[pParent->hdrOffset+8], pgnoNew[nNew-1]); 5796 }else{ 5797 /* Right-most sibling is the left child of the first entry in pParent 5798 ** past the right-most divider entry */ 5799 put4byte(findOverflowCell(pParent, nxDiv), pgnoNew[nNew-1]); 5800 } 5801 5802 /* 5803 ** Balance the parent page. Note that the current page (pPage) might 5804 ** have been added to the freelist so it might no longer be initialized. 5805 ** But the parent page will always be initialized. 5806 */ 5807 assert( pParent->isInit ); 5808 sqlite3ScratchFree(apCell); 5809 apCell = 0; 5810 TRACE(("BALANCE: finished with %d: old=%d new=%d cells=%d\n", 5811 pPage->pgno, nOld, nNew, nCell)); 5812 pPage->nOverflow = 0; 5813 releasePage(pPage); 5814 pCur->iPage--; 5815 rc = balance(pCur, 0); 5816 5817 /* 5818 ** Cleanup before returning. 5819 */ 5820 balance_cleanup: 5821 sqlite3PageFree(aSpace2); 5822 sqlite3ScratchFree(apCell); 5823 for(i=0; i<nOld; i++){ 5824 releasePage(apOld[i]); 5825 } 5826 for(i=0; i<nNew; i++){ 5827 releasePage(apNew[i]); 5828 } 5829 pCur->apPage[pCur->iPage]->nOverflow = 0; 5830 5831 return rc; 5832 } 5833 5834 /* 5835 ** This routine is called for the root page of a btree when the root 5836 ** page contains no cells. This is an opportunity to make the tree 5837 ** shallower by one level. 5838 */ 5839 static int balance_shallower(BtCursor *pCur){ 5840 MemPage *pPage; /* Root page of B-Tree */ 5841 MemPage *pChild; /* The only child page of pPage */ 5842 Pgno pgnoChild; /* Page number for pChild */ 5843 int rc = SQLITE_OK; /* Return code from subprocedures */ 5844 BtShared *pBt; /* The main BTree structure */ 5845 int mxCellPerPage; /* Maximum number of cells per page */ 5846 u8 **apCell; /* All cells from pages being balanced */ 5847 u16 *szCell; /* Local size of all cells */ 5848 5849 assert( pCur->iPage==0 ); 5850 pPage = pCur->apPage[0]; 5851 5852 assert( pPage->nCell==0 ); 5853 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 5854 pBt = pPage->pBt; 5855 mxCellPerPage = MX_CELL(pBt); 5856 apCell = sqlite3Malloc( mxCellPerPage*(sizeof(u8*)+sizeof(u16)) ); 5857 if( apCell==0 ) return SQLITE_NOMEM; 5858 szCell = (u16*)&apCell[mxCellPerPage]; 5859 if( pPage->leaf ){ 5860 /* The table is completely empty */ 5861 TRACE(("BALANCE: empty table %d\n", pPage->pgno)); 5862 }else{ 5863 /* The root page is empty but has one child. Transfer the 5864 ** information from that one child into the root page if it 5865 ** will fit. This reduces the depth of the tree by one. 5866 ** 5867 ** If the root page is page 1, it has less space available than 5868 ** its child (due to the 100 byte header that occurs at the beginning 5869 ** of the database fle), so it might not be able to hold all of the 5870 ** information currently contained in the child. If this is the 5871 ** case, then do not do the transfer. Leave page 1 empty except 5872 ** for the right-pointer to the child page. The child page becomes 5873 ** the virtual root of the tree. 5874 */ 5875 VVA_ONLY( pCur->pagesShuffled = 1 ); 5876 pgnoChild = get4byte(&pPage->aData[pPage->hdrOffset+8]); 5877 assert( pgnoChild>0 ); 5878 assert( pgnoChild<=pagerPagecount(pPage->pBt) ); 5879 rc = sqlite3BtreeGetPage(pPage->pBt, pgnoChild, &pChild, 0); 5880 if( rc ) goto end_shallow_balance; 5881 if( pPage->pgno==1 ){ 5882 rc = sqlite3BtreeInitPage(pChild); 5883 if( rc ) goto end_shallow_balance; 5884 assert( pChild->nOverflow==0 ); 5885 if( pChild->nFree>=100 ){ 5886 /* The child information will fit on the root page, so do the 5887 ** copy */ 5888 int i; 5889 zeroPage(pPage, pChild->aData[0]); 5890 for(i=0; i<pChild->nCell; i++){ 5891 apCell[i] = findCell(pChild,i); 5892 szCell[i] = cellSizePtr(pChild, apCell[i]); 5893 } 5894 assemblePage(pPage, pChild->nCell, apCell, szCell); 5895 /* Copy the right-pointer of the child to the parent. */ 5896 assert( sqlite3PagerIswriteable(pPage->pDbPage) ); 5897 put4byte(&pPage->aData[pPage->hdrOffset+8], 5898 get4byte(&pChild->aData[pChild->hdrOffset+8])); 5899 rc = freePage(pChild); 5900 TRACE(("BALANCE: child %d transfer to page 1\n", pChild->pgno)); 5901 }else{ 5902 /* The child has more information that will fit on the root. 5903 ** The tree is already balanced. Do nothing. */ 5904 TRACE(("BALANCE: child %d will not fit on page 1\n", pChild->pgno)); 5905 } 5906 }else{ 5907 memcpy(pPage->aData, pChild->aData, pPage->pBt->usableSize); 5908 pPage->isInit = 0; 5909 rc = sqlite3BtreeInitPage(pPage); 5910 assert( rc==SQLITE_OK ); 5911 freePage(pChild); 5912 TRACE(("BALANCE: transfer child %d into root %d\n", 5913 pChild->pgno, pPage->pgno)); 5914 } 5915 assert( pPage->nOverflow==0 ); 5916 #ifndef SQLITE_OMIT_AUTOVACUUM 5917 if( ISAUTOVACUUM && rc==SQLITE_OK ){ 5918 rc = setChildPtrmaps(pPage); 5919 } 5920 #endif 5921 releasePage(pChild); 5922 } 5923 end_shallow_balance: 5924 sqlite3_free(apCell); 5925 return rc; 5926 } 5927 5928 5929 /* 5930 ** The root page is overfull 5931 ** 5932 ** When this happens, Create a new child page and copy the 5933 ** contents of the root into the child. Then make the root 5934 ** page an empty page with rightChild pointing to the new 5935 ** child. Finally, call balance_internal() on the new child 5936 ** to cause it to split. 5937 */ 5938 static int balance_deeper(BtCursor *pCur){ 5939 int rc; /* Return value from subprocedures */ 5940 MemPage *pPage; /* Pointer to the root page */ 5941 MemPage *pChild; /* Pointer to a new child page */ 5942 Pgno pgnoChild; /* Page number of the new child page */ 5943 BtShared *pBt; /* The BTree */ 5944 int usableSize; /* Total usable size of a page */ 5945 u8 *data; /* Content of the parent page */ 5946 u8 *cdata; /* Content of the child page */ 5947 int hdr; /* Offset to page header in parent */ 5948 int cbrk; /* Offset to content of first cell in parent */ 5949 5950 assert( pCur->iPage==0 ); 5951 assert( pCur->apPage[0]->nOverflow>0 ); 5952 5953 VVA_ONLY( pCur->pagesShuffled = 1 ); 5954 pPage = pCur->apPage[0]; 5955 pBt = pPage->pBt; 5956 assert( sqlite3_mutex_held(pBt->mutex) ); 5957 assert( sqlite3PagerIswriteable(pPage->pDbPage) ); 5958 rc = allocateBtreePage(pBt, &pChild, &pgnoChild, pPage->pgno, 0); 5959 if( rc ) return rc; 5960 assert( sqlite3PagerIswriteable(pChild->pDbPage) ); 5961 usableSize = pBt->usableSize; 5962 data = pPage->aData; 5963 hdr = pPage->hdrOffset; 5964 cbrk = get2byte(&data[hdr+5]); 5965 cdata = pChild->aData; 5966 memcpy(cdata, &data[hdr], pPage->cellOffset+2*pPage->nCell-hdr); 5967 memcpy(&cdata[cbrk], &data[cbrk], usableSize-cbrk); 5968 5969 assert( pChild->isInit==0 ); 5970 rc = sqlite3BtreeInitPage(pChild); 5971 if( rc==SQLITE_OK ){ 5972 int nCopy = pPage->nOverflow*sizeof(pPage->aOvfl[0]); 5973 memcpy(pChild->aOvfl, pPage->aOvfl, nCopy); 5974 pChild->nOverflow = pPage->nOverflow; 5975 if( pChild->nOverflow ){ 5976 pChild->nFree = 0; 5977 } 5978 assert( pChild->nCell==pPage->nCell ); 5979 assert( sqlite3PagerIswriteable(pPage->pDbPage) ); 5980 zeroPage(pPage, pChild->aData[0] & ~PTF_LEAF); 5981 put4byte(&pPage->aData[pPage->hdrOffset+8], pgnoChild); 5982 TRACE(("BALANCE: copy root %d into %d\n", pPage->pgno, pChild->pgno)); 5983 if( ISAUTOVACUUM ){ 5984 rc = ptrmapPut(pBt, pChild->pgno, PTRMAP_BTREE, pPage->pgno); 5985 #ifndef SQLITE_OMIT_AUTOVACUUM 5986 if( rc==SQLITE_OK ){ 5987 rc = setChildPtrmaps(pChild); 5988 } 5989 if( rc ){ 5990 pChild->nOverflow = 0; 5991 } 5992 #endif 5993 } 5994 } 5995 5996 if( rc==SQLITE_OK ){ 5997 pCur->iPage++; 5998 pCur->apPage[1] = pChild; 5999 pCur->aiIdx[0] = 0; 6000 rc = balance_nonroot(pCur); 6001 }else{ 6002 releasePage(pChild); 6003 } 6004 6005 return rc; 6006 } 6007 6008 /* 6009 ** The page that pCur currently points to has just been modified in 6010 ** some way. This function figures out if this modification means the 6011 ** tree needs to be balanced, and if so calls the appropriate balancing 6012 ** routine. 6013 ** 6014 ** Parameter isInsert is true if a new cell was just inserted into the 6015 ** page, or false otherwise. 6016 */ 6017 static int balance(BtCursor *pCur, int isInsert){ 6018 int rc = SQLITE_OK; 6019 MemPage *pPage = pCur->apPage[pCur->iPage]; 6020 6021 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 6022 if( pCur->iPage==0 ){ 6023 rc = sqlite3PagerWrite(pPage->pDbPage); 6024 if( rc==SQLITE_OK && pPage->nOverflow>0 ){ 6025 rc = balance_deeper(pCur); 6026 assert( pCur->apPage[0]==pPage ); 6027 assert( pPage->nOverflow==0 || rc!=SQLITE_OK ); 6028 } 6029 if( rc==SQLITE_OK && pPage->nCell==0 ){ 6030 rc = balance_shallower(pCur); 6031 assert( pCur->apPage[0]==pPage ); 6032 assert( pPage->nOverflow==0 || rc!=SQLITE_OK ); 6033 } 6034 }else{ 6035 if( pPage->nOverflow>0 || 6036 (!isInsert && pPage->nFree>pPage->pBt->usableSize*2/3) ){ 6037 rc = balance_nonroot(pCur); 6038 } 6039 } 6040 return rc; 6041 } 6042 6043 /* 6044 ** This routine checks all cursors that point to table pgnoRoot. 6045 ** If any of those cursors were opened with wrFlag==0 in a different 6046 ** database connection (a database connection that shares the pager 6047 ** cache with the current connection) and that other connection 6048 ** is not in the ReadUncommmitted state, then this routine returns 6049 ** SQLITE_LOCKED. 6050 ** 6051 ** As well as cursors with wrFlag==0, cursors with 6052 ** isIncrblobHandle==1 are also considered 'read' cursors because 6053 ** incremental blob cursors are used for both reading and writing. 6054 ** 6055 ** When pgnoRoot is the root page of an intkey table, this function is also 6056 ** responsible for invalidating incremental blob cursors when the table row 6057 ** on which they are opened is deleted or modified. Cursors are invalidated 6058 ** according to the following rules: 6059 ** 6060 ** 1) When BtreeClearTable() is called to completely delete the contents 6061 ** of a B-Tree table, pExclude is set to zero and parameter iRow is 6062 ** set to non-zero. In this case all incremental blob cursors open 6063 ** on the table rooted at pgnoRoot are invalidated. 6064 ** 6065 ** 2) When BtreeInsert(), BtreeDelete() or BtreePutData() is called to 6066 ** modify a table row via an SQL statement, pExclude is set to the 6067 ** write cursor used to do the modification and parameter iRow is set 6068 ** to the integer row id of the B-Tree entry being modified. Unless 6069 ** pExclude is itself an incremental blob cursor, then all incremental 6070 ** blob cursors open on row iRow of the B-Tree are invalidated. 6071 ** 6072 ** 3) If both pExclude and iRow are set to zero, no incremental blob 6073 ** cursors are invalidated. 6074 */ 6075 static int checkForReadConflicts( 6076 Btree *pBtree, /* The database file to check */ 6077 Pgno pgnoRoot, /* Look for read cursors on this btree */ 6078 BtCursor *pExclude, /* Ignore this cursor */ 6079 i64 iRow /* The rowid that might be changing */ 6080 ){ 6081 BtCursor *p; 6082 BtShared *pBt = pBtree->pBt; 6083 sqlite3 *db = pBtree->db; 6084 assert( sqlite3BtreeHoldsMutex(pBtree) ); 6085 for(p=pBt->pCursor; p; p=p->pNext){ 6086 if( p==pExclude ) continue; 6087 if( p->pgnoRoot!=pgnoRoot ) continue; 6088 #ifndef SQLITE_OMIT_INCRBLOB 6089 if( p->isIncrblobHandle && ( 6090 (!pExclude && iRow) 6091 || (pExclude && !pExclude->isIncrblobHandle && p->info.nKey==iRow) 6092 )){ 6093 p->eState = CURSOR_INVALID; 6094 } 6095 #endif 6096 if( p->eState!=CURSOR_VALID ) continue; 6097 if( p->wrFlag==0 6098 #ifndef SQLITE_OMIT_INCRBLOB 6099 || p->isIncrblobHandle 6100 #endif 6101 ){ 6102 sqlite3 *dbOther = p->pBtree->db; 6103 assert(dbOther); 6104 if( dbOther!=db && (dbOther->flags & SQLITE_ReadUncommitted)==0 ){ 6105 sqlite3ConnectionBlocked(db, dbOther); 6106 return SQLITE_LOCKED_SHAREDCACHE; 6107 } 6108 } 6109 } 6110 return SQLITE_OK; 6111 } 6112 6113 /* 6114 ** Insert a new record into the BTree. The key is given by (pKey,nKey) 6115 ** and the data is given by (pData,nData). The cursor is used only to 6116 ** define what table the record should be inserted into. The cursor 6117 ** is left pointing at a random location. 6118 ** 6119 ** For an INTKEY table, only the nKey value of the key is used. pKey is 6120 ** ignored. For a ZERODATA table, the pData and nData are both ignored. 6121 */ 6122 int sqlite3BtreeInsert( 6123 BtCursor *pCur, /* Insert data into the table of this cursor */ 6124 const void *pKey, i64 nKey, /* The key of the new record */ 6125 const void *pData, int nData, /* The data of the new record */ 6126 int nZero, /* Number of extra 0 bytes to append to data */ 6127 int appendBias /* True if this is likely an append */ 6128 ){ 6129 int rc; 6130 int loc; 6131 int szNew; 6132 int idx; 6133 MemPage *pPage; 6134 Btree *p = pCur->pBtree; 6135 BtShared *pBt = p->pBt; 6136 unsigned char *oldCell; 6137 unsigned char *newCell = 0; 6138 6139 assert( cursorHoldsMutex(pCur) ); 6140 assert( pBt->inTransaction==TRANS_WRITE ); 6141 assert( !pBt->readOnly ); 6142 assert( pCur->wrFlag ); 6143 rc = checkForReadConflicts(pCur->pBtree, pCur->pgnoRoot, pCur, nKey); 6144 if( rc ){ 6145 /* The table pCur points to has a read lock */ 6146 assert( rc==SQLITE_LOCKED_SHAREDCACHE ); 6147 return rc; 6148 } 6149 if( pCur->eState==CURSOR_FAULT ){ 6150 return pCur->skip; 6151 } 6152 6153 /* Save the positions of any other cursors open on this table */ 6154 sqlite3BtreeClearCursor(pCur); 6155 if( 6156 SQLITE_OK!=(rc = saveAllCursors(pBt, pCur->pgnoRoot, pCur)) || 6157 SQLITE_OK!=(rc = sqlite3BtreeMoveto(pCur, pKey, nKey, appendBias, &loc)) 6158 ){ 6159 return rc; 6160 } 6161 6162 pPage = pCur->apPage[pCur->iPage]; 6163 assert( pPage->intKey || nKey>=0 ); 6164 assert( pPage->leaf || !pPage->intKey ); 6165 TRACE(("INSERT: table=%d nkey=%lld ndata=%d page=%d %s\n", 6166 pCur->pgnoRoot, nKey, nData, pPage->pgno, 6167 loc==0 ? "overwrite" : "new entry")); 6168 assert( pPage->isInit ); 6169 allocateTempSpace(pBt); 6170 newCell = pBt->pTmpSpace; 6171 if( newCell==0 ) return SQLITE_NOMEM; 6172 rc = fillInCell(pPage, newCell, pKey, nKey, pData, nData, nZero, &szNew); 6173 if( rc ) goto end_insert; 6174 assert( szNew==cellSizePtr(pPage, newCell) ); 6175 assert( szNew<=MX_CELL_SIZE(pBt) ); 6176 idx = pCur->aiIdx[pCur->iPage]; 6177 if( loc==0 && CURSOR_VALID==pCur->eState ){ 6178 u16 szOld; 6179 assert( idx<pPage->nCell ); 6180 rc = sqlite3PagerWrite(pPage->pDbPage); 6181 if( rc ){ 6182 goto end_insert; 6183 } 6184 oldCell = findCell(pPage, idx); 6185 if( !pPage->leaf ){ 6186 memcpy(newCell, oldCell, 4); 6187 } 6188 szOld = cellSizePtr(pPage, oldCell); 6189 rc = clearCell(pPage, oldCell); 6190 if( rc ) goto end_insert; 6191 rc = dropCell(pPage, idx, szOld); 6192 if( rc!=SQLITE_OK ) { 6193 goto end_insert; 6194 } 6195 }else if( loc<0 && pPage->nCell>0 ){ 6196 assert( pPage->leaf ); 6197 idx = ++pCur->aiIdx[pCur->iPage]; 6198 pCur->info.nSize = 0; 6199 pCur->validNKey = 0; 6200 }else{ 6201 assert( pPage->leaf ); 6202 } 6203 rc = insertCell(pPage, idx, newCell, szNew, 0, 0); 6204 if( rc==SQLITE_OK ){ 6205 rc = balance(pCur, 1); 6206 } 6207 6208 /* Must make sure nOverflow is reset to zero even if the balance() 6209 ** fails. Internal data structure corruption will result otherwise. */ 6210 pCur->apPage[pCur->iPage]->nOverflow = 0; 6211 6212 if( rc==SQLITE_OK ){ 6213 moveToRoot(pCur); 6214 } 6215 end_insert: 6216 return rc; 6217 } 6218 6219 /* 6220 ** Delete the entry that the cursor is pointing to. The cursor 6221 ** is left pointing at a arbitrary location. 6222 */ 6223 int sqlite3BtreeDelete(BtCursor *pCur){ 6224 MemPage *pPage = pCur->apPage[pCur->iPage]; 6225 int idx; 6226 unsigned char *pCell; 6227 int rc; 6228 Pgno pgnoChild = 0; 6229 Btree *p = pCur->pBtree; 6230 BtShared *pBt = p->pBt; 6231 6232 assert( cursorHoldsMutex(pCur) ); 6233 assert( pPage->isInit ); 6234 assert( pBt->inTransaction==TRANS_WRITE ); 6235 assert( !pBt->readOnly ); 6236 if( pCur->eState==CURSOR_FAULT ){ 6237 return pCur->skip; 6238 } 6239 if( NEVER(pCur->aiIdx[pCur->iPage]>=pPage->nCell) ){ 6240 return SQLITE_ERROR; /* The cursor is not pointing to anything */ 6241 } 6242 assert( pCur->wrFlag ); 6243 rc = checkForReadConflicts(p, pCur->pgnoRoot, pCur, pCur->info.nKey); 6244 if( rc!=SQLITE_OK ){ 6245 /* The table pCur points to has a read lock */ 6246 assert( rc==SQLITE_LOCKED_SHAREDCACHE ); 6247 return rc; 6248 } 6249 6250 /* Restore the current cursor position (a no-op if the cursor is not in 6251 ** CURSOR_REQUIRESEEK state) and save the positions of any other cursors 6252 ** open on the same table. Then call sqlite3PagerWrite() on the page 6253 ** that the entry will be deleted from. 6254 */ 6255 if( 6256 (rc = restoreCursorPosition(pCur))!=0 || 6257 (rc = saveAllCursors(pBt, pCur->pgnoRoot, pCur))!=0 || 6258 (rc = sqlite3PagerWrite(pPage->pDbPage))!=0 6259 ){ 6260 return rc; 6261 } 6262 6263 /* Locate the cell within its page and leave pCell pointing to the 6264 ** data. The clearCell() call frees any overflow pages associated with the 6265 ** cell. The cell itself is still intact. 6266 */ 6267 idx = pCur->aiIdx[pCur->iPage]; 6268 pCell = findCell(pPage, idx); 6269 if( !pPage->leaf ){ 6270 pgnoChild = get4byte(pCell); 6271 } 6272 rc = clearCell(pPage, pCell); 6273 if( rc ){ 6274 return rc; 6275 } 6276 6277 if( !pPage->leaf ){ 6278 /* 6279 ** The entry we are about to delete is not a leaf so if we do not 6280 ** do something we will leave a hole on an internal page. 6281 ** We have to fill the hole by moving in a cell from a leaf. The 6282 ** next Cell after the one to be deleted is guaranteed to exist and 6283 ** to be a leaf so we can use it. 6284 */ 6285 BtCursor leafCur; 6286 MemPage *pLeafPage = 0; 6287 6288 unsigned char *pNext; 6289 int notUsed; 6290 unsigned char *tempCell = 0; 6291 assert( !pPage->intKey ); 6292 sqlite3BtreeGetTempCursor(pCur, &leafCur); 6293 rc = sqlite3BtreeNext(&leafCur, ¬Used); 6294 if( rc==SQLITE_OK ){ 6295 assert( leafCur.aiIdx[leafCur.iPage]==0 ); 6296 pLeafPage = leafCur.apPage[leafCur.iPage]; 6297 rc = sqlite3PagerWrite(pLeafPage->pDbPage); 6298 } 6299 if( rc==SQLITE_OK ){ 6300 int leafCursorInvalid = 0; 6301 u16 szNext; 6302 TRACE(("DELETE: table=%d delete internal from %d replace from leaf %d\n", 6303 pCur->pgnoRoot, pPage->pgno, pLeafPage->pgno)); 6304 dropCell(pPage, idx, cellSizePtr(pPage, pCell)); 6305 pNext = findCell(pLeafPage, 0); 6306 szNext = cellSizePtr(pLeafPage, pNext); 6307 assert( MX_CELL_SIZE(pBt)>=szNext+4 ); 6308 allocateTempSpace(pBt); 6309 tempCell = pBt->pTmpSpace; 6310 if( tempCell==0 ){ 6311 rc = SQLITE_NOMEM; 6312 } 6313 if( rc==SQLITE_OK ){ 6314 rc = insertCell(pPage, idx, pNext-4, szNext+4, tempCell, 0); 6315 } 6316 6317 6318 /* The "if" statement in the next code block is critical. The 6319 ** slightest error in that statement would allow SQLite to operate 6320 ** correctly most of the time but produce very rare failures. To 6321 ** guard against this, the following macros help to verify that 6322 ** the "if" statement is well tested. 6323 */ 6324 testcase( pPage->nOverflow==0 && pPage->nFree<pBt->usableSize*2/3 6325 && pLeafPage->nFree+2+szNext > pBt->usableSize*2/3 ); 6326 testcase( pPage->nOverflow==0 && pPage->nFree==pBt->usableSize*2/3 6327 && pLeafPage->nFree+2+szNext > pBt->usableSize*2/3 ); 6328 testcase( pPage->nOverflow==0 && pPage->nFree==pBt->usableSize*2/3+1 6329 && pLeafPage->nFree+2+szNext > pBt->usableSize*2/3 ); 6330 testcase( pPage->nOverflow>0 && pPage->nFree<=pBt->usableSize*2/3 6331 && pLeafPage->nFree+2+szNext > pBt->usableSize*2/3 ); 6332 testcase( (pPage->nOverflow>0 || (pPage->nFree > pBt->usableSize*2/3)) 6333 && pLeafPage->nFree+2+szNext == pBt->usableSize*2/3 ); 6334 6335 6336 if( (pPage->nOverflow>0 || (pPage->nFree > pBt->usableSize*2/3)) && 6337 (pLeafPage->nFree+2+szNext > pBt->usableSize*2/3) 6338 ){ 6339 /* This branch is taken if the internal node is now either overflowing 6340 ** or underfull and the leaf node will be underfull after the just cell 6341 ** copied to the internal node is deleted from it. This is a special 6342 ** case because the call to balance() to correct the internal node 6343 ** may change the tree structure and invalidate the contents of 6344 ** the leafCur.apPage[] and leafCur.aiIdx[] arrays, which will be 6345 ** used by the balance() required to correct the underfull leaf 6346 ** node. 6347 ** 6348 ** The formula used in the expression above are based on facets of 6349 ** the SQLite file-format that do not change over time. 6350 */ 6351 testcase( pPage->nFree==pBt->usableSize*2/3+1 ); 6352 testcase( pLeafPage->nFree+2+szNext==pBt->usableSize*2/3+1 ); 6353 leafCursorInvalid = 1; 6354 } 6355 6356 if( rc==SQLITE_OK ){ 6357 assert( sqlite3PagerIswriteable(pPage->pDbPage) ); 6358 put4byte(findOverflowCell(pPage, idx), pgnoChild); 6359 VVA_ONLY( pCur->pagesShuffled = 0 ); 6360 rc = balance(pCur, 0); 6361 } 6362 6363 if( rc==SQLITE_OK && leafCursorInvalid ){ 6364 /* The leaf-node is now underfull and so the tree needs to be 6365 ** rebalanced. However, the balance() operation on the internal 6366 ** node above may have modified the structure of the B-Tree and 6367 ** so the current contents of leafCur.apPage[] and leafCur.aiIdx[] 6368 ** may not be trusted. 6369 ** 6370 ** It is not possible to copy the ancestry from pCur, as the same 6371 ** balance() call has invalidated the pCur->apPage[] and aiIdx[] 6372 ** arrays. 6373 ** 6374 ** The call to saveCursorPosition() below internally saves the 6375 ** key that leafCur is currently pointing to. Currently, there 6376 ** are two copies of that key in the tree - one here on the leaf 6377 ** page and one on some internal node in the tree. The copy on 6378 ** the leaf node is always the next key in tree-order after the 6379 ** copy on the internal node. So, the call to sqlite3BtreeNext() 6380 ** calls restoreCursorPosition() to point the cursor to the copy 6381 ** stored on the internal node, then advances to the next entry, 6382 ** which happens to be the copy of the key on the internal node. 6383 ** Net effect: leafCur is pointing back to the duplicate cell 6384 ** that needs to be removed, and the leafCur.apPage[] and 6385 ** leafCur.aiIdx[] arrays are correct. 6386 */ 6387 VVA_ONLY( Pgno leafPgno = pLeafPage->pgno ); 6388 rc = saveCursorPosition(&leafCur); 6389 if( rc==SQLITE_OK ){ 6390 rc = sqlite3BtreeNext(&leafCur, ¬Used); 6391 } 6392 pLeafPage = leafCur.apPage[leafCur.iPage]; 6393 assert( rc!=SQLITE_OK || pLeafPage->pgno==leafPgno ); 6394 assert( rc!=SQLITE_OK || leafCur.aiIdx[leafCur.iPage]==0 ); 6395 } 6396 6397 if( SQLITE_OK==rc 6398 && SQLITE_OK==(rc = sqlite3PagerWrite(pLeafPage->pDbPage)) 6399 ){ 6400 dropCell(pLeafPage, 0, szNext); 6401 VVA_ONLY( leafCur.pagesShuffled = 0 ); 6402 rc = balance(&leafCur, 0); 6403 assert( leafCursorInvalid || !leafCur.pagesShuffled 6404 || !pCur->pagesShuffled ); 6405 } 6406 } 6407 sqlite3BtreeReleaseTempCursor(&leafCur); 6408 }else{ 6409 TRACE(("DELETE: table=%d delete from leaf %d\n", 6410 pCur->pgnoRoot, pPage->pgno)); 6411 rc = dropCell(pPage, idx, cellSizePtr(pPage, pCell)); 6412 if( rc==SQLITE_OK ){ 6413 rc = balance(pCur, 0); 6414 } 6415 } 6416 if( rc==SQLITE_OK ){ 6417 moveToRoot(pCur); 6418 } 6419 return rc; 6420 } 6421 6422 /* 6423 ** Create a new BTree table. Write into *piTable the page 6424 ** number for the root page of the new table. 6425 ** 6426 ** The type of type is determined by the flags parameter. Only the 6427 ** following values of flags are currently in use. Other values for 6428 ** flags might not work: 6429 ** 6430 ** BTREE_INTKEY|BTREE_LEAFDATA Used for SQL tables with rowid keys 6431 ** BTREE_ZERODATA Used for SQL indices 6432 */ 6433 static int btreeCreateTable(Btree *p, int *piTable, int flags){ 6434 BtShared *pBt = p->pBt; 6435 MemPage *pRoot; 6436 Pgno pgnoRoot; 6437 int rc; 6438 6439 assert( sqlite3BtreeHoldsMutex(p) ); 6440 assert( pBt->inTransaction==TRANS_WRITE ); 6441 assert( !pBt->readOnly ); 6442 6443 #ifdef SQLITE_OMIT_AUTOVACUUM 6444 rc = allocateBtreePage(pBt, &pRoot, &pgnoRoot, 1, 0); 6445 if( rc ){ 6446 return rc; 6447 } 6448 #else 6449 if( pBt->autoVacuum ){ 6450 Pgno pgnoMove; /* Move a page here to make room for the root-page */ 6451 MemPage *pPageMove; /* The page to move to. */ 6452 6453 /* Creating a new table may probably require moving an existing database 6454 ** to make room for the new tables root page. In case this page turns 6455 ** out to be an overflow page, delete all overflow page-map caches 6456 ** held by open cursors. 6457 */ 6458 invalidateAllOverflowCache(pBt); 6459 6460 /* Read the value of meta[3] from the database to determine where the 6461 ** root page of the new table should go. meta[3] is the largest root-page 6462 ** created so far, so the new root-page is (meta[3]+1). 6463 */ 6464 rc = sqlite3BtreeGetMeta(p, 4, &pgnoRoot); 6465 if( rc!=SQLITE_OK ){ 6466 return rc; 6467 } 6468 pgnoRoot++; 6469 6470 /* The new root-page may not be allocated on a pointer-map page, or the 6471 ** PENDING_BYTE page. 6472 */ 6473 while( pgnoRoot==PTRMAP_PAGENO(pBt, pgnoRoot) || 6474 pgnoRoot==PENDING_BYTE_PAGE(pBt) ){ 6475 pgnoRoot++; 6476 } 6477 assert( pgnoRoot>=3 ); 6478 6479 /* Allocate a page. The page that currently resides at pgnoRoot will 6480 ** be moved to the allocated page (unless the allocated page happens 6481 ** to reside at pgnoRoot). 6482 */ 6483 rc = allocateBtreePage(pBt, &pPageMove, &pgnoMove, pgnoRoot, 1); 6484 if( rc!=SQLITE_OK ){ 6485 return rc; 6486 } 6487 6488 if( pgnoMove!=pgnoRoot ){ 6489 /* pgnoRoot is the page that will be used for the root-page of 6490 ** the new table (assuming an error did not occur). But we were 6491 ** allocated pgnoMove. If required (i.e. if it was not allocated 6492 ** by extending the file), the current page at position pgnoMove 6493 ** is already journaled. 6494 */ 6495 u8 eType; 6496 Pgno iPtrPage; 6497 6498 releasePage(pPageMove); 6499 6500 /* Move the page currently at pgnoRoot to pgnoMove. */ 6501 rc = sqlite3BtreeGetPage(pBt, pgnoRoot, &pRoot, 0); 6502 if( rc!=SQLITE_OK ){ 6503 return rc; 6504 } 6505 rc = ptrmapGet(pBt, pgnoRoot, &eType, &iPtrPage); 6506 if( rc!=SQLITE_OK || eType==PTRMAP_ROOTPAGE || eType==PTRMAP_FREEPAGE ){ 6507 releasePage(pRoot); 6508 return rc; 6509 } 6510 assert( eType!=PTRMAP_ROOTPAGE ); 6511 assert( eType!=PTRMAP_FREEPAGE ); 6512 rc = relocatePage(pBt, pRoot, eType, iPtrPage, pgnoMove, 0); 6513 releasePage(pRoot); 6514 6515 /* Obtain the page at pgnoRoot */ 6516 if( rc!=SQLITE_OK ){ 6517 return rc; 6518 } 6519 rc = sqlite3BtreeGetPage(pBt, pgnoRoot, &pRoot, 0); 6520 if( rc!=SQLITE_OK ){ 6521 return rc; 6522 } 6523 rc = sqlite3PagerWrite(pRoot->pDbPage); 6524 if( rc!=SQLITE_OK ){ 6525 releasePage(pRoot); 6526 return rc; 6527 } 6528 }else{ 6529 pRoot = pPageMove; 6530 } 6531 6532 /* Update the pointer-map and meta-data with the new root-page number. */ 6533 rc = ptrmapPut(pBt, pgnoRoot, PTRMAP_ROOTPAGE, 0); 6534 if( rc ){ 6535 releasePage(pRoot); 6536 return rc; 6537 } 6538 rc = sqlite3BtreeUpdateMeta(p, 4, pgnoRoot); 6539 if( rc ){ 6540 releasePage(pRoot); 6541 return rc; 6542 } 6543 6544 }else{ 6545 rc = allocateBtreePage(pBt, &pRoot, &pgnoRoot, 1, 0); 6546 if( rc ) return rc; 6547 } 6548 #endif 6549 assert( sqlite3PagerIswriteable(pRoot->pDbPage) ); 6550 zeroPage(pRoot, flags | PTF_LEAF); 6551 sqlite3PagerUnref(pRoot->pDbPage); 6552 *piTable = (int)pgnoRoot; 6553 return SQLITE_OK; 6554 } 6555 int sqlite3BtreeCreateTable(Btree *p, int *piTable, int flags){ 6556 int rc; 6557 sqlite3BtreeEnter(p); 6558 rc = btreeCreateTable(p, piTable, flags); 6559 sqlite3BtreeLeave(p); 6560 return rc; 6561 } 6562 6563 /* 6564 ** Erase the given database page and all its children. Return 6565 ** the page to the freelist. 6566 */ 6567 static int clearDatabasePage( 6568 BtShared *pBt, /* The BTree that contains the table */ 6569 Pgno pgno, /* Page number to clear */ 6570 int freePageFlag, /* Deallocate page if true */ 6571 int *pnChange 6572 ){ 6573 MemPage *pPage = 0; 6574 int rc; 6575 unsigned char *pCell; 6576 int i; 6577 6578 assert( sqlite3_mutex_held(pBt->mutex) ); 6579 if( pgno>pagerPagecount(pBt) ){ 6580 return SQLITE_CORRUPT_BKPT; 6581 } 6582 6583 rc = getAndInitPage(pBt, pgno, &pPage); 6584 if( rc ) goto cleardatabasepage_out; 6585 for(i=0; i<pPage->nCell; i++){ 6586 pCell = findCell(pPage, i); 6587 if( !pPage->leaf ){ 6588 rc = clearDatabasePage(pBt, get4byte(pCell), 1, pnChange); 6589 if( rc ) goto cleardatabasepage_out; 6590 } 6591 rc = clearCell(pPage, pCell); 6592 if( rc ) goto cleardatabasepage_out; 6593 } 6594 if( !pPage->leaf ){ 6595 rc = clearDatabasePage(pBt, get4byte(&pPage->aData[8]), 1, pnChange); 6596 if( rc ) goto cleardatabasepage_out; 6597 }else if( pnChange ){ 6598 assert( pPage->intKey ); 6599 *pnChange += pPage->nCell; 6600 } 6601 if( freePageFlag ){ 6602 rc = freePage(pPage); 6603 }else if( (rc = sqlite3PagerWrite(pPage->pDbPage))==0 ){ 6604 zeroPage(pPage, pPage->aData[0] | PTF_LEAF); 6605 } 6606 6607 cleardatabasepage_out: 6608 releasePage(pPage); 6609 return rc; 6610 } 6611 6612 /* 6613 ** Delete all information from a single table in the database. iTable is 6614 ** the page number of the root of the table. After this routine returns, 6615 ** the root page is empty, but still exists. 6616 ** 6617 ** This routine will fail with SQLITE_LOCKED if there are any open 6618 ** read cursors on the table. Open write cursors are moved to the 6619 ** root of the table. 6620 ** 6621 ** If pnChange is not NULL, then table iTable must be an intkey table. The 6622 ** integer value pointed to by pnChange is incremented by the number of 6623 ** entries in the table. 6624 */ 6625 int sqlite3BtreeClearTable(Btree *p, int iTable, int *pnChange){ 6626 int rc; 6627 BtShared *pBt = p->pBt; 6628 sqlite3BtreeEnter(p); 6629 assert( p->inTrans==TRANS_WRITE ); 6630 if( (rc = checkForReadConflicts(p, iTable, 0, 1))!=SQLITE_OK ){ 6631 /* nothing to do */ 6632 }else if( SQLITE_OK!=(rc = saveAllCursors(pBt, iTable, 0)) ){ 6633 /* nothing to do */ 6634 }else{ 6635 rc = clearDatabasePage(pBt, (Pgno)iTable, 0, pnChange); 6636 } 6637 sqlite3BtreeLeave(p); 6638 return rc; 6639 } 6640 6641 /* 6642 ** Erase all information in a table and add the root of the table to 6643 ** the freelist. Except, the root of the principle table (the one on 6644 ** page 1) is never added to the freelist. 6645 ** 6646 ** This routine will fail with SQLITE_LOCKED if there are any open 6647 ** cursors on the table. 6648 ** 6649 ** If AUTOVACUUM is enabled and the page at iTable is not the last 6650 ** root page in the database file, then the last root page 6651 ** in the database file is moved into the slot formerly occupied by 6652 ** iTable and that last slot formerly occupied by the last root page 6653 ** is added to the freelist instead of iTable. In this say, all 6654 ** root pages are kept at the beginning of the database file, which 6655 ** is necessary for AUTOVACUUM to work right. *piMoved is set to the 6656 ** page number that used to be the last root page in the file before 6657 ** the move. If no page gets moved, *piMoved is set to 0. 6658 ** The last root page is recorded in meta[3] and the value of 6659 ** meta[3] is updated by this procedure. 6660 */ 6661 static int btreeDropTable(Btree *p, Pgno iTable, int *piMoved){ 6662 int rc; 6663 MemPage *pPage = 0; 6664 BtShared *pBt = p->pBt; 6665 6666 assert( sqlite3BtreeHoldsMutex(p) ); 6667 assert( p->inTrans==TRANS_WRITE ); 6668 6669 /* It is illegal to drop a table if any cursors are open on the 6670 ** database. This is because in auto-vacuum mode the backend may 6671 ** need to move another root-page to fill a gap left by the deleted 6672 ** root page. If an open cursor was using this page a problem would 6673 ** occur. 6674 */ 6675 if( pBt->pCursor ){ 6676 sqlite3ConnectionBlocked(p->db, pBt->pCursor->pBtree->db); 6677 return SQLITE_LOCKED_SHAREDCACHE; 6678 } 6679 6680 rc = sqlite3BtreeGetPage(pBt, (Pgno)iTable, &pPage, 0); 6681 if( rc ) return rc; 6682 rc = sqlite3BtreeClearTable(p, iTable, 0); 6683 if( rc ){ 6684 releasePage(pPage); 6685 return rc; 6686 } 6687 6688 *piMoved = 0; 6689 6690 if( iTable>1 ){ 6691 #ifdef SQLITE_OMIT_AUTOVACUUM 6692 rc = freePage(pPage); 6693 releasePage(pPage); 6694 #else 6695 if( pBt->autoVacuum ){ 6696 Pgno maxRootPgno; 6697 rc = sqlite3BtreeGetMeta(p, 4, &maxRootPgno); 6698 if( rc!=SQLITE_OK ){ 6699 releasePage(pPage); 6700 return rc; 6701 } 6702 6703 if( iTable==maxRootPgno ){ 6704 /* If the table being dropped is the table with the largest root-page 6705 ** number in the database, put the root page on the free list. 6706 */ 6707 rc = freePage(pPage); 6708 releasePage(pPage); 6709 if( rc!=SQLITE_OK ){ 6710 return rc; 6711 } 6712 }else{ 6713 /* The table being dropped does not have the largest root-page 6714 ** number in the database. So move the page that does into the 6715 ** gap left by the deleted root-page. 6716 */ 6717 MemPage *pMove; 6718 releasePage(pPage); 6719 rc = sqlite3BtreeGetPage(pBt, maxRootPgno, &pMove, 0); 6720 if( rc!=SQLITE_OK ){ 6721 return rc; 6722 } 6723 rc = relocatePage(pBt, pMove, PTRMAP_ROOTPAGE, 0, iTable, 0); 6724 releasePage(pMove); 6725 if( rc!=SQLITE_OK ){ 6726 return rc; 6727 } 6728 rc = sqlite3BtreeGetPage(pBt, maxRootPgno, &pMove, 0); 6729 if( rc!=SQLITE_OK ){ 6730 return rc; 6731 } 6732 rc = freePage(pMove); 6733 releasePage(pMove); 6734 if( rc!=SQLITE_OK ){ 6735 return rc; 6736 } 6737 *piMoved = maxRootPgno; 6738 } 6739 6740 /* Set the new 'max-root-page' value in the database header. This 6741 ** is the old value less one, less one more if that happens to 6742 ** be a root-page number, less one again if that is the 6743 ** PENDING_BYTE_PAGE. 6744 */ 6745 maxRootPgno--; 6746 if( maxRootPgno==PENDING_BYTE_PAGE(pBt) ){ 6747 maxRootPgno--; 6748 } 6749 if( maxRootPgno==PTRMAP_PAGENO(pBt, maxRootPgno) ){ 6750 maxRootPgno--; 6751 } 6752 assert( maxRootPgno!=PENDING_BYTE_PAGE(pBt) ); 6753 6754 rc = sqlite3BtreeUpdateMeta(p, 4, maxRootPgno); 6755 }else{ 6756 rc = freePage(pPage); 6757 releasePage(pPage); 6758 } 6759 #endif 6760 }else{ 6761 /* If sqlite3BtreeDropTable was called on page 1. */ 6762 zeroPage(pPage, PTF_INTKEY|PTF_LEAF ); 6763 releasePage(pPage); 6764 } 6765 return rc; 6766 } 6767 int sqlite3BtreeDropTable(Btree *p, int iTable, int *piMoved){ 6768 int rc; 6769 sqlite3BtreeEnter(p); 6770 rc = btreeDropTable(p, iTable, piMoved); 6771 sqlite3BtreeLeave(p); 6772 return rc; 6773 } 6774 6775 6776 /* 6777 ** Read the meta-information out of a database file. Meta[0] 6778 ** is the number of free pages currently in the database. Meta[1] 6779 ** through meta[15] are available for use by higher layers. Meta[0] 6780 ** is read-only, the others are read/write. 6781 ** 6782 ** The schema layer numbers meta values differently. At the schema 6783 ** layer (and the SetCookie and ReadCookie opcodes) the number of 6784 ** free pages is not visible. So Cookie[0] is the same as Meta[1]. 6785 */ 6786 int sqlite3BtreeGetMeta(Btree *p, int idx, u32 *pMeta){ 6787 DbPage *pDbPage = 0; 6788 int rc; 6789 unsigned char *pP1; 6790 BtShared *pBt = p->pBt; 6791 6792 sqlite3BtreeEnter(p); 6793 6794 /* Reading a meta-data value requires a read-lock on page 1 (and hence 6795 ** the sqlite_master table. We grab this lock regardless of whether or 6796 ** not the SQLITE_ReadUncommitted flag is set (the table rooted at page 6797 ** 1 is treated as a special case by querySharedCacheTableLock() 6798 ** and setSharedCacheTableLock()). 6799 */ 6800 rc = querySharedCacheTableLock(p, 1, READ_LOCK); 6801 if( rc!=SQLITE_OK ){ 6802 sqlite3BtreeLeave(p); 6803 return rc; 6804 } 6805 6806 assert( idx>=0 && idx<=15 ); 6807 if( pBt->pPage1 ){ 6808 /* The b-tree is already holding a reference to page 1 of the database 6809 ** file. In this case the required meta-data value can be read directly 6810 ** from the page data of this reference. This is slightly faster than 6811 ** requesting a new reference from the pager layer. 6812 */ 6813 pP1 = (unsigned char *)pBt->pPage1->aData; 6814 }else{ 6815 /* The b-tree does not have a reference to page 1 of the database file. 6816 ** Obtain one from the pager layer. 6817 */ 6818 rc = sqlite3PagerGet(pBt->pPager, 1, &pDbPage); 6819 if( rc ){ 6820 sqlite3BtreeLeave(p); 6821 return rc; 6822 } 6823 pP1 = (unsigned char *)sqlite3PagerGetData(pDbPage); 6824 } 6825 *pMeta = get4byte(&pP1[36 + idx*4]); 6826 6827 /* If the b-tree is not holding a reference to page 1, then one was 6828 ** requested from the pager layer in the above block. Release it now. 6829 */ 6830 if( !pBt->pPage1 ){ 6831 sqlite3PagerUnref(pDbPage); 6832 } 6833 6834 /* If autovacuumed is disabled in this build but we are trying to 6835 ** access an autovacuumed database, then make the database readonly. 6836 */ 6837 #ifdef SQLITE_OMIT_AUTOVACUUM 6838 if( idx==4 && *pMeta>0 ) pBt->readOnly = 1; 6839 #endif 6840 6841 /* If there is currently an open transaction, grab a read-lock 6842 ** on page 1 of the database file. This is done to make sure that 6843 ** no other connection can modify the meta value just read from 6844 ** the database until the transaction is concluded. 6845 */ 6846 if( p->inTrans>0 ){ 6847 rc = setSharedCacheTableLock(p, 1, READ_LOCK); 6848 } 6849 sqlite3BtreeLeave(p); 6850 return rc; 6851 } 6852 6853 /* 6854 ** Write meta-information back into the database. Meta[0] is 6855 ** read-only and may not be written. 6856 */ 6857 int sqlite3BtreeUpdateMeta(Btree *p, int idx, u32 iMeta){ 6858 BtShared *pBt = p->pBt; 6859 unsigned char *pP1; 6860 int rc; 6861 assert( idx>=1 && idx<=15 ); 6862 sqlite3BtreeEnter(p); 6863 assert( p->inTrans==TRANS_WRITE ); 6864 assert( pBt->pPage1!=0 ); 6865 pP1 = pBt->pPage1->aData; 6866 rc = sqlite3PagerWrite(pBt->pPage1->pDbPage); 6867 if( rc==SQLITE_OK ){ 6868 put4byte(&pP1[36 + idx*4], iMeta); 6869 #ifndef SQLITE_OMIT_AUTOVACUUM 6870 if( idx==7 ){ 6871 assert( pBt->autoVacuum || iMeta==0 ); 6872 assert( iMeta==0 || iMeta==1 ); 6873 pBt->incrVacuum = (u8)iMeta; 6874 } 6875 #endif 6876 } 6877 sqlite3BtreeLeave(p); 6878 return rc; 6879 } 6880 6881 /* 6882 ** Return the flag byte at the beginning of the page that the cursor 6883 ** is currently pointing to. 6884 */ 6885 int sqlite3BtreeFlags(BtCursor *pCur){ 6886 /* TODO: What about CURSOR_REQUIRESEEK state? Probably need to call 6887 ** restoreCursorPosition() here. 6888 */ 6889 MemPage *pPage; 6890 restoreCursorPosition(pCur); 6891 pPage = pCur->apPage[pCur->iPage]; 6892 assert( cursorHoldsMutex(pCur) ); 6893 assert( pPage!=0 ); 6894 assert( pPage->pBt==pCur->pBt ); 6895 return pPage->aData[pPage->hdrOffset]; 6896 } 6897 6898 #ifndef SQLITE_OMIT_BTREECOUNT 6899 /* 6900 ** The first argument, pCur, is a cursor opened on some b-tree. Count the 6901 ** number of entries in the b-tree and write the result to *pnEntry. 6902 ** 6903 ** SQLITE_OK is returned if the operation is successfully executed. 6904 ** Otherwise, if an error is encountered (i.e. an IO error or database 6905 ** corruption) an SQLite error code is returned. 6906 */ 6907 int sqlite3BtreeCount(BtCursor *pCur, i64 *pnEntry){ 6908 i64 nEntry = 0; /* Value to return in *pnEntry */ 6909 int rc; /* Return code */ 6910 rc = moveToRoot(pCur); 6911 6912 /* Unless an error occurs, the following loop runs one iteration for each 6913 ** page in the B-Tree structure (not including overflow pages). 6914 */ 6915 while( rc==SQLITE_OK ){ 6916 int iIdx; /* Index of child node in parent */ 6917 MemPage *pPage; /* Current page of the b-tree */ 6918 6919 /* If this is a leaf page or the tree is not an int-key tree, then 6920 ** this page contains countable entries. Increment the entry counter 6921 ** accordingly. 6922 */ 6923 pPage = pCur->apPage[pCur->iPage]; 6924 if( pPage->leaf || !pPage->intKey ){ 6925 nEntry += pPage->nCell; 6926 } 6927 6928 /* pPage is a leaf node. This loop navigates the cursor so that it 6929 ** points to the first interior cell that it points to the parent of 6930 ** the next page in the tree that has not yet been visited. The 6931 ** pCur->aiIdx[pCur->iPage] value is set to the index of the parent cell 6932 ** of the page, or to the number of cells in the page if the next page 6933 ** to visit is the right-child of its parent. 6934 ** 6935 ** If all pages in the tree have been visited, return SQLITE_OK to the 6936 ** caller. 6937 */ 6938 if( pPage->leaf ){ 6939 do { 6940 if( pCur->iPage==0 ){ 6941 /* All pages of the b-tree have been visited. Return successfully. */ 6942 *pnEntry = nEntry; 6943 return SQLITE_OK; 6944 } 6945 sqlite3BtreeMoveToParent(pCur); 6946 }while ( pCur->aiIdx[pCur->iPage]>=pCur->apPage[pCur->iPage]->nCell ); 6947 6948 pCur->aiIdx[pCur->iPage]++; 6949 pPage = pCur->apPage[pCur->iPage]; 6950 } 6951 6952 /* Descend to the child node of the cell that the cursor currently 6953 ** points at. This is the right-child if (iIdx==pPage->nCell). 6954 */ 6955 iIdx = pCur->aiIdx[pCur->iPage]; 6956 if( iIdx==pPage->nCell ){ 6957 rc = moveToChild(pCur, get4byte(&pPage->aData[pPage->hdrOffset+8])); 6958 }else{ 6959 rc = moveToChild(pCur, get4byte(findCell(pPage, iIdx))); 6960 } 6961 } 6962 6963 /* An error has occurred. Return an error code. */ 6964 return rc; 6965 } 6966 #endif 6967 6968 /* 6969 ** Return the pager associated with a BTree. This routine is used for 6970 ** testing and debugging only. 6971 */ 6972 Pager *sqlite3BtreePager(Btree *p){ 6973 return p->pBt->pPager; 6974 } 6975 6976 #ifndef SQLITE_OMIT_INTEGRITY_CHECK 6977 /* 6978 ** Append a message to the error message string. 6979 */ 6980 static void checkAppendMsg( 6981 IntegrityCk *pCheck, 6982 char *zMsg1, 6983 const char *zFormat, 6984 ... 6985 ){ 6986 va_list ap; 6987 if( !pCheck->mxErr ) return; 6988 pCheck->mxErr--; 6989 pCheck->nErr++; 6990 va_start(ap, zFormat); 6991 if( pCheck->errMsg.nChar ){ 6992 sqlite3StrAccumAppend(&pCheck->errMsg, "\n", 1); 6993 } 6994 if( zMsg1 ){ 6995 sqlite3StrAccumAppend(&pCheck->errMsg, zMsg1, -1); 6996 } 6997 sqlite3VXPrintf(&pCheck->errMsg, 1, zFormat, ap); 6998 va_end(ap); 6999 if( pCheck->errMsg.mallocFailed ){ 7000 pCheck->mallocFailed = 1; 7001 } 7002 } 7003 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */ 7004 7005 #ifndef SQLITE_OMIT_INTEGRITY_CHECK 7006 /* 7007 ** Add 1 to the reference count for page iPage. If this is the second 7008 ** reference to the page, add an error message to pCheck->zErrMsg. 7009 ** Return 1 if there are 2 ore more references to the page and 0 if 7010 ** if this is the first reference to the page. 7011 ** 7012 ** Also check that the page number is in bounds. 7013 */ 7014 static int checkRef(IntegrityCk *pCheck, Pgno iPage, char *zContext){ 7015 if( iPage==0 ) return 1; 7016 if( iPage>pCheck->nPage ){ 7017 checkAppendMsg(pCheck, zContext, "invalid page number %d", iPage); 7018 return 1; 7019 } 7020 if( pCheck->anRef[iPage]==1 ){ 7021 checkAppendMsg(pCheck, zContext, "2nd reference to page %d", iPage); 7022 return 1; 7023 } 7024 return (pCheck->anRef[iPage]++)>1; 7025 } 7026 7027 #ifndef SQLITE_OMIT_AUTOVACUUM 7028 /* 7029 ** Check that the entry in the pointer-map for page iChild maps to 7030 ** page iParent, pointer type ptrType. If not, append an error message 7031 ** to pCheck. 7032 */ 7033 static void checkPtrmap( 7034 IntegrityCk *pCheck, /* Integrity check context */ 7035 Pgno iChild, /* Child page number */ 7036 u8 eType, /* Expected pointer map type */ 7037 Pgno iParent, /* Expected pointer map parent page number */ 7038 char *zContext /* Context description (used for error msg) */ 7039 ){ 7040 int rc; 7041 u8 ePtrmapType; 7042 Pgno iPtrmapParent; 7043 7044 rc = ptrmapGet(pCheck->pBt, iChild, &ePtrmapType, &iPtrmapParent); 7045 if( rc!=SQLITE_OK ){ 7046 if( rc==SQLITE_NOMEM ) pCheck->mallocFailed = 1; 7047 checkAppendMsg(pCheck, zContext, "Failed to read ptrmap key=%d", iChild); 7048 return; 7049 } 7050 7051 if( ePtrmapType!=eType || iPtrmapParent!=iParent ){ 7052 checkAppendMsg(pCheck, zContext, 7053 "Bad ptr map entry key=%d expected=(%d,%d) got=(%d,%d)", 7054 iChild, eType, iParent, ePtrmapType, iPtrmapParent); 7055 } 7056 } 7057 #endif 7058 7059 /* 7060 ** Check the integrity of the freelist or of an overflow page list. 7061 ** Verify that the number of pages on the list is N. 7062 */ 7063 static void checkList( 7064 IntegrityCk *pCheck, /* Integrity checking context */ 7065 int isFreeList, /* True for a freelist. False for overflow page list */ 7066 int iPage, /* Page number for first page in the list */ 7067 int N, /* Expected number of pages in the list */ 7068 char *zContext /* Context for error messages */ 7069 ){ 7070 int i; 7071 int expected = N; 7072 int iFirst = iPage; 7073 while( N-- > 0 && pCheck->mxErr ){ 7074 DbPage *pOvflPage; 7075 unsigned char *pOvflData; 7076 if( iPage<1 ){ 7077 checkAppendMsg(pCheck, zContext, 7078 "%d of %d pages missing from overflow list starting at %d", 7079 N+1, expected, iFirst); 7080 break; 7081 } 7082 if( checkRef(pCheck, iPage, zContext) ) break; 7083 if( sqlite3PagerGet(pCheck->pPager, (Pgno)iPage, &pOvflPage) ){ 7084 checkAppendMsg(pCheck, zContext, "failed to get page %d", iPage); 7085 break; 7086 } 7087 pOvflData = (unsigned char *)sqlite3PagerGetData(pOvflPage); 7088 if( isFreeList ){ 7089 int n = get4byte(&pOvflData[4]); 7090 #ifndef SQLITE_OMIT_AUTOVACUUM 7091 if( pCheck->pBt->autoVacuum ){ 7092 checkPtrmap(pCheck, iPage, PTRMAP_FREEPAGE, 0, zContext); 7093 } 7094 #endif 7095 if( n>pCheck->pBt->usableSize/4-2 ){ 7096 checkAppendMsg(pCheck, zContext, 7097 "freelist leaf count too big on page %d", iPage); 7098 N--; 7099 }else{ 7100 for(i=0; i<n; i++){ 7101 Pgno iFreePage = get4byte(&pOvflData[8+i*4]); 7102 #ifndef SQLITE_OMIT_AUTOVACUUM 7103 if( pCheck->pBt->autoVacuum ){ 7104 checkPtrmap(pCheck, iFreePage, PTRMAP_FREEPAGE, 0, zContext); 7105 } 7106 #endif 7107 checkRef(pCheck, iFreePage, zContext); 7108 } 7109 N -= n; 7110 } 7111 } 7112 #ifndef SQLITE_OMIT_AUTOVACUUM 7113 else{ 7114 /* If this database supports auto-vacuum and iPage is not the last 7115 ** page in this overflow list, check that the pointer-map entry for 7116 ** the following page matches iPage. 7117 */ 7118 if( pCheck->pBt->autoVacuum && N>0 ){ 7119 i = get4byte(pOvflData); 7120 checkPtrmap(pCheck, i, PTRMAP_OVERFLOW2, iPage, zContext); 7121 } 7122 } 7123 #endif 7124 iPage = get4byte(pOvflData); 7125 sqlite3PagerUnref(pOvflPage); 7126 } 7127 } 7128 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */ 7129 7130 #ifndef SQLITE_OMIT_INTEGRITY_CHECK 7131 /* 7132 ** Do various sanity checks on a single page of a tree. Return 7133 ** the tree depth. Root pages return 0. Parents of root pages 7134 ** return 1, and so forth. 7135 ** 7136 ** These checks are done: 7137 ** 7138 ** 1. Make sure that cells and freeblocks do not overlap 7139 ** but combine to completely cover the page. 7140 ** NO 2. Make sure cell keys are in order. 7141 ** NO 3. Make sure no key is less than or equal to zLowerBound. 7142 ** NO 4. Make sure no key is greater than or equal to zUpperBound. 7143 ** 5. Check the integrity of overflow pages. 7144 ** 6. Recursively call checkTreePage on all children. 7145 ** 7. Verify that the depth of all children is the same. 7146 ** 8. Make sure this page is at least 33% full or else it is 7147 ** the root of the tree. 7148 */ 7149 static int checkTreePage( 7150 IntegrityCk *pCheck, /* Context for the sanity check */ 7151 int iPage, /* Page number of the page to check */ 7152 char *zParentContext /* Parent context */ 7153 ){ 7154 MemPage *pPage; 7155 int i, rc, depth, d2, pgno, cnt; 7156 int hdr, cellStart; 7157 int nCell; 7158 u8 *data; 7159 BtShared *pBt; 7160 int usableSize; 7161 char zContext[100]; 7162 char *hit = 0; 7163 7164 sqlite3_snprintf(sizeof(zContext), zContext, "Page %d: ", iPage); 7165 7166 /* Check that the page exists 7167 */ 7168 pBt = pCheck->pBt; 7169 usableSize = pBt->usableSize; 7170 if( iPage==0 ) return 0; 7171 if( checkRef(pCheck, iPage, zParentContext) ) return 0; 7172 if( (rc = sqlite3BtreeGetPage(pBt, (Pgno)iPage, &pPage, 0))!=0 ){ 7173 if( rc==SQLITE_NOMEM ) pCheck->mallocFailed = 1; 7174 checkAppendMsg(pCheck, zContext, 7175 "unable to get the page. error code=%d", rc); 7176 return 0; 7177 } 7178 if( (rc = sqlite3BtreeInitPage(pPage))!=0 ){ 7179 assert( rc==SQLITE_CORRUPT ); /* The only possible error from InitPage */ 7180 checkAppendMsg(pCheck, zContext, 7181 "sqlite3BtreeInitPage() returns error code %d", rc); 7182 releasePage(pPage); 7183 return 0; 7184 } 7185 7186 /* Check out all the cells. 7187 */ 7188 depth = 0; 7189 for(i=0; i<pPage->nCell && pCheck->mxErr; i++){ 7190 u8 *pCell; 7191 u32 sz; 7192 CellInfo info; 7193 7194 /* Check payload overflow pages 7195 */ 7196 sqlite3_snprintf(sizeof(zContext), zContext, 7197 "On tree page %d cell %d: ", iPage, i); 7198 pCell = findCell(pPage,i); 7199 sqlite3BtreeParseCellPtr(pPage, pCell, &info); 7200 sz = info.nData; 7201 if( !pPage->intKey ) sz += (int)info.nKey; 7202 assert( sz==info.nPayload ); 7203 if( (sz>info.nLocal) 7204 && (&pCell[info.iOverflow]<=&pPage->aData[pBt->usableSize]) 7205 ){ 7206 int nPage = (sz - info.nLocal + usableSize - 5)/(usableSize - 4); 7207 Pgno pgnoOvfl = get4byte(&pCell[info.iOverflow]); 7208 #ifndef SQLITE_OMIT_AUTOVACUUM 7209 if( pBt->autoVacuum ){ 7210 checkPtrmap(pCheck, pgnoOvfl, PTRMAP_OVERFLOW1, iPage, zContext); 7211 } 7212 #endif 7213 checkList(pCheck, 0, pgnoOvfl, nPage, zContext); 7214 } 7215 7216 /* Check sanity of left child page. 7217 */ 7218 if( !pPage->leaf ){ 7219 pgno = get4byte(pCell); 7220 #ifndef SQLITE_OMIT_AUTOVACUUM 7221 if( pBt->autoVacuum ){ 7222 checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage, zContext); 7223 } 7224 #endif 7225 d2 = checkTreePage(pCheck, pgno, zContext); 7226 if( i>0 && d2!=depth ){ 7227 checkAppendMsg(pCheck, zContext, "Child page depth differs"); 7228 } 7229 depth = d2; 7230 } 7231 } 7232 if( !pPage->leaf ){ 7233 pgno = get4byte(&pPage->aData[pPage->hdrOffset+8]); 7234 sqlite3_snprintf(sizeof(zContext), zContext, 7235 "On page %d at right child: ", iPage); 7236 #ifndef SQLITE_OMIT_AUTOVACUUM 7237 if( pBt->autoVacuum ){ 7238 checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage, 0); 7239 } 7240 #endif 7241 checkTreePage(pCheck, pgno, zContext); 7242 } 7243 7244 /* Check for complete coverage of the page 7245 */ 7246 data = pPage->aData; 7247 hdr = pPage->hdrOffset; 7248 hit = sqlite3PageMalloc( pBt->pageSize ); 7249 if( hit==0 ){ 7250 pCheck->mallocFailed = 1; 7251 }else{ 7252 u16 contentOffset = get2byte(&data[hdr+5]); 7253 if (contentOffset > usableSize) { 7254 checkAppendMsg(pCheck, 0, 7255 "Corruption detected in header on page %d",iPage,0); 7256 goto check_page_abort; 7257 } 7258 memset(hit+contentOffset, 0, usableSize-contentOffset); 7259 memset(hit, 1, contentOffset); 7260 nCell = get2byte(&data[hdr+3]); 7261 cellStart = hdr + 12 - 4*pPage->leaf; 7262 for(i=0; i<nCell; i++){ 7263 int pc = get2byte(&data[cellStart+i*2]); 7264 u16 size = 1024; 7265 int j; 7266 if( pc<=usableSize ){ 7267 size = cellSizePtr(pPage, &data[pc]); 7268 } 7269 if( (pc+size-1)>=usableSize || pc<0 ){ 7270 checkAppendMsg(pCheck, 0, 7271 "Corruption detected in cell %d on page %d",i,iPage,0); 7272 }else{ 7273 for(j=pc+size-1; j>=pc; j--) hit[j]++; 7274 } 7275 } 7276 for(cnt=0, i=get2byte(&data[hdr+1]); i>0 && i<usableSize && cnt<10000; 7277 cnt++){ 7278 int size = get2byte(&data[i+2]); 7279 int j; 7280 if( (i+size-1)>=usableSize || i<0 ){ 7281 checkAppendMsg(pCheck, 0, 7282 "Corruption detected in cell %d on page %d",i,iPage,0); 7283 }else{ 7284 for(j=i+size-1; j>=i; j--) hit[j]++; 7285 } 7286 i = get2byte(&data[i]); 7287 } 7288 for(i=cnt=0; i<usableSize; i++){ 7289 if( hit[i]==0 ){ 7290 cnt++; 7291 }else if( hit[i]>1 ){ 7292 checkAppendMsg(pCheck, 0, 7293 "Multiple uses for byte %d of page %d", i, iPage); 7294 break; 7295 } 7296 } 7297 if( cnt!=data[hdr+7] ){ 7298 checkAppendMsg(pCheck, 0, 7299 "Fragmented space is %d byte reported as %d on page %d", 7300 cnt, data[hdr+7], iPage); 7301 } 7302 } 7303 check_page_abort: 7304 if (hit) sqlite3PageFree(hit); 7305 7306 releasePage(pPage); 7307 return depth+1; 7308 } 7309 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */ 7310 7311 #ifndef SQLITE_OMIT_INTEGRITY_CHECK 7312 /* 7313 ** This routine does a complete check of the given BTree file. aRoot[] is 7314 ** an array of pages numbers were each page number is the root page of 7315 ** a table. nRoot is the number of entries in aRoot. 7316 ** 7317 ** Write the number of error seen in *pnErr. Except for some memory 7318 ** allocation errors, an error message held in memory obtained from 7319 ** malloc is returned if *pnErr is non-zero. If *pnErr==0 then NULL is 7320 ** returned. If a memory allocation error occurs, NULL is returned. 7321 */ 7322 char *sqlite3BtreeIntegrityCheck( 7323 Btree *p, /* The btree to be checked */ 7324 int *aRoot, /* An array of root pages numbers for individual trees */ 7325 int nRoot, /* Number of entries in aRoot[] */ 7326 int mxErr, /* Stop reporting errors after this many */ 7327 int *pnErr /* Write number of errors seen to this variable */ 7328 ){ 7329 Pgno i; 7330 int nRef; 7331 IntegrityCk sCheck; 7332 BtShared *pBt = p->pBt; 7333 char zErr[100]; 7334 7335 sqlite3BtreeEnter(p); 7336 nRef = sqlite3PagerRefcount(pBt->pPager); 7337 if( lockBtreeWithRetry(p)!=SQLITE_OK ){ 7338 *pnErr = 1; 7339 sqlite3BtreeLeave(p); 7340 return sqlite3DbStrDup(0, "cannot acquire a read lock on the database"); 7341 } 7342 sCheck.pBt = pBt; 7343 sCheck.pPager = pBt->pPager; 7344 sCheck.nPage = pagerPagecount(sCheck.pBt); 7345 sCheck.mxErr = mxErr; 7346 sCheck.nErr = 0; 7347 sCheck.mallocFailed = 0; 7348 *pnErr = 0; 7349 if( sCheck.nPage==0 ){ 7350 unlockBtreeIfUnused(pBt); 7351 sqlite3BtreeLeave(p); 7352 return 0; 7353 } 7354 sCheck.anRef = sqlite3Malloc( (sCheck.nPage+1)*sizeof(sCheck.anRef[0]) ); 7355 if( !sCheck.anRef ){ 7356 unlockBtreeIfUnused(pBt); 7357 *pnErr = 1; 7358 sqlite3BtreeLeave(p); 7359 return 0; 7360 } 7361 for(i=0; i<=sCheck.nPage; i++){ sCheck.anRef[i] = 0; } 7362 i = PENDING_BYTE_PAGE(pBt); 7363 if( i<=sCheck.nPage ){ 7364 sCheck.anRef[i] = 1; 7365 } 7366 sqlite3StrAccumInit(&sCheck.errMsg, zErr, sizeof(zErr), 20000); 7367 7368 /* Check the integrity of the freelist 7369 */ 7370 checkList(&sCheck, 1, get4byte(&pBt->pPage1->aData[32]), 7371 get4byte(&pBt->pPage1->aData[36]), "Main freelist: "); 7372 7373 /* Check all the tables. 7374 */ 7375 for(i=0; (int)i<nRoot && sCheck.mxErr; i++){ 7376 if( aRoot[i]==0 ) continue; 7377 #ifndef SQLITE_OMIT_AUTOVACUUM 7378 if( pBt->autoVacuum && aRoot[i]>1 ){ 7379 checkPtrmap(&sCheck, aRoot[i], PTRMAP_ROOTPAGE, 0, 0); 7380 } 7381 #endif 7382 checkTreePage(&sCheck, aRoot[i], "List of tree roots: "); 7383 } 7384 7385 /* Make sure every page in the file is referenced 7386 */ 7387 for(i=1; i<=sCheck.nPage && sCheck.mxErr; i++){ 7388 #ifdef SQLITE_OMIT_AUTOVACUUM 7389 if( sCheck.anRef[i]==0 ){ 7390 checkAppendMsg(&sCheck, 0, "Page %d is never used", i); 7391 } 7392 #else 7393 /* If the database supports auto-vacuum, make sure no tables contain 7394 ** references to pointer-map pages. 7395 */ 7396 if( sCheck.anRef[i]==0 && 7397 (PTRMAP_PAGENO(pBt, i)!=i || !pBt->autoVacuum) ){ 7398 checkAppendMsg(&sCheck, 0, "Page %d is never used", i); 7399 } 7400 if( sCheck.anRef[i]!=0 && 7401 (PTRMAP_PAGENO(pBt, i)==i && pBt->autoVacuum) ){ 7402 checkAppendMsg(&sCheck, 0, "Pointer map page %d is referenced", i); 7403 } 7404 #endif 7405 } 7406 7407 /* Make sure this analysis did not leave any unref() pages. 7408 ** This is an internal consistency check; an integrity check 7409 ** of the integrity check. 7410 */ 7411 unlockBtreeIfUnused(pBt); 7412 if( NEVER(nRef != sqlite3PagerRefcount(pBt->pPager)) ){ 7413 checkAppendMsg(&sCheck, 0, 7414 "Outstanding page count goes from %d to %d during this analysis", 7415 nRef, sqlite3PagerRefcount(pBt->pPager) 7416 ); 7417 } 7418 7419 /* Clean up and report errors. 7420 */ 7421 sqlite3BtreeLeave(p); 7422 sqlite3_free(sCheck.anRef); 7423 if( sCheck.mallocFailed ){ 7424 sqlite3StrAccumReset(&sCheck.errMsg); 7425 *pnErr = sCheck.nErr+1; 7426 return 0; 7427 } 7428 *pnErr = sCheck.nErr; 7429 if( sCheck.nErr==0 ) sqlite3StrAccumReset(&sCheck.errMsg); 7430 return sqlite3StrAccumFinish(&sCheck.errMsg); 7431 } 7432 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */ 7433 7434 /* 7435 ** Return the full pathname of the underlying database file. 7436 ** 7437 ** The pager filename is invariant as long as the pager is 7438 ** open so it is safe to access without the BtShared mutex. 7439 */ 7440 const char *sqlite3BtreeGetFilename(Btree *p){ 7441 assert( p->pBt->pPager!=0 ); 7442 return sqlite3PagerFilename(p->pBt->pPager); 7443 } 7444 7445 /* 7446 ** Return the pathname of the journal file for this database. The return 7447 ** value of this routine is the same regardless of whether the journal file 7448 ** has been created or not. 7449 ** 7450 ** The pager journal filename is invariant as long as the pager is 7451 ** open so it is safe to access without the BtShared mutex. 7452 */ 7453 const char *sqlite3BtreeGetJournalname(Btree *p){ 7454 assert( p->pBt->pPager!=0 ); 7455 return sqlite3PagerJournalname(p->pBt->pPager); 7456 } 7457 7458 /* 7459 ** Return non-zero if a transaction is active. 7460 */ 7461 int sqlite3BtreeIsInTrans(Btree *p){ 7462 assert( p==0 || sqlite3_mutex_held(p->db->mutex) ); 7463 return (p && (p->inTrans==TRANS_WRITE)); 7464 } 7465 7466 /* 7467 ** Return non-zero if a read (or write) transaction is active. 7468 */ 7469 int sqlite3BtreeIsInReadTrans(Btree *p){ 7470 assert( p ); 7471 assert( sqlite3_mutex_held(p->db->mutex) ); 7472 return p->inTrans!=TRANS_NONE; 7473 } 7474 7475 int sqlite3BtreeIsInBackup(Btree *p){ 7476 assert( p ); 7477 assert( sqlite3_mutex_held(p->db->mutex) ); 7478 return p->nBackup!=0; 7479 } 7480 7481 /* 7482 ** This function returns a pointer to a blob of memory associated with 7483 ** a single shared-btree. The memory is used by client code for its own 7484 ** purposes (for example, to store a high-level schema associated with 7485 ** the shared-btree). The btree layer manages reference counting issues. 7486 ** 7487 ** The first time this is called on a shared-btree, nBytes bytes of memory 7488 ** are allocated, zeroed, and returned to the caller. For each subsequent 7489 ** call the nBytes parameter is ignored and a pointer to the same blob 7490 ** of memory returned. 7491 ** 7492 ** If the nBytes parameter is 0 and the blob of memory has not yet been 7493 ** allocated, a null pointer is returned. If the blob has already been 7494 ** allocated, it is returned as normal. 7495 ** 7496 ** Just before the shared-btree is closed, the function passed as the 7497 ** xFree argument when the memory allocation was made is invoked on the 7498 ** blob of allocated memory. This function should not call sqlite3_free() 7499 ** on the memory, the btree layer does that. 7500 */ 7501 void *sqlite3BtreeSchema(Btree *p, int nBytes, void(*xFree)(void *)){ 7502 BtShared *pBt = p->pBt; 7503 sqlite3BtreeEnter(p); 7504 if( !pBt->pSchema && nBytes ){ 7505 pBt->pSchema = sqlite3MallocZero(nBytes); 7506 pBt->xFreeSchema = xFree; 7507 } 7508 sqlite3BtreeLeave(p); 7509 return pBt->pSchema; 7510 } 7511 7512 /* 7513 ** Return SQLITE_LOCKED_SHAREDCACHE if another user of the same shared 7514 ** btree as the argument handle holds an exclusive lock on the 7515 ** sqlite_master table. Otherwise SQLITE_OK. 7516 */ 7517 int sqlite3BtreeSchemaLocked(Btree *p){ 7518 int rc; 7519 assert( sqlite3_mutex_held(p->db->mutex) ); 7520 sqlite3BtreeEnter(p); 7521 rc = querySharedCacheTableLock(p, MASTER_ROOT, READ_LOCK); 7522 assert( rc==SQLITE_OK || rc==SQLITE_LOCKED_SHAREDCACHE ); 7523 sqlite3BtreeLeave(p); 7524 return rc; 7525 } 7526 7527 7528 #ifndef SQLITE_OMIT_SHARED_CACHE 7529 /* 7530 ** Obtain a lock on the table whose root page is iTab. The 7531 ** lock is a write lock if isWritelock is true or a read lock 7532 ** if it is false. 7533 */ 7534 int sqlite3BtreeLockTable(Btree *p, int iTab, u8 isWriteLock){ 7535 int rc = SQLITE_OK; 7536 if( p->sharable ){ 7537 u8 lockType = READ_LOCK + isWriteLock; 7538 assert( READ_LOCK+1==WRITE_LOCK ); 7539 assert( isWriteLock==0 || isWriteLock==1 ); 7540 sqlite3BtreeEnter(p); 7541 rc = querySharedCacheTableLock(p, iTab, lockType); 7542 if( rc==SQLITE_OK ){ 7543 rc = setSharedCacheTableLock(p, iTab, lockType); 7544 } 7545 sqlite3BtreeLeave(p); 7546 } 7547 return rc; 7548 } 7549 #endif 7550 7551 #ifndef SQLITE_OMIT_INCRBLOB 7552 /* 7553 ** Argument pCsr must be a cursor opened for writing on an 7554 ** INTKEY table currently pointing at a valid table entry. 7555 ** This function modifies the data stored as part of that entry. 7556 ** Only the data content may only be modified, it is not possible 7557 ** to change the length of the data stored. 7558 */ 7559 int sqlite3BtreePutData(BtCursor *pCsr, u32 offset, u32 amt, void *z){ 7560 int rc; 7561 7562 assert( cursorHoldsMutex(pCsr) ); 7563 assert( sqlite3_mutex_held(pCsr->pBtree->db->mutex) ); 7564 assert(pCsr->isIncrblobHandle); 7565 7566 restoreCursorPosition(pCsr); 7567 assert( pCsr->eState!=CURSOR_REQUIRESEEK ); 7568 if( pCsr->eState!=CURSOR_VALID ){ 7569 return SQLITE_ABORT; 7570 } 7571 7572 /* Check some preconditions: 7573 ** (a) the cursor is open for writing, 7574 ** (b) there is no read-lock on the table being modified and 7575 ** (c) the cursor points at a valid row of an intKey table. 7576 */ 7577 if( !pCsr->wrFlag ){ 7578 return SQLITE_READONLY; 7579 } 7580 assert( !pCsr->pBt->readOnly 7581 && pCsr->pBt->inTransaction==TRANS_WRITE ); 7582 rc = checkForReadConflicts(pCsr->pBtree, pCsr->pgnoRoot, pCsr, 0); 7583 if( rc!=SQLITE_OK ){ 7584 /* The table pCur points to has a read lock */ 7585 assert( rc==SQLITE_LOCKED_SHAREDCACHE ); 7586 return rc; 7587 } 7588 if( pCsr->eState==CURSOR_INVALID || !pCsr->apPage[pCsr->iPage]->intKey ){ 7589 return SQLITE_ERROR; 7590 } 7591 7592 return accessPayload(pCsr, offset, amt, (unsigned char *)z, 0, 1); 7593 } 7594 7595 /* 7596 ** Set a flag on this cursor to cache the locations of pages from the 7597 ** overflow list for the current row. This is used by cursors opened 7598 ** for incremental blob IO only. 7599 ** 7600 ** This function sets a flag only. The actual page location cache 7601 ** (stored in BtCursor.aOverflow[]) is allocated and used by function 7602 ** accessPayload() (the worker function for sqlite3BtreeData() and 7603 ** sqlite3BtreePutData()). 7604 */ 7605 void sqlite3BtreeCacheOverflow(BtCursor *pCur){ 7606 assert( cursorHoldsMutex(pCur) ); 7607 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) ); 7608 assert(!pCur->isIncrblobHandle); 7609 assert(!pCur->aOverflow); 7610 pCur->isIncrblobHandle = 1; 7611 } 7612 #endif 7613