1 /* 2 ** 2004 April 6 3 ** 4 ** The author disclaims copyright to this source code. In place of 5 ** a legal notice, here is a blessing: 6 ** 7 ** May you do good and not evil. 8 ** May you find forgiveness for yourself and forgive others. 9 ** May you share freely, never taking more than you give. 10 ** 11 ************************************************************************* 12 ** $Id: btree.c,v 1.433 2007/12/13 21:54:11 drh Exp $ 13 ** 14 ** This file implements a external (disk-based) database using BTrees. 15 ** See the header comment on "btreeInt.h" for additional information. 16 ** Including a description of file format and an overview of operation. 17 */ 18 #include "btreeInt.h" 19 20 /* 21 ** The header string that appears at the beginning of every 22 ** SQLite database. 23 */ 24 static const char zMagicHeader[] = SQLITE_FILE_HEADER; 25 26 /* 27 ** Set this global variable to 1 to enable tracing using the TRACE 28 ** macro. 29 */ 30 #if SQLITE_TEST 31 int sqlite3_btree_trace=0; /* True to enable tracing */ 32 #endif 33 34 35 36 #ifndef SQLITE_OMIT_SHARED_CACHE 37 /* 38 ** A flag to indicate whether or not shared cache is enabled. Also, 39 ** a list of BtShared objects that are eligible for participation 40 ** in shared cache. The variables have file scope during normal builds, 41 ** but the test harness needs to access these variables so we make them 42 ** global for test builds. 43 */ 44 #ifdef SQLITE_TEST 45 BtShared *sqlite3SharedCacheList = 0; 46 int sqlite3SharedCacheEnabled = 0; 47 #else 48 static BtShared *sqlite3SharedCacheList = 0; 49 static int sqlite3SharedCacheEnabled = 0; 50 #endif 51 #endif /* SQLITE_OMIT_SHARED_CACHE */ 52 53 #ifndef SQLITE_OMIT_SHARED_CACHE 54 /* 55 ** Enable or disable the shared pager and schema features. 56 ** 57 ** This routine has no effect on existing database connections. 58 ** The shared cache setting effects only future calls to 59 ** sqlite3_open(), sqlite3_open16(), or sqlite3_open_v2(). 60 */ 61 int sqlite3_enable_shared_cache(int enable){ 62 sqlite3SharedCacheEnabled = enable; 63 return SQLITE_OK; 64 } 65 #endif 66 67 68 /* 69 ** Forward declaration 70 */ 71 static int checkReadLocks(Btree*,Pgno,BtCursor*); 72 73 74 #ifdef SQLITE_OMIT_SHARED_CACHE 75 /* 76 ** The functions queryTableLock(), lockTable() and unlockAllTables() 77 ** manipulate entries in the BtShared.pLock linked list used to store 78 ** shared-cache table level locks. If the library is compiled with the 79 ** shared-cache feature disabled, then there is only ever one user 80 ** of each BtShared structure and so this locking is not necessary. 81 ** So define the lock related functions as no-ops. 82 */ 83 #define queryTableLock(a,b,c) SQLITE_OK 84 #define lockTable(a,b,c) SQLITE_OK 85 #define unlockAllTables(a) 86 #endif 87 88 #ifndef SQLITE_OMIT_SHARED_CACHE 89 /* 90 ** Query to see if btree handle p may obtain a lock of type eLock 91 ** (READ_LOCK or WRITE_LOCK) on the table with root-page iTab. Return 92 ** SQLITE_OK if the lock may be obtained (by calling lockTable()), or 93 ** SQLITE_LOCKED if not. 94 */ 95 static int queryTableLock(Btree *p, Pgno iTab, u8 eLock){ 96 BtShared *pBt = p->pBt; 97 BtLock *pIter; 98 99 assert( sqlite3BtreeHoldsMutex(p) ); 100 101 /* This is a no-op if the shared-cache is not enabled */ 102 if( !p->sharable ){ 103 return SQLITE_OK; 104 } 105 106 /* This (along with lockTable()) is where the ReadUncommitted flag is 107 ** dealt with. If the caller is querying for a read-lock and the flag is 108 ** set, it is unconditionally granted - even if there are write-locks 109 ** on the table. If a write-lock is requested, the ReadUncommitted flag 110 ** is not considered. 111 ** 112 ** In function lockTable(), if a read-lock is demanded and the 113 ** ReadUncommitted flag is set, no entry is added to the locks list 114 ** (BtShared.pLock). 115 ** 116 ** To summarize: If the ReadUncommitted flag is set, then read cursors do 117 ** not create or respect table locks. The locking procedure for a 118 ** write-cursor does not change. 119 */ 120 if( 121 !p->db || 122 0==(p->db->flags&SQLITE_ReadUncommitted) || 123 eLock==WRITE_LOCK || 124 iTab==MASTER_ROOT 125 ){ 126 for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){ 127 if( pIter->pBtree!=p && pIter->iTable==iTab && 128 (pIter->eLock!=eLock || eLock!=READ_LOCK) ){ 129 return SQLITE_LOCKED; 130 } 131 } 132 } 133 return SQLITE_OK; 134 } 135 #endif /* !SQLITE_OMIT_SHARED_CACHE */ 136 137 #ifndef SQLITE_OMIT_SHARED_CACHE 138 /* 139 ** Add a lock on the table with root-page iTable to the shared-btree used 140 ** by Btree handle p. Parameter eLock must be either READ_LOCK or 141 ** WRITE_LOCK. 142 ** 143 ** SQLITE_OK is returned if the lock is added successfully. SQLITE_BUSY and 144 ** SQLITE_NOMEM may also be returned. 145 */ 146 static int lockTable(Btree *p, Pgno iTable, u8 eLock){ 147 BtShared *pBt = p->pBt; 148 BtLock *pLock = 0; 149 BtLock *pIter; 150 151 assert( sqlite3BtreeHoldsMutex(p) ); 152 153 /* This is a no-op if the shared-cache is not enabled */ 154 if( !p->sharable ){ 155 return SQLITE_OK; 156 } 157 158 assert( SQLITE_OK==queryTableLock(p, iTable, eLock) ); 159 160 /* If the read-uncommitted flag is set and a read-lock is requested, 161 ** return early without adding an entry to the BtShared.pLock list. See 162 ** comment in function queryTableLock() for more info on handling 163 ** the ReadUncommitted flag. 164 */ 165 if( 166 (p->db) && 167 (p->db->flags&SQLITE_ReadUncommitted) && 168 (eLock==READ_LOCK) && 169 iTable!=MASTER_ROOT 170 ){ 171 return SQLITE_OK; 172 } 173 174 /* First search the list for an existing lock on this table. */ 175 for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){ 176 if( pIter->iTable==iTable && pIter->pBtree==p ){ 177 pLock = pIter; 178 break; 179 } 180 } 181 182 /* If the above search did not find a BtLock struct associating Btree p 183 ** with table iTable, allocate one and link it into the list. 184 */ 185 if( !pLock ){ 186 pLock = (BtLock *)sqlite3MallocZero(sizeof(BtLock)); 187 if( !pLock ){ 188 return SQLITE_NOMEM; 189 } 190 pLock->iTable = iTable; 191 pLock->pBtree = p; 192 pLock->pNext = pBt->pLock; 193 pBt->pLock = pLock; 194 } 195 196 /* Set the BtLock.eLock variable to the maximum of the current lock 197 ** and the requested lock. This means if a write-lock was already held 198 ** and a read-lock requested, we don't incorrectly downgrade the lock. 199 */ 200 assert( WRITE_LOCK>READ_LOCK ); 201 if( eLock>pLock->eLock ){ 202 pLock->eLock = eLock; 203 } 204 205 return SQLITE_OK; 206 } 207 #endif /* !SQLITE_OMIT_SHARED_CACHE */ 208 209 #ifndef SQLITE_OMIT_SHARED_CACHE 210 /* 211 ** Release all the table locks (locks obtained via calls to the lockTable() 212 ** procedure) held by Btree handle p. 213 */ 214 static void unlockAllTables(Btree *p){ 215 BtLock **ppIter = &p->pBt->pLock; 216 217 assert( sqlite3BtreeHoldsMutex(p) ); 218 assert( p->sharable || 0==*ppIter ); 219 220 while( *ppIter ){ 221 BtLock *pLock = *ppIter; 222 if( pLock->pBtree==p ){ 223 *ppIter = pLock->pNext; 224 sqlite3_free(pLock); 225 }else{ 226 ppIter = &pLock->pNext; 227 } 228 } 229 } 230 #endif /* SQLITE_OMIT_SHARED_CACHE */ 231 232 static void releasePage(MemPage *pPage); /* Forward reference */ 233 234 /* 235 ** Verify that the cursor holds a mutex on the BtShared 236 */ 237 #ifndef NDEBUG 238 static int cursorHoldsMutex(BtCursor *p){ 239 return sqlite3_mutex_held(p->pBt->mutex); 240 } 241 #endif 242 243 244 #ifndef SQLITE_OMIT_INCRBLOB 245 /* 246 ** Invalidate the overflow page-list cache for cursor pCur, if any. 247 */ 248 static void invalidateOverflowCache(BtCursor *pCur){ 249 assert( cursorHoldsMutex(pCur) ); 250 sqlite3_free(pCur->aOverflow); 251 pCur->aOverflow = 0; 252 } 253 254 /* 255 ** Invalidate the overflow page-list cache for all cursors opened 256 ** on the shared btree structure pBt. 257 */ 258 static void invalidateAllOverflowCache(BtShared *pBt){ 259 BtCursor *p; 260 assert( sqlite3_mutex_held(pBt->mutex) ); 261 for(p=pBt->pCursor; p; p=p->pNext){ 262 invalidateOverflowCache(p); 263 } 264 } 265 #else 266 #define invalidateOverflowCache(x) 267 #define invalidateAllOverflowCache(x) 268 #endif 269 270 /* 271 ** Save the current cursor position in the variables BtCursor.nKey 272 ** and BtCursor.pKey. The cursor's state is set to CURSOR_REQUIRESEEK. 273 */ 274 static int saveCursorPosition(BtCursor *pCur){ 275 int rc; 276 277 assert( CURSOR_VALID==pCur->eState ); 278 assert( 0==pCur->pKey ); 279 assert( cursorHoldsMutex(pCur) ); 280 281 rc = sqlite3BtreeKeySize(pCur, &pCur->nKey); 282 283 /* If this is an intKey table, then the above call to BtreeKeySize() 284 ** stores the integer key in pCur->nKey. In this case this value is 285 ** all that is required. Otherwise, if pCur is not open on an intKey 286 ** table, then malloc space for and store the pCur->nKey bytes of key 287 ** data. 288 */ 289 if( rc==SQLITE_OK && 0==pCur->pPage->intKey){ 290 void *pKey = sqlite3_malloc(pCur->nKey); 291 if( pKey ){ 292 rc = sqlite3BtreeKey(pCur, 0, pCur->nKey, pKey); 293 if( rc==SQLITE_OK ){ 294 pCur->pKey = pKey; 295 }else{ 296 sqlite3_free(pKey); 297 } 298 }else{ 299 rc = SQLITE_NOMEM; 300 } 301 } 302 assert( !pCur->pPage->intKey || !pCur->pKey ); 303 304 if( rc==SQLITE_OK ){ 305 releasePage(pCur->pPage); 306 pCur->pPage = 0; 307 pCur->eState = CURSOR_REQUIRESEEK; 308 } 309 310 invalidateOverflowCache(pCur); 311 return rc; 312 } 313 314 /* 315 ** Save the positions of all cursors except pExcept open on the table 316 ** with root-page iRoot. Usually, this is called just before cursor 317 ** pExcept is used to modify the table (BtreeDelete() or BtreeInsert()). 318 */ 319 static int saveAllCursors(BtShared *pBt, Pgno iRoot, BtCursor *pExcept){ 320 BtCursor *p; 321 assert( sqlite3_mutex_held(pBt->mutex) ); 322 assert( pExcept==0 || pExcept->pBt==pBt ); 323 for(p=pBt->pCursor; p; p=p->pNext){ 324 if( p!=pExcept && (0==iRoot || p->pgnoRoot==iRoot) && 325 p->eState==CURSOR_VALID ){ 326 int rc = saveCursorPosition(p); 327 if( SQLITE_OK!=rc ){ 328 return rc; 329 } 330 } 331 } 332 return SQLITE_OK; 333 } 334 335 /* 336 ** Clear the current cursor position. 337 */ 338 static void clearCursorPosition(BtCursor *pCur){ 339 assert( cursorHoldsMutex(pCur) ); 340 sqlite3_free(pCur->pKey); 341 pCur->pKey = 0; 342 pCur->eState = CURSOR_INVALID; 343 } 344 345 /* 346 ** Restore the cursor to the position it was in (or as close to as possible) 347 ** when saveCursorPosition() was called. Note that this call deletes the 348 ** saved position info stored by saveCursorPosition(), so there can be 349 ** at most one effective restoreOrClearCursorPosition() call after each 350 ** saveCursorPosition(). 351 ** 352 ** If the second argument argument - doSeek - is false, then instead of 353 ** returning the cursor to its saved position, any saved position is deleted 354 ** and the cursor state set to CURSOR_INVALID. 355 */ 356 int sqlite3BtreeRestoreOrClearCursorPosition(BtCursor *pCur){ 357 int rc; 358 assert( cursorHoldsMutex(pCur) ); 359 assert( pCur->eState>=CURSOR_REQUIRESEEK ); 360 if( pCur->eState==CURSOR_FAULT ){ 361 return pCur->skip; 362 } 363 #ifndef SQLITE_OMIT_INCRBLOB 364 if( pCur->isIncrblobHandle ){ 365 return SQLITE_ABORT; 366 } 367 #endif 368 pCur->eState = CURSOR_INVALID; 369 rc = sqlite3BtreeMoveto(pCur, pCur->pKey, pCur->nKey, 0, &pCur->skip); 370 if( rc==SQLITE_OK ){ 371 sqlite3_free(pCur->pKey); 372 pCur->pKey = 0; 373 assert( pCur->eState==CURSOR_VALID || pCur->eState==CURSOR_INVALID ); 374 } 375 return rc; 376 } 377 378 #define restoreOrClearCursorPosition(p) \ 379 (p->eState>=CURSOR_REQUIRESEEK ? \ 380 sqlite3BtreeRestoreOrClearCursorPosition(p) : \ 381 SQLITE_OK) 382 383 #ifndef SQLITE_OMIT_AUTOVACUUM 384 /* 385 ** Given a page number of a regular database page, return the page 386 ** number for the pointer-map page that contains the entry for the 387 ** input page number. 388 */ 389 static Pgno ptrmapPageno(BtShared *pBt, Pgno pgno){ 390 int nPagesPerMapPage, iPtrMap, ret; 391 assert( sqlite3_mutex_held(pBt->mutex) ); 392 nPagesPerMapPage = (pBt->usableSize/5)+1; 393 iPtrMap = (pgno-2)/nPagesPerMapPage; 394 ret = (iPtrMap*nPagesPerMapPage) + 2; 395 if( ret==PENDING_BYTE_PAGE(pBt) ){ 396 ret++; 397 } 398 return ret; 399 } 400 401 /* 402 ** Write an entry into the pointer map. 403 ** 404 ** This routine updates the pointer map entry for page number 'key' 405 ** so that it maps to type 'eType' and parent page number 'pgno'. 406 ** An error code is returned if something goes wrong, otherwise SQLITE_OK. 407 */ 408 static int ptrmapPut(BtShared *pBt, Pgno key, u8 eType, Pgno parent){ 409 DbPage *pDbPage; /* The pointer map page */ 410 u8 *pPtrmap; /* The pointer map data */ 411 Pgno iPtrmap; /* The pointer map page number */ 412 int offset; /* Offset in pointer map page */ 413 int rc; 414 415 assert( sqlite3_mutex_held(pBt->mutex) ); 416 /* The master-journal page number must never be used as a pointer map page */ 417 assert( 0==PTRMAP_ISPAGE(pBt, PENDING_BYTE_PAGE(pBt)) ); 418 419 assert( pBt->autoVacuum ); 420 if( key==0 ){ 421 return SQLITE_CORRUPT_BKPT; 422 } 423 iPtrmap = PTRMAP_PAGENO(pBt, key); 424 rc = sqlite3PagerGet(pBt->pPager, iPtrmap, &pDbPage); 425 if( rc!=SQLITE_OK ){ 426 return rc; 427 } 428 offset = PTRMAP_PTROFFSET(pBt, key); 429 pPtrmap = (u8 *)sqlite3PagerGetData(pDbPage); 430 431 if( eType!=pPtrmap[offset] || get4byte(&pPtrmap[offset+1])!=parent ){ 432 TRACE(("PTRMAP_UPDATE: %d->(%d,%d)\n", key, eType, parent)); 433 rc = sqlite3PagerWrite(pDbPage); 434 if( rc==SQLITE_OK ){ 435 pPtrmap[offset] = eType; 436 put4byte(&pPtrmap[offset+1], parent); 437 } 438 } 439 440 sqlite3PagerUnref(pDbPage); 441 return rc; 442 } 443 444 /* 445 ** Read an entry from the pointer map. 446 ** 447 ** This routine retrieves the pointer map entry for page 'key', writing 448 ** the type and parent page number to *pEType and *pPgno respectively. 449 ** An error code is returned if something goes wrong, otherwise SQLITE_OK. 450 */ 451 static int ptrmapGet(BtShared *pBt, Pgno key, u8 *pEType, Pgno *pPgno){ 452 DbPage *pDbPage; /* The pointer map page */ 453 int iPtrmap; /* Pointer map page index */ 454 u8 *pPtrmap; /* Pointer map page data */ 455 int offset; /* Offset of entry in pointer map */ 456 int rc; 457 458 assert( sqlite3_mutex_held(pBt->mutex) ); 459 460 iPtrmap = PTRMAP_PAGENO(pBt, key); 461 rc = sqlite3PagerGet(pBt->pPager, iPtrmap, &pDbPage); 462 if( rc!=0 ){ 463 return rc; 464 } 465 pPtrmap = (u8 *)sqlite3PagerGetData(pDbPage); 466 467 offset = PTRMAP_PTROFFSET(pBt, key); 468 assert( pEType!=0 ); 469 *pEType = pPtrmap[offset]; 470 if( pPgno ) *pPgno = get4byte(&pPtrmap[offset+1]); 471 472 sqlite3PagerUnref(pDbPage); 473 if( *pEType<1 || *pEType>5 ) return SQLITE_CORRUPT_BKPT; 474 return SQLITE_OK; 475 } 476 477 #endif /* SQLITE_OMIT_AUTOVACUUM */ 478 479 /* 480 ** Given a btree page and a cell index (0 means the first cell on 481 ** the page, 1 means the second cell, and so forth) return a pointer 482 ** to the cell content. 483 ** 484 ** This routine works only for pages that do not contain overflow cells. 485 */ 486 #define findCell(pPage, iCell) \ 487 ((pPage)->aData + get2byte(&(pPage)->aData[(pPage)->cellOffset+2*(iCell)])) 488 #ifdef SQLITE_TEST 489 u8 *sqlite3BtreeFindCell(MemPage *pPage, int iCell){ 490 assert( iCell>=0 ); 491 assert( iCell<get2byte(&pPage->aData[pPage->hdrOffset+3]) ); 492 return findCell(pPage, iCell); 493 } 494 #endif 495 496 /* 497 ** This a more complex version of sqlite3BtreeFindCell() that works for 498 ** pages that do contain overflow cells. See insert 499 */ 500 static u8 *findOverflowCell(MemPage *pPage, int iCell){ 501 int i; 502 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 503 for(i=pPage->nOverflow-1; i>=0; i--){ 504 int k; 505 struct _OvflCell *pOvfl; 506 pOvfl = &pPage->aOvfl[i]; 507 k = pOvfl->idx; 508 if( k<=iCell ){ 509 if( k==iCell ){ 510 return pOvfl->pCell; 511 } 512 iCell--; 513 } 514 } 515 return findCell(pPage, iCell); 516 } 517 518 /* 519 ** Parse a cell content block and fill in the CellInfo structure. There 520 ** are two versions of this function. sqlite3BtreeParseCell() takes a 521 ** cell index as the second argument and sqlite3BtreeParseCellPtr() 522 ** takes a pointer to the body of the cell as its second argument. 523 ** 524 ** Within this file, the parseCell() macro can be called instead of 525 ** sqlite3BtreeParseCellPtr(). Using some compilers, this will be faster. 526 */ 527 void sqlite3BtreeParseCellPtr( 528 MemPage *pPage, /* Page containing the cell */ 529 u8 *pCell, /* Pointer to the cell text. */ 530 CellInfo *pInfo /* Fill in this structure */ 531 ){ 532 int n; /* Number bytes in cell content header */ 533 u32 nPayload; /* Number of bytes of cell payload */ 534 535 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 536 537 pInfo->pCell = pCell; 538 assert( pPage->leaf==0 || pPage->leaf==1 ); 539 n = pPage->childPtrSize; 540 assert( n==4-4*pPage->leaf ); 541 if( pPage->hasData ){ 542 n += getVarint32(&pCell[n], &nPayload); 543 }else{ 544 nPayload = 0; 545 } 546 pInfo->nData = nPayload; 547 if( pPage->intKey ){ 548 n += getVarint(&pCell[n], (u64 *)&pInfo->nKey); 549 }else{ 550 u32 x; 551 n += getVarint32(&pCell[n], &x); 552 pInfo->nKey = x; 553 nPayload += x; 554 } 555 pInfo->nPayload = nPayload; 556 pInfo->nHeader = n; 557 if( nPayload<=pPage->maxLocal ){ 558 /* This is the (easy) common case where the entire payload fits 559 ** on the local page. No overflow is required. 560 */ 561 int nSize; /* Total size of cell content in bytes */ 562 pInfo->nLocal = nPayload; 563 pInfo->iOverflow = 0; 564 nSize = nPayload + n; 565 if( nSize<4 ){ 566 nSize = 4; /* Minimum cell size is 4 */ 567 } 568 pInfo->nSize = nSize; 569 }else{ 570 /* If the payload will not fit completely on the local page, we have 571 ** to decide how much to store locally and how much to spill onto 572 ** overflow pages. The strategy is to minimize the amount of unused 573 ** space on overflow pages while keeping the amount of local storage 574 ** in between minLocal and maxLocal. 575 ** 576 ** Warning: changing the way overflow payload is distributed in any 577 ** way will result in an incompatible file format. 578 */ 579 int minLocal; /* Minimum amount of payload held locally */ 580 int maxLocal; /* Maximum amount of payload held locally */ 581 int surplus; /* Overflow payload available for local storage */ 582 583 minLocal = pPage->minLocal; 584 maxLocal = pPage->maxLocal; 585 surplus = minLocal + (nPayload - minLocal)%(pPage->pBt->usableSize - 4); 586 if( surplus <= maxLocal ){ 587 pInfo->nLocal = surplus; 588 }else{ 589 pInfo->nLocal = minLocal; 590 } 591 pInfo->iOverflow = pInfo->nLocal + n; 592 pInfo->nSize = pInfo->iOverflow + 4; 593 } 594 } 595 #define parseCell(pPage, iCell, pInfo) \ 596 sqlite3BtreeParseCellPtr((pPage), findCell((pPage), (iCell)), (pInfo)) 597 void sqlite3BtreeParseCell( 598 MemPage *pPage, /* Page containing the cell */ 599 int iCell, /* The cell index. First cell is 0 */ 600 CellInfo *pInfo /* Fill in this structure */ 601 ){ 602 parseCell(pPage, iCell, pInfo); 603 } 604 605 /* 606 ** Compute the total number of bytes that a Cell needs in the cell 607 ** data area of the btree-page. The return number includes the cell 608 ** data header and the local payload, but not any overflow page or 609 ** the space used by the cell pointer. 610 */ 611 #ifndef NDEBUG 612 static int cellSize(MemPage *pPage, int iCell){ 613 CellInfo info; 614 sqlite3BtreeParseCell(pPage, iCell, &info); 615 return info.nSize; 616 } 617 #endif 618 static int cellSizePtr(MemPage *pPage, u8 *pCell){ 619 CellInfo info; 620 sqlite3BtreeParseCellPtr(pPage, pCell, &info); 621 return info.nSize; 622 } 623 624 #ifndef SQLITE_OMIT_AUTOVACUUM 625 /* 626 ** If the cell pCell, part of page pPage contains a pointer 627 ** to an overflow page, insert an entry into the pointer-map 628 ** for the overflow page. 629 */ 630 static int ptrmapPutOvflPtr(MemPage *pPage, u8 *pCell){ 631 if( pCell ){ 632 CellInfo info; 633 sqlite3BtreeParseCellPtr(pPage, pCell, &info); 634 assert( (info.nData+(pPage->intKey?0:info.nKey))==info.nPayload ); 635 if( (info.nData+(pPage->intKey?0:info.nKey))>info.nLocal ){ 636 Pgno ovfl = get4byte(&pCell[info.iOverflow]); 637 return ptrmapPut(pPage->pBt, ovfl, PTRMAP_OVERFLOW1, pPage->pgno); 638 } 639 } 640 return SQLITE_OK; 641 } 642 /* 643 ** If the cell with index iCell on page pPage contains a pointer 644 ** to an overflow page, insert an entry into the pointer-map 645 ** for the overflow page. 646 */ 647 static int ptrmapPutOvfl(MemPage *pPage, int iCell){ 648 u8 *pCell; 649 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 650 pCell = findOverflowCell(pPage, iCell); 651 return ptrmapPutOvflPtr(pPage, pCell); 652 } 653 #endif 654 655 656 /* 657 ** Defragment the page given. All Cells are moved to the 658 ** end of the page and all free space is collected into one 659 ** big FreeBlk that occurs in between the header and cell 660 ** pointer array and the cell content area. 661 */ 662 static int defragmentPage(MemPage *pPage){ 663 int i; /* Loop counter */ 664 int pc; /* Address of a i-th cell */ 665 int addr; /* Offset of first byte after cell pointer array */ 666 int hdr; /* Offset to the page header */ 667 int size; /* Size of a cell */ 668 int usableSize; /* Number of usable bytes on a page */ 669 int cellOffset; /* Offset to the cell pointer array */ 670 int brk; /* Offset to the cell content area */ 671 int nCell; /* Number of cells on the page */ 672 unsigned char *data; /* The page data */ 673 unsigned char *temp; /* Temp area for cell content */ 674 675 assert( sqlite3PagerIswriteable(pPage->pDbPage) ); 676 assert( pPage->pBt!=0 ); 677 assert( pPage->pBt->usableSize <= SQLITE_MAX_PAGE_SIZE ); 678 assert( pPage->nOverflow==0 ); 679 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 680 temp = sqlite3PagerTempSpace(pPage->pBt->pPager); 681 data = pPage->aData; 682 hdr = pPage->hdrOffset; 683 cellOffset = pPage->cellOffset; 684 nCell = pPage->nCell; 685 assert( nCell==get2byte(&data[hdr+3]) ); 686 usableSize = pPage->pBt->usableSize; 687 brk = get2byte(&data[hdr+5]); 688 memcpy(&temp[brk], &data[brk], usableSize - brk); 689 brk = usableSize; 690 for(i=0; i<nCell; i++){ 691 u8 *pAddr; /* The i-th cell pointer */ 692 pAddr = &data[cellOffset + i*2]; 693 pc = get2byte(pAddr); 694 assert( pc<pPage->pBt->usableSize ); 695 size = cellSizePtr(pPage, &temp[pc]); 696 brk -= size; 697 memcpy(&data[brk], &temp[pc], size); 698 put2byte(pAddr, brk); 699 } 700 assert( brk>=cellOffset+2*nCell ); 701 put2byte(&data[hdr+5], brk); 702 data[hdr+1] = 0; 703 data[hdr+2] = 0; 704 data[hdr+7] = 0; 705 addr = cellOffset+2*nCell; 706 memset(&data[addr], 0, brk-addr); 707 return SQLITE_OK; 708 } 709 710 /* 711 ** Allocate nByte bytes of space on a page. 712 ** 713 ** Return the index into pPage->aData[] of the first byte of 714 ** the new allocation. Or return 0 if there is not enough free 715 ** space on the page to satisfy the allocation request. 716 ** 717 ** If the page contains nBytes of free space but does not contain 718 ** nBytes of contiguous free space, then this routine automatically 719 ** calls defragementPage() to consolidate all free space before 720 ** allocating the new chunk. 721 */ 722 static int allocateSpace(MemPage *pPage, int nByte){ 723 int addr, pc, hdr; 724 int size; 725 int nFrag; 726 int top; 727 int nCell; 728 int cellOffset; 729 unsigned char *data; 730 731 data = pPage->aData; 732 assert( sqlite3PagerIswriteable(pPage->pDbPage) ); 733 assert( pPage->pBt ); 734 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 735 if( nByte<4 ) nByte = 4; 736 if( pPage->nFree<nByte || pPage->nOverflow>0 ) return 0; 737 pPage->nFree -= nByte; 738 hdr = pPage->hdrOffset; 739 740 nFrag = data[hdr+7]; 741 if( nFrag<60 ){ 742 /* Search the freelist looking for a slot big enough to satisfy the 743 ** space request. */ 744 addr = hdr+1; 745 while( (pc = get2byte(&data[addr]))>0 ){ 746 size = get2byte(&data[pc+2]); 747 if( size>=nByte ){ 748 if( size<nByte+4 ){ 749 memcpy(&data[addr], &data[pc], 2); 750 data[hdr+7] = nFrag + size - nByte; 751 return pc; 752 }else{ 753 put2byte(&data[pc+2], size-nByte); 754 return pc + size - nByte; 755 } 756 } 757 addr = pc; 758 } 759 } 760 761 /* Allocate memory from the gap in between the cell pointer array 762 ** and the cell content area. 763 */ 764 top = get2byte(&data[hdr+5]); 765 nCell = get2byte(&data[hdr+3]); 766 cellOffset = pPage->cellOffset; 767 if( nFrag>=60 || cellOffset + 2*nCell > top - nByte ){ 768 if( defragmentPage(pPage) ) return 0; 769 top = get2byte(&data[hdr+5]); 770 } 771 top -= nByte; 772 assert( cellOffset + 2*nCell <= top ); 773 put2byte(&data[hdr+5], top); 774 return top; 775 } 776 777 /* 778 ** Return a section of the pPage->aData to the freelist. 779 ** The first byte of the new free block is pPage->aDisk[start] 780 ** and the size of the block is "size" bytes. 781 ** 782 ** Most of the effort here is involved in coalesing adjacent 783 ** free blocks into a single big free block. 784 */ 785 static void freeSpace(MemPage *pPage, int start, int size){ 786 int addr, pbegin, hdr; 787 unsigned char *data = pPage->aData; 788 789 assert( pPage->pBt!=0 ); 790 assert( sqlite3PagerIswriteable(pPage->pDbPage) ); 791 assert( start>=pPage->hdrOffset+6+(pPage->leaf?0:4) ); 792 assert( (start + size)<=pPage->pBt->usableSize ); 793 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 794 if( size<4 ) size = 4; 795 796 #ifdef SQLITE_SECURE_DELETE 797 /* Overwrite deleted information with zeros when the SECURE_DELETE 798 ** option is enabled at compile-time */ 799 memset(&data[start], 0, size); 800 #endif 801 802 /* Add the space back into the linked list of freeblocks */ 803 hdr = pPage->hdrOffset; 804 addr = hdr + 1; 805 while( (pbegin = get2byte(&data[addr]))<start && pbegin>0 ){ 806 assert( pbegin<=pPage->pBt->usableSize-4 ); 807 assert( pbegin>addr ); 808 addr = pbegin; 809 } 810 assert( pbegin<=pPage->pBt->usableSize-4 ); 811 assert( pbegin>addr || pbegin==0 ); 812 put2byte(&data[addr], start); 813 put2byte(&data[start], pbegin); 814 put2byte(&data[start+2], size); 815 pPage->nFree += size; 816 817 /* Coalesce adjacent free blocks */ 818 addr = pPage->hdrOffset + 1; 819 while( (pbegin = get2byte(&data[addr]))>0 ){ 820 int pnext, psize; 821 assert( pbegin>addr ); 822 assert( pbegin<=pPage->pBt->usableSize-4 ); 823 pnext = get2byte(&data[pbegin]); 824 psize = get2byte(&data[pbegin+2]); 825 if( pbegin + psize + 3 >= pnext && pnext>0 ){ 826 int frag = pnext - (pbegin+psize); 827 assert( frag<=data[pPage->hdrOffset+7] ); 828 data[pPage->hdrOffset+7] -= frag; 829 put2byte(&data[pbegin], get2byte(&data[pnext])); 830 put2byte(&data[pbegin+2], pnext+get2byte(&data[pnext+2])-pbegin); 831 }else{ 832 addr = pbegin; 833 } 834 } 835 836 /* If the cell content area begins with a freeblock, remove it. */ 837 if( data[hdr+1]==data[hdr+5] && data[hdr+2]==data[hdr+6] ){ 838 int top; 839 pbegin = get2byte(&data[hdr+1]); 840 memcpy(&data[hdr+1], &data[pbegin], 2); 841 top = get2byte(&data[hdr+5]); 842 put2byte(&data[hdr+5], top + get2byte(&data[pbegin+2])); 843 } 844 } 845 846 /* 847 ** Decode the flags byte (the first byte of the header) for a page 848 ** and initialize fields of the MemPage structure accordingly. 849 */ 850 static void decodeFlags(MemPage *pPage, int flagByte){ 851 BtShared *pBt; /* A copy of pPage->pBt */ 852 853 assert( pPage->hdrOffset==(pPage->pgno==1 ? 100 : 0) ); 854 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 855 pPage->intKey = (flagByte & (PTF_INTKEY|PTF_LEAFDATA))!=0; 856 pPage->zeroData = (flagByte & PTF_ZERODATA)!=0; 857 pPage->leaf = (flagByte & PTF_LEAF)!=0; 858 pPage->childPtrSize = 4*(pPage->leaf==0); 859 pBt = pPage->pBt; 860 if( flagByte & PTF_LEAFDATA ){ 861 pPage->leafData = 1; 862 pPage->maxLocal = pBt->maxLeaf; 863 pPage->minLocal = pBt->minLeaf; 864 }else{ 865 pPage->leafData = 0; 866 pPage->maxLocal = pBt->maxLocal; 867 pPage->minLocal = pBt->minLocal; 868 } 869 pPage->hasData = !(pPage->zeroData || (!pPage->leaf && pPage->leafData)); 870 } 871 872 /* 873 ** Initialize the auxiliary information for a disk block. 874 ** 875 ** The pParent parameter must be a pointer to the MemPage which 876 ** is the parent of the page being initialized. The root of a 877 ** BTree has no parent and so for that page, pParent==NULL. 878 ** 879 ** Return SQLITE_OK on success. If we see that the page does 880 ** not contain a well-formed database page, then return 881 ** SQLITE_CORRUPT. Note that a return of SQLITE_OK does not 882 ** guarantee that the page is well-formed. It only shows that 883 ** we failed to detect any corruption. 884 */ 885 int sqlite3BtreeInitPage( 886 MemPage *pPage, /* The page to be initialized */ 887 MemPage *pParent /* The parent. Might be NULL */ 888 ){ 889 int pc; /* Address of a freeblock within pPage->aData[] */ 890 int hdr; /* Offset to beginning of page header */ 891 u8 *data; /* Equal to pPage->aData */ 892 BtShared *pBt; /* The main btree structure */ 893 int usableSize; /* Amount of usable space on each page */ 894 int cellOffset; /* Offset from start of page to first cell pointer */ 895 int nFree; /* Number of unused bytes on the page */ 896 int top; /* First byte of the cell content area */ 897 898 pBt = pPage->pBt; 899 assert( pBt!=0 ); 900 assert( pParent==0 || pParent->pBt==pBt ); 901 assert( sqlite3_mutex_held(pBt->mutex) ); 902 assert( pPage->pgno==sqlite3PagerPagenumber(pPage->pDbPage) ); 903 assert( pPage == sqlite3PagerGetExtra(pPage->pDbPage) ); 904 assert( pPage->aData == sqlite3PagerGetData(pPage->pDbPage) ); 905 if( pPage->pParent!=pParent && (pPage->pParent!=0 || pPage->isInit) ){ 906 /* The parent page should never change unless the file is corrupt */ 907 return SQLITE_CORRUPT_BKPT; 908 } 909 if( pPage->isInit ) return SQLITE_OK; 910 if( pPage->pParent==0 && pParent!=0 ){ 911 pPage->pParent = pParent; 912 sqlite3PagerRef(pParent->pDbPage); 913 } 914 hdr = pPage->hdrOffset; 915 data = pPage->aData; 916 decodeFlags(pPage, data[hdr]); 917 pPage->nOverflow = 0; 918 pPage->idxShift = 0; 919 usableSize = pBt->usableSize; 920 pPage->cellOffset = cellOffset = hdr + 12 - 4*pPage->leaf; 921 top = get2byte(&data[hdr+5]); 922 pPage->nCell = get2byte(&data[hdr+3]); 923 if( pPage->nCell>MX_CELL(pBt) ){ 924 /* To many cells for a single page. The page must be corrupt */ 925 return SQLITE_CORRUPT_BKPT; 926 } 927 if( pPage->nCell==0 && pParent!=0 && pParent->pgno!=1 ){ 928 /* All pages must have at least one cell, except for root pages */ 929 return SQLITE_CORRUPT_BKPT; 930 } 931 932 /* Compute the total free space on the page */ 933 pc = get2byte(&data[hdr+1]); 934 nFree = data[hdr+7] + top - (cellOffset + 2*pPage->nCell); 935 while( pc>0 ){ 936 int next, size; 937 if( pc>usableSize-4 ){ 938 /* Free block is off the page */ 939 return SQLITE_CORRUPT_BKPT; 940 } 941 next = get2byte(&data[pc]); 942 size = get2byte(&data[pc+2]); 943 if( next>0 && next<=pc+size+3 ){ 944 /* Free blocks must be in accending order */ 945 return SQLITE_CORRUPT_BKPT; 946 } 947 nFree += size; 948 pc = next; 949 } 950 pPage->nFree = nFree; 951 if( nFree>=usableSize ){ 952 /* Free space cannot exceed total page size */ 953 return SQLITE_CORRUPT_BKPT; 954 } 955 956 pPage->isInit = 1; 957 return SQLITE_OK; 958 } 959 960 /* 961 ** Set up a raw page so that it looks like a database page holding 962 ** no entries. 963 */ 964 static void zeroPage(MemPage *pPage, int flags){ 965 unsigned char *data = pPage->aData; 966 BtShared *pBt = pPage->pBt; 967 int hdr = pPage->hdrOffset; 968 int first; 969 970 assert( sqlite3PagerPagenumber(pPage->pDbPage)==pPage->pgno ); 971 assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage ); 972 assert( sqlite3PagerGetData(pPage->pDbPage) == data ); 973 assert( sqlite3PagerIswriteable(pPage->pDbPage) ); 974 assert( sqlite3_mutex_held(pBt->mutex) ); 975 memset(&data[hdr], 0, pBt->usableSize - hdr); 976 data[hdr] = flags; 977 first = hdr + 8 + 4*((flags&PTF_LEAF)==0); 978 memset(&data[hdr+1], 0, 4); 979 data[hdr+7] = 0; 980 put2byte(&data[hdr+5], pBt->usableSize); 981 pPage->nFree = pBt->usableSize - first; 982 decodeFlags(pPage, flags); 983 pPage->hdrOffset = hdr; 984 pPage->cellOffset = first; 985 pPage->nOverflow = 0; 986 pPage->idxShift = 0; 987 pPage->nCell = 0; 988 pPage->isInit = 1; 989 } 990 991 /* 992 ** Get a page from the pager. Initialize the MemPage.pBt and 993 ** MemPage.aData elements if needed. 994 ** 995 ** If the noContent flag is set, it means that we do not care about 996 ** the content of the page at this time. So do not go to the disk 997 ** to fetch the content. Just fill in the content with zeros for now. 998 ** If in the future we call sqlite3PagerWrite() on this page, that 999 ** means we have started to be concerned about content and the disk 1000 ** read should occur at that point. 1001 */ 1002 int sqlite3BtreeGetPage( 1003 BtShared *pBt, /* The btree */ 1004 Pgno pgno, /* Number of the page to fetch */ 1005 MemPage **ppPage, /* Return the page in this parameter */ 1006 int noContent /* Do not load page content if true */ 1007 ){ 1008 int rc; 1009 MemPage *pPage; 1010 DbPage *pDbPage; 1011 1012 assert( sqlite3_mutex_held(pBt->mutex) ); 1013 rc = sqlite3PagerAcquire(pBt->pPager, pgno, (DbPage**)&pDbPage, noContent); 1014 if( rc ) return rc; 1015 pPage = (MemPage *)sqlite3PagerGetExtra(pDbPage); 1016 pPage->aData = sqlite3PagerGetData(pDbPage); 1017 pPage->pDbPage = pDbPage; 1018 pPage->pBt = pBt; 1019 pPage->pgno = pgno; 1020 pPage->hdrOffset = pPage->pgno==1 ? 100 : 0; 1021 *ppPage = pPage; 1022 return SQLITE_OK; 1023 } 1024 1025 /* 1026 ** Get a page from the pager and initialize it. This routine 1027 ** is just a convenience wrapper around separate calls to 1028 ** sqlite3BtreeGetPage() and sqlite3BtreeInitPage(). 1029 */ 1030 static int getAndInitPage( 1031 BtShared *pBt, /* The database file */ 1032 Pgno pgno, /* Number of the page to get */ 1033 MemPage **ppPage, /* Write the page pointer here */ 1034 MemPage *pParent /* Parent of the page */ 1035 ){ 1036 int rc; 1037 assert( sqlite3_mutex_held(pBt->mutex) ); 1038 if( pgno==0 ){ 1039 return SQLITE_CORRUPT_BKPT; 1040 } 1041 rc = sqlite3BtreeGetPage(pBt, pgno, ppPage, 0); 1042 if( rc==SQLITE_OK && (*ppPage)->isInit==0 ){ 1043 rc = sqlite3BtreeInitPage(*ppPage, pParent); 1044 } 1045 return rc; 1046 } 1047 1048 /* 1049 ** Release a MemPage. This should be called once for each prior 1050 ** call to sqlite3BtreeGetPage. 1051 */ 1052 static void releasePage(MemPage *pPage){ 1053 if( pPage ){ 1054 assert( pPage->aData ); 1055 assert( pPage->pBt ); 1056 assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage ); 1057 assert( sqlite3PagerGetData(pPage->pDbPage)==pPage->aData ); 1058 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 1059 sqlite3PagerUnref(pPage->pDbPage); 1060 } 1061 } 1062 1063 /* 1064 ** This routine is called when the reference count for a page 1065 ** reaches zero. We need to unref the pParent pointer when that 1066 ** happens. 1067 */ 1068 static void pageDestructor(DbPage *pData, int pageSize){ 1069 MemPage *pPage; 1070 assert( (pageSize & 7)==0 ); 1071 pPage = (MemPage *)sqlite3PagerGetExtra(pData); 1072 assert( pPage->isInit==0 || sqlite3_mutex_held(pPage->pBt->mutex) ); 1073 if( pPage->pParent ){ 1074 MemPage *pParent = pPage->pParent; 1075 assert( pParent->pBt==pPage->pBt ); 1076 pPage->pParent = 0; 1077 releasePage(pParent); 1078 } 1079 pPage->isInit = 0; 1080 } 1081 1082 /* 1083 ** During a rollback, when the pager reloads information into the cache 1084 ** so that the cache is restored to its original state at the start of 1085 ** the transaction, for each page restored this routine is called. 1086 ** 1087 ** This routine needs to reset the extra data section at the end of the 1088 ** page to agree with the restored data. 1089 */ 1090 static void pageReinit(DbPage *pData, int pageSize){ 1091 MemPage *pPage; 1092 assert( (pageSize & 7)==0 ); 1093 pPage = (MemPage *)sqlite3PagerGetExtra(pData); 1094 if( pPage->isInit ){ 1095 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 1096 pPage->isInit = 0; 1097 sqlite3BtreeInitPage(pPage, pPage->pParent); 1098 } 1099 } 1100 1101 /* 1102 ** Invoke the busy handler for a btree. 1103 */ 1104 static int sqlite3BtreeInvokeBusyHandler(void *pArg, int n){ 1105 BtShared *pBt = (BtShared*)pArg; 1106 assert( pBt->db ); 1107 assert( sqlite3_mutex_held(pBt->db->mutex) ); 1108 return sqlite3InvokeBusyHandler(&pBt->db->busyHandler); 1109 } 1110 1111 /* 1112 ** Open a database file. 1113 ** 1114 ** zFilename is the name of the database file. If zFilename is NULL 1115 ** a new database with a random name is created. This randomly named 1116 ** database file will be deleted when sqlite3BtreeClose() is called. 1117 ** If zFilename is ":memory:" then an in-memory database is created 1118 ** that is automatically destroyed when it is closed. 1119 */ 1120 int sqlite3BtreeOpen( 1121 const char *zFilename, /* Name of the file containing the BTree database */ 1122 sqlite3 *db, /* Associated database handle */ 1123 Btree **ppBtree, /* Pointer to new Btree object written here */ 1124 int flags, /* Options */ 1125 int vfsFlags /* Flags passed through to sqlite3_vfs.xOpen() */ 1126 ){ 1127 sqlite3_vfs *pVfs; /* The VFS to use for this btree */ 1128 BtShared *pBt = 0; /* Shared part of btree structure */ 1129 Btree *p; /* Handle to return */ 1130 int rc = SQLITE_OK; 1131 int nReserve; 1132 unsigned char zDbHeader[100]; 1133 1134 /* Set the variable isMemdb to true for an in-memory database, or 1135 ** false for a file-based database. This symbol is only required if 1136 ** either of the shared-data or autovacuum features are compiled 1137 ** into the library. 1138 */ 1139 #if !defined(SQLITE_OMIT_SHARED_CACHE) || !defined(SQLITE_OMIT_AUTOVACUUM) 1140 #ifdef SQLITE_OMIT_MEMORYDB 1141 const int isMemdb = 0; 1142 #else 1143 const int isMemdb = zFilename && !strcmp(zFilename, ":memory:"); 1144 #endif 1145 #endif 1146 1147 assert( db!=0 ); 1148 assert( sqlite3_mutex_held(db->mutex) ); 1149 1150 pVfs = db->pVfs; 1151 p = sqlite3MallocZero(sizeof(Btree)); 1152 if( !p ){ 1153 return SQLITE_NOMEM; 1154 } 1155 p->inTrans = TRANS_NONE; 1156 p->db = db; 1157 1158 #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO) 1159 /* 1160 ** If this Btree is a candidate for shared cache, try to find an 1161 ** existing BtShared object that we can share with 1162 */ 1163 if( (flags & BTREE_PRIVATE)==0 1164 && isMemdb==0 1165 && (db->flags & SQLITE_Vtab)==0 1166 && zFilename && zFilename[0] 1167 ){ 1168 if( sqlite3SharedCacheEnabled ){ 1169 int nFullPathname = pVfs->mxPathname+1; 1170 char *zFullPathname = (char *)sqlite3_malloc(nFullPathname); 1171 sqlite3_mutex *mutexShared; 1172 p->sharable = 1; 1173 if( db ){ 1174 db->flags |= SQLITE_SharedCache; 1175 } 1176 if( !zFullPathname ){ 1177 sqlite3_free(p); 1178 return SQLITE_NOMEM; 1179 } 1180 sqlite3OsFullPathname(pVfs, zFilename, nFullPathname, zFullPathname); 1181 mutexShared = sqlite3_mutex_alloc(SQLITE_MUTEX_STATIC_MASTER); 1182 sqlite3_mutex_enter(mutexShared); 1183 for(pBt=sqlite3SharedCacheList; pBt; pBt=pBt->pNext){ 1184 assert( pBt->nRef>0 ); 1185 if( 0==strcmp(zFullPathname, sqlite3PagerFilename(pBt->pPager)) 1186 && sqlite3PagerVfs(pBt->pPager)==pVfs ){ 1187 p->pBt = pBt; 1188 pBt->nRef++; 1189 break; 1190 } 1191 } 1192 sqlite3_mutex_leave(mutexShared); 1193 sqlite3_free(zFullPathname); 1194 } 1195 #ifdef SQLITE_DEBUG 1196 else{ 1197 /* In debug mode, we mark all persistent databases as sharable 1198 ** even when they are not. This exercises the locking code and 1199 ** gives more opportunity for asserts(sqlite3_mutex_held()) 1200 ** statements to find locking problems. 1201 */ 1202 p->sharable = 1; 1203 } 1204 #endif 1205 } 1206 #endif 1207 if( pBt==0 ){ 1208 /* 1209 ** The following asserts make sure that structures used by the btree are 1210 ** the right size. This is to guard against size changes that result 1211 ** when compiling on a different architecture. 1212 */ 1213 assert( sizeof(i64)==8 || sizeof(i64)==4 ); 1214 assert( sizeof(u64)==8 || sizeof(u64)==4 ); 1215 assert( sizeof(u32)==4 ); 1216 assert( sizeof(u16)==2 ); 1217 assert( sizeof(Pgno)==4 ); 1218 1219 pBt = sqlite3MallocZero( sizeof(*pBt) ); 1220 if( pBt==0 ){ 1221 rc = SQLITE_NOMEM; 1222 goto btree_open_out; 1223 } 1224 pBt->busyHdr.xFunc = sqlite3BtreeInvokeBusyHandler; 1225 pBt->busyHdr.pArg = pBt; 1226 rc = sqlite3PagerOpen(pVfs, &pBt->pPager, zFilename, 1227 EXTRA_SIZE, flags, vfsFlags); 1228 if( rc==SQLITE_OK ){ 1229 rc = sqlite3PagerReadFileheader(pBt->pPager,sizeof(zDbHeader),zDbHeader); 1230 } 1231 if( rc!=SQLITE_OK ){ 1232 goto btree_open_out; 1233 } 1234 sqlite3PagerSetBusyhandler(pBt->pPager, &pBt->busyHdr); 1235 p->pBt = pBt; 1236 1237 sqlite3PagerSetDestructor(pBt->pPager, pageDestructor); 1238 sqlite3PagerSetReiniter(pBt->pPager, pageReinit); 1239 pBt->pCursor = 0; 1240 pBt->pPage1 = 0; 1241 pBt->readOnly = sqlite3PagerIsreadonly(pBt->pPager); 1242 pBt->pageSize = get2byte(&zDbHeader[16]); 1243 if( pBt->pageSize<512 || pBt->pageSize>SQLITE_MAX_PAGE_SIZE 1244 || ((pBt->pageSize-1)&pBt->pageSize)!=0 ){ 1245 pBt->pageSize = 0; 1246 sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize); 1247 pBt->maxEmbedFrac = 64; /* 25% */ 1248 pBt->minEmbedFrac = 32; /* 12.5% */ 1249 pBt->minLeafFrac = 32; /* 12.5% */ 1250 #ifndef SQLITE_OMIT_AUTOVACUUM 1251 /* If the magic name ":memory:" will create an in-memory database, then 1252 ** leave the autoVacuum mode at 0 (do not auto-vacuum), even if 1253 ** SQLITE_DEFAULT_AUTOVACUUM is true. On the other hand, if 1254 ** SQLITE_OMIT_MEMORYDB has been defined, then ":memory:" is just a 1255 ** regular file-name. In this case the auto-vacuum applies as per normal. 1256 */ 1257 if( zFilename && !isMemdb ){ 1258 pBt->autoVacuum = (SQLITE_DEFAULT_AUTOVACUUM ? 1 : 0); 1259 pBt->incrVacuum = (SQLITE_DEFAULT_AUTOVACUUM==2 ? 1 : 0); 1260 } 1261 #endif 1262 nReserve = 0; 1263 }else{ 1264 nReserve = zDbHeader[20]; 1265 pBt->maxEmbedFrac = zDbHeader[21]; 1266 pBt->minEmbedFrac = zDbHeader[22]; 1267 pBt->minLeafFrac = zDbHeader[23]; 1268 pBt->pageSizeFixed = 1; 1269 #ifndef SQLITE_OMIT_AUTOVACUUM 1270 pBt->autoVacuum = (get4byte(&zDbHeader[36 + 4*4])?1:0); 1271 pBt->incrVacuum = (get4byte(&zDbHeader[36 + 7*4])?1:0); 1272 #endif 1273 } 1274 pBt->usableSize = pBt->pageSize - nReserve; 1275 assert( (pBt->pageSize & 7)==0 ); /* 8-byte alignment of pageSize */ 1276 sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize); 1277 1278 #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO) 1279 /* Add the new BtShared object to the linked list sharable BtShareds. 1280 */ 1281 if( p->sharable ){ 1282 sqlite3_mutex *mutexShared; 1283 pBt->nRef = 1; 1284 mutexShared = sqlite3_mutex_alloc(SQLITE_MUTEX_STATIC_MASTER); 1285 if( SQLITE_THREADSAFE ){ 1286 pBt->mutex = sqlite3_mutex_alloc(SQLITE_MUTEX_FAST); 1287 if( pBt->mutex==0 ){ 1288 rc = SQLITE_NOMEM; 1289 db->mallocFailed = 0; 1290 goto btree_open_out; 1291 } 1292 } 1293 sqlite3_mutex_enter(mutexShared); 1294 pBt->pNext = sqlite3SharedCacheList; 1295 sqlite3SharedCacheList = pBt; 1296 sqlite3_mutex_leave(mutexShared); 1297 } 1298 #endif 1299 } 1300 1301 #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO) 1302 /* If the new Btree uses a sharable pBtShared, then link the new 1303 ** Btree into the list of all sharable Btrees for the same connection. 1304 ** The list is kept in ascending order by pBt address. 1305 */ 1306 if( p->sharable ){ 1307 int i; 1308 Btree *pSib; 1309 for(i=0; i<db->nDb; i++){ 1310 if( (pSib = db->aDb[i].pBt)!=0 && pSib->sharable ){ 1311 while( pSib->pPrev ){ pSib = pSib->pPrev; } 1312 if( p->pBt<pSib->pBt ){ 1313 p->pNext = pSib; 1314 p->pPrev = 0; 1315 pSib->pPrev = p; 1316 }else{ 1317 while( pSib->pNext && pSib->pNext->pBt<p->pBt ){ 1318 pSib = pSib->pNext; 1319 } 1320 p->pNext = pSib->pNext; 1321 p->pPrev = pSib; 1322 if( p->pNext ){ 1323 p->pNext->pPrev = p; 1324 } 1325 pSib->pNext = p; 1326 } 1327 break; 1328 } 1329 } 1330 } 1331 #endif 1332 *ppBtree = p; 1333 1334 btree_open_out: 1335 if( rc!=SQLITE_OK ){ 1336 if( pBt && pBt->pPager ){ 1337 sqlite3PagerClose(pBt->pPager); 1338 } 1339 sqlite3_free(pBt); 1340 sqlite3_free(p); 1341 *ppBtree = 0; 1342 } 1343 return rc; 1344 } 1345 1346 /* 1347 ** Decrement the BtShared.nRef counter. When it reaches zero, 1348 ** remove the BtShared structure from the sharing list. Return 1349 ** true if the BtShared.nRef counter reaches zero and return 1350 ** false if it is still positive. 1351 */ 1352 static int removeFromSharingList(BtShared *pBt){ 1353 #ifndef SQLITE_OMIT_SHARED_CACHE 1354 sqlite3_mutex *pMaster; 1355 BtShared *pList; 1356 int removed = 0; 1357 1358 assert( sqlite3_mutex_notheld(pBt->mutex) ); 1359 pMaster = sqlite3_mutex_alloc(SQLITE_MUTEX_STATIC_MASTER); 1360 sqlite3_mutex_enter(pMaster); 1361 pBt->nRef--; 1362 if( pBt->nRef<=0 ){ 1363 if( sqlite3SharedCacheList==pBt ){ 1364 sqlite3SharedCacheList = pBt->pNext; 1365 }else{ 1366 pList = sqlite3SharedCacheList; 1367 while( pList && pList->pNext!=pBt ){ 1368 pList=pList->pNext; 1369 } 1370 if( pList ){ 1371 pList->pNext = pBt->pNext; 1372 } 1373 } 1374 if( SQLITE_THREADSAFE ){ 1375 sqlite3_mutex_free(pBt->mutex); 1376 } 1377 removed = 1; 1378 } 1379 sqlite3_mutex_leave(pMaster); 1380 return removed; 1381 #else 1382 return 1; 1383 #endif 1384 } 1385 1386 /* 1387 ** Close an open database and invalidate all cursors. 1388 */ 1389 int sqlite3BtreeClose(Btree *p){ 1390 BtShared *pBt = p->pBt; 1391 BtCursor *pCur; 1392 1393 /* Close all cursors opened via this handle. */ 1394 assert( sqlite3_mutex_held(p->db->mutex) ); 1395 sqlite3BtreeEnter(p); 1396 pBt->db = p->db; 1397 pCur = pBt->pCursor; 1398 while( pCur ){ 1399 BtCursor *pTmp = pCur; 1400 pCur = pCur->pNext; 1401 if( pTmp->pBtree==p ){ 1402 sqlite3BtreeCloseCursor(pTmp); 1403 } 1404 } 1405 1406 /* Rollback any active transaction and free the handle structure. 1407 ** The call to sqlite3BtreeRollback() drops any table-locks held by 1408 ** this handle. 1409 */ 1410 sqlite3BtreeRollback(p); 1411 sqlite3BtreeLeave(p); 1412 1413 /* If there are still other outstanding references to the shared-btree 1414 ** structure, return now. The remainder of this procedure cleans 1415 ** up the shared-btree. 1416 */ 1417 assert( p->wantToLock==0 && p->locked==0 ); 1418 if( !p->sharable || removeFromSharingList(pBt) ){ 1419 /* The pBt is no longer on the sharing list, so we can access 1420 ** it without having to hold the mutex. 1421 ** 1422 ** Clean out and delete the BtShared object. 1423 */ 1424 assert( !pBt->pCursor ); 1425 sqlite3PagerClose(pBt->pPager); 1426 if( pBt->xFreeSchema && pBt->pSchema ){ 1427 pBt->xFreeSchema(pBt->pSchema); 1428 } 1429 sqlite3_free(pBt->pSchema); 1430 sqlite3_free(pBt); 1431 } 1432 1433 #ifndef SQLITE_OMIT_SHARED_CACHE 1434 assert( p->wantToLock==0 ); 1435 assert( p->locked==0 ); 1436 if( p->pPrev ) p->pPrev->pNext = p->pNext; 1437 if( p->pNext ) p->pNext->pPrev = p->pPrev; 1438 #endif 1439 1440 sqlite3_free(p); 1441 return SQLITE_OK; 1442 } 1443 1444 /* 1445 ** Change the limit on the number of pages allowed in the cache. 1446 ** 1447 ** The maximum number of cache pages is set to the absolute 1448 ** value of mxPage. If mxPage is negative, the pager will 1449 ** operate asynchronously - it will not stop to do fsync()s 1450 ** to insure data is written to the disk surface before 1451 ** continuing. Transactions still work if synchronous is off, 1452 ** and the database cannot be corrupted if this program 1453 ** crashes. But if the operating system crashes or there is 1454 ** an abrupt power failure when synchronous is off, the database 1455 ** could be left in an inconsistent and unrecoverable state. 1456 ** Synchronous is on by default so database corruption is not 1457 ** normally a worry. 1458 */ 1459 int sqlite3BtreeSetCacheSize(Btree *p, int mxPage){ 1460 BtShared *pBt = p->pBt; 1461 assert( sqlite3_mutex_held(p->db->mutex) ); 1462 sqlite3BtreeEnter(p); 1463 sqlite3PagerSetCachesize(pBt->pPager, mxPage); 1464 sqlite3BtreeLeave(p); 1465 return SQLITE_OK; 1466 } 1467 1468 /* 1469 ** Change the way data is synced to disk in order to increase or decrease 1470 ** how well the database resists damage due to OS crashes and power 1471 ** failures. Level 1 is the same as asynchronous (no syncs() occur and 1472 ** there is a high probability of damage) Level 2 is the default. There 1473 ** is a very low but non-zero probability of damage. Level 3 reduces the 1474 ** probability of damage to near zero but with a write performance reduction. 1475 */ 1476 #ifndef SQLITE_OMIT_PAGER_PRAGMAS 1477 int sqlite3BtreeSetSafetyLevel(Btree *p, int level, int fullSync){ 1478 BtShared *pBt = p->pBt; 1479 assert( sqlite3_mutex_held(p->db->mutex) ); 1480 sqlite3BtreeEnter(p); 1481 sqlite3PagerSetSafetyLevel(pBt->pPager, level, fullSync); 1482 sqlite3BtreeLeave(p); 1483 return SQLITE_OK; 1484 } 1485 #endif 1486 1487 /* 1488 ** Return TRUE if the given btree is set to safety level 1. In other 1489 ** words, return TRUE if no sync() occurs on the disk files. 1490 */ 1491 int sqlite3BtreeSyncDisabled(Btree *p){ 1492 BtShared *pBt = p->pBt; 1493 int rc; 1494 assert( sqlite3_mutex_held(p->db->mutex) ); 1495 sqlite3BtreeEnter(p); 1496 assert( pBt && pBt->pPager ); 1497 rc = sqlite3PagerNosync(pBt->pPager); 1498 sqlite3BtreeLeave(p); 1499 return rc; 1500 } 1501 1502 #if !defined(SQLITE_OMIT_PAGER_PRAGMAS) || !defined(SQLITE_OMIT_VACUUM) 1503 /* 1504 ** Change the default pages size and the number of reserved bytes per page. 1505 ** 1506 ** The page size must be a power of 2 between 512 and 65536. If the page 1507 ** size supplied does not meet this constraint then the page size is not 1508 ** changed. 1509 ** 1510 ** Page sizes are constrained to be a power of two so that the region 1511 ** of the database file used for locking (beginning at PENDING_BYTE, 1512 ** the first byte past the 1GB boundary, 0x40000000) needs to occur 1513 ** at the beginning of a page. 1514 ** 1515 ** If parameter nReserve is less than zero, then the number of reserved 1516 ** bytes per page is left unchanged. 1517 */ 1518 int sqlite3BtreeSetPageSize(Btree *p, int pageSize, int nReserve){ 1519 int rc = SQLITE_OK; 1520 BtShared *pBt = p->pBt; 1521 sqlite3BtreeEnter(p); 1522 if( pBt->pageSizeFixed ){ 1523 sqlite3BtreeLeave(p); 1524 return SQLITE_READONLY; 1525 } 1526 if( nReserve<0 ){ 1527 nReserve = pBt->pageSize - pBt->usableSize; 1528 } 1529 if( pageSize>=512 && pageSize<=SQLITE_MAX_PAGE_SIZE && 1530 ((pageSize-1)&pageSize)==0 ){ 1531 assert( (pageSize & 7)==0 ); 1532 assert( !pBt->pPage1 && !pBt->pCursor ); 1533 pBt->pageSize = pageSize; 1534 rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize); 1535 } 1536 pBt->usableSize = pBt->pageSize - nReserve; 1537 sqlite3BtreeLeave(p); 1538 return rc; 1539 } 1540 1541 /* 1542 ** Return the currently defined page size 1543 */ 1544 int sqlite3BtreeGetPageSize(Btree *p){ 1545 return p->pBt->pageSize; 1546 } 1547 int sqlite3BtreeGetReserve(Btree *p){ 1548 int n; 1549 sqlite3BtreeEnter(p); 1550 n = p->pBt->pageSize - p->pBt->usableSize; 1551 sqlite3BtreeLeave(p); 1552 return n; 1553 } 1554 1555 /* 1556 ** Set the maximum page count for a database if mxPage is positive. 1557 ** No changes are made if mxPage is 0 or negative. 1558 ** Regardless of the value of mxPage, return the maximum page count. 1559 */ 1560 int sqlite3BtreeMaxPageCount(Btree *p, int mxPage){ 1561 int n; 1562 sqlite3BtreeEnter(p); 1563 n = sqlite3PagerMaxPageCount(p->pBt->pPager, mxPage); 1564 sqlite3BtreeLeave(p); 1565 return n; 1566 } 1567 #endif /* !defined(SQLITE_OMIT_PAGER_PRAGMAS) || !defined(SQLITE_OMIT_VACUUM) */ 1568 1569 /* 1570 ** Change the 'auto-vacuum' property of the database. If the 'autoVacuum' 1571 ** parameter is non-zero, then auto-vacuum mode is enabled. If zero, it 1572 ** is disabled. The default value for the auto-vacuum property is 1573 ** determined by the SQLITE_DEFAULT_AUTOVACUUM macro. 1574 */ 1575 int sqlite3BtreeSetAutoVacuum(Btree *p, int autoVacuum){ 1576 #ifdef SQLITE_OMIT_AUTOVACUUM 1577 return SQLITE_READONLY; 1578 #else 1579 BtShared *pBt = p->pBt; 1580 int rc = SQLITE_OK; 1581 int av = (autoVacuum?1:0); 1582 1583 sqlite3BtreeEnter(p); 1584 if( pBt->pageSizeFixed && av!=pBt->autoVacuum ){ 1585 rc = SQLITE_READONLY; 1586 }else{ 1587 pBt->autoVacuum = av; 1588 } 1589 sqlite3BtreeLeave(p); 1590 return rc; 1591 #endif 1592 } 1593 1594 /* 1595 ** Return the value of the 'auto-vacuum' property. If auto-vacuum is 1596 ** enabled 1 is returned. Otherwise 0. 1597 */ 1598 int sqlite3BtreeGetAutoVacuum(Btree *p){ 1599 #ifdef SQLITE_OMIT_AUTOVACUUM 1600 return BTREE_AUTOVACUUM_NONE; 1601 #else 1602 int rc; 1603 sqlite3BtreeEnter(p); 1604 rc = ( 1605 (!p->pBt->autoVacuum)?BTREE_AUTOVACUUM_NONE: 1606 (!p->pBt->incrVacuum)?BTREE_AUTOVACUUM_FULL: 1607 BTREE_AUTOVACUUM_INCR 1608 ); 1609 sqlite3BtreeLeave(p); 1610 return rc; 1611 #endif 1612 } 1613 1614 1615 /* 1616 ** Get a reference to pPage1 of the database file. This will 1617 ** also acquire a readlock on that file. 1618 ** 1619 ** SQLITE_OK is returned on success. If the file is not a 1620 ** well-formed database file, then SQLITE_CORRUPT is returned. 1621 ** SQLITE_BUSY is returned if the database is locked. SQLITE_NOMEM 1622 ** is returned if we run out of memory. 1623 */ 1624 static int lockBtree(BtShared *pBt){ 1625 int rc, pageSize; 1626 MemPage *pPage1; 1627 1628 assert( sqlite3_mutex_held(pBt->mutex) ); 1629 if( pBt->pPage1 ) return SQLITE_OK; 1630 rc = sqlite3BtreeGetPage(pBt, 1, &pPage1, 0); 1631 if( rc!=SQLITE_OK ) return rc; 1632 1633 1634 /* Do some checking to help insure the file we opened really is 1635 ** a valid database file. 1636 */ 1637 rc = SQLITE_NOTADB; 1638 if( sqlite3PagerPagecount(pBt->pPager)>0 ){ 1639 u8 *page1 = pPage1->aData; 1640 if( memcmp(page1, zMagicHeader, 16)!=0 ){ 1641 goto page1_init_failed; 1642 } 1643 if( page1[18]>1 ){ 1644 pBt->readOnly = 1; 1645 } 1646 if( page1[19]>1 ){ 1647 goto page1_init_failed; 1648 } 1649 pageSize = get2byte(&page1[16]); 1650 if( ((pageSize-1)&pageSize)!=0 || pageSize<512 || 1651 (SQLITE_MAX_PAGE_SIZE<32768 && pageSize>SQLITE_MAX_PAGE_SIZE) 1652 ){ 1653 goto page1_init_failed; 1654 } 1655 assert( (pageSize & 7)==0 ); 1656 pBt->pageSize = pageSize; 1657 pBt->usableSize = pageSize - page1[20]; 1658 if( pBt->usableSize<500 ){ 1659 goto page1_init_failed; 1660 } 1661 pBt->maxEmbedFrac = page1[21]; 1662 pBt->minEmbedFrac = page1[22]; 1663 pBt->minLeafFrac = page1[23]; 1664 #ifndef SQLITE_OMIT_AUTOVACUUM 1665 pBt->autoVacuum = (get4byte(&page1[36 + 4*4])?1:0); 1666 pBt->incrVacuum = (get4byte(&page1[36 + 7*4])?1:0); 1667 #endif 1668 } 1669 1670 /* maxLocal is the maximum amount of payload to store locally for 1671 ** a cell. Make sure it is small enough so that at least minFanout 1672 ** cells can will fit on one page. We assume a 10-byte page header. 1673 ** Besides the payload, the cell must store: 1674 ** 2-byte pointer to the cell 1675 ** 4-byte child pointer 1676 ** 9-byte nKey value 1677 ** 4-byte nData value 1678 ** 4-byte overflow page pointer 1679 ** So a cell consists of a 2-byte poiner, a header which is as much as 1680 ** 17 bytes long, 0 to N bytes of payload, and an optional 4 byte overflow 1681 ** page pointer. 1682 */ 1683 pBt->maxLocal = (pBt->usableSize-12)*pBt->maxEmbedFrac/255 - 23; 1684 pBt->minLocal = (pBt->usableSize-12)*pBt->minEmbedFrac/255 - 23; 1685 pBt->maxLeaf = pBt->usableSize - 35; 1686 pBt->minLeaf = (pBt->usableSize-12)*pBt->minLeafFrac/255 - 23; 1687 if( pBt->minLocal>pBt->maxLocal || pBt->maxLocal<0 ){ 1688 goto page1_init_failed; 1689 } 1690 assert( pBt->maxLeaf + 23 <= MX_CELL_SIZE(pBt) ); 1691 pBt->pPage1 = pPage1; 1692 return SQLITE_OK; 1693 1694 page1_init_failed: 1695 releasePage(pPage1); 1696 pBt->pPage1 = 0; 1697 return rc; 1698 } 1699 1700 /* 1701 ** This routine works like lockBtree() except that it also invokes the 1702 ** busy callback if there is lock contention. 1703 */ 1704 static int lockBtreeWithRetry(Btree *pRef){ 1705 int rc = SQLITE_OK; 1706 1707 assert( sqlite3BtreeHoldsMutex(pRef) ); 1708 if( pRef->inTrans==TRANS_NONE ){ 1709 u8 inTransaction = pRef->pBt->inTransaction; 1710 btreeIntegrity(pRef); 1711 rc = sqlite3BtreeBeginTrans(pRef, 0); 1712 pRef->pBt->inTransaction = inTransaction; 1713 pRef->inTrans = TRANS_NONE; 1714 if( rc==SQLITE_OK ){ 1715 pRef->pBt->nTransaction--; 1716 } 1717 btreeIntegrity(pRef); 1718 } 1719 return rc; 1720 } 1721 1722 1723 /* 1724 ** If there are no outstanding cursors and we are not in the middle 1725 ** of a transaction but there is a read lock on the database, then 1726 ** this routine unrefs the first page of the database file which 1727 ** has the effect of releasing the read lock. 1728 ** 1729 ** If there are any outstanding cursors, this routine is a no-op. 1730 ** 1731 ** If there is a transaction in progress, this routine is a no-op. 1732 */ 1733 static void unlockBtreeIfUnused(BtShared *pBt){ 1734 assert( sqlite3_mutex_held(pBt->mutex) ); 1735 if( pBt->inTransaction==TRANS_NONE && pBt->pCursor==0 && pBt->pPage1!=0 ){ 1736 if( sqlite3PagerRefcount(pBt->pPager)>=1 ){ 1737 if( pBt->pPage1->aData==0 ){ 1738 MemPage *pPage = pBt->pPage1; 1739 pPage->aData = sqlite3PagerGetData(pPage->pDbPage); 1740 pPage->pBt = pBt; 1741 pPage->pgno = 1; 1742 } 1743 releasePage(pBt->pPage1); 1744 } 1745 pBt->pPage1 = 0; 1746 pBt->inStmt = 0; 1747 } 1748 } 1749 1750 /* 1751 ** Create a new database by initializing the first page of the 1752 ** file. 1753 */ 1754 static int newDatabase(BtShared *pBt){ 1755 MemPage *pP1; 1756 unsigned char *data; 1757 int rc; 1758 1759 assert( sqlite3_mutex_held(pBt->mutex) ); 1760 if( sqlite3PagerPagecount(pBt->pPager)>0 ) return SQLITE_OK; 1761 pP1 = pBt->pPage1; 1762 assert( pP1!=0 ); 1763 data = pP1->aData; 1764 rc = sqlite3PagerWrite(pP1->pDbPage); 1765 if( rc ) return rc; 1766 memcpy(data, zMagicHeader, sizeof(zMagicHeader)); 1767 assert( sizeof(zMagicHeader)==16 ); 1768 put2byte(&data[16], pBt->pageSize); 1769 data[18] = 1; 1770 data[19] = 1; 1771 data[20] = pBt->pageSize - pBt->usableSize; 1772 data[21] = pBt->maxEmbedFrac; 1773 data[22] = pBt->minEmbedFrac; 1774 data[23] = pBt->minLeafFrac; 1775 memset(&data[24], 0, 100-24); 1776 zeroPage(pP1, PTF_INTKEY|PTF_LEAF|PTF_LEAFDATA ); 1777 pBt->pageSizeFixed = 1; 1778 #ifndef SQLITE_OMIT_AUTOVACUUM 1779 assert( pBt->autoVacuum==1 || pBt->autoVacuum==0 ); 1780 assert( pBt->incrVacuum==1 || pBt->incrVacuum==0 ); 1781 put4byte(&data[36 + 4*4], pBt->autoVacuum); 1782 put4byte(&data[36 + 7*4], pBt->incrVacuum); 1783 #endif 1784 return SQLITE_OK; 1785 } 1786 1787 /* 1788 ** Attempt to start a new transaction. A write-transaction 1789 ** is started if the second argument is nonzero, otherwise a read- 1790 ** transaction. If the second argument is 2 or more and exclusive 1791 ** transaction is started, meaning that no other process is allowed 1792 ** to access the database. A preexisting transaction may not be 1793 ** upgraded to exclusive by calling this routine a second time - the 1794 ** exclusivity flag only works for a new transaction. 1795 ** 1796 ** A write-transaction must be started before attempting any 1797 ** changes to the database. None of the following routines 1798 ** will work unless a transaction is started first: 1799 ** 1800 ** sqlite3BtreeCreateTable() 1801 ** sqlite3BtreeCreateIndex() 1802 ** sqlite3BtreeClearTable() 1803 ** sqlite3BtreeDropTable() 1804 ** sqlite3BtreeInsert() 1805 ** sqlite3BtreeDelete() 1806 ** sqlite3BtreeUpdateMeta() 1807 ** 1808 ** If an initial attempt to acquire the lock fails because of lock contention 1809 ** and the database was previously unlocked, then invoke the busy handler 1810 ** if there is one. But if there was previously a read-lock, do not 1811 ** invoke the busy handler - just return SQLITE_BUSY. SQLITE_BUSY is 1812 ** returned when there is already a read-lock in order to avoid a deadlock. 1813 ** 1814 ** Suppose there are two processes A and B. A has a read lock and B has 1815 ** a reserved lock. B tries to promote to exclusive but is blocked because 1816 ** of A's read lock. A tries to promote to reserved but is blocked by B. 1817 ** One or the other of the two processes must give way or there can be 1818 ** no progress. By returning SQLITE_BUSY and not invoking the busy callback 1819 ** when A already has a read lock, we encourage A to give up and let B 1820 ** proceed. 1821 */ 1822 int sqlite3BtreeBeginTrans(Btree *p, int wrflag){ 1823 BtShared *pBt = p->pBt; 1824 int rc = SQLITE_OK; 1825 1826 sqlite3BtreeEnter(p); 1827 pBt->db = p->db; 1828 btreeIntegrity(p); 1829 1830 /* If the btree is already in a write-transaction, or it 1831 ** is already in a read-transaction and a read-transaction 1832 ** is requested, this is a no-op. 1833 */ 1834 if( p->inTrans==TRANS_WRITE || (p->inTrans==TRANS_READ && !wrflag) ){ 1835 goto trans_begun; 1836 } 1837 1838 /* Write transactions are not possible on a read-only database */ 1839 if( pBt->readOnly && wrflag ){ 1840 rc = SQLITE_READONLY; 1841 goto trans_begun; 1842 } 1843 1844 /* If another database handle has already opened a write transaction 1845 ** on this shared-btree structure and a second write transaction is 1846 ** requested, return SQLITE_BUSY. 1847 */ 1848 if( pBt->inTransaction==TRANS_WRITE && wrflag ){ 1849 rc = SQLITE_BUSY; 1850 goto trans_begun; 1851 } 1852 1853 do { 1854 if( pBt->pPage1==0 ){ 1855 rc = lockBtree(pBt); 1856 } 1857 1858 if( rc==SQLITE_OK && wrflag ){ 1859 if( pBt->readOnly ){ 1860 rc = SQLITE_READONLY; 1861 }else{ 1862 rc = sqlite3PagerBegin(pBt->pPage1->pDbPage, wrflag>1); 1863 if( rc==SQLITE_OK ){ 1864 rc = newDatabase(pBt); 1865 } 1866 } 1867 } 1868 1869 if( rc==SQLITE_OK ){ 1870 if( wrflag ) pBt->inStmt = 0; 1871 }else{ 1872 unlockBtreeIfUnused(pBt); 1873 } 1874 }while( rc==SQLITE_BUSY && pBt->inTransaction==TRANS_NONE && 1875 sqlite3BtreeInvokeBusyHandler(pBt, 0) ); 1876 1877 if( rc==SQLITE_OK ){ 1878 if( p->inTrans==TRANS_NONE ){ 1879 pBt->nTransaction++; 1880 } 1881 p->inTrans = (wrflag?TRANS_WRITE:TRANS_READ); 1882 if( p->inTrans>pBt->inTransaction ){ 1883 pBt->inTransaction = p->inTrans; 1884 } 1885 } 1886 1887 1888 trans_begun: 1889 btreeIntegrity(p); 1890 sqlite3BtreeLeave(p); 1891 return rc; 1892 } 1893 1894 #ifndef SQLITE_OMIT_AUTOVACUUM 1895 1896 /* 1897 ** Set the pointer-map entries for all children of page pPage. Also, if 1898 ** pPage contains cells that point to overflow pages, set the pointer 1899 ** map entries for the overflow pages as well. 1900 */ 1901 static int setChildPtrmaps(MemPage *pPage){ 1902 int i; /* Counter variable */ 1903 int nCell; /* Number of cells in page pPage */ 1904 int rc; /* Return code */ 1905 BtShared *pBt = pPage->pBt; 1906 int isInitOrig = pPage->isInit; 1907 Pgno pgno = pPage->pgno; 1908 1909 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 1910 rc = sqlite3BtreeInitPage(pPage, pPage->pParent); 1911 if( rc!=SQLITE_OK ){ 1912 goto set_child_ptrmaps_out; 1913 } 1914 nCell = pPage->nCell; 1915 1916 for(i=0; i<nCell; i++){ 1917 u8 *pCell = findCell(pPage, i); 1918 1919 rc = ptrmapPutOvflPtr(pPage, pCell); 1920 if( rc!=SQLITE_OK ){ 1921 goto set_child_ptrmaps_out; 1922 } 1923 1924 if( !pPage->leaf ){ 1925 Pgno childPgno = get4byte(pCell); 1926 rc = ptrmapPut(pBt, childPgno, PTRMAP_BTREE, pgno); 1927 if( rc!=SQLITE_OK ) goto set_child_ptrmaps_out; 1928 } 1929 } 1930 1931 if( !pPage->leaf ){ 1932 Pgno childPgno = get4byte(&pPage->aData[pPage->hdrOffset+8]); 1933 rc = ptrmapPut(pBt, childPgno, PTRMAP_BTREE, pgno); 1934 } 1935 1936 set_child_ptrmaps_out: 1937 pPage->isInit = isInitOrig; 1938 return rc; 1939 } 1940 1941 /* 1942 ** Somewhere on pPage, which is guarenteed to be a btree page, not an overflow 1943 ** page, is a pointer to page iFrom. Modify this pointer so that it points to 1944 ** iTo. Parameter eType describes the type of pointer to be modified, as 1945 ** follows: 1946 ** 1947 ** PTRMAP_BTREE: pPage is a btree-page. The pointer points at a child 1948 ** page of pPage. 1949 ** 1950 ** PTRMAP_OVERFLOW1: pPage is a btree-page. The pointer points at an overflow 1951 ** page pointed to by one of the cells on pPage. 1952 ** 1953 ** PTRMAP_OVERFLOW2: pPage is an overflow-page. The pointer points at the next 1954 ** overflow page in the list. 1955 */ 1956 static int modifyPagePointer(MemPage *pPage, Pgno iFrom, Pgno iTo, u8 eType){ 1957 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 1958 if( eType==PTRMAP_OVERFLOW2 ){ 1959 /* The pointer is always the first 4 bytes of the page in this case. */ 1960 if( get4byte(pPage->aData)!=iFrom ){ 1961 return SQLITE_CORRUPT_BKPT; 1962 } 1963 put4byte(pPage->aData, iTo); 1964 }else{ 1965 int isInitOrig = pPage->isInit; 1966 int i; 1967 int nCell; 1968 1969 sqlite3BtreeInitPage(pPage, 0); 1970 nCell = pPage->nCell; 1971 1972 for(i=0; i<nCell; i++){ 1973 u8 *pCell = findCell(pPage, i); 1974 if( eType==PTRMAP_OVERFLOW1 ){ 1975 CellInfo info; 1976 sqlite3BtreeParseCellPtr(pPage, pCell, &info); 1977 if( info.iOverflow ){ 1978 if( iFrom==get4byte(&pCell[info.iOverflow]) ){ 1979 put4byte(&pCell[info.iOverflow], iTo); 1980 break; 1981 } 1982 } 1983 }else{ 1984 if( get4byte(pCell)==iFrom ){ 1985 put4byte(pCell, iTo); 1986 break; 1987 } 1988 } 1989 } 1990 1991 if( i==nCell ){ 1992 if( eType!=PTRMAP_BTREE || 1993 get4byte(&pPage->aData[pPage->hdrOffset+8])!=iFrom ){ 1994 return SQLITE_CORRUPT_BKPT; 1995 } 1996 put4byte(&pPage->aData[pPage->hdrOffset+8], iTo); 1997 } 1998 1999 pPage->isInit = isInitOrig; 2000 } 2001 return SQLITE_OK; 2002 } 2003 2004 2005 /* 2006 ** Move the open database page pDbPage to location iFreePage in the 2007 ** database. The pDbPage reference remains valid. 2008 */ 2009 static int relocatePage( 2010 BtShared *pBt, /* Btree */ 2011 MemPage *pDbPage, /* Open page to move */ 2012 u8 eType, /* Pointer map 'type' entry for pDbPage */ 2013 Pgno iPtrPage, /* Pointer map 'page-no' entry for pDbPage */ 2014 Pgno iFreePage /* The location to move pDbPage to */ 2015 ){ 2016 MemPage *pPtrPage; /* The page that contains a pointer to pDbPage */ 2017 Pgno iDbPage = pDbPage->pgno; 2018 Pager *pPager = pBt->pPager; 2019 int rc; 2020 2021 assert( eType==PTRMAP_OVERFLOW2 || eType==PTRMAP_OVERFLOW1 || 2022 eType==PTRMAP_BTREE || eType==PTRMAP_ROOTPAGE ); 2023 assert( sqlite3_mutex_held(pBt->mutex) ); 2024 assert( pDbPage->pBt==pBt ); 2025 2026 /* Move page iDbPage from its current location to page number iFreePage */ 2027 TRACE(("AUTOVACUUM: Moving %d to free page %d (ptr page %d type %d)\n", 2028 iDbPage, iFreePage, iPtrPage, eType)); 2029 rc = sqlite3PagerMovepage(pPager, pDbPage->pDbPage, iFreePage); 2030 if( rc!=SQLITE_OK ){ 2031 return rc; 2032 } 2033 pDbPage->pgno = iFreePage; 2034 2035 /* If pDbPage was a btree-page, then it may have child pages and/or cells 2036 ** that point to overflow pages. The pointer map entries for all these 2037 ** pages need to be changed. 2038 ** 2039 ** If pDbPage is an overflow page, then the first 4 bytes may store a 2040 ** pointer to a subsequent overflow page. If this is the case, then 2041 ** the pointer map needs to be updated for the subsequent overflow page. 2042 */ 2043 if( eType==PTRMAP_BTREE || eType==PTRMAP_ROOTPAGE ){ 2044 rc = setChildPtrmaps(pDbPage); 2045 if( rc!=SQLITE_OK ){ 2046 return rc; 2047 } 2048 }else{ 2049 Pgno nextOvfl = get4byte(pDbPage->aData); 2050 if( nextOvfl!=0 ){ 2051 rc = ptrmapPut(pBt, nextOvfl, PTRMAP_OVERFLOW2, iFreePage); 2052 if( rc!=SQLITE_OK ){ 2053 return rc; 2054 } 2055 } 2056 } 2057 2058 /* Fix the database pointer on page iPtrPage that pointed at iDbPage so 2059 ** that it points at iFreePage. Also fix the pointer map entry for 2060 ** iPtrPage. 2061 */ 2062 if( eType!=PTRMAP_ROOTPAGE ){ 2063 rc = sqlite3BtreeGetPage(pBt, iPtrPage, &pPtrPage, 0); 2064 if( rc!=SQLITE_OK ){ 2065 return rc; 2066 } 2067 rc = sqlite3PagerWrite(pPtrPage->pDbPage); 2068 if( rc!=SQLITE_OK ){ 2069 releasePage(pPtrPage); 2070 return rc; 2071 } 2072 rc = modifyPagePointer(pPtrPage, iDbPage, iFreePage, eType); 2073 releasePage(pPtrPage); 2074 if( rc==SQLITE_OK ){ 2075 rc = ptrmapPut(pBt, iFreePage, eType, iPtrPage); 2076 } 2077 } 2078 return rc; 2079 } 2080 2081 /* Forward declaration required by incrVacuumStep(). */ 2082 static int allocateBtreePage(BtShared *, MemPage **, Pgno *, Pgno, u8); 2083 2084 /* 2085 ** Perform a single step of an incremental-vacuum. If successful, 2086 ** return SQLITE_OK. If there is no work to do (and therefore no 2087 ** point in calling this function again), return SQLITE_DONE. 2088 ** 2089 ** More specificly, this function attempts to re-organize the 2090 ** database so that the last page of the file currently in use 2091 ** is no longer in use. 2092 ** 2093 ** If the nFin parameter is non-zero, the implementation assumes 2094 ** that the caller will keep calling incrVacuumStep() until 2095 ** it returns SQLITE_DONE or an error, and that nFin is the 2096 ** number of pages the database file will contain after this 2097 ** process is complete. 2098 */ 2099 static int incrVacuumStep(BtShared *pBt, Pgno nFin){ 2100 Pgno iLastPg; /* Last page in the database */ 2101 Pgno nFreeList; /* Number of pages still on the free-list */ 2102 2103 assert( sqlite3_mutex_held(pBt->mutex) ); 2104 iLastPg = pBt->nTrunc; 2105 if( iLastPg==0 ){ 2106 iLastPg = sqlite3PagerPagecount(pBt->pPager); 2107 } 2108 2109 if( !PTRMAP_ISPAGE(pBt, iLastPg) && iLastPg!=PENDING_BYTE_PAGE(pBt) ){ 2110 int rc; 2111 u8 eType; 2112 Pgno iPtrPage; 2113 2114 nFreeList = get4byte(&pBt->pPage1->aData[36]); 2115 if( nFreeList==0 || nFin==iLastPg ){ 2116 return SQLITE_DONE; 2117 } 2118 2119 rc = ptrmapGet(pBt, iLastPg, &eType, &iPtrPage); 2120 if( rc!=SQLITE_OK ){ 2121 return rc; 2122 } 2123 if( eType==PTRMAP_ROOTPAGE ){ 2124 return SQLITE_CORRUPT_BKPT; 2125 } 2126 2127 if( eType==PTRMAP_FREEPAGE ){ 2128 if( nFin==0 ){ 2129 /* Remove the page from the files free-list. This is not required 2130 ** if nFin is non-zero. In that case, the free-list will be 2131 ** truncated to zero after this function returns, so it doesn't 2132 ** matter if it still contains some garbage entries. 2133 */ 2134 Pgno iFreePg; 2135 MemPage *pFreePg; 2136 rc = allocateBtreePage(pBt, &pFreePg, &iFreePg, iLastPg, 1); 2137 if( rc!=SQLITE_OK ){ 2138 return rc; 2139 } 2140 assert( iFreePg==iLastPg ); 2141 releasePage(pFreePg); 2142 } 2143 } else { 2144 Pgno iFreePg; /* Index of free page to move pLastPg to */ 2145 MemPage *pLastPg; 2146 2147 rc = sqlite3BtreeGetPage(pBt, iLastPg, &pLastPg, 0); 2148 if( rc!=SQLITE_OK ){ 2149 return rc; 2150 } 2151 2152 /* If nFin is zero, this loop runs exactly once and page pLastPg 2153 ** is swapped with the first free page pulled off the free list. 2154 ** 2155 ** On the other hand, if nFin is greater than zero, then keep 2156 ** looping until a free-page located within the first nFin pages 2157 ** of the file is found. 2158 */ 2159 do { 2160 MemPage *pFreePg; 2161 rc = allocateBtreePage(pBt, &pFreePg, &iFreePg, 0, 0); 2162 if( rc!=SQLITE_OK ){ 2163 releasePage(pLastPg); 2164 return rc; 2165 } 2166 releasePage(pFreePg); 2167 }while( nFin!=0 && iFreePg>nFin ); 2168 assert( iFreePg<iLastPg ); 2169 2170 rc = sqlite3PagerWrite(pLastPg->pDbPage); 2171 if( rc==SQLITE_OK ){ 2172 rc = relocatePage(pBt, pLastPg, eType, iPtrPage, iFreePg); 2173 } 2174 releasePage(pLastPg); 2175 if( rc!=SQLITE_OK ){ 2176 return rc; 2177 } 2178 } 2179 } 2180 2181 pBt->nTrunc = iLastPg - 1; 2182 while( pBt->nTrunc==PENDING_BYTE_PAGE(pBt)||PTRMAP_ISPAGE(pBt, pBt->nTrunc) ){ 2183 pBt->nTrunc--; 2184 } 2185 return SQLITE_OK; 2186 } 2187 2188 /* 2189 ** A write-transaction must be opened before calling this function. 2190 ** It performs a single unit of work towards an incremental vacuum. 2191 ** 2192 ** If the incremental vacuum is finished after this function has run, 2193 ** SQLITE_DONE is returned. If it is not finished, but no error occured, 2194 ** SQLITE_OK is returned. Otherwise an SQLite error code. 2195 */ 2196 int sqlite3BtreeIncrVacuum(Btree *p){ 2197 int rc; 2198 BtShared *pBt = p->pBt; 2199 2200 sqlite3BtreeEnter(p); 2201 pBt->db = p->db; 2202 assert( pBt->inTransaction==TRANS_WRITE && p->inTrans==TRANS_WRITE ); 2203 if( !pBt->autoVacuum ){ 2204 rc = SQLITE_DONE; 2205 }else{ 2206 invalidateAllOverflowCache(pBt); 2207 rc = incrVacuumStep(pBt, 0); 2208 } 2209 sqlite3BtreeLeave(p); 2210 return rc; 2211 } 2212 2213 /* 2214 ** This routine is called prior to sqlite3PagerCommit when a transaction 2215 ** is commited for an auto-vacuum database. 2216 ** 2217 ** If SQLITE_OK is returned, then *pnTrunc is set to the number of pages 2218 ** the database file should be truncated to during the commit process. 2219 ** i.e. the database has been reorganized so that only the first *pnTrunc 2220 ** pages are in use. 2221 */ 2222 static int autoVacuumCommit(BtShared *pBt, Pgno *pnTrunc){ 2223 int rc = SQLITE_OK; 2224 Pager *pPager = pBt->pPager; 2225 #ifndef NDEBUG 2226 int nRef = sqlite3PagerRefcount(pPager); 2227 #endif 2228 2229 assert( sqlite3_mutex_held(pBt->mutex) ); 2230 invalidateAllOverflowCache(pBt); 2231 assert(pBt->autoVacuum); 2232 if( !pBt->incrVacuum ){ 2233 Pgno nFin = 0; 2234 2235 if( pBt->nTrunc==0 ){ 2236 Pgno nFree; 2237 Pgno nPtrmap; 2238 const int pgsz = pBt->pageSize; 2239 Pgno nOrig = sqlite3PagerPagecount(pBt->pPager); 2240 2241 if( PTRMAP_ISPAGE(pBt, nOrig) ){ 2242 return SQLITE_CORRUPT_BKPT; 2243 } 2244 if( nOrig==PENDING_BYTE_PAGE(pBt) ){ 2245 nOrig--; 2246 } 2247 nFree = get4byte(&pBt->pPage1->aData[36]); 2248 nPtrmap = (nFree-nOrig+PTRMAP_PAGENO(pBt, nOrig)+pgsz/5)/(pgsz/5); 2249 nFin = nOrig - nFree - nPtrmap; 2250 if( nOrig>PENDING_BYTE_PAGE(pBt) && nFin<=PENDING_BYTE_PAGE(pBt) ){ 2251 nFin--; 2252 } 2253 while( PTRMAP_ISPAGE(pBt, nFin) || nFin==PENDING_BYTE_PAGE(pBt) ){ 2254 nFin--; 2255 } 2256 } 2257 2258 while( rc==SQLITE_OK ){ 2259 rc = incrVacuumStep(pBt, nFin); 2260 } 2261 if( rc==SQLITE_DONE ){ 2262 assert(nFin==0 || pBt->nTrunc==0 || nFin<=pBt->nTrunc); 2263 rc = SQLITE_OK; 2264 if( pBt->nTrunc ){ 2265 rc = sqlite3PagerWrite(pBt->pPage1->pDbPage); 2266 put4byte(&pBt->pPage1->aData[32], 0); 2267 put4byte(&pBt->pPage1->aData[36], 0); 2268 pBt->nTrunc = nFin; 2269 } 2270 } 2271 if( rc!=SQLITE_OK ){ 2272 sqlite3PagerRollback(pPager); 2273 } 2274 } 2275 2276 if( rc==SQLITE_OK ){ 2277 *pnTrunc = pBt->nTrunc; 2278 pBt->nTrunc = 0; 2279 } 2280 assert( nRef==sqlite3PagerRefcount(pPager) ); 2281 return rc; 2282 } 2283 2284 #endif 2285 2286 /* 2287 ** This routine does the first phase of a two-phase commit. This routine 2288 ** causes a rollback journal to be created (if it does not already exist) 2289 ** and populated with enough information so that if a power loss occurs 2290 ** the database can be restored to its original state by playing back 2291 ** the journal. Then the contents of the journal are flushed out to 2292 ** the disk. After the journal is safely on oxide, the changes to the 2293 ** database are written into the database file and flushed to oxide. 2294 ** At the end of this call, the rollback journal still exists on the 2295 ** disk and we are still holding all locks, so the transaction has not 2296 ** committed. See sqlite3BtreeCommit() for the second phase of the 2297 ** commit process. 2298 ** 2299 ** This call is a no-op if no write-transaction is currently active on pBt. 2300 ** 2301 ** Otherwise, sync the database file for the btree pBt. zMaster points to 2302 ** the name of a master journal file that should be written into the 2303 ** individual journal file, or is NULL, indicating no master journal file 2304 ** (single database transaction). 2305 ** 2306 ** When this is called, the master journal should already have been 2307 ** created, populated with this journal pointer and synced to disk. 2308 ** 2309 ** Once this is routine has returned, the only thing required to commit 2310 ** the write-transaction for this database file is to delete the journal. 2311 */ 2312 int sqlite3BtreeCommitPhaseOne(Btree *p, const char *zMaster){ 2313 int rc = SQLITE_OK; 2314 if( p->inTrans==TRANS_WRITE ){ 2315 BtShared *pBt = p->pBt; 2316 Pgno nTrunc = 0; 2317 sqlite3BtreeEnter(p); 2318 pBt->db = p->db; 2319 #ifndef SQLITE_OMIT_AUTOVACUUM 2320 if( pBt->autoVacuum ){ 2321 rc = autoVacuumCommit(pBt, &nTrunc); 2322 if( rc!=SQLITE_OK ){ 2323 sqlite3BtreeLeave(p); 2324 return rc; 2325 } 2326 } 2327 #endif 2328 rc = sqlite3PagerCommitPhaseOne(pBt->pPager, zMaster, nTrunc); 2329 sqlite3BtreeLeave(p); 2330 } 2331 return rc; 2332 } 2333 2334 /* 2335 ** Commit the transaction currently in progress. 2336 ** 2337 ** This routine implements the second phase of a 2-phase commit. The 2338 ** sqlite3BtreeSync() routine does the first phase and should be invoked 2339 ** prior to calling this routine. The sqlite3BtreeSync() routine did 2340 ** all the work of writing information out to disk and flushing the 2341 ** contents so that they are written onto the disk platter. All this 2342 ** routine has to do is delete or truncate the rollback journal 2343 ** (which causes the transaction to commit) and drop locks. 2344 ** 2345 ** This will release the write lock on the database file. If there 2346 ** are no active cursors, it also releases the read lock. 2347 */ 2348 int sqlite3BtreeCommitPhaseTwo(Btree *p){ 2349 BtShared *pBt = p->pBt; 2350 2351 sqlite3BtreeEnter(p); 2352 pBt->db = p->db; 2353 btreeIntegrity(p); 2354 2355 /* If the handle has a write-transaction open, commit the shared-btrees 2356 ** transaction and set the shared state to TRANS_READ. 2357 */ 2358 if( p->inTrans==TRANS_WRITE ){ 2359 int rc; 2360 assert( pBt->inTransaction==TRANS_WRITE ); 2361 assert( pBt->nTransaction>0 ); 2362 rc = sqlite3PagerCommitPhaseTwo(pBt->pPager); 2363 if( rc!=SQLITE_OK ){ 2364 sqlite3BtreeLeave(p); 2365 return rc; 2366 } 2367 pBt->inTransaction = TRANS_READ; 2368 pBt->inStmt = 0; 2369 } 2370 unlockAllTables(p); 2371 2372 /* If the handle has any kind of transaction open, decrement the transaction 2373 ** count of the shared btree. If the transaction count reaches 0, set 2374 ** the shared state to TRANS_NONE. The unlockBtreeIfUnused() call below 2375 ** will unlock the pager. 2376 */ 2377 if( p->inTrans!=TRANS_NONE ){ 2378 pBt->nTransaction--; 2379 if( 0==pBt->nTransaction ){ 2380 pBt->inTransaction = TRANS_NONE; 2381 } 2382 } 2383 2384 /* Set the handles current transaction state to TRANS_NONE and unlock 2385 ** the pager if this call closed the only read or write transaction. 2386 */ 2387 p->inTrans = TRANS_NONE; 2388 unlockBtreeIfUnused(pBt); 2389 2390 btreeIntegrity(p); 2391 sqlite3BtreeLeave(p); 2392 return SQLITE_OK; 2393 } 2394 2395 /* 2396 ** Do both phases of a commit. 2397 */ 2398 int sqlite3BtreeCommit(Btree *p){ 2399 int rc; 2400 sqlite3BtreeEnter(p); 2401 rc = sqlite3BtreeCommitPhaseOne(p, 0); 2402 if( rc==SQLITE_OK ){ 2403 rc = sqlite3BtreeCommitPhaseTwo(p); 2404 } 2405 sqlite3BtreeLeave(p); 2406 return rc; 2407 } 2408 2409 #ifndef NDEBUG 2410 /* 2411 ** Return the number of write-cursors open on this handle. This is for use 2412 ** in assert() expressions, so it is only compiled if NDEBUG is not 2413 ** defined. 2414 ** 2415 ** For the purposes of this routine, a write-cursor is any cursor that 2416 ** is capable of writing to the databse. That means the cursor was 2417 ** originally opened for writing and the cursor has not be disabled 2418 ** by having its state changed to CURSOR_FAULT. 2419 */ 2420 static int countWriteCursors(BtShared *pBt){ 2421 BtCursor *pCur; 2422 int r = 0; 2423 for(pCur=pBt->pCursor; pCur; pCur=pCur->pNext){ 2424 if( pCur->wrFlag && pCur->eState!=CURSOR_FAULT ) r++; 2425 } 2426 return r; 2427 } 2428 #endif 2429 2430 /* 2431 ** This routine sets the state to CURSOR_FAULT and the error 2432 ** code to errCode for every cursor on BtShared that pBtree 2433 ** references. 2434 ** 2435 ** Every cursor is tripped, including cursors that belong 2436 ** to other database connections that happen to be sharing 2437 ** the cache with pBtree. 2438 ** 2439 ** This routine gets called when a rollback occurs. 2440 ** All cursors using the same cache must be tripped 2441 ** to prevent them from trying to use the btree after 2442 ** the rollback. The rollback may have deleted tables 2443 ** or moved root pages, so it is not sufficient to 2444 ** save the state of the cursor. The cursor must be 2445 ** invalidated. 2446 */ 2447 void sqlite3BtreeTripAllCursors(Btree *pBtree, int errCode){ 2448 BtCursor *p; 2449 sqlite3BtreeEnter(pBtree); 2450 for(p=pBtree->pBt->pCursor; p; p=p->pNext){ 2451 clearCursorPosition(p); 2452 p->eState = CURSOR_FAULT; 2453 p->skip = errCode; 2454 } 2455 sqlite3BtreeLeave(pBtree); 2456 } 2457 2458 /* 2459 ** Rollback the transaction in progress. All cursors will be 2460 ** invalided by this operation. Any attempt to use a cursor 2461 ** that was open at the beginning of this operation will result 2462 ** in an error. 2463 ** 2464 ** This will release the write lock on the database file. If there 2465 ** are no active cursors, it also releases the read lock. 2466 */ 2467 int sqlite3BtreeRollback(Btree *p){ 2468 int rc; 2469 BtShared *pBt = p->pBt; 2470 MemPage *pPage1; 2471 2472 sqlite3BtreeEnter(p); 2473 pBt->db = p->db; 2474 rc = saveAllCursors(pBt, 0, 0); 2475 #ifndef SQLITE_OMIT_SHARED_CACHE 2476 if( rc!=SQLITE_OK ){ 2477 /* This is a horrible situation. An IO or malloc() error occured whilst 2478 ** trying to save cursor positions. If this is an automatic rollback (as 2479 ** the result of a constraint, malloc() failure or IO error) then 2480 ** the cache may be internally inconsistent (not contain valid trees) so 2481 ** we cannot simply return the error to the caller. Instead, abort 2482 ** all queries that may be using any of the cursors that failed to save. 2483 */ 2484 sqlite3BtreeTripAllCursors(p, rc); 2485 } 2486 #endif 2487 btreeIntegrity(p); 2488 unlockAllTables(p); 2489 2490 if( p->inTrans==TRANS_WRITE ){ 2491 int rc2; 2492 2493 #ifndef SQLITE_OMIT_AUTOVACUUM 2494 pBt->nTrunc = 0; 2495 #endif 2496 2497 assert( TRANS_WRITE==pBt->inTransaction ); 2498 rc2 = sqlite3PagerRollback(pBt->pPager); 2499 if( rc2!=SQLITE_OK ){ 2500 rc = rc2; 2501 } 2502 2503 /* The rollback may have destroyed the pPage1->aData value. So 2504 ** call sqlite3BtreeGetPage() on page 1 again to make 2505 ** sure pPage1->aData is set correctly. */ 2506 if( sqlite3BtreeGetPage(pBt, 1, &pPage1, 0)==SQLITE_OK ){ 2507 releasePage(pPage1); 2508 } 2509 assert( countWriteCursors(pBt)==0 ); 2510 pBt->inTransaction = TRANS_READ; 2511 } 2512 2513 if( p->inTrans!=TRANS_NONE ){ 2514 assert( pBt->nTransaction>0 ); 2515 pBt->nTransaction--; 2516 if( 0==pBt->nTransaction ){ 2517 pBt->inTransaction = TRANS_NONE; 2518 } 2519 } 2520 2521 p->inTrans = TRANS_NONE; 2522 pBt->inStmt = 0; 2523 unlockBtreeIfUnused(pBt); 2524 2525 btreeIntegrity(p); 2526 sqlite3BtreeLeave(p); 2527 return rc; 2528 } 2529 2530 /* 2531 ** Start a statement subtransaction. The subtransaction can 2532 ** can be rolled back independently of the main transaction. 2533 ** You must start a transaction before starting a subtransaction. 2534 ** The subtransaction is ended automatically if the main transaction 2535 ** commits or rolls back. 2536 ** 2537 ** Only one subtransaction may be active at a time. It is an error to try 2538 ** to start a new subtransaction if another subtransaction is already active. 2539 ** 2540 ** Statement subtransactions are used around individual SQL statements 2541 ** that are contained within a BEGIN...COMMIT block. If a constraint 2542 ** error occurs within the statement, the effect of that one statement 2543 ** can be rolled back without having to rollback the entire transaction. 2544 */ 2545 int sqlite3BtreeBeginStmt(Btree *p){ 2546 int rc; 2547 BtShared *pBt = p->pBt; 2548 sqlite3BtreeEnter(p); 2549 pBt->db = p->db; 2550 if( (p->inTrans!=TRANS_WRITE) || pBt->inStmt ){ 2551 rc = pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR; 2552 }else{ 2553 assert( pBt->inTransaction==TRANS_WRITE ); 2554 rc = pBt->readOnly ? SQLITE_OK : sqlite3PagerStmtBegin(pBt->pPager); 2555 pBt->inStmt = 1; 2556 } 2557 sqlite3BtreeLeave(p); 2558 return rc; 2559 } 2560 2561 2562 /* 2563 ** Commit the statment subtransaction currently in progress. If no 2564 ** subtransaction is active, this is a no-op. 2565 */ 2566 int sqlite3BtreeCommitStmt(Btree *p){ 2567 int rc; 2568 BtShared *pBt = p->pBt; 2569 sqlite3BtreeEnter(p); 2570 pBt->db = p->db; 2571 if( pBt->inStmt && !pBt->readOnly ){ 2572 rc = sqlite3PagerStmtCommit(pBt->pPager); 2573 }else{ 2574 rc = SQLITE_OK; 2575 } 2576 pBt->inStmt = 0; 2577 sqlite3BtreeLeave(p); 2578 return rc; 2579 } 2580 2581 /* 2582 ** Rollback the active statement subtransaction. If no subtransaction 2583 ** is active this routine is a no-op. 2584 ** 2585 ** All cursors will be invalidated by this operation. Any attempt 2586 ** to use a cursor that was open at the beginning of this operation 2587 ** will result in an error. 2588 */ 2589 int sqlite3BtreeRollbackStmt(Btree *p){ 2590 int rc = SQLITE_OK; 2591 BtShared *pBt = p->pBt; 2592 sqlite3BtreeEnter(p); 2593 pBt->db = p->db; 2594 if( pBt->inStmt && !pBt->readOnly ){ 2595 rc = sqlite3PagerStmtRollback(pBt->pPager); 2596 assert( countWriteCursors(pBt)==0 ); 2597 pBt->inStmt = 0; 2598 } 2599 sqlite3BtreeLeave(p); 2600 return rc; 2601 } 2602 2603 /* 2604 ** Default key comparison function to be used if no comparison function 2605 ** is specified on the sqlite3BtreeCursor() call. 2606 */ 2607 static int dfltCompare( 2608 void *NotUsed, /* User data is not used */ 2609 int n1, const void *p1, /* First key to compare */ 2610 int n2, const void *p2 /* Second key to compare */ 2611 ){ 2612 int c; 2613 c = memcmp(p1, p2, n1<n2 ? n1 : n2); 2614 if( c==0 ){ 2615 c = n1 - n2; 2616 } 2617 return c; 2618 } 2619 2620 /* 2621 ** Create a new cursor for the BTree whose root is on the page 2622 ** iTable. The act of acquiring a cursor gets a read lock on 2623 ** the database file. 2624 ** 2625 ** If wrFlag==0, then the cursor can only be used for reading. 2626 ** If wrFlag==1, then the cursor can be used for reading or for 2627 ** writing if other conditions for writing are also met. These 2628 ** are the conditions that must be met in order for writing to 2629 ** be allowed: 2630 ** 2631 ** 1: The cursor must have been opened with wrFlag==1 2632 ** 2633 ** 2: Other database connections that share the same pager cache 2634 ** but which are not in the READ_UNCOMMITTED state may not have 2635 ** cursors open with wrFlag==0 on the same table. Otherwise 2636 ** the changes made by this write cursor would be visible to 2637 ** the read cursors in the other database connection. 2638 ** 2639 ** 3: The database must be writable (not on read-only media) 2640 ** 2641 ** 4: There must be an active transaction. 2642 ** 2643 ** No checking is done to make sure that page iTable really is the 2644 ** root page of a b-tree. If it is not, then the cursor acquired 2645 ** will not work correctly. 2646 ** 2647 ** The comparison function must be logically the same for every cursor 2648 ** on a particular table. Changing the comparison function will result 2649 ** in incorrect operations. If the comparison function is NULL, a 2650 ** default comparison function is used. The comparison function is 2651 ** always ignored for INTKEY tables. 2652 */ 2653 static int btreeCursor( 2654 Btree *p, /* The btree */ 2655 int iTable, /* Root page of table to open */ 2656 int wrFlag, /* 1 to write. 0 read-only */ 2657 int (*xCmp)(void*,int,const void*,int,const void*), /* Key Comparison func */ 2658 void *pArg, /* First arg to xCompare() */ 2659 BtCursor **ppCur /* Write new cursor here */ 2660 ){ 2661 int rc; 2662 BtCursor *pCur; 2663 BtShared *pBt = p->pBt; 2664 2665 assert( sqlite3BtreeHoldsMutex(p) ); 2666 *ppCur = 0; 2667 if( wrFlag ){ 2668 if( pBt->readOnly ){ 2669 return SQLITE_READONLY; 2670 } 2671 if( checkReadLocks(p, iTable, 0) ){ 2672 return SQLITE_LOCKED; 2673 } 2674 } 2675 2676 if( pBt->pPage1==0 ){ 2677 rc = lockBtreeWithRetry(p); 2678 if( rc!=SQLITE_OK ){ 2679 return rc; 2680 } 2681 if( pBt->readOnly && wrFlag ){ 2682 return SQLITE_READONLY; 2683 } 2684 } 2685 pCur = sqlite3MallocZero( sizeof(*pCur) ); 2686 if( pCur==0 ){ 2687 rc = SQLITE_NOMEM; 2688 goto create_cursor_exception; 2689 } 2690 pCur->pgnoRoot = (Pgno)iTable; 2691 if( iTable==1 && sqlite3PagerPagecount(pBt->pPager)==0 ){ 2692 rc = SQLITE_EMPTY; 2693 goto create_cursor_exception; 2694 } 2695 rc = getAndInitPage(pBt, pCur->pgnoRoot, &pCur->pPage, 0); 2696 if( rc!=SQLITE_OK ){ 2697 goto create_cursor_exception; 2698 } 2699 2700 /* Now that no other errors can occur, finish filling in the BtCursor 2701 ** variables, link the cursor into the BtShared list and set *ppCur (the 2702 ** output argument to this function). 2703 */ 2704 pCur->xCompare = xCmp ? xCmp : dfltCompare; 2705 pCur->pArg = pArg; 2706 pCur->pBtree = p; 2707 pCur->pBt = pBt; 2708 pCur->wrFlag = wrFlag; 2709 pCur->pNext = pBt->pCursor; 2710 if( pCur->pNext ){ 2711 pCur->pNext->pPrev = pCur; 2712 } 2713 pBt->pCursor = pCur; 2714 pCur->eState = CURSOR_INVALID; 2715 *ppCur = pCur; 2716 2717 return SQLITE_OK; 2718 2719 create_cursor_exception: 2720 if( pCur ){ 2721 releasePage(pCur->pPage); 2722 sqlite3_free(pCur); 2723 } 2724 unlockBtreeIfUnused(pBt); 2725 return rc; 2726 } 2727 int sqlite3BtreeCursor( 2728 Btree *p, /* The btree */ 2729 int iTable, /* Root page of table to open */ 2730 int wrFlag, /* 1 to write. 0 read-only */ 2731 int (*xCmp)(void*,int,const void*,int,const void*), /* Key Comparison func */ 2732 void *pArg, /* First arg to xCompare() */ 2733 BtCursor **ppCur /* Write new cursor here */ 2734 ){ 2735 int rc; 2736 sqlite3BtreeEnter(p); 2737 p->pBt->db = p->db; 2738 rc = btreeCursor(p, iTable, wrFlag, xCmp, pArg, ppCur); 2739 sqlite3BtreeLeave(p); 2740 return rc; 2741 } 2742 2743 2744 /* 2745 ** Close a cursor. The read lock on the database file is released 2746 ** when the last cursor is closed. 2747 */ 2748 int sqlite3BtreeCloseCursor(BtCursor *pCur){ 2749 BtShared *pBt = pCur->pBt; 2750 Btree *pBtree = pCur->pBtree; 2751 2752 sqlite3BtreeEnter(pBtree); 2753 pBt->db = pBtree->db; 2754 clearCursorPosition(pCur); 2755 if( pCur->pPrev ){ 2756 pCur->pPrev->pNext = pCur->pNext; 2757 }else{ 2758 pBt->pCursor = pCur->pNext; 2759 } 2760 if( pCur->pNext ){ 2761 pCur->pNext->pPrev = pCur->pPrev; 2762 } 2763 releasePage(pCur->pPage); 2764 unlockBtreeIfUnused(pBt); 2765 invalidateOverflowCache(pCur); 2766 sqlite3_free(pCur); 2767 sqlite3BtreeLeave(pBtree); 2768 return SQLITE_OK; 2769 } 2770 2771 /* 2772 ** Make a temporary cursor by filling in the fields of pTempCur. 2773 ** The temporary cursor is not on the cursor list for the Btree. 2774 */ 2775 void sqlite3BtreeGetTempCursor(BtCursor *pCur, BtCursor *pTempCur){ 2776 assert( cursorHoldsMutex(pCur) ); 2777 memcpy(pTempCur, pCur, sizeof(*pCur)); 2778 pTempCur->pNext = 0; 2779 pTempCur->pPrev = 0; 2780 if( pTempCur->pPage ){ 2781 sqlite3PagerRef(pTempCur->pPage->pDbPage); 2782 } 2783 } 2784 2785 /* 2786 ** Delete a temporary cursor such as was made by the CreateTemporaryCursor() 2787 ** function above. 2788 */ 2789 void sqlite3BtreeReleaseTempCursor(BtCursor *pCur){ 2790 assert( cursorHoldsMutex(pCur) ); 2791 if( pCur->pPage ){ 2792 sqlite3PagerUnref(pCur->pPage->pDbPage); 2793 } 2794 } 2795 2796 /* 2797 ** Make sure the BtCursor* given in the argument has a valid 2798 ** BtCursor.info structure. If it is not already valid, call 2799 ** sqlite3BtreeParseCell() to fill it in. 2800 ** 2801 ** BtCursor.info is a cache of the information in the current cell. 2802 ** Using this cache reduces the number of calls to sqlite3BtreeParseCell(). 2803 ** 2804 ** 2007-06-25: There is a bug in some versions of MSVC that cause the 2805 ** compiler to crash when getCellInfo() is implemented as a macro. 2806 ** But there is a measureable speed advantage to using the macro on gcc 2807 ** (when less compiler optimizations like -Os or -O0 are used and the 2808 ** compiler is not doing agressive inlining.) So we use a real function 2809 ** for MSVC and a macro for everything else. Ticket #2457. 2810 */ 2811 #ifndef NDEBUG 2812 static void assertCellInfo(BtCursor *pCur){ 2813 CellInfo info; 2814 memset(&info, 0, sizeof(info)); 2815 sqlite3BtreeParseCell(pCur->pPage, pCur->idx, &info); 2816 assert( memcmp(&info, &pCur->info, sizeof(info))==0 ); 2817 } 2818 #else 2819 #define assertCellInfo(x) 2820 #endif 2821 #ifdef _MSC_VER 2822 /* Use a real function in MSVC to work around bugs in that compiler. */ 2823 static void getCellInfo(BtCursor *pCur){ 2824 if( pCur->info.nSize==0 ){ 2825 sqlite3BtreeParseCell(pCur->pPage, pCur->idx, &pCur->info); 2826 }else{ 2827 assertCellInfo(pCur); 2828 } 2829 } 2830 #else /* if not _MSC_VER */ 2831 /* Use a macro in all other compilers so that the function is inlined */ 2832 #define getCellInfo(pCur) \ 2833 if( pCur->info.nSize==0 ){ \ 2834 sqlite3BtreeParseCell(pCur->pPage, pCur->idx, &pCur->info); \ 2835 }else{ \ 2836 assertCellInfo(pCur); \ 2837 } 2838 #endif /* _MSC_VER */ 2839 2840 /* 2841 ** Set *pSize to the size of the buffer needed to hold the value of 2842 ** the key for the current entry. If the cursor is not pointing 2843 ** to a valid entry, *pSize is set to 0. 2844 ** 2845 ** For a table with the INTKEY flag set, this routine returns the key 2846 ** itself, not the number of bytes in the key. 2847 */ 2848 int sqlite3BtreeKeySize(BtCursor *pCur, i64 *pSize){ 2849 int rc; 2850 2851 assert( cursorHoldsMutex(pCur) ); 2852 rc = restoreOrClearCursorPosition(pCur); 2853 if( rc==SQLITE_OK ){ 2854 assert( pCur->eState==CURSOR_INVALID || pCur->eState==CURSOR_VALID ); 2855 if( pCur->eState==CURSOR_INVALID ){ 2856 *pSize = 0; 2857 }else{ 2858 getCellInfo(pCur); 2859 *pSize = pCur->info.nKey; 2860 } 2861 } 2862 return rc; 2863 } 2864 2865 /* 2866 ** Set *pSize to the number of bytes of data in the entry the 2867 ** cursor currently points to. Always return SQLITE_OK. 2868 ** Failure is not possible. If the cursor is not currently 2869 ** pointing to an entry (which can happen, for example, if 2870 ** the database is empty) then *pSize is set to 0. 2871 */ 2872 int sqlite3BtreeDataSize(BtCursor *pCur, u32 *pSize){ 2873 int rc; 2874 2875 assert( cursorHoldsMutex(pCur) ); 2876 rc = restoreOrClearCursorPosition(pCur); 2877 if( rc==SQLITE_OK ){ 2878 assert( pCur->eState==CURSOR_INVALID || pCur->eState==CURSOR_VALID ); 2879 if( pCur->eState==CURSOR_INVALID ){ 2880 /* Not pointing at a valid entry - set *pSize to 0. */ 2881 *pSize = 0; 2882 }else{ 2883 getCellInfo(pCur); 2884 *pSize = pCur->info.nData; 2885 } 2886 } 2887 return rc; 2888 } 2889 2890 /* 2891 ** Given the page number of an overflow page in the database (parameter 2892 ** ovfl), this function finds the page number of the next page in the 2893 ** linked list of overflow pages. If possible, it uses the auto-vacuum 2894 ** pointer-map data instead of reading the content of page ovfl to do so. 2895 ** 2896 ** If an error occurs an SQLite error code is returned. Otherwise: 2897 ** 2898 ** Unless pPgnoNext is NULL, the page number of the next overflow 2899 ** page in the linked list is written to *pPgnoNext. If page ovfl 2900 ** is the last page in its linked list, *pPgnoNext is set to zero. 2901 ** 2902 ** If ppPage is not NULL, *ppPage is set to the MemPage* handle 2903 ** for page ovfl. The underlying pager page may have been requested 2904 ** with the noContent flag set, so the page data accessable via 2905 ** this handle may not be trusted. 2906 */ 2907 static int getOverflowPage( 2908 BtShared *pBt, 2909 Pgno ovfl, /* Overflow page */ 2910 MemPage **ppPage, /* OUT: MemPage handle */ 2911 Pgno *pPgnoNext /* OUT: Next overflow page number */ 2912 ){ 2913 Pgno next = 0; 2914 int rc; 2915 2916 assert( sqlite3_mutex_held(pBt->mutex) ); 2917 /* One of these must not be NULL. Otherwise, why call this function? */ 2918 assert(ppPage || pPgnoNext); 2919 2920 /* If pPgnoNext is NULL, then this function is being called to obtain 2921 ** a MemPage* reference only. No page-data is required in this case. 2922 */ 2923 if( !pPgnoNext ){ 2924 return sqlite3BtreeGetPage(pBt, ovfl, ppPage, 1); 2925 } 2926 2927 #ifndef SQLITE_OMIT_AUTOVACUUM 2928 /* Try to find the next page in the overflow list using the 2929 ** autovacuum pointer-map pages. Guess that the next page in 2930 ** the overflow list is page number (ovfl+1). If that guess turns 2931 ** out to be wrong, fall back to loading the data of page 2932 ** number ovfl to determine the next page number. 2933 */ 2934 if( pBt->autoVacuum ){ 2935 Pgno pgno; 2936 Pgno iGuess = ovfl+1; 2937 u8 eType; 2938 2939 while( PTRMAP_ISPAGE(pBt, iGuess) || iGuess==PENDING_BYTE_PAGE(pBt) ){ 2940 iGuess++; 2941 } 2942 2943 if( iGuess<=sqlite3PagerPagecount(pBt->pPager) ){ 2944 rc = ptrmapGet(pBt, iGuess, &eType, &pgno); 2945 if( rc!=SQLITE_OK ){ 2946 return rc; 2947 } 2948 if( eType==PTRMAP_OVERFLOW2 && pgno==ovfl ){ 2949 next = iGuess; 2950 } 2951 } 2952 } 2953 #endif 2954 2955 if( next==0 || ppPage ){ 2956 MemPage *pPage = 0; 2957 2958 rc = sqlite3BtreeGetPage(pBt, ovfl, &pPage, next!=0); 2959 assert(rc==SQLITE_OK || pPage==0); 2960 if( next==0 && rc==SQLITE_OK ){ 2961 next = get4byte(pPage->aData); 2962 } 2963 2964 if( ppPage ){ 2965 *ppPage = pPage; 2966 }else{ 2967 releasePage(pPage); 2968 } 2969 } 2970 *pPgnoNext = next; 2971 2972 return rc; 2973 } 2974 2975 /* 2976 ** Copy data from a buffer to a page, or from a page to a buffer. 2977 ** 2978 ** pPayload is a pointer to data stored on database page pDbPage. 2979 ** If argument eOp is false, then nByte bytes of data are copied 2980 ** from pPayload to the buffer pointed at by pBuf. If eOp is true, 2981 ** then sqlite3PagerWrite() is called on pDbPage and nByte bytes 2982 ** of data are copied from the buffer pBuf to pPayload. 2983 ** 2984 ** SQLITE_OK is returned on success, otherwise an error code. 2985 */ 2986 static int copyPayload( 2987 void *pPayload, /* Pointer to page data */ 2988 void *pBuf, /* Pointer to buffer */ 2989 int nByte, /* Number of bytes to copy */ 2990 int eOp, /* 0 -> copy from page, 1 -> copy to page */ 2991 DbPage *pDbPage /* Page containing pPayload */ 2992 ){ 2993 if( eOp ){ 2994 /* Copy data from buffer to page (a write operation) */ 2995 int rc = sqlite3PagerWrite(pDbPage); 2996 if( rc!=SQLITE_OK ){ 2997 return rc; 2998 } 2999 memcpy(pPayload, pBuf, nByte); 3000 }else{ 3001 /* Copy data from page to buffer (a read operation) */ 3002 memcpy(pBuf, pPayload, nByte); 3003 } 3004 return SQLITE_OK; 3005 } 3006 3007 /* 3008 ** This function is used to read or overwrite payload information 3009 ** for the entry that the pCur cursor is pointing to. If the eOp 3010 ** parameter is 0, this is a read operation (data copied into 3011 ** buffer pBuf). If it is non-zero, a write (data copied from 3012 ** buffer pBuf). 3013 ** 3014 ** A total of "amt" bytes are read or written beginning at "offset". 3015 ** Data is read to or from the buffer pBuf. 3016 ** 3017 ** This routine does not make a distinction between key and data. 3018 ** It just reads or writes bytes from the payload area. Data might 3019 ** appear on the main page or be scattered out on multiple overflow 3020 ** pages. 3021 ** 3022 ** If the BtCursor.isIncrblobHandle flag is set, and the current 3023 ** cursor entry uses one or more overflow pages, this function 3024 ** allocates space for and lazily popluates the overflow page-list 3025 ** cache array (BtCursor.aOverflow). Subsequent calls use this 3026 ** cache to make seeking to the supplied offset more efficient. 3027 ** 3028 ** Once an overflow page-list cache has been allocated, it may be 3029 ** invalidated if some other cursor writes to the same table, or if 3030 ** the cursor is moved to a different row. Additionally, in auto-vacuum 3031 ** mode, the following events may invalidate an overflow page-list cache. 3032 ** 3033 ** * An incremental vacuum, 3034 ** * A commit in auto_vacuum="full" mode, 3035 ** * Creating a table (may require moving an overflow page). 3036 */ 3037 static int accessPayload( 3038 BtCursor *pCur, /* Cursor pointing to entry to read from */ 3039 int offset, /* Begin reading this far into payload */ 3040 int amt, /* Read this many bytes */ 3041 unsigned char *pBuf, /* Write the bytes into this buffer */ 3042 int skipKey, /* offset begins at data if this is true */ 3043 int eOp /* zero to read. non-zero to write. */ 3044 ){ 3045 unsigned char *aPayload; 3046 int rc = SQLITE_OK; 3047 u32 nKey; 3048 int iIdx = 0; 3049 MemPage *pPage = pCur->pPage; /* Btree page of current cursor entry */ 3050 BtShared *pBt; /* Btree this cursor belongs to */ 3051 3052 assert( pPage ); 3053 assert( pCur->eState==CURSOR_VALID ); 3054 assert( pCur->idx>=0 && pCur->idx<pPage->nCell ); 3055 assert( offset>=0 ); 3056 assert( cursorHoldsMutex(pCur) ); 3057 3058 getCellInfo(pCur); 3059 aPayload = pCur->info.pCell + pCur->info.nHeader; 3060 nKey = (pPage->intKey ? 0 : pCur->info.nKey); 3061 3062 if( skipKey ){ 3063 offset += nKey; 3064 } 3065 if( offset+amt > nKey+pCur->info.nData ){ 3066 /* Trying to read or write past the end of the data is an error */ 3067 return SQLITE_ERROR; 3068 } 3069 3070 /* Check if data must be read/written to/from the btree page itself. */ 3071 if( offset<pCur->info.nLocal ){ 3072 int a = amt; 3073 if( a+offset>pCur->info.nLocal ){ 3074 a = pCur->info.nLocal - offset; 3075 } 3076 rc = copyPayload(&aPayload[offset], pBuf, a, eOp, pPage->pDbPage); 3077 offset = 0; 3078 pBuf += a; 3079 amt -= a; 3080 }else{ 3081 offset -= pCur->info.nLocal; 3082 } 3083 3084 pBt = pCur->pBt; 3085 if( rc==SQLITE_OK && amt>0 ){ 3086 const int ovflSize = pBt->usableSize - 4; /* Bytes content per ovfl page */ 3087 Pgno nextPage; 3088 3089 nextPage = get4byte(&aPayload[pCur->info.nLocal]); 3090 3091 #ifndef SQLITE_OMIT_INCRBLOB 3092 /* If the isIncrblobHandle flag is set and the BtCursor.aOverflow[] 3093 ** has not been allocated, allocate it now. The array is sized at 3094 ** one entry for each overflow page in the overflow chain. The 3095 ** page number of the first overflow page is stored in aOverflow[0], 3096 ** etc. A value of 0 in the aOverflow[] array means "not yet known" 3097 ** (the cache is lazily populated). 3098 */ 3099 if( pCur->isIncrblobHandle && !pCur->aOverflow ){ 3100 int nOvfl = (pCur->info.nPayload-pCur->info.nLocal+ovflSize-1)/ovflSize; 3101 pCur->aOverflow = (Pgno *)sqlite3MallocZero(sizeof(Pgno)*nOvfl); 3102 if( nOvfl && !pCur->aOverflow ){ 3103 rc = SQLITE_NOMEM; 3104 } 3105 } 3106 3107 /* If the overflow page-list cache has been allocated and the 3108 ** entry for the first required overflow page is valid, skip 3109 ** directly to it. 3110 */ 3111 if( pCur->aOverflow && pCur->aOverflow[offset/ovflSize] ){ 3112 iIdx = (offset/ovflSize); 3113 nextPage = pCur->aOverflow[iIdx]; 3114 offset = (offset%ovflSize); 3115 } 3116 #endif 3117 3118 for( ; rc==SQLITE_OK && amt>0 && nextPage; iIdx++){ 3119 3120 #ifndef SQLITE_OMIT_INCRBLOB 3121 /* If required, populate the overflow page-list cache. */ 3122 if( pCur->aOverflow ){ 3123 assert(!pCur->aOverflow[iIdx] || pCur->aOverflow[iIdx]==nextPage); 3124 pCur->aOverflow[iIdx] = nextPage; 3125 } 3126 #endif 3127 3128 if( offset>=ovflSize ){ 3129 /* The only reason to read this page is to obtain the page 3130 ** number for the next page in the overflow chain. The page 3131 ** data is not required. So first try to lookup the overflow 3132 ** page-list cache, if any, then fall back to the getOverflowPage() 3133 ** function. 3134 */ 3135 #ifndef SQLITE_OMIT_INCRBLOB 3136 if( pCur->aOverflow && pCur->aOverflow[iIdx+1] ){ 3137 nextPage = pCur->aOverflow[iIdx+1]; 3138 } else 3139 #endif 3140 rc = getOverflowPage(pBt, nextPage, 0, &nextPage); 3141 offset -= ovflSize; 3142 }else{ 3143 /* Need to read this page properly. It contains some of the 3144 ** range of data that is being read (eOp==0) or written (eOp!=0). 3145 */ 3146 DbPage *pDbPage; 3147 int a = amt; 3148 rc = sqlite3PagerGet(pBt->pPager, nextPage, &pDbPage); 3149 if( rc==SQLITE_OK ){ 3150 aPayload = sqlite3PagerGetData(pDbPage); 3151 nextPage = get4byte(aPayload); 3152 if( a + offset > ovflSize ){ 3153 a = ovflSize - offset; 3154 } 3155 rc = copyPayload(&aPayload[offset+4], pBuf, a, eOp, pDbPage); 3156 sqlite3PagerUnref(pDbPage); 3157 offset = 0; 3158 amt -= a; 3159 pBuf += a; 3160 } 3161 } 3162 } 3163 } 3164 3165 if( rc==SQLITE_OK && amt>0 ){ 3166 return SQLITE_CORRUPT_BKPT; 3167 } 3168 return rc; 3169 } 3170 3171 /* 3172 ** Read part of the key associated with cursor pCur. Exactly 3173 ** "amt" bytes will be transfered into pBuf[]. The transfer 3174 ** begins at "offset". 3175 ** 3176 ** Return SQLITE_OK on success or an error code if anything goes 3177 ** wrong. An error is returned if "offset+amt" is larger than 3178 ** the available payload. 3179 */ 3180 int sqlite3BtreeKey(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){ 3181 int rc; 3182 3183 assert( cursorHoldsMutex(pCur) ); 3184 rc = restoreOrClearCursorPosition(pCur); 3185 if( rc==SQLITE_OK ){ 3186 assert( pCur->eState==CURSOR_VALID ); 3187 assert( pCur->pPage!=0 ); 3188 if( pCur->pPage->intKey ){ 3189 return SQLITE_CORRUPT_BKPT; 3190 } 3191 assert( pCur->pPage->intKey==0 ); 3192 assert( pCur->idx>=0 && pCur->idx<pCur->pPage->nCell ); 3193 rc = accessPayload(pCur, offset, amt, (unsigned char*)pBuf, 0, 0); 3194 } 3195 return rc; 3196 } 3197 3198 /* 3199 ** Read part of the data associated with cursor pCur. Exactly 3200 ** "amt" bytes will be transfered into pBuf[]. The transfer 3201 ** begins at "offset". 3202 ** 3203 ** Return SQLITE_OK on success or an error code if anything goes 3204 ** wrong. An error is returned if "offset+amt" is larger than 3205 ** the available payload. 3206 */ 3207 int sqlite3BtreeData(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){ 3208 int rc; 3209 3210 assert( cursorHoldsMutex(pCur) ); 3211 rc = restoreOrClearCursorPosition(pCur); 3212 if( rc==SQLITE_OK ){ 3213 assert( pCur->eState==CURSOR_VALID ); 3214 assert( pCur->pPage!=0 ); 3215 assert( pCur->idx>=0 && pCur->idx<pCur->pPage->nCell ); 3216 rc = accessPayload(pCur, offset, amt, pBuf, 1, 0); 3217 } 3218 return rc; 3219 } 3220 3221 /* 3222 ** Return a pointer to payload information from the entry that the 3223 ** pCur cursor is pointing to. The pointer is to the beginning of 3224 ** the key if skipKey==0 and it points to the beginning of data if 3225 ** skipKey==1. The number of bytes of available key/data is written 3226 ** into *pAmt. If *pAmt==0, then the value returned will not be 3227 ** a valid pointer. 3228 ** 3229 ** This routine is an optimization. It is common for the entire key 3230 ** and data to fit on the local page and for there to be no overflow 3231 ** pages. When that is so, this routine can be used to access the 3232 ** key and data without making a copy. If the key and/or data spills 3233 ** onto overflow pages, then accessPayload() must be used to reassembly 3234 ** the key/data and copy it into a preallocated buffer. 3235 ** 3236 ** The pointer returned by this routine looks directly into the cached 3237 ** page of the database. The data might change or move the next time 3238 ** any btree routine is called. 3239 */ 3240 static const unsigned char *fetchPayload( 3241 BtCursor *pCur, /* Cursor pointing to entry to read from */ 3242 int *pAmt, /* Write the number of available bytes here */ 3243 int skipKey /* read beginning at data if this is true */ 3244 ){ 3245 unsigned char *aPayload; 3246 MemPage *pPage; 3247 u32 nKey; 3248 int nLocal; 3249 3250 assert( pCur!=0 && pCur->pPage!=0 ); 3251 assert( pCur->eState==CURSOR_VALID ); 3252 assert( cursorHoldsMutex(pCur) ); 3253 pPage = pCur->pPage; 3254 assert( pCur->idx>=0 && pCur->idx<pPage->nCell ); 3255 getCellInfo(pCur); 3256 aPayload = pCur->info.pCell; 3257 aPayload += pCur->info.nHeader; 3258 if( pPage->intKey ){ 3259 nKey = 0; 3260 }else{ 3261 nKey = pCur->info.nKey; 3262 } 3263 if( skipKey ){ 3264 aPayload += nKey; 3265 nLocal = pCur->info.nLocal - nKey; 3266 }else{ 3267 nLocal = pCur->info.nLocal; 3268 if( nLocal>nKey ){ 3269 nLocal = nKey; 3270 } 3271 } 3272 *pAmt = nLocal; 3273 return aPayload; 3274 } 3275 3276 3277 /* 3278 ** For the entry that cursor pCur is point to, return as 3279 ** many bytes of the key or data as are available on the local 3280 ** b-tree page. Write the number of available bytes into *pAmt. 3281 ** 3282 ** The pointer returned is ephemeral. The key/data may move 3283 ** or be destroyed on the next call to any Btree routine, 3284 ** including calls from other threads against the same cache. 3285 ** Hence, a mutex on the BtShared should be held prior to calling 3286 ** this routine. 3287 ** 3288 ** These routines is used to get quick access to key and data 3289 ** in the common case where no overflow pages are used. 3290 */ 3291 const void *sqlite3BtreeKeyFetch(BtCursor *pCur, int *pAmt){ 3292 assert( cursorHoldsMutex(pCur) ); 3293 if( pCur->eState==CURSOR_VALID ){ 3294 return (const void*)fetchPayload(pCur, pAmt, 0); 3295 } 3296 return 0; 3297 } 3298 const void *sqlite3BtreeDataFetch(BtCursor *pCur, int *pAmt){ 3299 assert( cursorHoldsMutex(pCur) ); 3300 if( pCur->eState==CURSOR_VALID ){ 3301 return (const void*)fetchPayload(pCur, pAmt, 1); 3302 } 3303 return 0; 3304 } 3305 3306 3307 /* 3308 ** Move the cursor down to a new child page. The newPgno argument is the 3309 ** page number of the child page to move to. 3310 */ 3311 static int moveToChild(BtCursor *pCur, u32 newPgno){ 3312 int rc; 3313 MemPage *pNewPage; 3314 MemPage *pOldPage; 3315 BtShared *pBt = pCur->pBt; 3316 3317 assert( cursorHoldsMutex(pCur) ); 3318 assert( pCur->eState==CURSOR_VALID ); 3319 rc = getAndInitPage(pBt, newPgno, &pNewPage, pCur->pPage); 3320 if( rc ) return rc; 3321 pNewPage->idxParent = pCur->idx; 3322 pOldPage = pCur->pPage; 3323 pOldPage->idxShift = 0; 3324 releasePage(pOldPage); 3325 pCur->pPage = pNewPage; 3326 pCur->idx = 0; 3327 pCur->info.nSize = 0; 3328 if( pNewPage->nCell<1 ){ 3329 return SQLITE_CORRUPT_BKPT; 3330 } 3331 return SQLITE_OK; 3332 } 3333 3334 /* 3335 ** Return true if the page is the virtual root of its table. 3336 ** 3337 ** The virtual root page is the root page for most tables. But 3338 ** for the table rooted on page 1, sometime the real root page 3339 ** is empty except for the right-pointer. In such cases the 3340 ** virtual root page is the page that the right-pointer of page 3341 ** 1 is pointing to. 3342 */ 3343 int sqlite3BtreeIsRootPage(MemPage *pPage){ 3344 MemPage *pParent; 3345 3346 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 3347 pParent = pPage->pParent; 3348 if( pParent==0 ) return 1; 3349 if( pParent->pgno>1 ) return 0; 3350 if( get2byte(&pParent->aData[pParent->hdrOffset+3])==0 ) return 1; 3351 return 0; 3352 } 3353 3354 /* 3355 ** Move the cursor up to the parent page. 3356 ** 3357 ** pCur->idx is set to the cell index that contains the pointer 3358 ** to the page we are coming from. If we are coming from the 3359 ** right-most child page then pCur->idx is set to one more than 3360 ** the largest cell index. 3361 */ 3362 void sqlite3BtreeMoveToParent(BtCursor *pCur){ 3363 MemPage *pParent; 3364 MemPage *pPage; 3365 int idxParent; 3366 3367 assert( cursorHoldsMutex(pCur) ); 3368 assert( pCur->eState==CURSOR_VALID ); 3369 pPage = pCur->pPage; 3370 assert( pPage!=0 ); 3371 assert( !sqlite3BtreeIsRootPage(pPage) ); 3372 pParent = pPage->pParent; 3373 assert( pParent!=0 ); 3374 idxParent = pPage->idxParent; 3375 sqlite3PagerRef(pParent->pDbPage); 3376 releasePage(pPage); 3377 pCur->pPage = pParent; 3378 pCur->info.nSize = 0; 3379 assert( pParent->idxShift==0 ); 3380 pCur->idx = idxParent; 3381 } 3382 3383 /* 3384 ** Move the cursor to the root page 3385 */ 3386 static int moveToRoot(BtCursor *pCur){ 3387 MemPage *pRoot; 3388 int rc = SQLITE_OK; 3389 Btree *p = pCur->pBtree; 3390 BtShared *pBt = p->pBt; 3391 3392 assert( cursorHoldsMutex(pCur) ); 3393 assert( CURSOR_INVALID < CURSOR_REQUIRESEEK ); 3394 assert( CURSOR_VALID < CURSOR_REQUIRESEEK ); 3395 assert( CURSOR_FAULT > CURSOR_REQUIRESEEK ); 3396 if( pCur->eState>=CURSOR_REQUIRESEEK ){ 3397 if( pCur->eState==CURSOR_FAULT ){ 3398 return pCur->skip; 3399 } 3400 clearCursorPosition(pCur); 3401 } 3402 pRoot = pCur->pPage; 3403 if( pRoot && pRoot->pgno==pCur->pgnoRoot ){ 3404 assert( pRoot->isInit ); 3405 }else{ 3406 if( 3407 SQLITE_OK!=(rc = getAndInitPage(pBt, pCur->pgnoRoot, &pRoot, 0)) 3408 ){ 3409 pCur->eState = CURSOR_INVALID; 3410 return rc; 3411 } 3412 releasePage(pCur->pPage); 3413 pCur->pPage = pRoot; 3414 } 3415 pCur->idx = 0; 3416 pCur->info.nSize = 0; 3417 if( pRoot->nCell==0 && !pRoot->leaf ){ 3418 Pgno subpage; 3419 assert( pRoot->pgno==1 ); 3420 subpage = get4byte(&pRoot->aData[pRoot->hdrOffset+8]); 3421 assert( subpage>0 ); 3422 pCur->eState = CURSOR_VALID; 3423 rc = moveToChild(pCur, subpage); 3424 } 3425 pCur->eState = ((pCur->pPage->nCell>0)?CURSOR_VALID:CURSOR_INVALID); 3426 return rc; 3427 } 3428 3429 /* 3430 ** Move the cursor down to the left-most leaf entry beneath the 3431 ** entry to which it is currently pointing. 3432 ** 3433 ** The left-most leaf is the one with the smallest key - the first 3434 ** in ascending order. 3435 */ 3436 static int moveToLeftmost(BtCursor *pCur){ 3437 Pgno pgno; 3438 int rc = SQLITE_OK; 3439 MemPage *pPage; 3440 3441 assert( cursorHoldsMutex(pCur) ); 3442 assert( pCur->eState==CURSOR_VALID ); 3443 while( rc==SQLITE_OK && !(pPage = pCur->pPage)->leaf ){ 3444 assert( pCur->idx>=0 && pCur->idx<pPage->nCell ); 3445 pgno = get4byte(findCell(pPage, pCur->idx)); 3446 rc = moveToChild(pCur, pgno); 3447 } 3448 return rc; 3449 } 3450 3451 /* 3452 ** Move the cursor down to the right-most leaf entry beneath the 3453 ** page to which it is currently pointing. Notice the difference 3454 ** between moveToLeftmost() and moveToRightmost(). moveToLeftmost() 3455 ** finds the left-most entry beneath the *entry* whereas moveToRightmost() 3456 ** finds the right-most entry beneath the *page*. 3457 ** 3458 ** The right-most entry is the one with the largest key - the last 3459 ** key in ascending order. 3460 */ 3461 static int moveToRightmost(BtCursor *pCur){ 3462 Pgno pgno; 3463 int rc = SQLITE_OK; 3464 MemPage *pPage; 3465 3466 assert( cursorHoldsMutex(pCur) ); 3467 assert( pCur->eState==CURSOR_VALID ); 3468 while( rc==SQLITE_OK && !(pPage = pCur->pPage)->leaf ){ 3469 pgno = get4byte(&pPage->aData[pPage->hdrOffset+8]); 3470 pCur->idx = pPage->nCell; 3471 rc = moveToChild(pCur, pgno); 3472 } 3473 if( rc==SQLITE_OK ){ 3474 pCur->idx = pPage->nCell - 1; 3475 pCur->info.nSize = 0; 3476 } 3477 return SQLITE_OK; 3478 } 3479 3480 /* Move the cursor to the first entry in the table. Return SQLITE_OK 3481 ** on success. Set *pRes to 0 if the cursor actually points to something 3482 ** or set *pRes to 1 if the table is empty. 3483 */ 3484 int sqlite3BtreeFirst(BtCursor *pCur, int *pRes){ 3485 int rc; 3486 3487 assert( cursorHoldsMutex(pCur) ); 3488 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) ); 3489 rc = moveToRoot(pCur); 3490 if( rc==SQLITE_OK ){ 3491 if( pCur->eState==CURSOR_INVALID ){ 3492 assert( pCur->pPage->nCell==0 ); 3493 *pRes = 1; 3494 rc = SQLITE_OK; 3495 }else{ 3496 assert( pCur->pPage->nCell>0 ); 3497 *pRes = 0; 3498 rc = moveToLeftmost(pCur); 3499 } 3500 } 3501 return rc; 3502 } 3503 3504 /* Move the cursor to the last entry in the table. Return SQLITE_OK 3505 ** on success. Set *pRes to 0 if the cursor actually points to something 3506 ** or set *pRes to 1 if the table is empty. 3507 */ 3508 int sqlite3BtreeLast(BtCursor *pCur, int *pRes){ 3509 int rc; 3510 3511 assert( cursorHoldsMutex(pCur) ); 3512 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) ); 3513 rc = moveToRoot(pCur); 3514 if( rc==SQLITE_OK ){ 3515 if( CURSOR_INVALID==pCur->eState ){ 3516 assert( pCur->pPage->nCell==0 ); 3517 *pRes = 1; 3518 }else{ 3519 assert( pCur->eState==CURSOR_VALID ); 3520 *pRes = 0; 3521 rc = moveToRightmost(pCur); 3522 } 3523 } 3524 return rc; 3525 } 3526 3527 /* Move the cursor so that it points to an entry near pKey/nKey. 3528 ** Return a success code. 3529 ** 3530 ** For INTKEY tables, only the nKey parameter is used. pKey is 3531 ** ignored. For other tables, nKey is the number of bytes of data 3532 ** in pKey. The comparison function specified when the cursor was 3533 ** created is used to compare keys. 3534 ** 3535 ** If an exact match is not found, then the cursor is always 3536 ** left pointing at a leaf page which would hold the entry if it 3537 ** were present. The cursor might point to an entry that comes 3538 ** before or after the key. 3539 ** 3540 ** The result of comparing the key with the entry to which the 3541 ** cursor is written to *pRes if pRes!=NULL. The meaning of 3542 ** this value is as follows: 3543 ** 3544 ** *pRes<0 The cursor is left pointing at an entry that 3545 ** is smaller than pKey or if the table is empty 3546 ** and the cursor is therefore left point to nothing. 3547 ** 3548 ** *pRes==0 The cursor is left pointing at an entry that 3549 ** exactly matches pKey. 3550 ** 3551 ** *pRes>0 The cursor is left pointing at an entry that 3552 ** is larger than pKey. 3553 ** 3554 */ 3555 int sqlite3BtreeMoveto( 3556 BtCursor *pCur, /* The cursor to be moved */ 3557 const void *pKey, /* The key content for indices. Not used by tables */ 3558 i64 nKey, /* Size of pKey. Or the key for tables */ 3559 int biasRight, /* If true, bias the search to the high end */ 3560 int *pRes /* Search result flag */ 3561 ){ 3562 int rc; 3563 3564 assert( cursorHoldsMutex(pCur) ); 3565 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) ); 3566 rc = moveToRoot(pCur); 3567 if( rc ){ 3568 return rc; 3569 } 3570 assert( pCur->pPage ); 3571 assert( pCur->pPage->isInit ); 3572 if( pCur->eState==CURSOR_INVALID ){ 3573 *pRes = -1; 3574 assert( pCur->pPage->nCell==0 ); 3575 return SQLITE_OK; 3576 } 3577 for(;;){ 3578 int lwr, upr; 3579 Pgno chldPg; 3580 MemPage *pPage = pCur->pPage; 3581 int c = -1; /* pRes return if table is empty must be -1 */ 3582 lwr = 0; 3583 upr = pPage->nCell-1; 3584 if( !pPage->intKey && pKey==0 ){ 3585 return SQLITE_CORRUPT_BKPT; 3586 } 3587 if( biasRight ){ 3588 pCur->idx = upr; 3589 }else{ 3590 pCur->idx = (upr+lwr)/2; 3591 } 3592 if( lwr<=upr ) for(;;){ 3593 void *pCellKey; 3594 i64 nCellKey; 3595 pCur->info.nSize = 0; 3596 if( pPage->intKey ){ 3597 u8 *pCell; 3598 pCell = findCell(pPage, pCur->idx) + pPage->childPtrSize; 3599 if( pPage->hasData ){ 3600 u32 dummy; 3601 pCell += getVarint32(pCell, &dummy); 3602 } 3603 getVarint(pCell, (u64 *)&nCellKey); 3604 if( nCellKey<nKey ){ 3605 c = -1; 3606 }else if( nCellKey>nKey ){ 3607 c = +1; 3608 }else{ 3609 c = 0; 3610 } 3611 }else{ 3612 int available; 3613 pCellKey = (void *)fetchPayload(pCur, &available, 0); 3614 nCellKey = pCur->info.nKey; 3615 if( available>=nCellKey ){ 3616 c = pCur->xCompare(pCur->pArg, nCellKey, pCellKey, nKey, pKey); 3617 }else{ 3618 pCellKey = sqlite3_malloc( nCellKey ); 3619 if( pCellKey==0 ) return SQLITE_NOMEM; 3620 rc = sqlite3BtreeKey(pCur, 0, nCellKey, (void *)pCellKey); 3621 c = pCur->xCompare(pCur->pArg, nCellKey, pCellKey, nKey, pKey); 3622 sqlite3_free(pCellKey); 3623 if( rc ){ 3624 return rc; 3625 } 3626 } 3627 } 3628 if( c==0 ){ 3629 if( pPage->leafData && !pPage->leaf ){ 3630 lwr = pCur->idx; 3631 upr = lwr - 1; 3632 break; 3633 }else{ 3634 if( pRes ) *pRes = 0; 3635 return SQLITE_OK; 3636 } 3637 } 3638 if( c<0 ){ 3639 lwr = pCur->idx+1; 3640 }else{ 3641 upr = pCur->idx-1; 3642 } 3643 if( lwr>upr ){ 3644 break; 3645 } 3646 pCur->idx = (lwr+upr)/2; 3647 } 3648 assert( lwr==upr+1 ); 3649 assert( pPage->isInit ); 3650 if( pPage->leaf ){ 3651 chldPg = 0; 3652 }else if( lwr>=pPage->nCell ){ 3653 chldPg = get4byte(&pPage->aData[pPage->hdrOffset+8]); 3654 }else{ 3655 chldPg = get4byte(findCell(pPage, lwr)); 3656 } 3657 if( chldPg==0 ){ 3658 assert( pCur->idx>=0 && pCur->idx<pCur->pPage->nCell ); 3659 if( pRes ) *pRes = c; 3660 return SQLITE_OK; 3661 } 3662 pCur->idx = lwr; 3663 pCur->info.nSize = 0; 3664 rc = moveToChild(pCur, chldPg); 3665 if( rc ){ 3666 return rc; 3667 } 3668 } 3669 /* NOT REACHED */ 3670 } 3671 3672 3673 /* 3674 ** Return TRUE if the cursor is not pointing at an entry of the table. 3675 ** 3676 ** TRUE will be returned after a call to sqlite3BtreeNext() moves 3677 ** past the last entry in the table or sqlite3BtreePrev() moves past 3678 ** the first entry. TRUE is also returned if the table is empty. 3679 */ 3680 int sqlite3BtreeEof(BtCursor *pCur){ 3681 /* TODO: What if the cursor is in CURSOR_REQUIRESEEK but all table entries 3682 ** have been deleted? This API will need to change to return an error code 3683 ** as well as the boolean result value. 3684 */ 3685 return (CURSOR_VALID!=pCur->eState); 3686 } 3687 3688 /* 3689 ** Return the database connection handle for a cursor. 3690 */ 3691 sqlite3 *sqlite3BtreeCursorDb(const BtCursor *pCur){ 3692 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) ); 3693 return pCur->pBtree->db; 3694 } 3695 3696 /* 3697 ** Advance the cursor to the next entry in the database. If 3698 ** successful then set *pRes=0. If the cursor 3699 ** was already pointing to the last entry in the database before 3700 ** this routine was called, then set *pRes=1. 3701 */ 3702 static int btreeNext(BtCursor *pCur, int *pRes){ 3703 int rc; 3704 MemPage *pPage; 3705 3706 assert( cursorHoldsMutex(pCur) ); 3707 rc = restoreOrClearCursorPosition(pCur); 3708 if( rc!=SQLITE_OK ){ 3709 return rc; 3710 } 3711 assert( pRes!=0 ); 3712 pPage = pCur->pPage; 3713 if( CURSOR_INVALID==pCur->eState ){ 3714 *pRes = 1; 3715 return SQLITE_OK; 3716 } 3717 if( pCur->skip>0 ){ 3718 pCur->skip = 0; 3719 *pRes = 0; 3720 return SQLITE_OK; 3721 } 3722 pCur->skip = 0; 3723 3724 assert( pPage->isInit ); 3725 assert( pCur->idx<pPage->nCell ); 3726 3727 pCur->idx++; 3728 pCur->info.nSize = 0; 3729 if( pCur->idx>=pPage->nCell ){ 3730 if( !pPage->leaf ){ 3731 rc = moveToChild(pCur, get4byte(&pPage->aData[pPage->hdrOffset+8])); 3732 if( rc ) return rc; 3733 rc = moveToLeftmost(pCur); 3734 *pRes = 0; 3735 return rc; 3736 } 3737 do{ 3738 if( sqlite3BtreeIsRootPage(pPage) ){ 3739 *pRes = 1; 3740 pCur->eState = CURSOR_INVALID; 3741 return SQLITE_OK; 3742 } 3743 sqlite3BtreeMoveToParent(pCur); 3744 pPage = pCur->pPage; 3745 }while( pCur->idx>=pPage->nCell ); 3746 *pRes = 0; 3747 if( pPage->leafData ){ 3748 rc = sqlite3BtreeNext(pCur, pRes); 3749 }else{ 3750 rc = SQLITE_OK; 3751 } 3752 return rc; 3753 } 3754 *pRes = 0; 3755 if( pPage->leaf ){ 3756 return SQLITE_OK; 3757 } 3758 rc = moveToLeftmost(pCur); 3759 return rc; 3760 } 3761 int sqlite3BtreeNext(BtCursor *pCur, int *pRes){ 3762 int rc; 3763 assert( cursorHoldsMutex(pCur) ); 3764 rc = btreeNext(pCur, pRes); 3765 return rc; 3766 } 3767 3768 3769 /* 3770 ** Step the cursor to the back to the previous entry in the database. If 3771 ** successful then set *pRes=0. If the cursor 3772 ** was already pointing to the first entry in the database before 3773 ** this routine was called, then set *pRes=1. 3774 */ 3775 static int btreePrevious(BtCursor *pCur, int *pRes){ 3776 int rc; 3777 Pgno pgno; 3778 MemPage *pPage; 3779 3780 assert( cursorHoldsMutex(pCur) ); 3781 rc = restoreOrClearCursorPosition(pCur); 3782 if( rc!=SQLITE_OK ){ 3783 return rc; 3784 } 3785 if( CURSOR_INVALID==pCur->eState ){ 3786 *pRes = 1; 3787 return SQLITE_OK; 3788 } 3789 if( pCur->skip<0 ){ 3790 pCur->skip = 0; 3791 *pRes = 0; 3792 return SQLITE_OK; 3793 } 3794 pCur->skip = 0; 3795 3796 pPage = pCur->pPage; 3797 assert( pPage->isInit ); 3798 assert( pCur->idx>=0 ); 3799 if( !pPage->leaf ){ 3800 pgno = get4byte( findCell(pPage, pCur->idx) ); 3801 rc = moveToChild(pCur, pgno); 3802 if( rc ){ 3803 return rc; 3804 } 3805 rc = moveToRightmost(pCur); 3806 }else{ 3807 while( pCur->idx==0 ){ 3808 if( sqlite3BtreeIsRootPage(pPage) ){ 3809 pCur->eState = CURSOR_INVALID; 3810 *pRes = 1; 3811 return SQLITE_OK; 3812 } 3813 sqlite3BtreeMoveToParent(pCur); 3814 pPage = pCur->pPage; 3815 } 3816 pCur->idx--; 3817 pCur->info.nSize = 0; 3818 if( pPage->leafData && !pPage->leaf ){ 3819 rc = sqlite3BtreePrevious(pCur, pRes); 3820 }else{ 3821 rc = SQLITE_OK; 3822 } 3823 } 3824 *pRes = 0; 3825 return rc; 3826 } 3827 int sqlite3BtreePrevious(BtCursor *pCur, int *pRes){ 3828 int rc; 3829 assert( cursorHoldsMutex(pCur) ); 3830 rc = btreePrevious(pCur, pRes); 3831 return rc; 3832 } 3833 3834 /* 3835 ** Allocate a new page from the database file. 3836 ** 3837 ** The new page is marked as dirty. (In other words, sqlite3PagerWrite() 3838 ** has already been called on the new page.) The new page has also 3839 ** been referenced and the calling routine is responsible for calling 3840 ** sqlite3PagerUnref() on the new page when it is done. 3841 ** 3842 ** SQLITE_OK is returned on success. Any other return value indicates 3843 ** an error. *ppPage and *pPgno are undefined in the event of an error. 3844 ** Do not invoke sqlite3PagerUnref() on *ppPage if an error is returned. 3845 ** 3846 ** If the "nearby" parameter is not 0, then a (feeble) effort is made to 3847 ** locate a page close to the page number "nearby". This can be used in an 3848 ** attempt to keep related pages close to each other in the database file, 3849 ** which in turn can make database access faster. 3850 ** 3851 ** If the "exact" parameter is not 0, and the page-number nearby exists 3852 ** anywhere on the free-list, then it is guarenteed to be returned. This 3853 ** is only used by auto-vacuum databases when allocating a new table. 3854 */ 3855 static int allocateBtreePage( 3856 BtShared *pBt, 3857 MemPage **ppPage, 3858 Pgno *pPgno, 3859 Pgno nearby, 3860 u8 exact 3861 ){ 3862 MemPage *pPage1; 3863 int rc; 3864 int n; /* Number of pages on the freelist */ 3865 int k; /* Number of leaves on the trunk of the freelist */ 3866 MemPage *pTrunk = 0; 3867 MemPage *pPrevTrunk = 0; 3868 3869 assert( sqlite3_mutex_held(pBt->mutex) ); 3870 pPage1 = pBt->pPage1; 3871 n = get4byte(&pPage1->aData[36]); 3872 if( n>0 ){ 3873 /* There are pages on the freelist. Reuse one of those pages. */ 3874 Pgno iTrunk; 3875 u8 searchList = 0; /* If the free-list must be searched for 'nearby' */ 3876 3877 /* If the 'exact' parameter was true and a query of the pointer-map 3878 ** shows that the page 'nearby' is somewhere on the free-list, then 3879 ** the entire-list will be searched for that page. 3880 */ 3881 #ifndef SQLITE_OMIT_AUTOVACUUM 3882 if( exact && nearby<=sqlite3PagerPagecount(pBt->pPager) ){ 3883 u8 eType; 3884 assert( nearby>0 ); 3885 assert( pBt->autoVacuum ); 3886 rc = ptrmapGet(pBt, nearby, &eType, 0); 3887 if( rc ) return rc; 3888 if( eType==PTRMAP_FREEPAGE ){ 3889 searchList = 1; 3890 } 3891 *pPgno = nearby; 3892 } 3893 #endif 3894 3895 /* Decrement the free-list count by 1. Set iTrunk to the index of the 3896 ** first free-list trunk page. iPrevTrunk is initially 1. 3897 */ 3898 rc = sqlite3PagerWrite(pPage1->pDbPage); 3899 if( rc ) return rc; 3900 put4byte(&pPage1->aData[36], n-1); 3901 3902 /* The code within this loop is run only once if the 'searchList' variable 3903 ** is not true. Otherwise, it runs once for each trunk-page on the 3904 ** free-list until the page 'nearby' is located. 3905 */ 3906 do { 3907 pPrevTrunk = pTrunk; 3908 if( pPrevTrunk ){ 3909 iTrunk = get4byte(&pPrevTrunk->aData[0]); 3910 }else{ 3911 iTrunk = get4byte(&pPage1->aData[32]); 3912 } 3913 rc = sqlite3BtreeGetPage(pBt, iTrunk, &pTrunk, 0); 3914 if( rc ){ 3915 pTrunk = 0; 3916 goto end_allocate_page; 3917 } 3918 3919 k = get4byte(&pTrunk->aData[4]); 3920 if( k==0 && !searchList ){ 3921 /* The trunk has no leaves and the list is not being searched. 3922 ** So extract the trunk page itself and use it as the newly 3923 ** allocated page */ 3924 assert( pPrevTrunk==0 ); 3925 rc = sqlite3PagerWrite(pTrunk->pDbPage); 3926 if( rc ){ 3927 goto end_allocate_page; 3928 } 3929 *pPgno = iTrunk; 3930 memcpy(&pPage1->aData[32], &pTrunk->aData[0], 4); 3931 *ppPage = pTrunk; 3932 pTrunk = 0; 3933 TRACE(("ALLOCATE: %d trunk - %d free pages left\n", *pPgno, n-1)); 3934 }else if( k>pBt->usableSize/4 - 8 ){ 3935 /* Value of k is out of range. Database corruption */ 3936 rc = SQLITE_CORRUPT_BKPT; 3937 goto end_allocate_page; 3938 #ifndef SQLITE_OMIT_AUTOVACUUM 3939 }else if( searchList && nearby==iTrunk ){ 3940 /* The list is being searched and this trunk page is the page 3941 ** to allocate, regardless of whether it has leaves. 3942 */ 3943 assert( *pPgno==iTrunk ); 3944 *ppPage = pTrunk; 3945 searchList = 0; 3946 rc = sqlite3PagerWrite(pTrunk->pDbPage); 3947 if( rc ){ 3948 goto end_allocate_page; 3949 } 3950 if( k==0 ){ 3951 if( !pPrevTrunk ){ 3952 memcpy(&pPage1->aData[32], &pTrunk->aData[0], 4); 3953 }else{ 3954 memcpy(&pPrevTrunk->aData[0], &pTrunk->aData[0], 4); 3955 } 3956 }else{ 3957 /* The trunk page is required by the caller but it contains 3958 ** pointers to free-list leaves. The first leaf becomes a trunk 3959 ** page in this case. 3960 */ 3961 MemPage *pNewTrunk; 3962 Pgno iNewTrunk = get4byte(&pTrunk->aData[8]); 3963 rc = sqlite3BtreeGetPage(pBt, iNewTrunk, &pNewTrunk, 0); 3964 if( rc!=SQLITE_OK ){ 3965 goto end_allocate_page; 3966 } 3967 rc = sqlite3PagerWrite(pNewTrunk->pDbPage); 3968 if( rc!=SQLITE_OK ){ 3969 releasePage(pNewTrunk); 3970 goto end_allocate_page; 3971 } 3972 memcpy(&pNewTrunk->aData[0], &pTrunk->aData[0], 4); 3973 put4byte(&pNewTrunk->aData[4], k-1); 3974 memcpy(&pNewTrunk->aData[8], &pTrunk->aData[12], (k-1)*4); 3975 releasePage(pNewTrunk); 3976 if( !pPrevTrunk ){ 3977 put4byte(&pPage1->aData[32], iNewTrunk); 3978 }else{ 3979 rc = sqlite3PagerWrite(pPrevTrunk->pDbPage); 3980 if( rc ){ 3981 goto end_allocate_page; 3982 } 3983 put4byte(&pPrevTrunk->aData[0], iNewTrunk); 3984 } 3985 } 3986 pTrunk = 0; 3987 TRACE(("ALLOCATE: %d trunk - %d free pages left\n", *pPgno, n-1)); 3988 #endif 3989 }else{ 3990 /* Extract a leaf from the trunk */ 3991 int closest; 3992 Pgno iPage; 3993 unsigned char *aData = pTrunk->aData; 3994 rc = sqlite3PagerWrite(pTrunk->pDbPage); 3995 if( rc ){ 3996 goto end_allocate_page; 3997 } 3998 if( nearby>0 ){ 3999 int i, dist; 4000 closest = 0; 4001 dist = get4byte(&aData[8]) - nearby; 4002 if( dist<0 ) dist = -dist; 4003 for(i=1; i<k; i++){ 4004 int d2 = get4byte(&aData[8+i*4]) - nearby; 4005 if( d2<0 ) d2 = -d2; 4006 if( d2<dist ){ 4007 closest = i; 4008 dist = d2; 4009 } 4010 } 4011 }else{ 4012 closest = 0; 4013 } 4014 4015 iPage = get4byte(&aData[8+closest*4]); 4016 if( !searchList || iPage==nearby ){ 4017 *pPgno = iPage; 4018 if( *pPgno>sqlite3PagerPagecount(pBt->pPager) ){ 4019 /* Free page off the end of the file */ 4020 return SQLITE_CORRUPT_BKPT; 4021 } 4022 TRACE(("ALLOCATE: %d was leaf %d of %d on trunk %d" 4023 ": %d more free pages\n", 4024 *pPgno, closest+1, k, pTrunk->pgno, n-1)); 4025 if( closest<k-1 ){ 4026 memcpy(&aData[8+closest*4], &aData[4+k*4], 4); 4027 } 4028 put4byte(&aData[4], k-1); 4029 rc = sqlite3BtreeGetPage(pBt, *pPgno, ppPage, 1); 4030 if( rc==SQLITE_OK ){ 4031 sqlite3PagerDontRollback((*ppPage)->pDbPage); 4032 rc = sqlite3PagerWrite((*ppPage)->pDbPage); 4033 if( rc!=SQLITE_OK ){ 4034 releasePage(*ppPage); 4035 } 4036 } 4037 searchList = 0; 4038 } 4039 } 4040 releasePage(pPrevTrunk); 4041 pPrevTrunk = 0; 4042 }while( searchList ); 4043 }else{ 4044 /* There are no pages on the freelist, so create a new page at the 4045 ** end of the file */ 4046 *pPgno = sqlite3PagerPagecount(pBt->pPager) + 1; 4047 4048 #ifndef SQLITE_OMIT_AUTOVACUUM 4049 if( pBt->nTrunc ){ 4050 /* An incr-vacuum has already run within this transaction. So the 4051 ** page to allocate is not from the physical end of the file, but 4052 ** at pBt->nTrunc. 4053 */ 4054 *pPgno = pBt->nTrunc+1; 4055 if( *pPgno==PENDING_BYTE_PAGE(pBt) ){ 4056 (*pPgno)++; 4057 } 4058 } 4059 if( pBt->autoVacuum && PTRMAP_ISPAGE(pBt, *pPgno) ){ 4060 /* If *pPgno refers to a pointer-map page, allocate two new pages 4061 ** at the end of the file instead of one. The first allocated page 4062 ** becomes a new pointer-map page, the second is used by the caller. 4063 */ 4064 TRACE(("ALLOCATE: %d from end of file (pointer-map page)\n", *pPgno)); 4065 assert( *pPgno!=PENDING_BYTE_PAGE(pBt) ); 4066 (*pPgno)++; 4067 } 4068 if( pBt->nTrunc ){ 4069 pBt->nTrunc = *pPgno; 4070 } 4071 #endif 4072 4073 assert( *pPgno!=PENDING_BYTE_PAGE(pBt) ); 4074 rc = sqlite3BtreeGetPage(pBt, *pPgno, ppPage, 0); 4075 if( rc ) return rc; 4076 rc = sqlite3PagerWrite((*ppPage)->pDbPage); 4077 if( rc!=SQLITE_OK ){ 4078 releasePage(*ppPage); 4079 } 4080 TRACE(("ALLOCATE: %d from end of file\n", *pPgno)); 4081 } 4082 4083 assert( *pPgno!=PENDING_BYTE_PAGE(pBt) ); 4084 4085 end_allocate_page: 4086 releasePage(pTrunk); 4087 releasePage(pPrevTrunk); 4088 return rc; 4089 } 4090 4091 /* 4092 ** Add a page of the database file to the freelist. 4093 ** 4094 ** sqlite3PagerUnref() is NOT called for pPage. 4095 */ 4096 static int freePage(MemPage *pPage){ 4097 BtShared *pBt = pPage->pBt; 4098 MemPage *pPage1 = pBt->pPage1; 4099 int rc, n, k; 4100 4101 /* Prepare the page for freeing */ 4102 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 4103 assert( pPage->pgno>1 ); 4104 pPage->isInit = 0; 4105 releasePage(pPage->pParent); 4106 pPage->pParent = 0; 4107 4108 /* Increment the free page count on pPage1 */ 4109 rc = sqlite3PagerWrite(pPage1->pDbPage); 4110 if( rc ) return rc; 4111 n = get4byte(&pPage1->aData[36]); 4112 put4byte(&pPage1->aData[36], n+1); 4113 4114 #ifdef SQLITE_SECURE_DELETE 4115 /* If the SQLITE_SECURE_DELETE compile-time option is enabled, then 4116 ** always fully overwrite deleted information with zeros. 4117 */ 4118 rc = sqlite3PagerWrite(pPage->pDbPage); 4119 if( rc ) return rc; 4120 memset(pPage->aData, 0, pPage->pBt->pageSize); 4121 #endif 4122 4123 #ifndef SQLITE_OMIT_AUTOVACUUM 4124 /* If the database supports auto-vacuum, write an entry in the pointer-map 4125 ** to indicate that the page is free. 4126 */ 4127 if( pBt->autoVacuum ){ 4128 rc = ptrmapPut(pBt, pPage->pgno, PTRMAP_FREEPAGE, 0); 4129 if( rc ) return rc; 4130 } 4131 #endif 4132 4133 if( n==0 ){ 4134 /* This is the first free page */ 4135 rc = sqlite3PagerWrite(pPage->pDbPage); 4136 if( rc ) return rc; 4137 memset(pPage->aData, 0, 8); 4138 put4byte(&pPage1->aData[32], pPage->pgno); 4139 TRACE(("FREE-PAGE: %d first\n", pPage->pgno)); 4140 }else{ 4141 /* Other free pages already exist. Retrive the first trunk page 4142 ** of the freelist and find out how many leaves it has. */ 4143 MemPage *pTrunk; 4144 rc = sqlite3BtreeGetPage(pBt, get4byte(&pPage1->aData[32]), &pTrunk, 0); 4145 if( rc ) return rc; 4146 k = get4byte(&pTrunk->aData[4]); 4147 if( k>=pBt->usableSize/4 - 8 ){ 4148 /* The trunk is full. Turn the page being freed into a new 4149 ** trunk page with no leaves. */ 4150 rc = sqlite3PagerWrite(pPage->pDbPage); 4151 if( rc==SQLITE_OK ){ 4152 put4byte(pPage->aData, pTrunk->pgno); 4153 put4byte(&pPage->aData[4], 0); 4154 put4byte(&pPage1->aData[32], pPage->pgno); 4155 TRACE(("FREE-PAGE: %d new trunk page replacing %d\n", 4156 pPage->pgno, pTrunk->pgno)); 4157 } 4158 }else if( k<0 ){ 4159 rc = SQLITE_CORRUPT; 4160 }else{ 4161 /* Add the newly freed page as a leaf on the current trunk */ 4162 rc = sqlite3PagerWrite(pTrunk->pDbPage); 4163 if( rc==SQLITE_OK ){ 4164 put4byte(&pTrunk->aData[4], k+1); 4165 put4byte(&pTrunk->aData[8+k*4], pPage->pgno); 4166 #ifndef SQLITE_SECURE_DELETE 4167 sqlite3PagerDontWrite(pPage->pDbPage); 4168 #endif 4169 } 4170 TRACE(("FREE-PAGE: %d leaf on trunk page %d\n",pPage->pgno,pTrunk->pgno)); 4171 } 4172 releasePage(pTrunk); 4173 } 4174 return rc; 4175 } 4176 4177 /* 4178 ** Free any overflow pages associated with the given Cell. 4179 */ 4180 static int clearCell(MemPage *pPage, unsigned char *pCell){ 4181 BtShared *pBt = pPage->pBt; 4182 CellInfo info; 4183 Pgno ovflPgno; 4184 int rc; 4185 int nOvfl; 4186 int ovflPageSize; 4187 4188 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 4189 sqlite3BtreeParseCellPtr(pPage, pCell, &info); 4190 if( info.iOverflow==0 ){ 4191 return SQLITE_OK; /* No overflow pages. Return without doing anything */ 4192 } 4193 ovflPgno = get4byte(&pCell[info.iOverflow]); 4194 ovflPageSize = pBt->usableSize - 4; 4195 nOvfl = (info.nPayload - info.nLocal + ovflPageSize - 1)/ovflPageSize; 4196 assert( ovflPgno==0 || nOvfl>0 ); 4197 while( nOvfl-- ){ 4198 MemPage *pOvfl; 4199 if( ovflPgno==0 || ovflPgno>sqlite3PagerPagecount(pBt->pPager) ){ 4200 return SQLITE_CORRUPT_BKPT; 4201 } 4202 4203 rc = getOverflowPage(pBt, ovflPgno, &pOvfl, (nOvfl==0)?0:&ovflPgno); 4204 if( rc ) return rc; 4205 rc = freePage(pOvfl); 4206 sqlite3PagerUnref(pOvfl->pDbPage); 4207 if( rc ) return rc; 4208 } 4209 return SQLITE_OK; 4210 } 4211 4212 /* 4213 ** Create the byte sequence used to represent a cell on page pPage 4214 ** and write that byte sequence into pCell[]. Overflow pages are 4215 ** allocated and filled in as necessary. The calling procedure 4216 ** is responsible for making sure sufficient space has been allocated 4217 ** for pCell[]. 4218 ** 4219 ** Note that pCell does not necessary need to point to the pPage->aData 4220 ** area. pCell might point to some temporary storage. The cell will 4221 ** be constructed in this temporary area then copied into pPage->aData 4222 ** later. 4223 */ 4224 static int fillInCell( 4225 MemPage *pPage, /* The page that contains the cell */ 4226 unsigned char *pCell, /* Complete text of the cell */ 4227 const void *pKey, i64 nKey, /* The key */ 4228 const void *pData,int nData, /* The data */ 4229 int nZero, /* Extra zero bytes to append to pData */ 4230 int *pnSize /* Write cell size here */ 4231 ){ 4232 int nPayload; 4233 const u8 *pSrc; 4234 int nSrc, n, rc; 4235 int spaceLeft; 4236 MemPage *pOvfl = 0; 4237 MemPage *pToRelease = 0; 4238 unsigned char *pPrior; 4239 unsigned char *pPayload; 4240 BtShared *pBt = pPage->pBt; 4241 Pgno pgnoOvfl = 0; 4242 int nHeader; 4243 CellInfo info; 4244 4245 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 4246 4247 /* Fill in the header. */ 4248 nHeader = 0; 4249 if( !pPage->leaf ){ 4250 nHeader += 4; 4251 } 4252 if( pPage->hasData ){ 4253 nHeader += putVarint(&pCell[nHeader], nData+nZero); 4254 }else{ 4255 nData = nZero = 0; 4256 } 4257 nHeader += putVarint(&pCell[nHeader], *(u64*)&nKey); 4258 sqlite3BtreeParseCellPtr(pPage, pCell, &info); 4259 assert( info.nHeader==nHeader ); 4260 assert( info.nKey==nKey ); 4261 assert( info.nData==nData+nZero ); 4262 4263 /* Fill in the payload */ 4264 nPayload = nData + nZero; 4265 if( pPage->intKey ){ 4266 pSrc = pData; 4267 nSrc = nData; 4268 nData = 0; 4269 }else{ 4270 nPayload += nKey; 4271 pSrc = pKey; 4272 nSrc = nKey; 4273 } 4274 *pnSize = info.nSize; 4275 spaceLeft = info.nLocal; 4276 pPayload = &pCell[nHeader]; 4277 pPrior = &pCell[info.iOverflow]; 4278 4279 while( nPayload>0 ){ 4280 if( spaceLeft==0 ){ 4281 int isExact = 0; 4282 #ifndef SQLITE_OMIT_AUTOVACUUM 4283 Pgno pgnoPtrmap = pgnoOvfl; /* Overflow page pointer-map entry page */ 4284 if( pBt->autoVacuum ){ 4285 do{ 4286 pgnoOvfl++; 4287 } while( 4288 PTRMAP_ISPAGE(pBt, pgnoOvfl) || pgnoOvfl==PENDING_BYTE_PAGE(pBt) 4289 ); 4290 if( pgnoOvfl>1 ){ 4291 /* isExact = 1; */ 4292 } 4293 } 4294 #endif 4295 rc = allocateBtreePage(pBt, &pOvfl, &pgnoOvfl, pgnoOvfl, isExact); 4296 #ifndef SQLITE_OMIT_AUTOVACUUM 4297 /* If the database supports auto-vacuum, and the second or subsequent 4298 ** overflow page is being allocated, add an entry to the pointer-map 4299 ** for that page now. 4300 ** 4301 ** If this is the first overflow page, then write a partial entry 4302 ** to the pointer-map. If we write nothing to this pointer-map slot, 4303 ** then the optimistic overflow chain processing in clearCell() 4304 ** may misinterpret the uninitialised values and delete the 4305 ** wrong pages from the database. 4306 */ 4307 if( pBt->autoVacuum && rc==SQLITE_OK ){ 4308 u8 eType = (pgnoPtrmap?PTRMAP_OVERFLOW2:PTRMAP_OVERFLOW1); 4309 rc = ptrmapPut(pBt, pgnoOvfl, eType, pgnoPtrmap); 4310 if( rc ){ 4311 releasePage(pOvfl); 4312 } 4313 } 4314 #endif 4315 if( rc ){ 4316 releasePage(pToRelease); 4317 return rc; 4318 } 4319 put4byte(pPrior, pgnoOvfl); 4320 releasePage(pToRelease); 4321 pToRelease = pOvfl; 4322 pPrior = pOvfl->aData; 4323 put4byte(pPrior, 0); 4324 pPayload = &pOvfl->aData[4]; 4325 spaceLeft = pBt->usableSize - 4; 4326 } 4327 n = nPayload; 4328 if( n>spaceLeft ) n = spaceLeft; 4329 if( nSrc>0 ){ 4330 if( n>nSrc ) n = nSrc; 4331 assert( pSrc ); 4332 memcpy(pPayload, pSrc, n); 4333 }else{ 4334 memset(pPayload, 0, n); 4335 } 4336 nPayload -= n; 4337 pPayload += n; 4338 pSrc += n; 4339 nSrc -= n; 4340 spaceLeft -= n; 4341 if( nSrc==0 ){ 4342 nSrc = nData; 4343 pSrc = pData; 4344 } 4345 } 4346 releasePage(pToRelease); 4347 return SQLITE_OK; 4348 } 4349 4350 /* 4351 ** Change the MemPage.pParent pointer on the page whose number is 4352 ** given in the second argument so that MemPage.pParent holds the 4353 ** pointer in the third argument. 4354 */ 4355 static int reparentPage(BtShared *pBt, Pgno pgno, MemPage *pNewParent, int idx){ 4356 MemPage *pThis; 4357 DbPage *pDbPage; 4358 4359 assert( sqlite3_mutex_held(pBt->mutex) ); 4360 assert( pNewParent!=0 ); 4361 if( pgno==0 ) return SQLITE_OK; 4362 assert( pBt->pPager!=0 ); 4363 pDbPage = sqlite3PagerLookup(pBt->pPager, pgno); 4364 if( pDbPage ){ 4365 pThis = (MemPage *)sqlite3PagerGetExtra(pDbPage); 4366 if( pThis->isInit ){ 4367 assert( pThis->aData==sqlite3PagerGetData(pDbPage) ); 4368 if( pThis->pParent!=pNewParent ){ 4369 if( pThis->pParent ) sqlite3PagerUnref(pThis->pParent->pDbPage); 4370 pThis->pParent = pNewParent; 4371 sqlite3PagerRef(pNewParent->pDbPage); 4372 } 4373 pThis->idxParent = idx; 4374 } 4375 sqlite3PagerUnref(pDbPage); 4376 } 4377 4378 #ifndef SQLITE_OMIT_AUTOVACUUM 4379 if( pBt->autoVacuum ){ 4380 return ptrmapPut(pBt, pgno, PTRMAP_BTREE, pNewParent->pgno); 4381 } 4382 #endif 4383 return SQLITE_OK; 4384 } 4385 4386 4387 4388 /* 4389 ** Change the pParent pointer of all children of pPage to point back 4390 ** to pPage. 4391 ** 4392 ** In other words, for every child of pPage, invoke reparentPage() 4393 ** to make sure that each child knows that pPage is its parent. 4394 ** 4395 ** This routine gets called after you memcpy() one page into 4396 ** another. 4397 */ 4398 static int reparentChildPages(MemPage *pPage){ 4399 int i; 4400 BtShared *pBt = pPage->pBt; 4401 int rc = SQLITE_OK; 4402 4403 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 4404 if( pPage->leaf ) return SQLITE_OK; 4405 4406 for(i=0; i<pPage->nCell; i++){ 4407 u8 *pCell = findCell(pPage, i); 4408 if( !pPage->leaf ){ 4409 rc = reparentPage(pBt, get4byte(pCell), pPage, i); 4410 if( rc!=SQLITE_OK ) return rc; 4411 } 4412 } 4413 if( !pPage->leaf ){ 4414 rc = reparentPage(pBt, get4byte(&pPage->aData[pPage->hdrOffset+8]), 4415 pPage, i); 4416 pPage->idxShift = 0; 4417 } 4418 return rc; 4419 } 4420 4421 /* 4422 ** Remove the i-th cell from pPage. This routine effects pPage only. 4423 ** The cell content is not freed or deallocated. It is assumed that 4424 ** the cell content has been copied someplace else. This routine just 4425 ** removes the reference to the cell from pPage. 4426 ** 4427 ** "sz" must be the number of bytes in the cell. 4428 */ 4429 static void dropCell(MemPage *pPage, int idx, int sz){ 4430 int i; /* Loop counter */ 4431 int pc; /* Offset to cell content of cell being deleted */ 4432 u8 *data; /* pPage->aData */ 4433 u8 *ptr; /* Used to move bytes around within data[] */ 4434 4435 assert( idx>=0 && idx<pPage->nCell ); 4436 assert( sz==cellSize(pPage, idx) ); 4437 assert( sqlite3PagerIswriteable(pPage->pDbPage) ); 4438 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 4439 data = pPage->aData; 4440 ptr = &data[pPage->cellOffset + 2*idx]; 4441 pc = get2byte(ptr); 4442 assert( pc>10 && pc+sz<=pPage->pBt->usableSize ); 4443 freeSpace(pPage, pc, sz); 4444 for(i=idx+1; i<pPage->nCell; i++, ptr+=2){ 4445 ptr[0] = ptr[2]; 4446 ptr[1] = ptr[3]; 4447 } 4448 pPage->nCell--; 4449 put2byte(&data[pPage->hdrOffset+3], pPage->nCell); 4450 pPage->nFree += 2; 4451 pPage->idxShift = 1; 4452 } 4453 4454 /* 4455 ** Insert a new cell on pPage at cell index "i". pCell points to the 4456 ** content of the cell. 4457 ** 4458 ** If the cell content will fit on the page, then put it there. If it 4459 ** will not fit, then make a copy of the cell content into pTemp if 4460 ** pTemp is not null. Regardless of pTemp, allocate a new entry 4461 ** in pPage->aOvfl[] and make it point to the cell content (either 4462 ** in pTemp or the original pCell) and also record its index. 4463 ** Allocating a new entry in pPage->aCell[] implies that 4464 ** pPage->nOverflow is incremented. 4465 ** 4466 ** If nSkip is non-zero, then do not copy the first nSkip bytes of the 4467 ** cell. The caller will overwrite them after this function returns. If 4468 ** nSkip is non-zero, then pCell may not point to an invalid memory location 4469 ** (but pCell+nSkip is always valid). 4470 */ 4471 static int insertCell( 4472 MemPage *pPage, /* Page into which we are copying */ 4473 int i, /* New cell becomes the i-th cell of the page */ 4474 u8 *pCell, /* Content of the new cell */ 4475 int sz, /* Bytes of content in pCell */ 4476 u8 *pTemp, /* Temp storage space for pCell, if needed */ 4477 u8 nSkip /* Do not write the first nSkip bytes of the cell */ 4478 ){ 4479 int idx; /* Where to write new cell content in data[] */ 4480 int j; /* Loop counter */ 4481 int top; /* First byte of content for any cell in data[] */ 4482 int end; /* First byte past the last cell pointer in data[] */ 4483 int ins; /* Index in data[] where new cell pointer is inserted */ 4484 int hdr; /* Offset into data[] of the page header */ 4485 int cellOffset; /* Address of first cell pointer in data[] */ 4486 u8 *data; /* The content of the whole page */ 4487 u8 *ptr; /* Used for moving information around in data[] */ 4488 4489 assert( i>=0 && i<=pPage->nCell+pPage->nOverflow ); 4490 assert( sz==cellSizePtr(pPage, pCell) ); 4491 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 4492 if( pPage->nOverflow || sz+2>pPage->nFree ){ 4493 if( pTemp ){ 4494 memcpy(pTemp+nSkip, pCell+nSkip, sz-nSkip); 4495 pCell = pTemp; 4496 } 4497 j = pPage->nOverflow++; 4498 assert( j<sizeof(pPage->aOvfl)/sizeof(pPage->aOvfl[0]) ); 4499 pPage->aOvfl[j].pCell = pCell; 4500 pPage->aOvfl[j].idx = i; 4501 pPage->nFree = 0; 4502 }else{ 4503 int rc = sqlite3PagerWrite(pPage->pDbPage); 4504 if( rc!=SQLITE_OK ){ 4505 return rc; 4506 } 4507 assert( sqlite3PagerIswriteable(pPage->pDbPage) ); 4508 data = pPage->aData; 4509 hdr = pPage->hdrOffset; 4510 top = get2byte(&data[hdr+5]); 4511 cellOffset = pPage->cellOffset; 4512 end = cellOffset + 2*pPage->nCell + 2; 4513 ins = cellOffset + 2*i; 4514 if( end > top - sz ){ 4515 rc = defragmentPage(pPage); 4516 if( rc!=SQLITE_OK ) return rc; 4517 top = get2byte(&data[hdr+5]); 4518 assert( end + sz <= top ); 4519 } 4520 idx = allocateSpace(pPage, sz); 4521 assert( idx>0 ); 4522 assert( end <= get2byte(&data[hdr+5]) ); 4523 pPage->nCell++; 4524 pPage->nFree -= 2; 4525 memcpy(&data[idx+nSkip], pCell+nSkip, sz-nSkip); 4526 for(j=end-2, ptr=&data[j]; j>ins; j-=2, ptr-=2){ 4527 ptr[0] = ptr[-2]; 4528 ptr[1] = ptr[-1]; 4529 } 4530 put2byte(&data[ins], idx); 4531 put2byte(&data[hdr+3], pPage->nCell); 4532 pPage->idxShift = 1; 4533 #ifndef SQLITE_OMIT_AUTOVACUUM 4534 if( pPage->pBt->autoVacuum ){ 4535 /* The cell may contain a pointer to an overflow page. If so, write 4536 ** the entry for the overflow page into the pointer map. 4537 */ 4538 CellInfo info; 4539 sqlite3BtreeParseCellPtr(pPage, pCell, &info); 4540 assert( (info.nData+(pPage->intKey?0:info.nKey))==info.nPayload ); 4541 if( (info.nData+(pPage->intKey?0:info.nKey))>info.nLocal ){ 4542 Pgno pgnoOvfl = get4byte(&pCell[info.iOverflow]); 4543 rc = ptrmapPut(pPage->pBt, pgnoOvfl, PTRMAP_OVERFLOW1, pPage->pgno); 4544 if( rc!=SQLITE_OK ) return rc; 4545 } 4546 } 4547 #endif 4548 } 4549 4550 return SQLITE_OK; 4551 } 4552 4553 /* 4554 ** Add a list of cells to a page. The page should be initially empty. 4555 ** The cells are guaranteed to fit on the page. 4556 */ 4557 static void assemblePage( 4558 MemPage *pPage, /* The page to be assemblied */ 4559 int nCell, /* The number of cells to add to this page */ 4560 u8 **apCell, /* Pointers to cell bodies */ 4561 int *aSize /* Sizes of the cells */ 4562 ){ 4563 int i; /* Loop counter */ 4564 int totalSize; /* Total size of all cells */ 4565 int hdr; /* Index of page header */ 4566 int cellptr; /* Address of next cell pointer */ 4567 int cellbody; /* Address of next cell body */ 4568 u8 *data; /* Data for the page */ 4569 4570 assert( pPage->nOverflow==0 ); 4571 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 4572 totalSize = 0; 4573 for(i=0; i<nCell; i++){ 4574 totalSize += aSize[i]; 4575 } 4576 assert( totalSize+2*nCell<=pPage->nFree ); 4577 assert( pPage->nCell==0 ); 4578 cellptr = pPage->cellOffset; 4579 data = pPage->aData; 4580 hdr = pPage->hdrOffset; 4581 put2byte(&data[hdr+3], nCell); 4582 if( nCell ){ 4583 cellbody = allocateSpace(pPage, totalSize); 4584 assert( cellbody>0 ); 4585 assert( pPage->nFree >= 2*nCell ); 4586 pPage->nFree -= 2*nCell; 4587 for(i=0; i<nCell; i++){ 4588 put2byte(&data[cellptr], cellbody); 4589 memcpy(&data[cellbody], apCell[i], aSize[i]); 4590 cellptr += 2; 4591 cellbody += aSize[i]; 4592 } 4593 assert( cellbody==pPage->pBt->usableSize ); 4594 } 4595 pPage->nCell = nCell; 4596 } 4597 4598 /* 4599 ** The following parameters determine how many adjacent pages get involved 4600 ** in a balancing operation. NN is the number of neighbors on either side 4601 ** of the page that participate in the balancing operation. NB is the 4602 ** total number of pages that participate, including the target page and 4603 ** NN neighbors on either side. 4604 ** 4605 ** The minimum value of NN is 1 (of course). Increasing NN above 1 4606 ** (to 2 or 3) gives a modest improvement in SELECT and DELETE performance 4607 ** in exchange for a larger degradation in INSERT and UPDATE performance. 4608 ** The value of NN appears to give the best results overall. 4609 */ 4610 #define NN 1 /* Number of neighbors on either side of pPage */ 4611 #define NB (NN*2+1) /* Total pages involved in the balance */ 4612 4613 /* Forward reference */ 4614 static int balance(MemPage*, int); 4615 4616 #ifndef SQLITE_OMIT_QUICKBALANCE 4617 /* 4618 ** This version of balance() handles the common special case where 4619 ** a new entry is being inserted on the extreme right-end of the 4620 ** tree, in other words, when the new entry will become the largest 4621 ** entry in the tree. 4622 ** 4623 ** Instead of trying balance the 3 right-most leaf pages, just add 4624 ** a new page to the right-hand side and put the one new entry in 4625 ** that page. This leaves the right side of the tree somewhat 4626 ** unbalanced. But odds are that we will be inserting new entries 4627 ** at the end soon afterwards so the nearly empty page will quickly 4628 ** fill up. On average. 4629 ** 4630 ** pPage is the leaf page which is the right-most page in the tree. 4631 ** pParent is its parent. pPage must have a single overflow entry 4632 ** which is also the right-most entry on the page. 4633 */ 4634 static int balance_quick(MemPage *pPage, MemPage *pParent){ 4635 int rc; 4636 MemPage *pNew; 4637 Pgno pgnoNew; 4638 u8 *pCell; 4639 int szCell; 4640 CellInfo info; 4641 BtShared *pBt = pPage->pBt; 4642 int parentIdx = pParent->nCell; /* pParent new divider cell index */ 4643 int parentSize; /* Size of new divider cell */ 4644 u8 parentCell[64]; /* Space for the new divider cell */ 4645 4646 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 4647 4648 /* Allocate a new page. Insert the overflow cell from pPage 4649 ** into it. Then remove the overflow cell from pPage. 4650 */ 4651 rc = allocateBtreePage(pBt, &pNew, &pgnoNew, 0, 0); 4652 if( rc!=SQLITE_OK ){ 4653 return rc; 4654 } 4655 pCell = pPage->aOvfl[0].pCell; 4656 szCell = cellSizePtr(pPage, pCell); 4657 zeroPage(pNew, pPage->aData[0]); 4658 assemblePage(pNew, 1, &pCell, &szCell); 4659 pPage->nOverflow = 0; 4660 4661 /* Set the parent of the newly allocated page to pParent. */ 4662 pNew->pParent = pParent; 4663 sqlite3PagerRef(pParent->pDbPage); 4664 4665 /* pPage is currently the right-child of pParent. Change this 4666 ** so that the right-child is the new page allocated above and 4667 ** pPage is the next-to-right child. 4668 */ 4669 assert( pPage->nCell>0 ); 4670 pCell = findCell(pPage, pPage->nCell-1); 4671 sqlite3BtreeParseCellPtr(pPage, pCell, &info); 4672 rc = fillInCell(pParent, parentCell, 0, info.nKey, 0, 0, 0, &parentSize); 4673 if( rc!=SQLITE_OK ){ 4674 return rc; 4675 } 4676 assert( parentSize<64 ); 4677 rc = insertCell(pParent, parentIdx, parentCell, parentSize, 0, 4); 4678 if( rc!=SQLITE_OK ){ 4679 return rc; 4680 } 4681 put4byte(findOverflowCell(pParent,parentIdx), pPage->pgno); 4682 put4byte(&pParent->aData[pParent->hdrOffset+8], pgnoNew); 4683 4684 #ifndef SQLITE_OMIT_AUTOVACUUM 4685 /* If this is an auto-vacuum database, update the pointer map 4686 ** with entries for the new page, and any pointer from the 4687 ** cell on the page to an overflow page. 4688 */ 4689 if( pBt->autoVacuum ){ 4690 rc = ptrmapPut(pBt, pgnoNew, PTRMAP_BTREE, pParent->pgno); 4691 if( rc==SQLITE_OK ){ 4692 rc = ptrmapPutOvfl(pNew, 0); 4693 } 4694 if( rc!=SQLITE_OK ){ 4695 releasePage(pNew); 4696 return rc; 4697 } 4698 } 4699 #endif 4700 4701 /* Release the reference to the new page and balance the parent page, 4702 ** in case the divider cell inserted caused it to become overfull. 4703 */ 4704 releasePage(pNew); 4705 return balance(pParent, 0); 4706 } 4707 #endif /* SQLITE_OMIT_QUICKBALANCE */ 4708 4709 /* 4710 ** This routine redistributes Cells on pPage and up to NN*2 siblings 4711 ** of pPage so that all pages have about the same amount of free space. 4712 ** Usually NN siblings on either side of pPage is used in the balancing, 4713 ** though more siblings might come from one side if pPage is the first 4714 ** or last child of its parent. If pPage has fewer than 2*NN siblings 4715 ** (something which can only happen if pPage is the root page or a 4716 ** child of root) then all available siblings participate in the balancing. 4717 ** 4718 ** The number of siblings of pPage might be increased or decreased by one or 4719 ** two in an effort to keep pages nearly full but not over full. The root page 4720 ** is special and is allowed to be nearly empty. If pPage is 4721 ** the root page, then the depth of the tree might be increased 4722 ** or decreased by one, as necessary, to keep the root page from being 4723 ** overfull or completely empty. 4724 ** 4725 ** Note that when this routine is called, some of the Cells on pPage 4726 ** might not actually be stored in pPage->aData[]. This can happen 4727 ** if the page is overfull. Part of the job of this routine is to 4728 ** make sure all Cells for pPage once again fit in pPage->aData[]. 4729 ** 4730 ** In the course of balancing the siblings of pPage, the parent of pPage 4731 ** might become overfull or underfull. If that happens, then this routine 4732 ** is called recursively on the parent. 4733 ** 4734 ** If this routine fails for any reason, it might leave the database 4735 ** in a corrupted state. So if this routine fails, the database should 4736 ** be rolled back. 4737 */ 4738 static int balance_nonroot(MemPage *pPage){ 4739 MemPage *pParent; /* The parent of pPage */ 4740 BtShared *pBt; /* The whole database */ 4741 int nCell = 0; /* Number of cells in apCell[] */ 4742 int nMaxCells = 0; /* Allocated size of apCell, szCell, aFrom. */ 4743 int nOld; /* Number of pages in apOld[] */ 4744 int nNew; /* Number of pages in apNew[] */ 4745 int nDiv; /* Number of cells in apDiv[] */ 4746 int i, j, k; /* Loop counters */ 4747 int idx; /* Index of pPage in pParent->aCell[] */ 4748 int nxDiv; /* Next divider slot in pParent->aCell[] */ 4749 int rc; /* The return code */ 4750 int leafCorrection; /* 4 if pPage is a leaf. 0 if not */ 4751 int leafData; /* True if pPage is a leaf of a LEAFDATA tree */ 4752 int usableSpace; /* Bytes in pPage beyond the header */ 4753 int pageFlags; /* Value of pPage->aData[0] */ 4754 int subtotal; /* Subtotal of bytes in cells on one page */ 4755 int iSpace = 0; /* First unused byte of aSpace[] */ 4756 MemPage *apOld[NB]; /* pPage and up to two siblings */ 4757 Pgno pgnoOld[NB]; /* Page numbers for each page in apOld[] */ 4758 MemPage *apCopy[NB]; /* Private copies of apOld[] pages */ 4759 MemPage *apNew[NB+2]; /* pPage and up to NB siblings after balancing */ 4760 Pgno pgnoNew[NB+2]; /* Page numbers for each page in apNew[] */ 4761 u8 *apDiv[NB]; /* Divider cells in pParent */ 4762 int cntNew[NB+2]; /* Index in aCell[] of cell after i-th page */ 4763 int szNew[NB+2]; /* Combined size of cells place on i-th page */ 4764 u8 **apCell = 0; /* All cells begin balanced */ 4765 int *szCell; /* Local size of all cells in apCell[] */ 4766 u8 *aCopy[NB]; /* Space for holding data of apCopy[] */ 4767 u8 *aSpace; /* Space to hold copies of dividers cells */ 4768 #ifndef SQLITE_OMIT_AUTOVACUUM 4769 u8 *aFrom = 0; 4770 #endif 4771 4772 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 4773 4774 /* 4775 ** Find the parent page. 4776 */ 4777 assert( pPage->isInit ); 4778 assert( sqlite3PagerIswriteable(pPage->pDbPage) || pPage->nOverflow==1 ); 4779 pBt = pPage->pBt; 4780 pParent = pPage->pParent; 4781 assert( pParent ); 4782 if( SQLITE_OK!=(rc = sqlite3PagerWrite(pParent->pDbPage)) ){ 4783 return rc; 4784 } 4785 TRACE(("BALANCE: begin page %d child of %d\n", pPage->pgno, pParent->pgno)); 4786 4787 #ifndef SQLITE_OMIT_QUICKBALANCE 4788 /* 4789 ** A special case: If a new entry has just been inserted into a 4790 ** table (that is, a btree with integer keys and all data at the leaves) 4791 ** and the new entry is the right-most entry in the tree (it has the 4792 ** largest key) then use the special balance_quick() routine for 4793 ** balancing. balance_quick() is much faster and results in a tighter 4794 ** packing of data in the common case. 4795 */ 4796 if( pPage->leaf && 4797 pPage->intKey && 4798 pPage->leafData && 4799 pPage->nOverflow==1 && 4800 pPage->aOvfl[0].idx==pPage->nCell && 4801 pPage->pParent->pgno!=1 && 4802 get4byte(&pParent->aData[pParent->hdrOffset+8])==pPage->pgno 4803 ){ 4804 /* 4805 ** TODO: Check the siblings to the left of pPage. It may be that 4806 ** they are not full and no new page is required. 4807 */ 4808 return balance_quick(pPage, pParent); 4809 } 4810 #endif 4811 4812 if( SQLITE_OK!=(rc = sqlite3PagerWrite(pPage->pDbPage)) ){ 4813 return rc; 4814 } 4815 4816 /* 4817 ** Find the cell in the parent page whose left child points back 4818 ** to pPage. The "idx" variable is the index of that cell. If pPage 4819 ** is the rightmost child of pParent then set idx to pParent->nCell 4820 */ 4821 if( pParent->idxShift ){ 4822 Pgno pgno; 4823 pgno = pPage->pgno; 4824 assert( pgno==sqlite3PagerPagenumber(pPage->pDbPage) ); 4825 for(idx=0; idx<pParent->nCell; idx++){ 4826 if( get4byte(findCell(pParent, idx))==pgno ){ 4827 break; 4828 } 4829 } 4830 assert( idx<pParent->nCell 4831 || get4byte(&pParent->aData[pParent->hdrOffset+8])==pgno ); 4832 }else{ 4833 idx = pPage->idxParent; 4834 } 4835 4836 /* 4837 ** Initialize variables so that it will be safe to jump 4838 ** directly to balance_cleanup at any moment. 4839 */ 4840 nOld = nNew = 0; 4841 sqlite3PagerRef(pParent->pDbPage); 4842 4843 /* 4844 ** Find sibling pages to pPage and the cells in pParent that divide 4845 ** the siblings. An attempt is made to find NN siblings on either 4846 ** side of pPage. More siblings are taken from one side, however, if 4847 ** pPage there are fewer than NN siblings on the other side. If pParent 4848 ** has NB or fewer children then all children of pParent are taken. 4849 */ 4850 nxDiv = idx - NN; 4851 if( nxDiv + NB > pParent->nCell ){ 4852 nxDiv = pParent->nCell - NB + 1; 4853 } 4854 if( nxDiv<0 ){ 4855 nxDiv = 0; 4856 } 4857 nDiv = 0; 4858 for(i=0, k=nxDiv; i<NB; i++, k++){ 4859 if( k<pParent->nCell ){ 4860 apDiv[i] = findCell(pParent, k); 4861 nDiv++; 4862 assert( !pParent->leaf ); 4863 pgnoOld[i] = get4byte(apDiv[i]); 4864 }else if( k==pParent->nCell ){ 4865 pgnoOld[i] = get4byte(&pParent->aData[pParent->hdrOffset+8]); 4866 }else{ 4867 break; 4868 } 4869 rc = getAndInitPage(pBt, pgnoOld[i], &apOld[i], pParent); 4870 if( rc ) goto balance_cleanup; 4871 apOld[i]->idxParent = k; 4872 apCopy[i] = 0; 4873 assert( i==nOld ); 4874 nOld++; 4875 nMaxCells += 1+apOld[i]->nCell+apOld[i]->nOverflow; 4876 } 4877 4878 /* Make nMaxCells a multiple of 2 in order to preserve 8-byte 4879 ** alignment */ 4880 nMaxCells = (nMaxCells + 1)&~1; 4881 4882 /* 4883 ** Allocate space for memory structures 4884 */ 4885 apCell = sqlite3_malloc( 4886 nMaxCells*sizeof(u8*) /* apCell */ 4887 + nMaxCells*sizeof(int) /* szCell */ 4888 + ROUND8(sizeof(MemPage))*NB /* aCopy */ 4889 + pBt->pageSize*(5+NB) /* aSpace */ 4890 + (ISAUTOVACUUM ? nMaxCells : 0) /* aFrom */ 4891 ); 4892 if( apCell==0 ){ 4893 rc = SQLITE_NOMEM; 4894 goto balance_cleanup; 4895 } 4896 szCell = (int*)&apCell[nMaxCells]; 4897 aCopy[0] = (u8*)&szCell[nMaxCells]; 4898 assert( ((aCopy[0] - (u8*)apCell) & 7)==0 ); /* 8-byte alignment required */ 4899 for(i=1; i<NB; i++){ 4900 aCopy[i] = &aCopy[i-1][pBt->pageSize+ROUND8(sizeof(MemPage))]; 4901 assert( ((aCopy[i] - (u8*)apCell) & 7)==0 ); /* 8-byte alignment required */ 4902 } 4903 aSpace = &aCopy[NB-1][pBt->pageSize+ROUND8(sizeof(MemPage))]; 4904 assert( ((aSpace - (u8*)apCell) & 7)==0 ); /* 8-byte alignment required */ 4905 #ifndef SQLITE_OMIT_AUTOVACUUM 4906 if( pBt->autoVacuum ){ 4907 aFrom = &aSpace[5*pBt->pageSize]; 4908 } 4909 #endif 4910 4911 /* 4912 ** Make copies of the content of pPage and its siblings into aOld[]. 4913 ** The rest of this function will use data from the copies rather 4914 ** that the original pages since the original pages will be in the 4915 ** process of being overwritten. 4916 */ 4917 for(i=0; i<nOld; i++){ 4918 MemPage *p = apCopy[i] = (MemPage*)aCopy[i]; 4919 memcpy(p, apOld[i], sizeof(MemPage)); 4920 p->aData = (void*)&p[1]; 4921 memcpy(p->aData, apOld[i]->aData, pBt->pageSize); 4922 } 4923 4924 /* 4925 ** Load pointers to all cells on sibling pages and the divider cells 4926 ** into the local apCell[] array. Make copies of the divider cells 4927 ** into space obtained form aSpace[] and remove the the divider Cells 4928 ** from pParent. 4929 ** 4930 ** If the siblings are on leaf pages, then the child pointers of the 4931 ** divider cells are stripped from the cells before they are copied 4932 ** into aSpace[]. In this way, all cells in apCell[] are without 4933 ** child pointers. If siblings are not leaves, then all cell in 4934 ** apCell[] include child pointers. Either way, all cells in apCell[] 4935 ** are alike. 4936 ** 4937 ** leafCorrection: 4 if pPage is a leaf. 0 if pPage is not a leaf. 4938 ** leafData: 1 if pPage holds key+data and pParent holds only keys. 4939 */ 4940 nCell = 0; 4941 leafCorrection = pPage->leaf*4; 4942 leafData = pPage->leafData && pPage->leaf; 4943 for(i=0; i<nOld; i++){ 4944 MemPage *pOld = apCopy[i]; 4945 int limit = pOld->nCell+pOld->nOverflow; 4946 for(j=0; j<limit; j++){ 4947 assert( nCell<nMaxCells ); 4948 apCell[nCell] = findOverflowCell(pOld, j); 4949 szCell[nCell] = cellSizePtr(pOld, apCell[nCell]); 4950 #ifndef SQLITE_OMIT_AUTOVACUUM 4951 if( pBt->autoVacuum ){ 4952 int a; 4953 aFrom[nCell] = i; 4954 for(a=0; a<pOld->nOverflow; a++){ 4955 if( pOld->aOvfl[a].pCell==apCell[nCell] ){ 4956 aFrom[nCell] = 0xFF; 4957 break; 4958 } 4959 } 4960 } 4961 #endif 4962 nCell++; 4963 } 4964 if( i<nOld-1 ){ 4965 int sz = cellSizePtr(pParent, apDiv[i]); 4966 if( leafData ){ 4967 /* With the LEAFDATA flag, pParent cells hold only INTKEYs that 4968 ** are duplicates of keys on the child pages. We need to remove 4969 ** the divider cells from pParent, but the dividers cells are not 4970 ** added to apCell[] because they are duplicates of child cells. 4971 */ 4972 dropCell(pParent, nxDiv, sz); 4973 }else{ 4974 u8 *pTemp; 4975 assert( nCell<nMaxCells ); 4976 szCell[nCell] = sz; 4977 pTemp = &aSpace[iSpace]; 4978 iSpace += sz; 4979 assert( iSpace<=pBt->pageSize*5 ); 4980 memcpy(pTemp, apDiv[i], sz); 4981 apCell[nCell] = pTemp+leafCorrection; 4982 #ifndef SQLITE_OMIT_AUTOVACUUM 4983 if( pBt->autoVacuum ){ 4984 aFrom[nCell] = 0xFF; 4985 } 4986 #endif 4987 dropCell(pParent, nxDiv, sz); 4988 szCell[nCell] -= leafCorrection; 4989 assert( get4byte(pTemp)==pgnoOld[i] ); 4990 if( !pOld->leaf ){ 4991 assert( leafCorrection==0 ); 4992 /* The right pointer of the child page pOld becomes the left 4993 ** pointer of the divider cell */ 4994 memcpy(apCell[nCell], &pOld->aData[pOld->hdrOffset+8], 4); 4995 }else{ 4996 assert( leafCorrection==4 ); 4997 if( szCell[nCell]<4 ){ 4998 /* Do not allow any cells smaller than 4 bytes. */ 4999 szCell[nCell] = 4; 5000 } 5001 } 5002 nCell++; 5003 } 5004 } 5005 } 5006 5007 /* 5008 ** Figure out the number of pages needed to hold all nCell cells. 5009 ** Store this number in "k". Also compute szNew[] which is the total 5010 ** size of all cells on the i-th page and cntNew[] which is the index 5011 ** in apCell[] of the cell that divides page i from page i+1. 5012 ** cntNew[k] should equal nCell. 5013 ** 5014 ** Values computed by this block: 5015 ** 5016 ** k: The total number of sibling pages 5017 ** szNew[i]: Spaced used on the i-th sibling page. 5018 ** cntNew[i]: Index in apCell[] and szCell[] for the first cell to 5019 ** the right of the i-th sibling page. 5020 ** usableSpace: Number of bytes of space available on each sibling. 5021 ** 5022 */ 5023 usableSpace = pBt->usableSize - 12 + leafCorrection; 5024 for(subtotal=k=i=0; i<nCell; i++){ 5025 assert( i<nMaxCells ); 5026 subtotal += szCell[i] + 2; 5027 if( subtotal > usableSpace ){ 5028 szNew[k] = subtotal - szCell[i]; 5029 cntNew[k] = i; 5030 if( leafData ){ i--; } 5031 subtotal = 0; 5032 k++; 5033 } 5034 } 5035 szNew[k] = subtotal; 5036 cntNew[k] = nCell; 5037 k++; 5038 5039 /* 5040 ** The packing computed by the previous block is biased toward the siblings 5041 ** on the left side. The left siblings are always nearly full, while the 5042 ** right-most sibling might be nearly empty. This block of code attempts 5043 ** to adjust the packing of siblings to get a better balance. 5044 ** 5045 ** This adjustment is more than an optimization. The packing above might 5046 ** be so out of balance as to be illegal. For example, the right-most 5047 ** sibling might be completely empty. This adjustment is not optional. 5048 */ 5049 for(i=k-1; i>0; i--){ 5050 int szRight = szNew[i]; /* Size of sibling on the right */ 5051 int szLeft = szNew[i-1]; /* Size of sibling on the left */ 5052 int r; /* Index of right-most cell in left sibling */ 5053 int d; /* Index of first cell to the left of right sibling */ 5054 5055 r = cntNew[i-1] - 1; 5056 d = r + 1 - leafData; 5057 assert( d<nMaxCells ); 5058 assert( r<nMaxCells ); 5059 while( szRight==0 || szRight+szCell[d]+2<=szLeft-(szCell[r]+2) ){ 5060 szRight += szCell[d] + 2; 5061 szLeft -= szCell[r] + 2; 5062 cntNew[i-1]--; 5063 r = cntNew[i-1] - 1; 5064 d = r + 1 - leafData; 5065 } 5066 szNew[i] = szRight; 5067 szNew[i-1] = szLeft; 5068 } 5069 5070 /* Either we found one or more cells (cntnew[0])>0) or we are the 5071 ** a virtual root page. A virtual root page is when the real root 5072 ** page is page 1 and we are the only child of that page. 5073 */ 5074 assert( cntNew[0]>0 || (pParent->pgno==1 && pParent->nCell==0) ); 5075 5076 /* 5077 ** Allocate k new pages. Reuse old pages where possible. 5078 */ 5079 assert( pPage->pgno>1 ); 5080 pageFlags = pPage->aData[0]; 5081 for(i=0; i<k; i++){ 5082 MemPage *pNew; 5083 if( i<nOld ){ 5084 pNew = apNew[i] = apOld[i]; 5085 pgnoNew[i] = pgnoOld[i]; 5086 apOld[i] = 0; 5087 rc = sqlite3PagerWrite(pNew->pDbPage); 5088 nNew++; 5089 if( rc ) goto balance_cleanup; 5090 }else{ 5091 assert( i>0 ); 5092 rc = allocateBtreePage(pBt, &pNew, &pgnoNew[i], pgnoNew[i-1], 0); 5093 if( rc ) goto balance_cleanup; 5094 apNew[i] = pNew; 5095 nNew++; 5096 } 5097 zeroPage(pNew, pageFlags); 5098 } 5099 5100 /* Free any old pages that were not reused as new pages. 5101 */ 5102 while( i<nOld ){ 5103 rc = freePage(apOld[i]); 5104 if( rc ) goto balance_cleanup; 5105 releasePage(apOld[i]); 5106 apOld[i] = 0; 5107 i++; 5108 } 5109 5110 /* 5111 ** Put the new pages in accending order. This helps to 5112 ** keep entries in the disk file in order so that a scan 5113 ** of the table is a linear scan through the file. That 5114 ** in turn helps the operating system to deliver pages 5115 ** from the disk more rapidly. 5116 ** 5117 ** An O(n^2) insertion sort algorithm is used, but since 5118 ** n is never more than NB (a small constant), that should 5119 ** not be a problem. 5120 ** 5121 ** When NB==3, this one optimization makes the database 5122 ** about 25% faster for large insertions and deletions. 5123 */ 5124 for(i=0; i<k-1; i++){ 5125 int minV = pgnoNew[i]; 5126 int minI = i; 5127 for(j=i+1; j<k; j++){ 5128 if( pgnoNew[j]<(unsigned)minV ){ 5129 minI = j; 5130 minV = pgnoNew[j]; 5131 } 5132 } 5133 if( minI>i ){ 5134 int t; 5135 MemPage *pT; 5136 t = pgnoNew[i]; 5137 pT = apNew[i]; 5138 pgnoNew[i] = pgnoNew[minI]; 5139 apNew[i] = apNew[minI]; 5140 pgnoNew[minI] = t; 5141 apNew[minI] = pT; 5142 } 5143 } 5144 TRACE(("BALANCE: old: %d %d %d new: %d(%d) %d(%d) %d(%d) %d(%d) %d(%d)\n", 5145 pgnoOld[0], 5146 nOld>=2 ? pgnoOld[1] : 0, 5147 nOld>=3 ? pgnoOld[2] : 0, 5148 pgnoNew[0], szNew[0], 5149 nNew>=2 ? pgnoNew[1] : 0, nNew>=2 ? szNew[1] : 0, 5150 nNew>=3 ? pgnoNew[2] : 0, nNew>=3 ? szNew[2] : 0, 5151 nNew>=4 ? pgnoNew[3] : 0, nNew>=4 ? szNew[3] : 0, 5152 nNew>=5 ? pgnoNew[4] : 0, nNew>=5 ? szNew[4] : 0)); 5153 5154 /* 5155 ** Evenly distribute the data in apCell[] across the new pages. 5156 ** Insert divider cells into pParent as necessary. 5157 */ 5158 j = 0; 5159 for(i=0; i<nNew; i++){ 5160 /* Assemble the new sibling page. */ 5161 MemPage *pNew = apNew[i]; 5162 assert( j<nMaxCells ); 5163 assert( pNew->pgno==pgnoNew[i] ); 5164 assemblePage(pNew, cntNew[i]-j, &apCell[j], &szCell[j]); 5165 assert( pNew->nCell>0 || (nNew==1 && cntNew[0]==0) ); 5166 assert( pNew->nOverflow==0 ); 5167 5168 #ifndef SQLITE_OMIT_AUTOVACUUM 5169 /* If this is an auto-vacuum database, update the pointer map entries 5170 ** that point to the siblings that were rearranged. These can be: left 5171 ** children of cells, the right-child of the page, or overflow pages 5172 ** pointed to by cells. 5173 */ 5174 if( pBt->autoVacuum ){ 5175 for(k=j; k<cntNew[i]; k++){ 5176 assert( k<nMaxCells ); 5177 if( aFrom[k]==0xFF || apCopy[aFrom[k]]->pgno!=pNew->pgno ){ 5178 rc = ptrmapPutOvfl(pNew, k-j); 5179 if( rc!=SQLITE_OK ){ 5180 goto balance_cleanup; 5181 } 5182 } 5183 } 5184 } 5185 #endif 5186 5187 j = cntNew[i]; 5188 5189 /* If the sibling page assembled above was not the right-most sibling, 5190 ** insert a divider cell into the parent page. 5191 */ 5192 if( i<nNew-1 && j<nCell ){ 5193 u8 *pCell; 5194 u8 *pTemp; 5195 int sz; 5196 5197 assert( j<nMaxCells ); 5198 pCell = apCell[j]; 5199 sz = szCell[j] + leafCorrection; 5200 if( !pNew->leaf ){ 5201 memcpy(&pNew->aData[8], pCell, 4); 5202 pTemp = 0; 5203 }else if( leafData ){ 5204 /* If the tree is a leaf-data tree, and the siblings are leaves, 5205 ** then there is no divider cell in apCell[]. Instead, the divider 5206 ** cell consists of the integer key for the right-most cell of 5207 ** the sibling-page assembled above only. 5208 */ 5209 CellInfo info; 5210 j--; 5211 sqlite3BtreeParseCellPtr(pNew, apCell[j], &info); 5212 pCell = &aSpace[iSpace]; 5213 fillInCell(pParent, pCell, 0, info.nKey, 0, 0, 0, &sz); 5214 iSpace += sz; 5215 assert( iSpace<=pBt->pageSize*5 ); 5216 pTemp = 0; 5217 }else{ 5218 pCell -= 4; 5219 pTemp = &aSpace[iSpace]; 5220 iSpace += sz; 5221 assert( iSpace<=pBt->pageSize*5 ); 5222 /* Obscure case for non-leaf-data trees: If the cell at pCell was 5223 ** previously stored on a leaf node, and its reported size was 4 5224 ** bytes, then it may actually be smaller than this 5225 ** (see sqlite3BtreeParseCellPtr(), 4 bytes is the minimum size of 5226 ** any cell). But it is important to pass the correct size to 5227 ** insertCell(), so reparse the cell now. 5228 ** 5229 ** Note that this can never happen in an SQLite data file, as all 5230 ** cells are at least 4 bytes. It only happens in b-trees used 5231 ** to evaluate "IN (SELECT ...)" and similar clauses. 5232 */ 5233 if( szCell[j]==4 ){ 5234 assert(leafCorrection==4); 5235 sz = cellSizePtr(pParent, pCell); 5236 } 5237 } 5238 rc = insertCell(pParent, nxDiv, pCell, sz, pTemp, 4); 5239 if( rc!=SQLITE_OK ) goto balance_cleanup; 5240 put4byte(findOverflowCell(pParent,nxDiv), pNew->pgno); 5241 #ifndef SQLITE_OMIT_AUTOVACUUM 5242 /* If this is an auto-vacuum database, and not a leaf-data tree, 5243 ** then update the pointer map with an entry for the overflow page 5244 ** that the cell just inserted points to (if any). 5245 */ 5246 if( pBt->autoVacuum && !leafData ){ 5247 rc = ptrmapPutOvfl(pParent, nxDiv); 5248 if( rc!=SQLITE_OK ){ 5249 goto balance_cleanup; 5250 } 5251 } 5252 #endif 5253 j++; 5254 nxDiv++; 5255 } 5256 } 5257 assert( j==nCell ); 5258 assert( nOld>0 ); 5259 assert( nNew>0 ); 5260 if( (pageFlags & PTF_LEAF)==0 ){ 5261 memcpy(&apNew[nNew-1]->aData[8], &apCopy[nOld-1]->aData[8], 4); 5262 } 5263 if( nxDiv==pParent->nCell+pParent->nOverflow ){ 5264 /* Right-most sibling is the right-most child of pParent */ 5265 put4byte(&pParent->aData[pParent->hdrOffset+8], pgnoNew[nNew-1]); 5266 }else{ 5267 /* Right-most sibling is the left child of the first entry in pParent 5268 ** past the right-most divider entry */ 5269 put4byte(findOverflowCell(pParent, nxDiv), pgnoNew[nNew-1]); 5270 } 5271 5272 /* 5273 ** Reparent children of all cells. 5274 */ 5275 for(i=0; i<nNew; i++){ 5276 rc = reparentChildPages(apNew[i]); 5277 if( rc!=SQLITE_OK ) goto balance_cleanup; 5278 } 5279 rc = reparentChildPages(pParent); 5280 if( rc!=SQLITE_OK ) goto balance_cleanup; 5281 5282 /* 5283 ** Balance the parent page. Note that the current page (pPage) might 5284 ** have been added to the freelist so it might no longer be initialized. 5285 ** But the parent page will always be initialized. 5286 */ 5287 assert( pParent->isInit ); 5288 rc = balance(pParent, 0); 5289 5290 /* 5291 ** Cleanup before returning. 5292 */ 5293 balance_cleanup: 5294 sqlite3_free(apCell); 5295 for(i=0; i<nOld; i++){ 5296 releasePage(apOld[i]); 5297 } 5298 for(i=0; i<nNew; i++){ 5299 releasePage(apNew[i]); 5300 } 5301 releasePage(pParent); 5302 TRACE(("BALANCE: finished with %d: old=%d new=%d cells=%d\n", 5303 pPage->pgno, nOld, nNew, nCell)); 5304 return rc; 5305 } 5306 5307 /* 5308 ** This routine is called for the root page of a btree when the root 5309 ** page contains no cells. This is an opportunity to make the tree 5310 ** shallower by one level. 5311 */ 5312 static int balance_shallower(MemPage *pPage){ 5313 MemPage *pChild; /* The only child page of pPage */ 5314 Pgno pgnoChild; /* Page number for pChild */ 5315 int rc = SQLITE_OK; /* Return code from subprocedures */ 5316 BtShared *pBt; /* The main BTree structure */ 5317 int mxCellPerPage; /* Maximum number of cells per page */ 5318 u8 **apCell; /* All cells from pages being balanced */ 5319 int *szCell; /* Local size of all cells */ 5320 5321 assert( pPage->pParent==0 ); 5322 assert( pPage->nCell==0 ); 5323 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 5324 pBt = pPage->pBt; 5325 mxCellPerPage = MX_CELL(pBt); 5326 apCell = sqlite3_malloc( mxCellPerPage*(sizeof(u8*)+sizeof(int)) ); 5327 if( apCell==0 ) return SQLITE_NOMEM; 5328 szCell = (int*)&apCell[mxCellPerPage]; 5329 if( pPage->leaf ){ 5330 /* The table is completely empty */ 5331 TRACE(("BALANCE: empty table %d\n", pPage->pgno)); 5332 }else{ 5333 /* The root page is empty but has one child. Transfer the 5334 ** information from that one child into the root page if it 5335 ** will fit. This reduces the depth of the tree by one. 5336 ** 5337 ** If the root page is page 1, it has less space available than 5338 ** its child (due to the 100 byte header that occurs at the beginning 5339 ** of the database fle), so it might not be able to hold all of the 5340 ** information currently contained in the child. If this is the 5341 ** case, then do not do the transfer. Leave page 1 empty except 5342 ** for the right-pointer to the child page. The child page becomes 5343 ** the virtual root of the tree. 5344 */ 5345 pgnoChild = get4byte(&pPage->aData[pPage->hdrOffset+8]); 5346 assert( pgnoChild>0 ); 5347 assert( pgnoChild<=sqlite3PagerPagecount(pPage->pBt->pPager) ); 5348 rc = sqlite3BtreeGetPage(pPage->pBt, pgnoChild, &pChild, 0); 5349 if( rc ) goto end_shallow_balance; 5350 if( pPage->pgno==1 ){ 5351 rc = sqlite3BtreeInitPage(pChild, pPage); 5352 if( rc ) goto end_shallow_balance; 5353 assert( pChild->nOverflow==0 ); 5354 if( pChild->nFree>=100 ){ 5355 /* The child information will fit on the root page, so do the 5356 ** copy */ 5357 int i; 5358 zeroPage(pPage, pChild->aData[0]); 5359 for(i=0; i<pChild->nCell; i++){ 5360 apCell[i] = findCell(pChild,i); 5361 szCell[i] = cellSizePtr(pChild, apCell[i]); 5362 } 5363 assemblePage(pPage, pChild->nCell, apCell, szCell); 5364 /* Copy the right-pointer of the child to the parent. */ 5365 put4byte(&pPage->aData[pPage->hdrOffset+8], 5366 get4byte(&pChild->aData[pChild->hdrOffset+8])); 5367 freePage(pChild); 5368 TRACE(("BALANCE: child %d transfer to page 1\n", pChild->pgno)); 5369 }else{ 5370 /* The child has more information that will fit on the root. 5371 ** The tree is already balanced. Do nothing. */ 5372 TRACE(("BALANCE: child %d will not fit on page 1\n", pChild->pgno)); 5373 } 5374 }else{ 5375 memcpy(pPage->aData, pChild->aData, pPage->pBt->usableSize); 5376 pPage->isInit = 0; 5377 pPage->pParent = 0; 5378 rc = sqlite3BtreeInitPage(pPage, 0); 5379 assert( rc==SQLITE_OK ); 5380 freePage(pChild); 5381 TRACE(("BALANCE: transfer child %d into root %d\n", 5382 pChild->pgno, pPage->pgno)); 5383 } 5384 rc = reparentChildPages(pPage); 5385 assert( pPage->nOverflow==0 ); 5386 #ifndef SQLITE_OMIT_AUTOVACUUM 5387 if( pBt->autoVacuum ){ 5388 int i; 5389 for(i=0; i<pPage->nCell; i++){ 5390 rc = ptrmapPutOvfl(pPage, i); 5391 if( rc!=SQLITE_OK ){ 5392 goto end_shallow_balance; 5393 } 5394 } 5395 } 5396 #endif 5397 releasePage(pChild); 5398 } 5399 end_shallow_balance: 5400 sqlite3_free(apCell); 5401 return rc; 5402 } 5403 5404 5405 /* 5406 ** The root page is overfull 5407 ** 5408 ** When this happens, Create a new child page and copy the 5409 ** contents of the root into the child. Then make the root 5410 ** page an empty page with rightChild pointing to the new 5411 ** child. Finally, call balance_internal() on the new child 5412 ** to cause it to split. 5413 */ 5414 static int balance_deeper(MemPage *pPage){ 5415 int rc; /* Return value from subprocedures */ 5416 MemPage *pChild; /* Pointer to a new child page */ 5417 Pgno pgnoChild; /* Page number of the new child page */ 5418 BtShared *pBt; /* The BTree */ 5419 int usableSize; /* Total usable size of a page */ 5420 u8 *data; /* Content of the parent page */ 5421 u8 *cdata; /* Content of the child page */ 5422 int hdr; /* Offset to page header in parent */ 5423 int brk; /* Offset to content of first cell in parent */ 5424 5425 assert( pPage->pParent==0 ); 5426 assert( pPage->nOverflow>0 ); 5427 pBt = pPage->pBt; 5428 assert( sqlite3_mutex_held(pBt->mutex) ); 5429 rc = allocateBtreePage(pBt, &pChild, &pgnoChild, pPage->pgno, 0); 5430 if( rc ) return rc; 5431 assert( sqlite3PagerIswriteable(pChild->pDbPage) ); 5432 usableSize = pBt->usableSize; 5433 data = pPage->aData; 5434 hdr = pPage->hdrOffset; 5435 brk = get2byte(&data[hdr+5]); 5436 cdata = pChild->aData; 5437 memcpy(cdata, &data[hdr], pPage->cellOffset+2*pPage->nCell-hdr); 5438 memcpy(&cdata[brk], &data[brk], usableSize-brk); 5439 assert( pChild->isInit==0 ); 5440 rc = sqlite3BtreeInitPage(pChild, pPage); 5441 if( rc ) goto balancedeeper_out; 5442 memcpy(pChild->aOvfl, pPage->aOvfl, pPage->nOverflow*sizeof(pPage->aOvfl[0])); 5443 pChild->nOverflow = pPage->nOverflow; 5444 if( pChild->nOverflow ){ 5445 pChild->nFree = 0; 5446 } 5447 assert( pChild->nCell==pPage->nCell ); 5448 zeroPage(pPage, pChild->aData[0] & ~PTF_LEAF); 5449 put4byte(&pPage->aData[pPage->hdrOffset+8], pgnoChild); 5450 TRACE(("BALANCE: copy root %d into %d\n", pPage->pgno, pChild->pgno)); 5451 #ifndef SQLITE_OMIT_AUTOVACUUM 5452 if( pBt->autoVacuum ){ 5453 int i; 5454 rc = ptrmapPut(pBt, pChild->pgno, PTRMAP_BTREE, pPage->pgno); 5455 if( rc ) goto balancedeeper_out; 5456 for(i=0; i<pChild->nCell; i++){ 5457 rc = ptrmapPutOvfl(pChild, i); 5458 if( rc!=SQLITE_OK ){ 5459 return rc; 5460 } 5461 } 5462 } 5463 #endif 5464 rc = balance_nonroot(pChild); 5465 5466 balancedeeper_out: 5467 releasePage(pChild); 5468 return rc; 5469 } 5470 5471 /* 5472 ** Decide if the page pPage needs to be balanced. If balancing is 5473 ** required, call the appropriate balancing routine. 5474 */ 5475 static int balance(MemPage *pPage, int insert){ 5476 int rc = SQLITE_OK; 5477 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 5478 if( pPage->pParent==0 ){ 5479 rc = sqlite3PagerWrite(pPage->pDbPage); 5480 if( rc==SQLITE_OK && pPage->nOverflow>0 ){ 5481 rc = balance_deeper(pPage); 5482 } 5483 if( rc==SQLITE_OK && pPage->nCell==0 ){ 5484 rc = balance_shallower(pPage); 5485 } 5486 }else{ 5487 if( pPage->nOverflow>0 || 5488 (!insert && pPage->nFree>pPage->pBt->usableSize*2/3) ){ 5489 rc = balance_nonroot(pPage); 5490 } 5491 } 5492 return rc; 5493 } 5494 5495 /* 5496 ** This routine checks all cursors that point to table pgnoRoot. 5497 ** If any of those cursors were opened with wrFlag==0 in a different 5498 ** database connection (a database connection that shares the pager 5499 ** cache with the current connection) and that other connection 5500 ** is not in the ReadUncommmitted state, then this routine returns 5501 ** SQLITE_LOCKED. 5502 ** 5503 ** In addition to checking for read-locks (where a read-lock 5504 ** means a cursor opened with wrFlag==0) this routine also moves 5505 ** all write cursors so that they are pointing to the 5506 ** first Cell on the root page. This is necessary because an insert 5507 ** or delete might change the number of cells on a page or delete 5508 ** a page entirely and we do not want to leave any cursors 5509 ** pointing to non-existant pages or cells. 5510 */ 5511 static int checkReadLocks(Btree *pBtree, Pgno pgnoRoot, BtCursor *pExclude){ 5512 BtCursor *p; 5513 BtShared *pBt = pBtree->pBt; 5514 sqlite3 *db = pBtree->db; 5515 assert( sqlite3BtreeHoldsMutex(pBtree) ); 5516 for(p=pBt->pCursor; p; p=p->pNext){ 5517 if( p==pExclude ) continue; 5518 if( p->eState!=CURSOR_VALID ) continue; 5519 if( p->pgnoRoot!=pgnoRoot ) continue; 5520 if( p->wrFlag==0 ){ 5521 sqlite3 *dbOther = p->pBtree->db; 5522 if( dbOther==0 || 5523 (dbOther!=db && (dbOther->flags & SQLITE_ReadUncommitted)==0) ){ 5524 return SQLITE_LOCKED; 5525 } 5526 }else if( p->pPage->pgno!=p->pgnoRoot ){ 5527 moveToRoot(p); 5528 } 5529 } 5530 return SQLITE_OK; 5531 } 5532 5533 /* 5534 ** Insert a new record into the BTree. The key is given by (pKey,nKey) 5535 ** and the data is given by (pData,nData). The cursor is used only to 5536 ** define what table the record should be inserted into. The cursor 5537 ** is left pointing at a random location. 5538 ** 5539 ** For an INTKEY table, only the nKey value of the key is used. pKey is 5540 ** ignored. For a ZERODATA table, the pData and nData are both ignored. 5541 */ 5542 int sqlite3BtreeInsert( 5543 BtCursor *pCur, /* Insert data into the table of this cursor */ 5544 const void *pKey, i64 nKey, /* The key of the new record */ 5545 const void *pData, int nData, /* The data of the new record */ 5546 int nZero, /* Number of extra 0 bytes to append to data */ 5547 int appendBias /* True if this is likely an append */ 5548 ){ 5549 int rc; 5550 int loc; 5551 int szNew; 5552 MemPage *pPage; 5553 Btree *p = pCur->pBtree; 5554 BtShared *pBt = p->pBt; 5555 unsigned char *oldCell; 5556 unsigned char *newCell = 0; 5557 5558 assert( cursorHoldsMutex(pCur) ); 5559 if( pBt->inTransaction!=TRANS_WRITE ){ 5560 /* Must start a transaction before doing an insert */ 5561 rc = pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR; 5562 return rc; 5563 } 5564 assert( !pBt->readOnly ); 5565 if( !pCur->wrFlag ){ 5566 return SQLITE_PERM; /* Cursor not open for writing */ 5567 } 5568 if( checkReadLocks(pCur->pBtree, pCur->pgnoRoot, pCur) ){ 5569 return SQLITE_LOCKED; /* The table pCur points to has a read lock */ 5570 } 5571 if( pCur->eState==CURSOR_FAULT ){ 5572 return pCur->skip; 5573 } 5574 5575 /* Save the positions of any other cursors open on this table */ 5576 clearCursorPosition(pCur); 5577 if( 5578 SQLITE_OK!=(rc = saveAllCursors(pBt, pCur->pgnoRoot, pCur)) || 5579 SQLITE_OK!=(rc = sqlite3BtreeMoveto(pCur, pKey, nKey, appendBias, &loc)) 5580 ){ 5581 return rc; 5582 } 5583 5584 pPage = pCur->pPage; 5585 assert( pPage->intKey || nKey>=0 ); 5586 assert( pPage->leaf || !pPage->leafData ); 5587 TRACE(("INSERT: table=%d nkey=%lld ndata=%d page=%d %s\n", 5588 pCur->pgnoRoot, nKey, nData, pPage->pgno, 5589 loc==0 ? "overwrite" : "new entry")); 5590 assert( pPage->isInit ); 5591 newCell = sqlite3_malloc( MX_CELL_SIZE(pBt) ); 5592 if( newCell==0 ) return SQLITE_NOMEM; 5593 rc = fillInCell(pPage, newCell, pKey, nKey, pData, nData, nZero, &szNew); 5594 if( rc ) goto end_insert; 5595 assert( szNew==cellSizePtr(pPage, newCell) ); 5596 assert( szNew<=MX_CELL_SIZE(pBt) ); 5597 if( loc==0 && CURSOR_VALID==pCur->eState ){ 5598 int szOld; 5599 assert( pCur->idx>=0 && pCur->idx<pPage->nCell ); 5600 rc = sqlite3PagerWrite(pPage->pDbPage); 5601 if( rc ){ 5602 goto end_insert; 5603 } 5604 oldCell = findCell(pPage, pCur->idx); 5605 if( !pPage->leaf ){ 5606 memcpy(newCell, oldCell, 4); 5607 } 5608 szOld = cellSizePtr(pPage, oldCell); 5609 rc = clearCell(pPage, oldCell); 5610 if( rc ) goto end_insert; 5611 dropCell(pPage, pCur->idx, szOld); 5612 }else if( loc<0 && pPage->nCell>0 ){ 5613 assert( pPage->leaf ); 5614 pCur->idx++; 5615 pCur->info.nSize = 0; 5616 }else{ 5617 assert( pPage->leaf ); 5618 } 5619 rc = insertCell(pPage, pCur->idx, newCell, szNew, 0, 0); 5620 if( rc!=SQLITE_OK ) goto end_insert; 5621 rc = balance(pPage, 1); 5622 /* sqlite3BtreePageDump(pCur->pBt, pCur->pgnoRoot, 1); */ 5623 /* fflush(stdout); */ 5624 if( rc==SQLITE_OK ){ 5625 moveToRoot(pCur); 5626 } 5627 end_insert: 5628 sqlite3_free(newCell); 5629 return rc; 5630 } 5631 5632 /* 5633 ** Delete the entry that the cursor is pointing to. The cursor 5634 ** is left pointing at a random location. 5635 */ 5636 int sqlite3BtreeDelete(BtCursor *pCur){ 5637 MemPage *pPage = pCur->pPage; 5638 unsigned char *pCell; 5639 int rc; 5640 Pgno pgnoChild = 0; 5641 Btree *p = pCur->pBtree; 5642 BtShared *pBt = p->pBt; 5643 5644 assert( cursorHoldsMutex(pCur) ); 5645 assert( pPage->isInit ); 5646 if( pBt->inTransaction!=TRANS_WRITE ){ 5647 /* Must start a transaction before doing a delete */ 5648 rc = pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR; 5649 return rc; 5650 } 5651 assert( !pBt->readOnly ); 5652 if( pCur->eState==CURSOR_FAULT ){ 5653 return pCur->skip; 5654 } 5655 if( pCur->idx >= pPage->nCell ){ 5656 return SQLITE_ERROR; /* The cursor is not pointing to anything */ 5657 } 5658 if( !pCur->wrFlag ){ 5659 return SQLITE_PERM; /* Did not open this cursor for writing */ 5660 } 5661 if( checkReadLocks(pCur->pBtree, pCur->pgnoRoot, pCur) ){ 5662 return SQLITE_LOCKED; /* The table pCur points to has a read lock */ 5663 } 5664 5665 /* Restore the current cursor position (a no-op if the cursor is not in 5666 ** CURSOR_REQUIRESEEK state) and save the positions of any other cursors 5667 ** open on the same table. Then call sqlite3PagerWrite() on the page 5668 ** that the entry will be deleted from. 5669 */ 5670 if( 5671 (rc = restoreOrClearCursorPosition(pCur))!=0 || 5672 (rc = saveAllCursors(pBt, pCur->pgnoRoot, pCur))!=0 || 5673 (rc = sqlite3PagerWrite(pPage->pDbPage))!=0 5674 ){ 5675 return rc; 5676 } 5677 5678 /* Locate the cell within its page and leave pCell pointing to the 5679 ** data. The clearCell() call frees any overflow pages associated with the 5680 ** cell. The cell itself is still intact. 5681 */ 5682 pCell = findCell(pPage, pCur->idx); 5683 if( !pPage->leaf ){ 5684 pgnoChild = get4byte(pCell); 5685 } 5686 rc = clearCell(pPage, pCell); 5687 if( rc ){ 5688 return rc; 5689 } 5690 5691 if( !pPage->leaf ){ 5692 /* 5693 ** The entry we are about to delete is not a leaf so if we do not 5694 ** do something we will leave a hole on an internal page. 5695 ** We have to fill the hole by moving in a cell from a leaf. The 5696 ** next Cell after the one to be deleted is guaranteed to exist and 5697 ** to be a leaf so we can use it. 5698 */ 5699 BtCursor leafCur; 5700 unsigned char *pNext; 5701 int szNext; /* The compiler warning is wrong: szNext is always 5702 ** initialized before use. Adding an extra initialization 5703 ** to silence the compiler slows down the code. */ 5704 int notUsed; 5705 unsigned char *tempCell = 0; 5706 assert( !pPage->leafData ); 5707 sqlite3BtreeGetTempCursor(pCur, &leafCur); 5708 rc = sqlite3BtreeNext(&leafCur, ¬Used); 5709 if( rc==SQLITE_OK ){ 5710 rc = sqlite3PagerWrite(leafCur.pPage->pDbPage); 5711 } 5712 if( rc==SQLITE_OK ){ 5713 TRACE(("DELETE: table=%d delete internal from %d replace from leaf %d\n", 5714 pCur->pgnoRoot, pPage->pgno, leafCur.pPage->pgno)); 5715 dropCell(pPage, pCur->idx, cellSizePtr(pPage, pCell)); 5716 pNext = findCell(leafCur.pPage, leafCur.idx); 5717 szNext = cellSizePtr(leafCur.pPage, pNext); 5718 assert( MX_CELL_SIZE(pBt)>=szNext+4 ); 5719 tempCell = sqlite3_malloc( MX_CELL_SIZE(pBt) ); 5720 if( tempCell==0 ){ 5721 rc = SQLITE_NOMEM; 5722 } 5723 } 5724 if( rc==SQLITE_OK ){ 5725 rc = insertCell(pPage, pCur->idx, pNext-4, szNext+4, tempCell, 0); 5726 } 5727 if( rc==SQLITE_OK ){ 5728 put4byte(findOverflowCell(pPage, pCur->idx), pgnoChild); 5729 rc = balance(pPage, 0); 5730 } 5731 if( rc==SQLITE_OK ){ 5732 dropCell(leafCur.pPage, leafCur.idx, szNext); 5733 rc = balance(leafCur.pPage, 0); 5734 } 5735 sqlite3_free(tempCell); 5736 sqlite3BtreeReleaseTempCursor(&leafCur); 5737 }else{ 5738 TRACE(("DELETE: table=%d delete from leaf %d\n", 5739 pCur->pgnoRoot, pPage->pgno)); 5740 dropCell(pPage, pCur->idx, cellSizePtr(pPage, pCell)); 5741 rc = balance(pPage, 0); 5742 } 5743 if( rc==SQLITE_OK ){ 5744 moveToRoot(pCur); 5745 } 5746 return rc; 5747 } 5748 5749 /* 5750 ** Create a new BTree table. Write into *piTable the page 5751 ** number for the root page of the new table. 5752 ** 5753 ** The type of type is determined by the flags parameter. Only the 5754 ** following values of flags are currently in use. Other values for 5755 ** flags might not work: 5756 ** 5757 ** BTREE_INTKEY|BTREE_LEAFDATA Used for SQL tables with rowid keys 5758 ** BTREE_ZERODATA Used for SQL indices 5759 */ 5760 static int btreeCreateTable(Btree *p, int *piTable, int flags){ 5761 BtShared *pBt = p->pBt; 5762 MemPage *pRoot; 5763 Pgno pgnoRoot; 5764 int rc; 5765 5766 assert( sqlite3BtreeHoldsMutex(p) ); 5767 if( pBt->inTransaction!=TRANS_WRITE ){ 5768 /* Must start a transaction first */ 5769 rc = pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR; 5770 return rc; 5771 } 5772 assert( !pBt->readOnly ); 5773 5774 #ifdef SQLITE_OMIT_AUTOVACUUM 5775 rc = allocateBtreePage(pBt, &pRoot, &pgnoRoot, 1, 0); 5776 if( rc ){ 5777 return rc; 5778 } 5779 #else 5780 if( pBt->autoVacuum ){ 5781 Pgno pgnoMove; /* Move a page here to make room for the root-page */ 5782 MemPage *pPageMove; /* The page to move to. */ 5783 5784 /* Creating a new table may probably require moving an existing database 5785 ** to make room for the new tables root page. In case this page turns 5786 ** out to be an overflow page, delete all overflow page-map caches 5787 ** held by open cursors. 5788 */ 5789 invalidateAllOverflowCache(pBt); 5790 5791 /* Read the value of meta[3] from the database to determine where the 5792 ** root page of the new table should go. meta[3] is the largest root-page 5793 ** created so far, so the new root-page is (meta[3]+1). 5794 */ 5795 rc = sqlite3BtreeGetMeta(p, 4, &pgnoRoot); 5796 if( rc!=SQLITE_OK ){ 5797 return rc; 5798 } 5799 pgnoRoot++; 5800 5801 /* The new root-page may not be allocated on a pointer-map page, or the 5802 ** PENDING_BYTE page. 5803 */ 5804 if( pgnoRoot==PTRMAP_PAGENO(pBt, pgnoRoot) || 5805 pgnoRoot==PENDING_BYTE_PAGE(pBt) ){ 5806 pgnoRoot++; 5807 } 5808 assert( pgnoRoot>=3 ); 5809 5810 /* Allocate a page. The page that currently resides at pgnoRoot will 5811 ** be moved to the allocated page (unless the allocated page happens 5812 ** to reside at pgnoRoot). 5813 */ 5814 rc = allocateBtreePage(pBt, &pPageMove, &pgnoMove, pgnoRoot, 1); 5815 if( rc!=SQLITE_OK ){ 5816 return rc; 5817 } 5818 5819 if( pgnoMove!=pgnoRoot ){ 5820 /* pgnoRoot is the page that will be used for the root-page of 5821 ** the new table (assuming an error did not occur). But we were 5822 ** allocated pgnoMove. If required (i.e. if it was not allocated 5823 ** by extending the file), the current page at position pgnoMove 5824 ** is already journaled. 5825 */ 5826 u8 eType; 5827 Pgno iPtrPage; 5828 5829 releasePage(pPageMove); 5830 5831 /* Move the page currently at pgnoRoot to pgnoMove. */ 5832 rc = sqlite3BtreeGetPage(pBt, pgnoRoot, &pRoot, 0); 5833 if( rc!=SQLITE_OK ){ 5834 return rc; 5835 } 5836 rc = ptrmapGet(pBt, pgnoRoot, &eType, &iPtrPage); 5837 if( rc!=SQLITE_OK || eType==PTRMAP_ROOTPAGE || eType==PTRMAP_FREEPAGE ){ 5838 releasePage(pRoot); 5839 return rc; 5840 } 5841 assert( eType!=PTRMAP_ROOTPAGE ); 5842 assert( eType!=PTRMAP_FREEPAGE ); 5843 rc = sqlite3PagerWrite(pRoot->pDbPage); 5844 if( rc!=SQLITE_OK ){ 5845 releasePage(pRoot); 5846 return rc; 5847 } 5848 rc = relocatePage(pBt, pRoot, eType, iPtrPage, pgnoMove); 5849 releasePage(pRoot); 5850 5851 /* Obtain the page at pgnoRoot */ 5852 if( rc!=SQLITE_OK ){ 5853 return rc; 5854 } 5855 rc = sqlite3BtreeGetPage(pBt, pgnoRoot, &pRoot, 0); 5856 if( rc!=SQLITE_OK ){ 5857 return rc; 5858 } 5859 rc = sqlite3PagerWrite(pRoot->pDbPage); 5860 if( rc!=SQLITE_OK ){ 5861 releasePage(pRoot); 5862 return rc; 5863 } 5864 }else{ 5865 pRoot = pPageMove; 5866 } 5867 5868 /* Update the pointer-map and meta-data with the new root-page number. */ 5869 rc = ptrmapPut(pBt, pgnoRoot, PTRMAP_ROOTPAGE, 0); 5870 if( rc ){ 5871 releasePage(pRoot); 5872 return rc; 5873 } 5874 rc = sqlite3BtreeUpdateMeta(p, 4, pgnoRoot); 5875 if( rc ){ 5876 releasePage(pRoot); 5877 return rc; 5878 } 5879 5880 }else{ 5881 rc = allocateBtreePage(pBt, &pRoot, &pgnoRoot, 1, 0); 5882 if( rc ) return rc; 5883 } 5884 #endif 5885 assert( sqlite3PagerIswriteable(pRoot->pDbPage) ); 5886 zeroPage(pRoot, flags | PTF_LEAF); 5887 sqlite3PagerUnref(pRoot->pDbPage); 5888 *piTable = (int)pgnoRoot; 5889 return SQLITE_OK; 5890 } 5891 int sqlite3BtreeCreateTable(Btree *p, int *piTable, int flags){ 5892 int rc; 5893 sqlite3BtreeEnter(p); 5894 p->pBt->db = p->db; 5895 rc = btreeCreateTable(p, piTable, flags); 5896 sqlite3BtreeLeave(p); 5897 return rc; 5898 } 5899 5900 /* 5901 ** Erase the given database page and all its children. Return 5902 ** the page to the freelist. 5903 */ 5904 static int clearDatabasePage( 5905 BtShared *pBt, /* The BTree that contains the table */ 5906 Pgno pgno, /* Page number to clear */ 5907 MemPage *pParent, /* Parent page. NULL for the root */ 5908 int freePageFlag /* Deallocate page if true */ 5909 ){ 5910 MemPage *pPage = 0; 5911 int rc; 5912 unsigned char *pCell; 5913 int i; 5914 5915 assert( sqlite3_mutex_held(pBt->mutex) ); 5916 if( pgno>sqlite3PagerPagecount(pBt->pPager) ){ 5917 return SQLITE_CORRUPT_BKPT; 5918 } 5919 5920 rc = getAndInitPage(pBt, pgno, &pPage, pParent); 5921 if( rc ) goto cleardatabasepage_out; 5922 for(i=0; i<pPage->nCell; i++){ 5923 pCell = findCell(pPage, i); 5924 if( !pPage->leaf ){ 5925 rc = clearDatabasePage(pBt, get4byte(pCell), pPage->pParent, 1); 5926 if( rc ) goto cleardatabasepage_out; 5927 } 5928 rc = clearCell(pPage, pCell); 5929 if( rc ) goto cleardatabasepage_out; 5930 } 5931 if( !pPage->leaf ){ 5932 rc = clearDatabasePage(pBt, get4byte(&pPage->aData[8]), pPage->pParent, 1); 5933 if( rc ) goto cleardatabasepage_out; 5934 } 5935 if( freePageFlag ){ 5936 rc = freePage(pPage); 5937 }else if( (rc = sqlite3PagerWrite(pPage->pDbPage))==0 ){ 5938 zeroPage(pPage, pPage->aData[0] | PTF_LEAF); 5939 } 5940 5941 cleardatabasepage_out: 5942 releasePage(pPage); 5943 return rc; 5944 } 5945 5946 /* 5947 ** Delete all information from a single table in the database. iTable is 5948 ** the page number of the root of the table. After this routine returns, 5949 ** the root page is empty, but still exists. 5950 ** 5951 ** This routine will fail with SQLITE_LOCKED if there are any open 5952 ** read cursors on the table. Open write cursors are moved to the 5953 ** root of the table. 5954 */ 5955 int sqlite3BtreeClearTable(Btree *p, int iTable){ 5956 int rc; 5957 BtShared *pBt = p->pBt; 5958 sqlite3BtreeEnter(p); 5959 pBt->db = p->db; 5960 if( p->inTrans!=TRANS_WRITE ){ 5961 rc = pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR; 5962 }else if( (rc = checkReadLocks(p, iTable, 0))!=SQLITE_OK ){ 5963 /* nothing to do */ 5964 }else if( SQLITE_OK!=(rc = saveAllCursors(pBt, iTable, 0)) ){ 5965 /* nothing to do */ 5966 }else{ 5967 rc = clearDatabasePage(pBt, (Pgno)iTable, 0, 0); 5968 } 5969 sqlite3BtreeLeave(p); 5970 return rc; 5971 } 5972 5973 /* 5974 ** Erase all information in a table and add the root of the table to 5975 ** the freelist. Except, the root of the principle table (the one on 5976 ** page 1) is never added to the freelist. 5977 ** 5978 ** This routine will fail with SQLITE_LOCKED if there are any open 5979 ** cursors on the table. 5980 ** 5981 ** If AUTOVACUUM is enabled and the page at iTable is not the last 5982 ** root page in the database file, then the last root page 5983 ** in the database file is moved into the slot formerly occupied by 5984 ** iTable and that last slot formerly occupied by the last root page 5985 ** is added to the freelist instead of iTable. In this say, all 5986 ** root pages are kept at the beginning of the database file, which 5987 ** is necessary for AUTOVACUUM to work right. *piMoved is set to the 5988 ** page number that used to be the last root page in the file before 5989 ** the move. If no page gets moved, *piMoved is set to 0. 5990 ** The last root page is recorded in meta[3] and the value of 5991 ** meta[3] is updated by this procedure. 5992 */ 5993 static int btreeDropTable(Btree *p, int iTable, int *piMoved){ 5994 int rc; 5995 MemPage *pPage = 0; 5996 BtShared *pBt = p->pBt; 5997 5998 assert( sqlite3BtreeHoldsMutex(p) ); 5999 if( p->inTrans!=TRANS_WRITE ){ 6000 return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR; 6001 } 6002 6003 /* It is illegal to drop a table if any cursors are open on the 6004 ** database. This is because in auto-vacuum mode the backend may 6005 ** need to move another root-page to fill a gap left by the deleted 6006 ** root page. If an open cursor was using this page a problem would 6007 ** occur. 6008 */ 6009 if( pBt->pCursor ){ 6010 return SQLITE_LOCKED; 6011 } 6012 6013 rc = sqlite3BtreeGetPage(pBt, (Pgno)iTable, &pPage, 0); 6014 if( rc ) return rc; 6015 rc = sqlite3BtreeClearTable(p, iTable); 6016 if( rc ){ 6017 releasePage(pPage); 6018 return rc; 6019 } 6020 6021 *piMoved = 0; 6022 6023 if( iTable>1 ){ 6024 #ifdef SQLITE_OMIT_AUTOVACUUM 6025 rc = freePage(pPage); 6026 releasePage(pPage); 6027 #else 6028 if( pBt->autoVacuum ){ 6029 Pgno maxRootPgno; 6030 rc = sqlite3BtreeGetMeta(p, 4, &maxRootPgno); 6031 if( rc!=SQLITE_OK ){ 6032 releasePage(pPage); 6033 return rc; 6034 } 6035 6036 if( iTable==maxRootPgno ){ 6037 /* If the table being dropped is the table with the largest root-page 6038 ** number in the database, put the root page on the free list. 6039 */ 6040 rc = freePage(pPage); 6041 releasePage(pPage); 6042 if( rc!=SQLITE_OK ){ 6043 return rc; 6044 } 6045 }else{ 6046 /* The table being dropped does not have the largest root-page 6047 ** number in the database. So move the page that does into the 6048 ** gap left by the deleted root-page. 6049 */ 6050 MemPage *pMove; 6051 releasePage(pPage); 6052 rc = sqlite3BtreeGetPage(pBt, maxRootPgno, &pMove, 0); 6053 if( rc!=SQLITE_OK ){ 6054 return rc; 6055 } 6056 rc = relocatePage(pBt, pMove, PTRMAP_ROOTPAGE, 0, iTable); 6057 releasePage(pMove); 6058 if( rc!=SQLITE_OK ){ 6059 return rc; 6060 } 6061 rc = sqlite3BtreeGetPage(pBt, maxRootPgno, &pMove, 0); 6062 if( rc!=SQLITE_OK ){ 6063 return rc; 6064 } 6065 rc = freePage(pMove); 6066 releasePage(pMove); 6067 if( rc!=SQLITE_OK ){ 6068 return rc; 6069 } 6070 *piMoved = maxRootPgno; 6071 } 6072 6073 /* Set the new 'max-root-page' value in the database header. This 6074 ** is the old value less one, less one more if that happens to 6075 ** be a root-page number, less one again if that is the 6076 ** PENDING_BYTE_PAGE. 6077 */ 6078 maxRootPgno--; 6079 if( maxRootPgno==PENDING_BYTE_PAGE(pBt) ){ 6080 maxRootPgno--; 6081 } 6082 if( maxRootPgno==PTRMAP_PAGENO(pBt, maxRootPgno) ){ 6083 maxRootPgno--; 6084 } 6085 assert( maxRootPgno!=PENDING_BYTE_PAGE(pBt) ); 6086 6087 rc = sqlite3BtreeUpdateMeta(p, 4, maxRootPgno); 6088 }else{ 6089 rc = freePage(pPage); 6090 releasePage(pPage); 6091 } 6092 #endif 6093 }else{ 6094 /* If sqlite3BtreeDropTable was called on page 1. */ 6095 zeroPage(pPage, PTF_INTKEY|PTF_LEAF ); 6096 releasePage(pPage); 6097 } 6098 return rc; 6099 } 6100 int sqlite3BtreeDropTable(Btree *p, int iTable, int *piMoved){ 6101 int rc; 6102 sqlite3BtreeEnter(p); 6103 p->pBt->db = p->db; 6104 rc = btreeDropTable(p, iTable, piMoved); 6105 sqlite3BtreeLeave(p); 6106 return rc; 6107 } 6108 6109 6110 /* 6111 ** Read the meta-information out of a database file. Meta[0] 6112 ** is the number of free pages currently in the database. Meta[1] 6113 ** through meta[15] are available for use by higher layers. Meta[0] 6114 ** is read-only, the others are read/write. 6115 ** 6116 ** The schema layer numbers meta values differently. At the schema 6117 ** layer (and the SetCookie and ReadCookie opcodes) the number of 6118 ** free pages is not visible. So Cookie[0] is the same as Meta[1]. 6119 */ 6120 int sqlite3BtreeGetMeta(Btree *p, int idx, u32 *pMeta){ 6121 DbPage *pDbPage; 6122 int rc; 6123 unsigned char *pP1; 6124 BtShared *pBt = p->pBt; 6125 6126 sqlite3BtreeEnter(p); 6127 pBt->db = p->db; 6128 6129 /* Reading a meta-data value requires a read-lock on page 1 (and hence 6130 ** the sqlite_master table. We grab this lock regardless of whether or 6131 ** not the SQLITE_ReadUncommitted flag is set (the table rooted at page 6132 ** 1 is treated as a special case by queryTableLock() and lockTable()). 6133 */ 6134 rc = queryTableLock(p, 1, READ_LOCK); 6135 if( rc!=SQLITE_OK ){ 6136 sqlite3BtreeLeave(p); 6137 return rc; 6138 } 6139 6140 assert( idx>=0 && idx<=15 ); 6141 rc = sqlite3PagerGet(pBt->pPager, 1, &pDbPage); 6142 if( rc ){ 6143 sqlite3BtreeLeave(p); 6144 return rc; 6145 } 6146 pP1 = (unsigned char *)sqlite3PagerGetData(pDbPage); 6147 *pMeta = get4byte(&pP1[36 + idx*4]); 6148 sqlite3PagerUnref(pDbPage); 6149 6150 /* If autovacuumed is disabled in this build but we are trying to 6151 ** access an autovacuumed database, then make the database readonly. 6152 */ 6153 #ifdef SQLITE_OMIT_AUTOVACUUM 6154 if( idx==4 && *pMeta>0 ) pBt->readOnly = 1; 6155 #endif 6156 6157 /* Grab the read-lock on page 1. */ 6158 rc = lockTable(p, 1, READ_LOCK); 6159 sqlite3BtreeLeave(p); 6160 return rc; 6161 } 6162 6163 /* 6164 ** Write meta-information back into the database. Meta[0] is 6165 ** read-only and may not be written. 6166 */ 6167 int sqlite3BtreeUpdateMeta(Btree *p, int idx, u32 iMeta){ 6168 BtShared *pBt = p->pBt; 6169 unsigned char *pP1; 6170 int rc; 6171 assert( idx>=1 && idx<=15 ); 6172 sqlite3BtreeEnter(p); 6173 pBt->db = p->db; 6174 if( p->inTrans!=TRANS_WRITE ){ 6175 rc = pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR; 6176 }else{ 6177 assert( pBt->pPage1!=0 ); 6178 pP1 = pBt->pPage1->aData; 6179 rc = sqlite3PagerWrite(pBt->pPage1->pDbPage); 6180 if( rc==SQLITE_OK ){ 6181 put4byte(&pP1[36 + idx*4], iMeta); 6182 #ifndef SQLITE_OMIT_AUTOVACUUM 6183 if( idx==7 ){ 6184 assert( pBt->autoVacuum || iMeta==0 ); 6185 assert( iMeta==0 || iMeta==1 ); 6186 pBt->incrVacuum = iMeta; 6187 } 6188 #endif 6189 } 6190 } 6191 sqlite3BtreeLeave(p); 6192 return rc; 6193 } 6194 6195 /* 6196 ** Return the flag byte at the beginning of the page that the cursor 6197 ** is currently pointing to. 6198 */ 6199 int sqlite3BtreeFlags(BtCursor *pCur){ 6200 /* TODO: What about CURSOR_REQUIRESEEK state? Probably need to call 6201 ** restoreOrClearCursorPosition() here. 6202 */ 6203 MemPage *pPage = pCur->pPage; 6204 assert( cursorHoldsMutex(pCur) ); 6205 assert( pPage->pBt==pCur->pBt ); 6206 return pPage ? pPage->aData[pPage->hdrOffset] : 0; 6207 } 6208 6209 6210 /* 6211 ** Return the pager associated with a BTree. This routine is used for 6212 ** testing and debugging only. 6213 */ 6214 Pager *sqlite3BtreePager(Btree *p){ 6215 return p->pBt->pPager; 6216 } 6217 6218 #ifndef SQLITE_OMIT_INTEGRITY_CHECK 6219 /* 6220 ** Append a message to the error message string. 6221 */ 6222 static void checkAppendMsg( 6223 IntegrityCk *pCheck, 6224 char *zMsg1, 6225 const char *zFormat, 6226 ... 6227 ){ 6228 va_list ap; 6229 char *zMsg2; 6230 if( !pCheck->mxErr ) return; 6231 pCheck->mxErr--; 6232 pCheck->nErr++; 6233 va_start(ap, zFormat); 6234 zMsg2 = sqlite3VMPrintf(0, zFormat, ap); 6235 va_end(ap); 6236 if( zMsg1==0 ) zMsg1 = ""; 6237 if( pCheck->zErrMsg ){ 6238 char *zOld = pCheck->zErrMsg; 6239 pCheck->zErrMsg = 0; 6240 sqlite3SetString(&pCheck->zErrMsg, zOld, "\n", zMsg1, zMsg2, (char*)0); 6241 sqlite3_free(zOld); 6242 }else{ 6243 sqlite3SetString(&pCheck->zErrMsg, zMsg1, zMsg2, (char*)0); 6244 } 6245 sqlite3_free(zMsg2); 6246 } 6247 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */ 6248 6249 #ifndef SQLITE_OMIT_INTEGRITY_CHECK 6250 /* 6251 ** Add 1 to the reference count for page iPage. If this is the second 6252 ** reference to the page, add an error message to pCheck->zErrMsg. 6253 ** Return 1 if there are 2 ore more references to the page and 0 if 6254 ** if this is the first reference to the page. 6255 ** 6256 ** Also check that the page number is in bounds. 6257 */ 6258 static int checkRef(IntegrityCk *pCheck, int iPage, char *zContext){ 6259 if( iPage==0 ) return 1; 6260 if( iPage>pCheck->nPage || iPage<0 ){ 6261 checkAppendMsg(pCheck, zContext, "invalid page number %d", iPage); 6262 return 1; 6263 } 6264 if( pCheck->anRef[iPage]==1 ){ 6265 checkAppendMsg(pCheck, zContext, "2nd reference to page %d", iPage); 6266 return 1; 6267 } 6268 return (pCheck->anRef[iPage]++)>1; 6269 } 6270 6271 #ifndef SQLITE_OMIT_AUTOVACUUM 6272 /* 6273 ** Check that the entry in the pointer-map for page iChild maps to 6274 ** page iParent, pointer type ptrType. If not, append an error message 6275 ** to pCheck. 6276 */ 6277 static void checkPtrmap( 6278 IntegrityCk *pCheck, /* Integrity check context */ 6279 Pgno iChild, /* Child page number */ 6280 u8 eType, /* Expected pointer map type */ 6281 Pgno iParent, /* Expected pointer map parent page number */ 6282 char *zContext /* Context description (used for error msg) */ 6283 ){ 6284 int rc; 6285 u8 ePtrmapType; 6286 Pgno iPtrmapParent; 6287 6288 rc = ptrmapGet(pCheck->pBt, iChild, &ePtrmapType, &iPtrmapParent); 6289 if( rc!=SQLITE_OK ){ 6290 checkAppendMsg(pCheck, zContext, "Failed to read ptrmap key=%d", iChild); 6291 return; 6292 } 6293 6294 if( ePtrmapType!=eType || iPtrmapParent!=iParent ){ 6295 checkAppendMsg(pCheck, zContext, 6296 "Bad ptr map entry key=%d expected=(%d,%d) got=(%d,%d)", 6297 iChild, eType, iParent, ePtrmapType, iPtrmapParent); 6298 } 6299 } 6300 #endif 6301 6302 /* 6303 ** Check the integrity of the freelist or of an overflow page list. 6304 ** Verify that the number of pages on the list is N. 6305 */ 6306 static void checkList( 6307 IntegrityCk *pCheck, /* Integrity checking context */ 6308 int isFreeList, /* True for a freelist. False for overflow page list */ 6309 int iPage, /* Page number for first page in the list */ 6310 int N, /* Expected number of pages in the list */ 6311 char *zContext /* Context for error messages */ 6312 ){ 6313 int i; 6314 int expected = N; 6315 int iFirst = iPage; 6316 while( N-- > 0 && pCheck->mxErr ){ 6317 DbPage *pOvflPage; 6318 unsigned char *pOvflData; 6319 if( iPage<1 ){ 6320 checkAppendMsg(pCheck, zContext, 6321 "%d of %d pages missing from overflow list starting at %d", 6322 N+1, expected, iFirst); 6323 break; 6324 } 6325 if( checkRef(pCheck, iPage, zContext) ) break; 6326 if( sqlite3PagerGet(pCheck->pPager, (Pgno)iPage, &pOvflPage) ){ 6327 checkAppendMsg(pCheck, zContext, "failed to get page %d", iPage); 6328 break; 6329 } 6330 pOvflData = (unsigned char *)sqlite3PagerGetData(pOvflPage); 6331 if( isFreeList ){ 6332 int n = get4byte(&pOvflData[4]); 6333 #ifndef SQLITE_OMIT_AUTOVACUUM 6334 if( pCheck->pBt->autoVacuum ){ 6335 checkPtrmap(pCheck, iPage, PTRMAP_FREEPAGE, 0, zContext); 6336 } 6337 #endif 6338 if( n>pCheck->pBt->usableSize/4-8 ){ 6339 checkAppendMsg(pCheck, zContext, 6340 "freelist leaf count too big on page %d", iPage); 6341 N--; 6342 }else{ 6343 for(i=0; i<n; i++){ 6344 Pgno iFreePage = get4byte(&pOvflData[8+i*4]); 6345 #ifndef SQLITE_OMIT_AUTOVACUUM 6346 if( pCheck->pBt->autoVacuum ){ 6347 checkPtrmap(pCheck, iFreePage, PTRMAP_FREEPAGE, 0, zContext); 6348 } 6349 #endif 6350 checkRef(pCheck, iFreePage, zContext); 6351 } 6352 N -= n; 6353 } 6354 } 6355 #ifndef SQLITE_OMIT_AUTOVACUUM 6356 else{ 6357 /* If this database supports auto-vacuum and iPage is not the last 6358 ** page in this overflow list, check that the pointer-map entry for 6359 ** the following page matches iPage. 6360 */ 6361 if( pCheck->pBt->autoVacuum && N>0 ){ 6362 i = get4byte(pOvflData); 6363 checkPtrmap(pCheck, i, PTRMAP_OVERFLOW2, iPage, zContext); 6364 } 6365 } 6366 #endif 6367 iPage = get4byte(pOvflData); 6368 sqlite3PagerUnref(pOvflPage); 6369 } 6370 } 6371 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */ 6372 6373 #ifndef SQLITE_OMIT_INTEGRITY_CHECK 6374 /* 6375 ** Do various sanity checks on a single page of a tree. Return 6376 ** the tree depth. Root pages return 0. Parents of root pages 6377 ** return 1, and so forth. 6378 ** 6379 ** These checks are done: 6380 ** 6381 ** 1. Make sure that cells and freeblocks do not overlap 6382 ** but combine to completely cover the page. 6383 ** NO 2. Make sure cell keys are in order. 6384 ** NO 3. Make sure no key is less than or equal to zLowerBound. 6385 ** NO 4. Make sure no key is greater than or equal to zUpperBound. 6386 ** 5. Check the integrity of overflow pages. 6387 ** 6. Recursively call checkTreePage on all children. 6388 ** 7. Verify that the depth of all children is the same. 6389 ** 8. Make sure this page is at least 33% full or else it is 6390 ** the root of the tree. 6391 */ 6392 static int checkTreePage( 6393 IntegrityCk *pCheck, /* Context for the sanity check */ 6394 int iPage, /* Page number of the page to check */ 6395 MemPage *pParent, /* Parent page */ 6396 char *zParentContext /* Parent context */ 6397 ){ 6398 MemPage *pPage; 6399 int i, rc, depth, d2, pgno, cnt; 6400 int hdr, cellStart; 6401 int nCell; 6402 u8 *data; 6403 BtShared *pBt; 6404 int usableSize; 6405 char zContext[100]; 6406 char *hit; 6407 6408 sqlite3_snprintf(sizeof(zContext), zContext, "Page %d: ", iPage); 6409 6410 /* Check that the page exists 6411 */ 6412 pBt = pCheck->pBt; 6413 usableSize = pBt->usableSize; 6414 if( iPage==0 ) return 0; 6415 if( checkRef(pCheck, iPage, zParentContext) ) return 0; 6416 if( (rc = sqlite3BtreeGetPage(pBt, (Pgno)iPage, &pPage, 0))!=0 ){ 6417 checkAppendMsg(pCheck, zContext, 6418 "unable to get the page. error code=%d", rc); 6419 return 0; 6420 } 6421 if( (rc = sqlite3BtreeInitPage(pPage, pParent))!=0 ){ 6422 checkAppendMsg(pCheck, zContext, 6423 "sqlite3BtreeInitPage() returns error code %d", rc); 6424 releasePage(pPage); 6425 return 0; 6426 } 6427 6428 /* Check out all the cells. 6429 */ 6430 depth = 0; 6431 for(i=0; i<pPage->nCell && pCheck->mxErr; i++){ 6432 u8 *pCell; 6433 int sz; 6434 CellInfo info; 6435 6436 /* Check payload overflow pages 6437 */ 6438 sqlite3_snprintf(sizeof(zContext), zContext, 6439 "On tree page %d cell %d: ", iPage, i); 6440 pCell = findCell(pPage,i); 6441 sqlite3BtreeParseCellPtr(pPage, pCell, &info); 6442 sz = info.nData; 6443 if( !pPage->intKey ) sz += info.nKey; 6444 assert( sz==info.nPayload ); 6445 if( sz>info.nLocal ){ 6446 int nPage = (sz - info.nLocal + usableSize - 5)/(usableSize - 4); 6447 Pgno pgnoOvfl = get4byte(&pCell[info.iOverflow]); 6448 #ifndef SQLITE_OMIT_AUTOVACUUM 6449 if( pBt->autoVacuum ){ 6450 checkPtrmap(pCheck, pgnoOvfl, PTRMAP_OVERFLOW1, iPage, zContext); 6451 } 6452 #endif 6453 checkList(pCheck, 0, pgnoOvfl, nPage, zContext); 6454 } 6455 6456 /* Check sanity of left child page. 6457 */ 6458 if( !pPage->leaf ){ 6459 pgno = get4byte(pCell); 6460 #ifndef SQLITE_OMIT_AUTOVACUUM 6461 if( pBt->autoVacuum ){ 6462 checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage, zContext); 6463 } 6464 #endif 6465 d2 = checkTreePage(pCheck,pgno,pPage,zContext); 6466 if( i>0 && d2!=depth ){ 6467 checkAppendMsg(pCheck, zContext, "Child page depth differs"); 6468 } 6469 depth = d2; 6470 } 6471 } 6472 if( !pPage->leaf ){ 6473 pgno = get4byte(&pPage->aData[pPage->hdrOffset+8]); 6474 sqlite3_snprintf(sizeof(zContext), zContext, 6475 "On page %d at right child: ", iPage); 6476 #ifndef SQLITE_OMIT_AUTOVACUUM 6477 if( pBt->autoVacuum ){ 6478 checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage, 0); 6479 } 6480 #endif 6481 checkTreePage(pCheck, pgno, pPage, zContext); 6482 } 6483 6484 /* Check for complete coverage of the page 6485 */ 6486 data = pPage->aData; 6487 hdr = pPage->hdrOffset; 6488 hit = sqlite3MallocZero( usableSize ); 6489 if( hit ){ 6490 memset(hit, 1, get2byte(&data[hdr+5])); 6491 nCell = get2byte(&data[hdr+3]); 6492 cellStart = hdr + 12 - 4*pPage->leaf; 6493 for(i=0; i<nCell; i++){ 6494 int pc = get2byte(&data[cellStart+i*2]); 6495 int size = cellSizePtr(pPage, &data[pc]); 6496 int j; 6497 if( (pc+size-1)>=usableSize || pc<0 ){ 6498 checkAppendMsg(pCheck, 0, 6499 "Corruption detected in cell %d on page %d",i,iPage,0); 6500 }else{ 6501 for(j=pc+size-1; j>=pc; j--) hit[j]++; 6502 } 6503 } 6504 for(cnt=0, i=get2byte(&data[hdr+1]); i>0 && i<usableSize && cnt<10000; 6505 cnt++){ 6506 int size = get2byte(&data[i+2]); 6507 int j; 6508 if( (i+size-1)>=usableSize || i<0 ){ 6509 checkAppendMsg(pCheck, 0, 6510 "Corruption detected in cell %d on page %d",i,iPage,0); 6511 }else{ 6512 for(j=i+size-1; j>=i; j--) hit[j]++; 6513 } 6514 i = get2byte(&data[i]); 6515 } 6516 for(i=cnt=0; i<usableSize; i++){ 6517 if( hit[i]==0 ){ 6518 cnt++; 6519 }else if( hit[i]>1 ){ 6520 checkAppendMsg(pCheck, 0, 6521 "Multiple uses for byte %d of page %d", i, iPage); 6522 break; 6523 } 6524 } 6525 if( cnt!=data[hdr+7] ){ 6526 checkAppendMsg(pCheck, 0, 6527 "Fragmented space is %d byte reported as %d on page %d", 6528 cnt, data[hdr+7], iPage); 6529 } 6530 } 6531 sqlite3_free(hit); 6532 6533 releasePage(pPage); 6534 return depth+1; 6535 } 6536 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */ 6537 6538 #ifndef SQLITE_OMIT_INTEGRITY_CHECK 6539 /* 6540 ** This routine does a complete check of the given BTree file. aRoot[] is 6541 ** an array of pages numbers were each page number is the root page of 6542 ** a table. nRoot is the number of entries in aRoot. 6543 ** 6544 ** If everything checks out, this routine returns NULL. If something is 6545 ** amiss, an error message is written into memory obtained from malloc() 6546 ** and a pointer to that error message is returned. The calling function 6547 ** is responsible for freeing the error message when it is done. 6548 */ 6549 char *sqlite3BtreeIntegrityCheck( 6550 Btree *p, /* The btree to be checked */ 6551 int *aRoot, /* An array of root pages numbers for individual trees */ 6552 int nRoot, /* Number of entries in aRoot[] */ 6553 int mxErr, /* Stop reporting errors after this many */ 6554 int *pnErr /* Write number of errors seen to this variable */ 6555 ){ 6556 int i; 6557 int nRef; 6558 IntegrityCk sCheck; 6559 BtShared *pBt = p->pBt; 6560 6561 sqlite3BtreeEnter(p); 6562 pBt->db = p->db; 6563 nRef = sqlite3PagerRefcount(pBt->pPager); 6564 if( lockBtreeWithRetry(p)!=SQLITE_OK ){ 6565 sqlite3BtreeLeave(p); 6566 return sqlite3StrDup("Unable to acquire a read lock on the database"); 6567 } 6568 sCheck.pBt = pBt; 6569 sCheck.pPager = pBt->pPager; 6570 sCheck.nPage = sqlite3PagerPagecount(sCheck.pPager); 6571 sCheck.mxErr = mxErr; 6572 sCheck.nErr = 0; 6573 *pnErr = 0; 6574 #ifndef SQLITE_OMIT_AUTOVACUUM 6575 if( pBt->nTrunc!=0 ){ 6576 sCheck.nPage = pBt->nTrunc; 6577 } 6578 #endif 6579 if( sCheck.nPage==0 ){ 6580 unlockBtreeIfUnused(pBt); 6581 sqlite3BtreeLeave(p); 6582 return 0; 6583 } 6584 sCheck.anRef = sqlite3_malloc( (sCheck.nPage+1)*sizeof(sCheck.anRef[0]) ); 6585 if( !sCheck.anRef ){ 6586 unlockBtreeIfUnused(pBt); 6587 *pnErr = 1; 6588 sqlite3BtreeLeave(p); 6589 return sqlite3MPrintf(p->db, "Unable to malloc %d bytes", 6590 (sCheck.nPage+1)*sizeof(sCheck.anRef[0])); 6591 } 6592 for(i=0; i<=sCheck.nPage; i++){ sCheck.anRef[i] = 0; } 6593 i = PENDING_BYTE_PAGE(pBt); 6594 if( i<=sCheck.nPage ){ 6595 sCheck.anRef[i] = 1; 6596 } 6597 sCheck.zErrMsg = 0; 6598 6599 /* Check the integrity of the freelist 6600 */ 6601 checkList(&sCheck, 1, get4byte(&pBt->pPage1->aData[32]), 6602 get4byte(&pBt->pPage1->aData[36]), "Main freelist: "); 6603 6604 /* Check all the tables. 6605 */ 6606 for(i=0; i<nRoot && sCheck.mxErr; i++){ 6607 if( aRoot[i]==0 ) continue; 6608 #ifndef SQLITE_OMIT_AUTOVACUUM 6609 if( pBt->autoVacuum && aRoot[i]>1 ){ 6610 checkPtrmap(&sCheck, aRoot[i], PTRMAP_ROOTPAGE, 0, 0); 6611 } 6612 #endif 6613 checkTreePage(&sCheck, aRoot[i], 0, "List of tree roots: "); 6614 } 6615 6616 /* Make sure every page in the file is referenced 6617 */ 6618 for(i=1; i<=sCheck.nPage && sCheck.mxErr; i++){ 6619 #ifdef SQLITE_OMIT_AUTOVACUUM 6620 if( sCheck.anRef[i]==0 ){ 6621 checkAppendMsg(&sCheck, 0, "Page %d is never used", i); 6622 } 6623 #else 6624 /* If the database supports auto-vacuum, make sure no tables contain 6625 ** references to pointer-map pages. 6626 */ 6627 if( sCheck.anRef[i]==0 && 6628 (PTRMAP_PAGENO(pBt, i)!=i || !pBt->autoVacuum) ){ 6629 checkAppendMsg(&sCheck, 0, "Page %d is never used", i); 6630 } 6631 if( sCheck.anRef[i]!=0 && 6632 (PTRMAP_PAGENO(pBt, i)==i && pBt->autoVacuum) ){ 6633 checkAppendMsg(&sCheck, 0, "Pointer map page %d is referenced", i); 6634 } 6635 #endif 6636 } 6637 6638 /* Make sure this analysis did not leave any unref() pages 6639 */ 6640 unlockBtreeIfUnused(pBt); 6641 if( nRef != sqlite3PagerRefcount(pBt->pPager) ){ 6642 checkAppendMsg(&sCheck, 0, 6643 "Outstanding page count goes from %d to %d during this analysis", 6644 nRef, sqlite3PagerRefcount(pBt->pPager) 6645 ); 6646 } 6647 6648 /* Clean up and report errors. 6649 */ 6650 sqlite3BtreeLeave(p); 6651 sqlite3_free(sCheck.anRef); 6652 *pnErr = sCheck.nErr; 6653 return sCheck.zErrMsg; 6654 } 6655 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */ 6656 6657 /* 6658 ** Return the full pathname of the underlying database file. 6659 ** 6660 ** The pager filename is invariant as long as the pager is 6661 ** open so it is safe to access without the BtShared mutex. 6662 */ 6663 const char *sqlite3BtreeGetFilename(Btree *p){ 6664 assert( p->pBt->pPager!=0 ); 6665 return sqlite3PagerFilename(p->pBt->pPager); 6666 } 6667 6668 /* 6669 ** Return the pathname of the directory that contains the database file. 6670 ** 6671 ** The pager directory name is invariant as long as the pager is 6672 ** open so it is safe to access without the BtShared mutex. 6673 */ 6674 const char *sqlite3BtreeGetDirname(Btree *p){ 6675 assert( p->pBt->pPager!=0 ); 6676 return sqlite3PagerDirname(p->pBt->pPager); 6677 } 6678 6679 /* 6680 ** Return the pathname of the journal file for this database. The return 6681 ** value of this routine is the same regardless of whether the journal file 6682 ** has been created or not. 6683 ** 6684 ** The pager journal filename is invariant as long as the pager is 6685 ** open so it is safe to access without the BtShared mutex. 6686 */ 6687 const char *sqlite3BtreeGetJournalname(Btree *p){ 6688 assert( p->pBt->pPager!=0 ); 6689 return sqlite3PagerJournalname(p->pBt->pPager); 6690 } 6691 6692 #ifndef SQLITE_OMIT_VACUUM 6693 /* 6694 ** Copy the complete content of pBtFrom into pBtTo. A transaction 6695 ** must be active for both files. 6696 ** 6697 ** The size of file pBtFrom may be reduced by this operation. 6698 ** If anything goes wrong, the transaction on pBtFrom is rolled back. 6699 */ 6700 static int btreeCopyFile(Btree *pTo, Btree *pFrom){ 6701 int rc = SQLITE_OK; 6702 Pgno i, nPage, nToPage, iSkip; 6703 6704 BtShared *pBtTo = pTo->pBt; 6705 BtShared *pBtFrom = pFrom->pBt; 6706 pBtTo->db = pTo->db; 6707 pBtFrom->db = pFrom->db; 6708 6709 6710 if( pTo->inTrans!=TRANS_WRITE || pFrom->inTrans!=TRANS_WRITE ){ 6711 return SQLITE_ERROR; 6712 } 6713 if( pBtTo->pCursor ) return SQLITE_BUSY; 6714 nToPage = sqlite3PagerPagecount(pBtTo->pPager); 6715 nPage = sqlite3PagerPagecount(pBtFrom->pPager); 6716 iSkip = PENDING_BYTE_PAGE(pBtTo); 6717 for(i=1; rc==SQLITE_OK && i<=nPage; i++){ 6718 DbPage *pDbPage; 6719 if( i==iSkip ) continue; 6720 rc = sqlite3PagerGet(pBtFrom->pPager, i, &pDbPage); 6721 if( rc ) break; 6722 rc = sqlite3PagerOverwrite(pBtTo->pPager, i, sqlite3PagerGetData(pDbPage)); 6723 sqlite3PagerUnref(pDbPage); 6724 } 6725 6726 /* If the file is shrinking, journal the pages that are being truncated 6727 ** so that they can be rolled back if the commit fails. 6728 */ 6729 for(i=nPage+1; rc==SQLITE_OK && i<=nToPage; i++){ 6730 DbPage *pDbPage; 6731 if( i==iSkip ) continue; 6732 rc = sqlite3PagerGet(pBtTo->pPager, i, &pDbPage); 6733 if( rc ) break; 6734 rc = sqlite3PagerWrite(pDbPage); 6735 sqlite3PagerDontWrite(pDbPage); 6736 /* Yeah. It seems wierd to call DontWrite() right after Write(). But 6737 ** that is because the names of those procedures do not exactly 6738 ** represent what they do. Write() really means "put this page in the 6739 ** rollback journal and mark it as dirty so that it will be written 6740 ** to the database file later." DontWrite() undoes the second part of 6741 ** that and prevents the page from being written to the database. The 6742 ** page is still on the rollback journal, though. And that is the whole 6743 ** point of this loop: to put pages on the rollback journal. */ 6744 sqlite3PagerUnref(pDbPage); 6745 } 6746 if( !rc && nPage<nToPage ){ 6747 rc = sqlite3PagerTruncate(pBtTo->pPager, nPage); 6748 } 6749 6750 if( rc ){ 6751 sqlite3BtreeRollback(pTo); 6752 } 6753 return rc; 6754 } 6755 int sqlite3BtreeCopyFile(Btree *pTo, Btree *pFrom){ 6756 int rc; 6757 sqlite3BtreeEnter(pTo); 6758 sqlite3BtreeEnter(pFrom); 6759 rc = btreeCopyFile(pTo, pFrom); 6760 sqlite3BtreeLeave(pFrom); 6761 sqlite3BtreeLeave(pTo); 6762 return rc; 6763 } 6764 6765 #endif /* SQLITE_OMIT_VACUUM */ 6766 6767 /* 6768 ** Return non-zero if a transaction is active. 6769 */ 6770 int sqlite3BtreeIsInTrans(Btree *p){ 6771 assert( p==0 || sqlite3_mutex_held(p->db->mutex) ); 6772 return (p && (p->inTrans==TRANS_WRITE)); 6773 } 6774 6775 /* 6776 ** Return non-zero if a statement transaction is active. 6777 */ 6778 int sqlite3BtreeIsInStmt(Btree *p){ 6779 assert( sqlite3BtreeHoldsMutex(p) ); 6780 return (p->pBt && p->pBt->inStmt); 6781 } 6782 6783 /* 6784 ** Return non-zero if a read (or write) transaction is active. 6785 */ 6786 int sqlite3BtreeIsInReadTrans(Btree *p){ 6787 assert( sqlite3_mutex_held(p->db->mutex) ); 6788 return (p && (p->inTrans!=TRANS_NONE)); 6789 } 6790 6791 /* 6792 ** This function returns a pointer to a blob of memory associated with 6793 ** a single shared-btree. The memory is used by client code for its own 6794 ** purposes (for example, to store a high-level schema associated with 6795 ** the shared-btree). The btree layer manages reference counting issues. 6796 ** 6797 ** The first time this is called on a shared-btree, nBytes bytes of memory 6798 ** are allocated, zeroed, and returned to the caller. For each subsequent 6799 ** call the nBytes parameter is ignored and a pointer to the same blob 6800 ** of memory returned. 6801 ** 6802 ** Just before the shared-btree is closed, the function passed as the 6803 ** xFree argument when the memory allocation was made is invoked on the 6804 ** blob of allocated memory. This function should not call sqlite3_free() 6805 ** on the memory, the btree layer does that. 6806 */ 6807 void *sqlite3BtreeSchema(Btree *p, int nBytes, void(*xFree)(void *)){ 6808 BtShared *pBt = p->pBt; 6809 sqlite3BtreeEnter(p); 6810 if( !pBt->pSchema ){ 6811 pBt->pSchema = sqlite3MallocZero(nBytes); 6812 pBt->xFreeSchema = xFree; 6813 } 6814 sqlite3BtreeLeave(p); 6815 return pBt->pSchema; 6816 } 6817 6818 /* 6819 ** Return true if another user of the same shared btree as the argument 6820 ** handle holds an exclusive lock on the sqlite_master table. 6821 */ 6822 int sqlite3BtreeSchemaLocked(Btree *p){ 6823 int rc; 6824 assert( sqlite3_mutex_held(p->db->mutex) ); 6825 sqlite3BtreeEnter(p); 6826 rc = (queryTableLock(p, MASTER_ROOT, READ_LOCK)!=SQLITE_OK); 6827 sqlite3BtreeLeave(p); 6828 return rc; 6829 } 6830 6831 6832 #ifndef SQLITE_OMIT_SHARED_CACHE 6833 /* 6834 ** Obtain a lock on the table whose root page is iTab. The 6835 ** lock is a write lock if isWritelock is true or a read lock 6836 ** if it is false. 6837 */ 6838 int sqlite3BtreeLockTable(Btree *p, int iTab, u8 isWriteLock){ 6839 int rc = SQLITE_OK; 6840 u8 lockType = (isWriteLock?WRITE_LOCK:READ_LOCK); 6841 sqlite3BtreeEnter(p); 6842 rc = queryTableLock(p, iTab, lockType); 6843 if( rc==SQLITE_OK ){ 6844 rc = lockTable(p, iTab, lockType); 6845 } 6846 sqlite3BtreeLeave(p); 6847 return rc; 6848 } 6849 #endif 6850 6851 #ifndef SQLITE_OMIT_INCRBLOB 6852 /* 6853 ** Argument pCsr must be a cursor opened for writing on an 6854 ** INTKEY table currently pointing at a valid table entry. 6855 ** This function modifies the data stored as part of that entry. 6856 ** Only the data content may only be modified, it is not possible 6857 ** to change the length of the data stored. 6858 */ 6859 int sqlite3BtreePutData(BtCursor *pCsr, u32 offset, u32 amt, void *z){ 6860 assert( cursorHoldsMutex(pCsr) ); 6861 assert( sqlite3_mutex_held(pCsr->pBtree->db->mutex) ); 6862 assert(pCsr->isIncrblobHandle); 6863 if( pCsr->eState>=CURSOR_REQUIRESEEK ){ 6864 if( pCsr->eState==CURSOR_FAULT ){ 6865 return pCsr->skip; 6866 }else{ 6867 return SQLITE_ABORT; 6868 } 6869 } 6870 6871 /* Check some preconditions: 6872 ** (a) the cursor is open for writing, 6873 ** (b) there is no read-lock on the table being modified and 6874 ** (c) the cursor points at a valid row of an intKey table. 6875 */ 6876 if( !pCsr->wrFlag ){ 6877 return SQLITE_READONLY; 6878 } 6879 assert( !pCsr->pBt->readOnly 6880 && pCsr->pBt->inTransaction==TRANS_WRITE ); 6881 if( checkReadLocks(pCsr->pBtree, pCsr->pgnoRoot, pCsr) ){ 6882 return SQLITE_LOCKED; /* The table pCur points to has a read lock */ 6883 } 6884 if( pCsr->eState==CURSOR_INVALID || !pCsr->pPage->intKey ){ 6885 return SQLITE_ERROR; 6886 } 6887 6888 return accessPayload(pCsr, offset, amt, (unsigned char *)z, 0, 1); 6889 } 6890 6891 /* 6892 ** Set a flag on this cursor to cache the locations of pages from the 6893 ** overflow list for the current row. This is used by cursors opened 6894 ** for incremental blob IO only. 6895 ** 6896 ** This function sets a flag only. The actual page location cache 6897 ** (stored in BtCursor.aOverflow[]) is allocated and used by function 6898 ** accessPayload() (the worker function for sqlite3BtreeData() and 6899 ** sqlite3BtreePutData()). 6900 */ 6901 void sqlite3BtreeCacheOverflow(BtCursor *pCur){ 6902 assert( cursorHoldsMutex(pCur) ); 6903 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) ); 6904 assert(!pCur->isIncrblobHandle); 6905 assert(!pCur->aOverflow); 6906 pCur->isIncrblobHandle = 1; 6907 } 6908 #endif 6909