1 /* 2 ** 2004 April 6 3 ** 4 ** The author disclaims copyright to this source code. In place of 5 ** a legal notice, here is a blessing: 6 ** 7 ** May you do good and not evil. 8 ** May you find forgiveness for yourself and forgive others. 9 ** May you share freely, never taking more than you give. 10 ** 11 ************************************************************************* 12 ** $Id: btree.c,v 1.430 2007/11/05 15:30:13 danielk1977 Exp $ 13 ** 14 ** This file implements a external (disk-based) database using BTrees. 15 ** See the header comment on "btreeInt.h" for additional information. 16 ** Including a description of file format and an overview of operation. 17 */ 18 #include "btreeInt.h" 19 20 /* 21 ** The header string that appears at the beginning of every 22 ** SQLite database. 23 */ 24 static const char zMagicHeader[] = SQLITE_FILE_HEADER; 25 26 /* 27 ** Set this global variable to 1 to enable tracing using the TRACE 28 ** macro. 29 */ 30 #if SQLITE_TEST 31 int sqlite3_btree_trace=0; /* True to enable tracing */ 32 #endif 33 34 35 36 #ifndef SQLITE_OMIT_SHARED_CACHE 37 /* 38 ** A flag to indicate whether or not shared cache is enabled. Also, 39 ** a list of BtShared objects that are eligible for participation 40 ** in shared cache. The variables have file scope during normal builds, 41 ** but the test harness needs to access these variables so we make them 42 ** global for test builds. 43 */ 44 #ifdef SQLITE_TEST 45 BtShared *sqlite3SharedCacheList = 0; 46 int sqlite3SharedCacheEnabled = 0; 47 #else 48 static BtShared *sqlite3SharedCacheList = 0; 49 static int sqlite3SharedCacheEnabled = 0; 50 #endif 51 #endif /* SQLITE_OMIT_SHARED_CACHE */ 52 53 #ifndef SQLITE_OMIT_SHARED_CACHE 54 /* 55 ** Enable or disable the shared pager and schema features. 56 ** 57 ** This routine has no effect on existing database connections. 58 ** The shared cache setting effects only future calls to 59 ** sqlite3_open(), sqlite3_open16(), or sqlite3_open_v2(). 60 */ 61 int sqlite3_enable_shared_cache(int enable){ 62 sqlite3SharedCacheEnabled = enable; 63 return SQLITE_OK; 64 } 65 #endif 66 67 68 /* 69 ** Forward declaration 70 */ 71 static int checkReadLocks(Btree*,Pgno,BtCursor*); 72 73 74 #ifdef SQLITE_OMIT_SHARED_CACHE 75 /* 76 ** The functions queryTableLock(), lockTable() and unlockAllTables() 77 ** manipulate entries in the BtShared.pLock linked list used to store 78 ** shared-cache table level locks. If the library is compiled with the 79 ** shared-cache feature disabled, then there is only ever one user 80 ** of each BtShared structure and so this locking is not necessary. 81 ** So define the lock related functions as no-ops. 82 */ 83 #define queryTableLock(a,b,c) SQLITE_OK 84 #define lockTable(a,b,c) SQLITE_OK 85 #define unlockAllTables(a) 86 #endif 87 88 #ifndef SQLITE_OMIT_SHARED_CACHE 89 /* 90 ** Query to see if btree handle p may obtain a lock of type eLock 91 ** (READ_LOCK or WRITE_LOCK) on the table with root-page iTab. Return 92 ** SQLITE_OK if the lock may be obtained (by calling lockTable()), or 93 ** SQLITE_LOCKED if not. 94 */ 95 static int queryTableLock(Btree *p, Pgno iTab, u8 eLock){ 96 BtShared *pBt = p->pBt; 97 BtLock *pIter; 98 99 assert( sqlite3BtreeHoldsMutex(p) ); 100 101 /* This is a no-op if the shared-cache is not enabled */ 102 if( !p->sharable ){ 103 return SQLITE_OK; 104 } 105 106 /* This (along with lockTable()) is where the ReadUncommitted flag is 107 ** dealt with. If the caller is querying for a read-lock and the flag is 108 ** set, it is unconditionally granted - even if there are write-locks 109 ** on the table. If a write-lock is requested, the ReadUncommitted flag 110 ** is not considered. 111 ** 112 ** In function lockTable(), if a read-lock is demanded and the 113 ** ReadUncommitted flag is set, no entry is added to the locks list 114 ** (BtShared.pLock). 115 ** 116 ** To summarize: If the ReadUncommitted flag is set, then read cursors do 117 ** not create or respect table locks. The locking procedure for a 118 ** write-cursor does not change. 119 */ 120 if( 121 !p->pSqlite || 122 0==(p->pSqlite->flags&SQLITE_ReadUncommitted) || 123 eLock==WRITE_LOCK || 124 iTab==MASTER_ROOT 125 ){ 126 for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){ 127 if( pIter->pBtree!=p && pIter->iTable==iTab && 128 (pIter->eLock!=eLock || eLock!=READ_LOCK) ){ 129 return SQLITE_LOCKED; 130 } 131 } 132 } 133 return SQLITE_OK; 134 } 135 #endif /* !SQLITE_OMIT_SHARED_CACHE */ 136 137 #ifndef SQLITE_OMIT_SHARED_CACHE 138 /* 139 ** Add a lock on the table with root-page iTable to the shared-btree used 140 ** by Btree handle p. Parameter eLock must be either READ_LOCK or 141 ** WRITE_LOCK. 142 ** 143 ** SQLITE_OK is returned if the lock is added successfully. SQLITE_BUSY and 144 ** SQLITE_NOMEM may also be returned. 145 */ 146 static int lockTable(Btree *p, Pgno iTable, u8 eLock){ 147 BtShared *pBt = p->pBt; 148 BtLock *pLock = 0; 149 BtLock *pIter; 150 151 assert( sqlite3BtreeHoldsMutex(p) ); 152 153 /* This is a no-op if the shared-cache is not enabled */ 154 if( !p->sharable ){ 155 return SQLITE_OK; 156 } 157 158 assert( SQLITE_OK==queryTableLock(p, iTable, eLock) ); 159 160 /* If the read-uncommitted flag is set and a read-lock is requested, 161 ** return early without adding an entry to the BtShared.pLock list. See 162 ** comment in function queryTableLock() for more info on handling 163 ** the ReadUncommitted flag. 164 */ 165 if( 166 (p->pSqlite) && 167 (p->pSqlite->flags&SQLITE_ReadUncommitted) && 168 (eLock==READ_LOCK) && 169 iTable!=MASTER_ROOT 170 ){ 171 return SQLITE_OK; 172 } 173 174 /* First search the list for an existing lock on this table. */ 175 for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){ 176 if( pIter->iTable==iTable && pIter->pBtree==p ){ 177 pLock = pIter; 178 break; 179 } 180 } 181 182 /* If the above search did not find a BtLock struct associating Btree p 183 ** with table iTable, allocate one and link it into the list. 184 */ 185 if( !pLock ){ 186 pLock = (BtLock *)sqlite3MallocZero(sizeof(BtLock)); 187 if( !pLock ){ 188 return SQLITE_NOMEM; 189 } 190 pLock->iTable = iTable; 191 pLock->pBtree = p; 192 pLock->pNext = pBt->pLock; 193 pBt->pLock = pLock; 194 } 195 196 /* Set the BtLock.eLock variable to the maximum of the current lock 197 ** and the requested lock. This means if a write-lock was already held 198 ** and a read-lock requested, we don't incorrectly downgrade the lock. 199 */ 200 assert( WRITE_LOCK>READ_LOCK ); 201 if( eLock>pLock->eLock ){ 202 pLock->eLock = eLock; 203 } 204 205 return SQLITE_OK; 206 } 207 #endif /* !SQLITE_OMIT_SHARED_CACHE */ 208 209 #ifndef SQLITE_OMIT_SHARED_CACHE 210 /* 211 ** Release all the table locks (locks obtained via calls to the lockTable() 212 ** procedure) held by Btree handle p. 213 */ 214 static void unlockAllTables(Btree *p){ 215 BtLock **ppIter = &p->pBt->pLock; 216 217 assert( sqlite3BtreeHoldsMutex(p) ); 218 assert( p->sharable || 0==*ppIter ); 219 220 while( *ppIter ){ 221 BtLock *pLock = *ppIter; 222 if( pLock->pBtree==p ){ 223 *ppIter = pLock->pNext; 224 sqlite3_free(pLock); 225 }else{ 226 ppIter = &pLock->pNext; 227 } 228 } 229 } 230 #endif /* SQLITE_OMIT_SHARED_CACHE */ 231 232 static void releasePage(MemPage *pPage); /* Forward reference */ 233 234 /* 235 ** Verify that the cursor holds a mutex on the BtShared 236 */ 237 #ifndef NDEBUG 238 static int cursorHoldsMutex(BtCursor *p){ 239 return sqlite3_mutex_held(p->pBt->mutex); 240 } 241 #endif 242 243 244 #ifndef SQLITE_OMIT_INCRBLOB 245 /* 246 ** Invalidate the overflow page-list cache for cursor pCur, if any. 247 */ 248 static void invalidateOverflowCache(BtCursor *pCur){ 249 assert( cursorHoldsMutex(pCur) ); 250 sqlite3_free(pCur->aOverflow); 251 pCur->aOverflow = 0; 252 } 253 254 /* 255 ** Invalidate the overflow page-list cache for all cursors opened 256 ** on the shared btree structure pBt. 257 */ 258 static void invalidateAllOverflowCache(BtShared *pBt){ 259 BtCursor *p; 260 assert( sqlite3_mutex_held(pBt->mutex) ); 261 for(p=pBt->pCursor; p; p=p->pNext){ 262 invalidateOverflowCache(p); 263 } 264 } 265 #else 266 #define invalidateOverflowCache(x) 267 #define invalidateAllOverflowCache(x) 268 #endif 269 270 /* 271 ** Save the current cursor position in the variables BtCursor.nKey 272 ** and BtCursor.pKey. The cursor's state is set to CURSOR_REQUIRESEEK. 273 */ 274 static int saveCursorPosition(BtCursor *pCur){ 275 int rc; 276 277 assert( CURSOR_VALID==pCur->eState ); 278 assert( 0==pCur->pKey ); 279 assert( cursorHoldsMutex(pCur) ); 280 281 rc = sqlite3BtreeKeySize(pCur, &pCur->nKey); 282 283 /* If this is an intKey table, then the above call to BtreeKeySize() 284 ** stores the integer key in pCur->nKey. In this case this value is 285 ** all that is required. Otherwise, if pCur is not open on an intKey 286 ** table, then malloc space for and store the pCur->nKey bytes of key 287 ** data. 288 */ 289 if( rc==SQLITE_OK && 0==pCur->pPage->intKey){ 290 void *pKey = sqlite3_malloc(pCur->nKey); 291 if( pKey ){ 292 rc = sqlite3BtreeKey(pCur, 0, pCur->nKey, pKey); 293 if( rc==SQLITE_OK ){ 294 pCur->pKey = pKey; 295 }else{ 296 sqlite3_free(pKey); 297 } 298 }else{ 299 rc = SQLITE_NOMEM; 300 } 301 } 302 assert( !pCur->pPage->intKey || !pCur->pKey ); 303 304 if( rc==SQLITE_OK ){ 305 releasePage(pCur->pPage); 306 pCur->pPage = 0; 307 pCur->eState = CURSOR_REQUIRESEEK; 308 } 309 310 invalidateOverflowCache(pCur); 311 return rc; 312 } 313 314 /* 315 ** Save the positions of all cursors except pExcept open on the table 316 ** with root-page iRoot. Usually, this is called just before cursor 317 ** pExcept is used to modify the table (BtreeDelete() or BtreeInsert()). 318 */ 319 static int saveAllCursors(BtShared *pBt, Pgno iRoot, BtCursor *pExcept){ 320 BtCursor *p; 321 assert( sqlite3_mutex_held(pBt->mutex) ); 322 assert( pExcept==0 || pExcept->pBt==pBt ); 323 for(p=pBt->pCursor; p; p=p->pNext){ 324 if( p!=pExcept && (0==iRoot || p->pgnoRoot==iRoot) && 325 p->eState==CURSOR_VALID ){ 326 int rc = saveCursorPosition(p); 327 if( SQLITE_OK!=rc ){ 328 return rc; 329 } 330 } 331 } 332 return SQLITE_OK; 333 } 334 335 /* 336 ** Clear the current cursor position. 337 */ 338 static void clearCursorPosition(BtCursor *pCur){ 339 assert( cursorHoldsMutex(pCur) ); 340 sqlite3_free(pCur->pKey); 341 pCur->pKey = 0; 342 pCur->eState = CURSOR_INVALID; 343 } 344 345 /* 346 ** Restore the cursor to the position it was in (or as close to as possible) 347 ** when saveCursorPosition() was called. Note that this call deletes the 348 ** saved position info stored by saveCursorPosition(), so there can be 349 ** at most one effective restoreOrClearCursorPosition() call after each 350 ** saveCursorPosition(). 351 ** 352 ** If the second argument argument - doSeek - is false, then instead of 353 ** returning the cursor to it's saved position, any saved position is deleted 354 ** and the cursor state set to CURSOR_INVALID. 355 */ 356 int sqlite3BtreeRestoreOrClearCursorPosition(BtCursor *pCur){ 357 int rc; 358 assert( cursorHoldsMutex(pCur) ); 359 assert( pCur->eState>=CURSOR_REQUIRESEEK ); 360 if( pCur->eState==CURSOR_FAULT ){ 361 return pCur->skip; 362 } 363 #ifndef SQLITE_OMIT_INCRBLOB 364 if( pCur->isIncrblobHandle ){ 365 return SQLITE_ABORT; 366 } 367 #endif 368 pCur->eState = CURSOR_INVALID; 369 rc = sqlite3BtreeMoveto(pCur, pCur->pKey, pCur->nKey, 0, &pCur->skip); 370 if( rc==SQLITE_OK ){ 371 sqlite3_free(pCur->pKey); 372 pCur->pKey = 0; 373 assert( pCur->eState==CURSOR_VALID || pCur->eState==CURSOR_INVALID ); 374 } 375 return rc; 376 } 377 378 #define restoreOrClearCursorPosition(p) \ 379 (p->eState>=CURSOR_REQUIRESEEK ? \ 380 sqlite3BtreeRestoreOrClearCursorPosition(p) : \ 381 SQLITE_OK) 382 383 #ifndef SQLITE_OMIT_AUTOVACUUM 384 /* 385 ** Given a page number of a regular database page, return the page 386 ** number for the pointer-map page that contains the entry for the 387 ** input page number. 388 */ 389 static Pgno ptrmapPageno(BtShared *pBt, Pgno pgno){ 390 int nPagesPerMapPage, iPtrMap, ret; 391 assert( sqlite3_mutex_held(pBt->mutex) ); 392 nPagesPerMapPage = (pBt->usableSize/5)+1; 393 iPtrMap = (pgno-2)/nPagesPerMapPage; 394 ret = (iPtrMap*nPagesPerMapPage) + 2; 395 if( ret==PENDING_BYTE_PAGE(pBt) ){ 396 ret++; 397 } 398 return ret; 399 } 400 401 /* 402 ** Write an entry into the pointer map. 403 ** 404 ** This routine updates the pointer map entry for page number 'key' 405 ** so that it maps to type 'eType' and parent page number 'pgno'. 406 ** An error code is returned if something goes wrong, otherwise SQLITE_OK. 407 */ 408 static int ptrmapPut(BtShared *pBt, Pgno key, u8 eType, Pgno parent){ 409 DbPage *pDbPage; /* The pointer map page */ 410 u8 *pPtrmap; /* The pointer map data */ 411 Pgno iPtrmap; /* The pointer map page number */ 412 int offset; /* Offset in pointer map page */ 413 int rc; 414 415 assert( sqlite3_mutex_held(pBt->mutex) ); 416 /* The master-journal page number must never be used as a pointer map page */ 417 assert( 0==PTRMAP_ISPAGE(pBt, PENDING_BYTE_PAGE(pBt)) ); 418 419 assert( pBt->autoVacuum ); 420 if( key==0 ){ 421 return SQLITE_CORRUPT_BKPT; 422 } 423 iPtrmap = PTRMAP_PAGENO(pBt, key); 424 rc = sqlite3PagerGet(pBt->pPager, iPtrmap, &pDbPage); 425 if( rc!=SQLITE_OK ){ 426 return rc; 427 } 428 offset = PTRMAP_PTROFFSET(pBt, key); 429 pPtrmap = (u8 *)sqlite3PagerGetData(pDbPage); 430 431 if( eType!=pPtrmap[offset] || get4byte(&pPtrmap[offset+1])!=parent ){ 432 TRACE(("PTRMAP_UPDATE: %d->(%d,%d)\n", key, eType, parent)); 433 rc = sqlite3PagerWrite(pDbPage); 434 if( rc==SQLITE_OK ){ 435 pPtrmap[offset] = eType; 436 put4byte(&pPtrmap[offset+1], parent); 437 } 438 } 439 440 sqlite3PagerUnref(pDbPage); 441 return rc; 442 } 443 444 /* 445 ** Read an entry from the pointer map. 446 ** 447 ** This routine retrieves the pointer map entry for page 'key', writing 448 ** the type and parent page number to *pEType and *pPgno respectively. 449 ** An error code is returned if something goes wrong, otherwise SQLITE_OK. 450 */ 451 static int ptrmapGet(BtShared *pBt, Pgno key, u8 *pEType, Pgno *pPgno){ 452 DbPage *pDbPage; /* The pointer map page */ 453 int iPtrmap; /* Pointer map page index */ 454 u8 *pPtrmap; /* Pointer map page data */ 455 int offset; /* Offset of entry in pointer map */ 456 int rc; 457 458 assert( sqlite3_mutex_held(pBt->mutex) ); 459 460 iPtrmap = PTRMAP_PAGENO(pBt, key); 461 rc = sqlite3PagerGet(pBt->pPager, iPtrmap, &pDbPage); 462 if( rc!=0 ){ 463 return rc; 464 } 465 pPtrmap = (u8 *)sqlite3PagerGetData(pDbPage); 466 467 offset = PTRMAP_PTROFFSET(pBt, key); 468 assert( pEType!=0 ); 469 *pEType = pPtrmap[offset]; 470 if( pPgno ) *pPgno = get4byte(&pPtrmap[offset+1]); 471 472 sqlite3PagerUnref(pDbPage); 473 if( *pEType<1 || *pEType>5 ) return SQLITE_CORRUPT_BKPT; 474 return SQLITE_OK; 475 } 476 477 #endif /* SQLITE_OMIT_AUTOVACUUM */ 478 479 /* 480 ** Given a btree page and a cell index (0 means the first cell on 481 ** the page, 1 means the second cell, and so forth) return a pointer 482 ** to the cell content. 483 ** 484 ** This routine works only for pages that do not contain overflow cells. 485 */ 486 #define findCell(pPage, iCell) \ 487 ((pPage)->aData + get2byte(&(pPage)->aData[(pPage)->cellOffset+2*(iCell)])) 488 #ifdef SQLITE_TEST 489 u8 *sqlite3BtreeFindCell(MemPage *pPage, int iCell){ 490 assert( iCell>=0 ); 491 assert( iCell<get2byte(&pPage->aData[pPage->hdrOffset+3]) ); 492 return findCell(pPage, iCell); 493 } 494 #endif 495 496 /* 497 ** This a more complex version of sqlite3BtreeFindCell() that works for 498 ** pages that do contain overflow cells. See insert 499 */ 500 static u8 *findOverflowCell(MemPage *pPage, int iCell){ 501 int i; 502 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 503 for(i=pPage->nOverflow-1; i>=0; i--){ 504 int k; 505 struct _OvflCell *pOvfl; 506 pOvfl = &pPage->aOvfl[i]; 507 k = pOvfl->idx; 508 if( k<=iCell ){ 509 if( k==iCell ){ 510 return pOvfl->pCell; 511 } 512 iCell--; 513 } 514 } 515 return findCell(pPage, iCell); 516 } 517 518 /* 519 ** Parse a cell content block and fill in the CellInfo structure. There 520 ** are two versions of this function. sqlite3BtreeParseCell() takes a 521 ** cell index as the second argument and sqlite3BtreeParseCellPtr() 522 ** takes a pointer to the body of the cell as its second argument. 523 ** 524 ** Within this file, the parseCell() macro can be called instead of 525 ** sqlite3BtreeParseCellPtr(). Using some compilers, this will be faster. 526 */ 527 void sqlite3BtreeParseCellPtr( 528 MemPage *pPage, /* Page containing the cell */ 529 u8 *pCell, /* Pointer to the cell text. */ 530 CellInfo *pInfo /* Fill in this structure */ 531 ){ 532 int n; /* Number bytes in cell content header */ 533 u32 nPayload; /* Number of bytes of cell payload */ 534 535 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 536 537 pInfo->pCell = pCell; 538 assert( pPage->leaf==0 || pPage->leaf==1 ); 539 n = pPage->childPtrSize; 540 assert( n==4-4*pPage->leaf ); 541 if( pPage->hasData ){ 542 n += getVarint32(&pCell[n], &nPayload); 543 }else{ 544 nPayload = 0; 545 } 546 pInfo->nData = nPayload; 547 if( pPage->intKey ){ 548 n += getVarint(&pCell[n], (u64 *)&pInfo->nKey); 549 }else{ 550 u32 x; 551 n += getVarint32(&pCell[n], &x); 552 pInfo->nKey = x; 553 nPayload += x; 554 } 555 pInfo->nPayload = nPayload; 556 pInfo->nHeader = n; 557 if( nPayload<=pPage->maxLocal ){ 558 /* This is the (easy) common case where the entire payload fits 559 ** on the local page. No overflow is required. 560 */ 561 int nSize; /* Total size of cell content in bytes */ 562 pInfo->nLocal = nPayload; 563 pInfo->iOverflow = 0; 564 nSize = nPayload + n; 565 if( nSize<4 ){ 566 nSize = 4; /* Minimum cell size is 4 */ 567 } 568 pInfo->nSize = nSize; 569 }else{ 570 /* If the payload will not fit completely on the local page, we have 571 ** to decide how much to store locally and how much to spill onto 572 ** overflow pages. The strategy is to minimize the amount of unused 573 ** space on overflow pages while keeping the amount of local storage 574 ** in between minLocal and maxLocal. 575 ** 576 ** Warning: changing the way overflow payload is distributed in any 577 ** way will result in an incompatible file format. 578 */ 579 int minLocal; /* Minimum amount of payload held locally */ 580 int maxLocal; /* Maximum amount of payload held locally */ 581 int surplus; /* Overflow payload available for local storage */ 582 583 minLocal = pPage->minLocal; 584 maxLocal = pPage->maxLocal; 585 surplus = minLocal + (nPayload - minLocal)%(pPage->pBt->usableSize - 4); 586 if( surplus <= maxLocal ){ 587 pInfo->nLocal = surplus; 588 }else{ 589 pInfo->nLocal = minLocal; 590 } 591 pInfo->iOverflow = pInfo->nLocal + n; 592 pInfo->nSize = pInfo->iOverflow + 4; 593 } 594 } 595 #define parseCell(pPage, iCell, pInfo) \ 596 sqlite3BtreeParseCellPtr((pPage), findCell((pPage), (iCell)), (pInfo)) 597 void sqlite3BtreeParseCell( 598 MemPage *pPage, /* Page containing the cell */ 599 int iCell, /* The cell index. First cell is 0 */ 600 CellInfo *pInfo /* Fill in this structure */ 601 ){ 602 parseCell(pPage, iCell, pInfo); 603 } 604 605 /* 606 ** Compute the total number of bytes that a Cell needs in the cell 607 ** data area of the btree-page. The return number includes the cell 608 ** data header and the local payload, but not any overflow page or 609 ** the space used by the cell pointer. 610 */ 611 #ifndef NDEBUG 612 static int cellSize(MemPage *pPage, int iCell){ 613 CellInfo info; 614 sqlite3BtreeParseCell(pPage, iCell, &info); 615 return info.nSize; 616 } 617 #endif 618 static int cellSizePtr(MemPage *pPage, u8 *pCell){ 619 CellInfo info; 620 sqlite3BtreeParseCellPtr(pPage, pCell, &info); 621 return info.nSize; 622 } 623 624 #ifndef SQLITE_OMIT_AUTOVACUUM 625 /* 626 ** If the cell pCell, part of page pPage contains a pointer 627 ** to an overflow page, insert an entry into the pointer-map 628 ** for the overflow page. 629 */ 630 static int ptrmapPutOvflPtr(MemPage *pPage, u8 *pCell){ 631 if( pCell ){ 632 CellInfo info; 633 sqlite3BtreeParseCellPtr(pPage, pCell, &info); 634 assert( (info.nData+(pPage->intKey?0:info.nKey))==info.nPayload ); 635 if( (info.nData+(pPage->intKey?0:info.nKey))>info.nLocal ){ 636 Pgno ovfl = get4byte(&pCell[info.iOverflow]); 637 return ptrmapPut(pPage->pBt, ovfl, PTRMAP_OVERFLOW1, pPage->pgno); 638 } 639 } 640 return SQLITE_OK; 641 } 642 /* 643 ** If the cell with index iCell on page pPage contains a pointer 644 ** to an overflow page, insert an entry into the pointer-map 645 ** for the overflow page. 646 */ 647 static int ptrmapPutOvfl(MemPage *pPage, int iCell){ 648 u8 *pCell; 649 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 650 pCell = findOverflowCell(pPage, iCell); 651 return ptrmapPutOvflPtr(pPage, pCell); 652 } 653 #endif 654 655 656 /* 657 ** Defragment the page given. All Cells are moved to the 658 ** end of the page and all free space is collected into one 659 ** big FreeBlk that occurs in between the header and cell 660 ** pointer array and the cell content area. 661 */ 662 static int defragmentPage(MemPage *pPage){ 663 int i; /* Loop counter */ 664 int pc; /* Address of a i-th cell */ 665 int addr; /* Offset of first byte after cell pointer array */ 666 int hdr; /* Offset to the page header */ 667 int size; /* Size of a cell */ 668 int usableSize; /* Number of usable bytes on a page */ 669 int cellOffset; /* Offset to the cell pointer array */ 670 int brk; /* Offset to the cell content area */ 671 int nCell; /* Number of cells on the page */ 672 unsigned char *data; /* The page data */ 673 unsigned char *temp; /* Temp area for cell content */ 674 675 assert( sqlite3PagerIswriteable(pPage->pDbPage) ); 676 assert( pPage->pBt!=0 ); 677 assert( pPage->pBt->usableSize <= SQLITE_MAX_PAGE_SIZE ); 678 assert( pPage->nOverflow==0 ); 679 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 680 temp = sqlite3_malloc( pPage->pBt->pageSize ); 681 if( temp==0 ) return SQLITE_NOMEM; 682 data = pPage->aData; 683 hdr = pPage->hdrOffset; 684 cellOffset = pPage->cellOffset; 685 nCell = pPage->nCell; 686 assert( nCell==get2byte(&data[hdr+3]) ); 687 usableSize = pPage->pBt->usableSize; 688 brk = get2byte(&data[hdr+5]); 689 memcpy(&temp[brk], &data[brk], usableSize - brk); 690 brk = usableSize; 691 for(i=0; i<nCell; i++){ 692 u8 *pAddr; /* The i-th cell pointer */ 693 pAddr = &data[cellOffset + i*2]; 694 pc = get2byte(pAddr); 695 assert( pc<pPage->pBt->usableSize ); 696 size = cellSizePtr(pPage, &temp[pc]); 697 brk -= size; 698 memcpy(&data[brk], &temp[pc], size); 699 put2byte(pAddr, brk); 700 } 701 assert( brk>=cellOffset+2*nCell ); 702 put2byte(&data[hdr+5], brk); 703 data[hdr+1] = 0; 704 data[hdr+2] = 0; 705 data[hdr+7] = 0; 706 addr = cellOffset+2*nCell; 707 memset(&data[addr], 0, brk-addr); 708 sqlite3_free(temp); 709 return SQLITE_OK; 710 } 711 712 /* 713 ** Allocate nByte bytes of space on a page. 714 ** 715 ** Return the index into pPage->aData[] of the first byte of 716 ** the new allocation. Or return 0 if there is not enough free 717 ** space on the page to satisfy the allocation request. 718 ** 719 ** If the page contains nBytes of free space but does not contain 720 ** nBytes of contiguous free space, then this routine automatically 721 ** calls defragementPage() to consolidate all free space before 722 ** allocating the new chunk. 723 */ 724 static int allocateSpace(MemPage *pPage, int nByte){ 725 int addr, pc, hdr; 726 int size; 727 int nFrag; 728 int top; 729 int nCell; 730 int cellOffset; 731 unsigned char *data; 732 733 data = pPage->aData; 734 assert( sqlite3PagerIswriteable(pPage->pDbPage) ); 735 assert( pPage->pBt ); 736 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 737 if( nByte<4 ) nByte = 4; 738 if( pPage->nFree<nByte || pPage->nOverflow>0 ) return 0; 739 pPage->nFree -= nByte; 740 hdr = pPage->hdrOffset; 741 742 nFrag = data[hdr+7]; 743 if( nFrag<60 ){ 744 /* Search the freelist looking for a slot big enough to satisfy the 745 ** space request. */ 746 addr = hdr+1; 747 while( (pc = get2byte(&data[addr]))>0 ){ 748 size = get2byte(&data[pc+2]); 749 if( size>=nByte ){ 750 if( size<nByte+4 ){ 751 memcpy(&data[addr], &data[pc], 2); 752 data[hdr+7] = nFrag + size - nByte; 753 return pc; 754 }else{ 755 put2byte(&data[pc+2], size-nByte); 756 return pc + size - nByte; 757 } 758 } 759 addr = pc; 760 } 761 } 762 763 /* Allocate memory from the gap in between the cell pointer array 764 ** and the cell content area. 765 */ 766 top = get2byte(&data[hdr+5]); 767 nCell = get2byte(&data[hdr+3]); 768 cellOffset = pPage->cellOffset; 769 if( nFrag>=60 || cellOffset + 2*nCell > top - nByte ){ 770 if( defragmentPage(pPage) ) return 0; 771 top = get2byte(&data[hdr+5]); 772 } 773 top -= nByte; 774 assert( cellOffset + 2*nCell <= top ); 775 put2byte(&data[hdr+5], top); 776 return top; 777 } 778 779 /* 780 ** Return a section of the pPage->aData to the freelist. 781 ** The first byte of the new free block is pPage->aDisk[start] 782 ** and the size of the block is "size" bytes. 783 ** 784 ** Most of the effort here is involved in coalesing adjacent 785 ** free blocks into a single big free block. 786 */ 787 static void freeSpace(MemPage *pPage, int start, int size){ 788 int addr, pbegin, hdr; 789 unsigned char *data = pPage->aData; 790 791 assert( pPage->pBt!=0 ); 792 assert( sqlite3PagerIswriteable(pPage->pDbPage) ); 793 assert( start>=pPage->hdrOffset+6+(pPage->leaf?0:4) ); 794 assert( (start + size)<=pPage->pBt->usableSize ); 795 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 796 if( size<4 ) size = 4; 797 798 #ifdef SQLITE_SECURE_DELETE 799 /* Overwrite deleted information with zeros when the SECURE_DELETE 800 ** option is enabled at compile-time */ 801 memset(&data[start], 0, size); 802 #endif 803 804 /* Add the space back into the linked list of freeblocks */ 805 hdr = pPage->hdrOffset; 806 addr = hdr + 1; 807 while( (pbegin = get2byte(&data[addr]))<start && pbegin>0 ){ 808 assert( pbegin<=pPage->pBt->usableSize-4 ); 809 assert( pbegin>addr ); 810 addr = pbegin; 811 } 812 assert( pbegin<=pPage->pBt->usableSize-4 ); 813 assert( pbegin>addr || pbegin==0 ); 814 put2byte(&data[addr], start); 815 put2byte(&data[start], pbegin); 816 put2byte(&data[start+2], size); 817 pPage->nFree += size; 818 819 /* Coalesce adjacent free blocks */ 820 addr = pPage->hdrOffset + 1; 821 while( (pbegin = get2byte(&data[addr]))>0 ){ 822 int pnext, psize; 823 assert( pbegin>addr ); 824 assert( pbegin<=pPage->pBt->usableSize-4 ); 825 pnext = get2byte(&data[pbegin]); 826 psize = get2byte(&data[pbegin+2]); 827 if( pbegin + psize + 3 >= pnext && pnext>0 ){ 828 int frag = pnext - (pbegin+psize); 829 assert( frag<=data[pPage->hdrOffset+7] ); 830 data[pPage->hdrOffset+7] -= frag; 831 put2byte(&data[pbegin], get2byte(&data[pnext])); 832 put2byte(&data[pbegin+2], pnext+get2byte(&data[pnext+2])-pbegin); 833 }else{ 834 addr = pbegin; 835 } 836 } 837 838 /* If the cell content area begins with a freeblock, remove it. */ 839 if( data[hdr+1]==data[hdr+5] && data[hdr+2]==data[hdr+6] ){ 840 int top; 841 pbegin = get2byte(&data[hdr+1]); 842 memcpy(&data[hdr+1], &data[pbegin], 2); 843 top = get2byte(&data[hdr+5]); 844 put2byte(&data[hdr+5], top + get2byte(&data[pbegin+2])); 845 } 846 } 847 848 /* 849 ** Decode the flags byte (the first byte of the header) for a page 850 ** and initialize fields of the MemPage structure accordingly. 851 */ 852 static void decodeFlags(MemPage *pPage, int flagByte){ 853 BtShared *pBt; /* A copy of pPage->pBt */ 854 855 assert( pPage->hdrOffset==(pPage->pgno==1 ? 100 : 0) ); 856 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 857 pPage->intKey = (flagByte & (PTF_INTKEY|PTF_LEAFDATA))!=0; 858 pPage->zeroData = (flagByte & PTF_ZERODATA)!=0; 859 pPage->leaf = (flagByte & PTF_LEAF)!=0; 860 pPage->childPtrSize = 4*(pPage->leaf==0); 861 pBt = pPage->pBt; 862 if( flagByte & PTF_LEAFDATA ){ 863 pPage->leafData = 1; 864 pPage->maxLocal = pBt->maxLeaf; 865 pPage->minLocal = pBt->minLeaf; 866 }else{ 867 pPage->leafData = 0; 868 pPage->maxLocal = pBt->maxLocal; 869 pPage->minLocal = pBt->minLocal; 870 } 871 pPage->hasData = !(pPage->zeroData || (!pPage->leaf && pPage->leafData)); 872 } 873 874 /* 875 ** Initialize the auxiliary information for a disk block. 876 ** 877 ** The pParent parameter must be a pointer to the MemPage which 878 ** is the parent of the page being initialized. The root of a 879 ** BTree has no parent and so for that page, pParent==NULL. 880 ** 881 ** Return SQLITE_OK on success. If we see that the page does 882 ** not contain a well-formed database page, then return 883 ** SQLITE_CORRUPT. Note that a return of SQLITE_OK does not 884 ** guarantee that the page is well-formed. It only shows that 885 ** we failed to detect any corruption. 886 */ 887 int sqlite3BtreeInitPage( 888 MemPage *pPage, /* The page to be initialized */ 889 MemPage *pParent /* The parent. Might be NULL */ 890 ){ 891 int pc; /* Address of a freeblock within pPage->aData[] */ 892 int hdr; /* Offset to beginning of page header */ 893 u8 *data; /* Equal to pPage->aData */ 894 BtShared *pBt; /* The main btree structure */ 895 int usableSize; /* Amount of usable space on each page */ 896 int cellOffset; /* Offset from start of page to first cell pointer */ 897 int nFree; /* Number of unused bytes on the page */ 898 int top; /* First byte of the cell content area */ 899 900 pBt = pPage->pBt; 901 assert( pBt!=0 ); 902 assert( pParent==0 || pParent->pBt==pBt ); 903 assert( sqlite3_mutex_held(pBt->mutex) ); 904 assert( pPage->pgno==sqlite3PagerPagenumber(pPage->pDbPage) ); 905 assert( pPage == sqlite3PagerGetExtra(pPage->pDbPage) ); 906 assert( pPage->aData == sqlite3PagerGetData(pPage->pDbPage) ); 907 if( pPage->pParent!=pParent && (pPage->pParent!=0 || pPage->isInit) ){ 908 /* The parent page should never change unless the file is corrupt */ 909 return SQLITE_CORRUPT_BKPT; 910 } 911 if( pPage->isInit ) return SQLITE_OK; 912 if( pPage->pParent==0 && pParent!=0 ){ 913 pPage->pParent = pParent; 914 sqlite3PagerRef(pParent->pDbPage); 915 } 916 hdr = pPage->hdrOffset; 917 data = pPage->aData; 918 decodeFlags(pPage, data[hdr]); 919 pPage->nOverflow = 0; 920 pPage->idxShift = 0; 921 usableSize = pBt->usableSize; 922 pPage->cellOffset = cellOffset = hdr + 12 - 4*pPage->leaf; 923 top = get2byte(&data[hdr+5]); 924 pPage->nCell = get2byte(&data[hdr+3]); 925 if( pPage->nCell>MX_CELL(pBt) ){ 926 /* To many cells for a single page. The page must be corrupt */ 927 return SQLITE_CORRUPT_BKPT; 928 } 929 if( pPage->nCell==0 && pParent!=0 && pParent->pgno!=1 ){ 930 /* All pages must have at least one cell, except for root pages */ 931 return SQLITE_CORRUPT_BKPT; 932 } 933 934 /* Compute the total free space on the page */ 935 pc = get2byte(&data[hdr+1]); 936 nFree = data[hdr+7] + top - (cellOffset + 2*pPage->nCell); 937 while( pc>0 ){ 938 int next, size; 939 if( pc>usableSize-4 ){ 940 /* Free block is off the page */ 941 return SQLITE_CORRUPT_BKPT; 942 } 943 next = get2byte(&data[pc]); 944 size = get2byte(&data[pc+2]); 945 if( next>0 && next<=pc+size+3 ){ 946 /* Free blocks must be in accending order */ 947 return SQLITE_CORRUPT_BKPT; 948 } 949 nFree += size; 950 pc = next; 951 } 952 pPage->nFree = nFree; 953 if( nFree>=usableSize ){ 954 /* Free space cannot exceed total page size */ 955 return SQLITE_CORRUPT_BKPT; 956 } 957 958 pPage->isInit = 1; 959 return SQLITE_OK; 960 } 961 962 /* 963 ** Set up a raw page so that it looks like a database page holding 964 ** no entries. 965 */ 966 static void zeroPage(MemPage *pPage, int flags){ 967 unsigned char *data = pPage->aData; 968 BtShared *pBt = pPage->pBt; 969 int hdr = pPage->hdrOffset; 970 int first; 971 972 assert( sqlite3PagerPagenumber(pPage->pDbPage)==pPage->pgno ); 973 assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage ); 974 assert( sqlite3PagerGetData(pPage->pDbPage) == data ); 975 assert( sqlite3PagerIswriteable(pPage->pDbPage) ); 976 assert( sqlite3_mutex_held(pBt->mutex) ); 977 memset(&data[hdr], 0, pBt->usableSize - hdr); 978 data[hdr] = flags; 979 first = hdr + 8 + 4*((flags&PTF_LEAF)==0); 980 memset(&data[hdr+1], 0, 4); 981 data[hdr+7] = 0; 982 put2byte(&data[hdr+5], pBt->usableSize); 983 pPage->nFree = pBt->usableSize - first; 984 decodeFlags(pPage, flags); 985 pPage->hdrOffset = hdr; 986 pPage->cellOffset = first; 987 pPage->nOverflow = 0; 988 pPage->idxShift = 0; 989 pPage->nCell = 0; 990 pPage->isInit = 1; 991 } 992 993 /* 994 ** Get a page from the pager. Initialize the MemPage.pBt and 995 ** MemPage.aData elements if needed. 996 ** 997 ** If the noContent flag is set, it means that we do not care about 998 ** the content of the page at this time. So do not go to the disk 999 ** to fetch the content. Just fill in the content with zeros for now. 1000 ** If in the future we call sqlite3PagerWrite() on this page, that 1001 ** means we have started to be concerned about content and the disk 1002 ** read should occur at that point. 1003 */ 1004 int sqlite3BtreeGetPage( 1005 BtShared *pBt, /* The btree */ 1006 Pgno pgno, /* Number of the page to fetch */ 1007 MemPage **ppPage, /* Return the page in this parameter */ 1008 int noContent /* Do not load page content if true */ 1009 ){ 1010 int rc; 1011 MemPage *pPage; 1012 DbPage *pDbPage; 1013 1014 assert( sqlite3_mutex_held(pBt->mutex) ); 1015 rc = sqlite3PagerAcquire(pBt->pPager, pgno, (DbPage**)&pDbPage, noContent); 1016 if( rc ) return rc; 1017 pPage = (MemPage *)sqlite3PagerGetExtra(pDbPage); 1018 pPage->aData = sqlite3PagerGetData(pDbPage); 1019 pPage->pDbPage = pDbPage; 1020 pPage->pBt = pBt; 1021 pPage->pgno = pgno; 1022 pPage->hdrOffset = pPage->pgno==1 ? 100 : 0; 1023 *ppPage = pPage; 1024 return SQLITE_OK; 1025 } 1026 1027 /* 1028 ** Get a page from the pager and initialize it. This routine 1029 ** is just a convenience wrapper around separate calls to 1030 ** sqlite3BtreeGetPage() and sqlite3BtreeInitPage(). 1031 */ 1032 static int getAndInitPage( 1033 BtShared *pBt, /* The database file */ 1034 Pgno pgno, /* Number of the page to get */ 1035 MemPage **ppPage, /* Write the page pointer here */ 1036 MemPage *pParent /* Parent of the page */ 1037 ){ 1038 int rc; 1039 assert( sqlite3_mutex_held(pBt->mutex) ); 1040 if( pgno==0 ){ 1041 return SQLITE_CORRUPT_BKPT; 1042 } 1043 rc = sqlite3BtreeGetPage(pBt, pgno, ppPage, 0); 1044 if( rc==SQLITE_OK && (*ppPage)->isInit==0 ){ 1045 rc = sqlite3BtreeInitPage(*ppPage, pParent); 1046 } 1047 return rc; 1048 } 1049 1050 /* 1051 ** Release a MemPage. This should be called once for each prior 1052 ** call to sqlite3BtreeGetPage. 1053 */ 1054 static void releasePage(MemPage *pPage){ 1055 if( pPage ){ 1056 assert( pPage->aData ); 1057 assert( pPage->pBt ); 1058 assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage ); 1059 assert( sqlite3PagerGetData(pPage->pDbPage)==pPage->aData ); 1060 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 1061 sqlite3PagerUnref(pPage->pDbPage); 1062 } 1063 } 1064 1065 /* 1066 ** This routine is called when the reference count for a page 1067 ** reaches zero. We need to unref the pParent pointer when that 1068 ** happens. 1069 */ 1070 static void pageDestructor(DbPage *pData, int pageSize){ 1071 MemPage *pPage; 1072 assert( (pageSize & 7)==0 ); 1073 pPage = (MemPage *)sqlite3PagerGetExtra(pData); 1074 assert( pPage->isInit==0 || sqlite3_mutex_held(pPage->pBt->mutex) ); 1075 if( pPage->pParent ){ 1076 MemPage *pParent = pPage->pParent; 1077 assert( pParent->pBt==pPage->pBt ); 1078 pPage->pParent = 0; 1079 releasePage(pParent); 1080 } 1081 pPage->isInit = 0; 1082 } 1083 1084 /* 1085 ** During a rollback, when the pager reloads information into the cache 1086 ** so that the cache is restored to its original state at the start of 1087 ** the transaction, for each page restored this routine is called. 1088 ** 1089 ** This routine needs to reset the extra data section at the end of the 1090 ** page to agree with the restored data. 1091 */ 1092 static void pageReinit(DbPage *pData, int pageSize){ 1093 MemPage *pPage; 1094 assert( (pageSize & 7)==0 ); 1095 pPage = (MemPage *)sqlite3PagerGetExtra(pData); 1096 if( pPage->isInit ){ 1097 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 1098 pPage->isInit = 0; 1099 sqlite3BtreeInitPage(pPage, pPage->pParent); 1100 } 1101 } 1102 1103 /* 1104 ** Open a database file. 1105 ** 1106 ** zFilename is the name of the database file. If zFilename is NULL 1107 ** a new database with a random name is created. This randomly named 1108 ** database file will be deleted when sqlite3BtreeClose() is called. 1109 ** If zFilename is ":memory:" then an in-memory database is created 1110 ** that is automatically destroyed when it is closed. 1111 */ 1112 int sqlite3BtreeOpen( 1113 const char *zFilename, /* Name of the file containing the BTree database */ 1114 sqlite3 *pSqlite, /* Associated database handle */ 1115 Btree **ppBtree, /* Pointer to new Btree object written here */ 1116 int flags, /* Options */ 1117 int vfsFlags /* Flags passed through to sqlite3_vfs.xOpen() */ 1118 ){ 1119 sqlite3_vfs *pVfs; /* The VFS to use for this btree */ 1120 BtShared *pBt = 0; /* Shared part of btree structure */ 1121 Btree *p; /* Handle to return */ 1122 int rc = SQLITE_OK; 1123 int nReserve; 1124 unsigned char zDbHeader[100]; 1125 1126 /* Set the variable isMemdb to true for an in-memory database, or 1127 ** false for a file-based database. This symbol is only required if 1128 ** either of the shared-data or autovacuum features are compiled 1129 ** into the library. 1130 */ 1131 #if !defined(SQLITE_OMIT_SHARED_CACHE) || !defined(SQLITE_OMIT_AUTOVACUUM) 1132 #ifdef SQLITE_OMIT_MEMORYDB 1133 const int isMemdb = 0; 1134 #else 1135 const int isMemdb = zFilename && !strcmp(zFilename, ":memory:"); 1136 #endif 1137 #endif 1138 1139 assert( pSqlite!=0 ); 1140 assert( sqlite3_mutex_held(pSqlite->mutex) ); 1141 1142 pVfs = pSqlite->pVfs; 1143 p = sqlite3MallocZero(sizeof(Btree)); 1144 if( !p ){ 1145 return SQLITE_NOMEM; 1146 } 1147 p->inTrans = TRANS_NONE; 1148 p->pSqlite = pSqlite; 1149 1150 #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO) 1151 /* 1152 ** If this Btree is a candidate for shared cache, try to find an 1153 ** existing BtShared object that we can share with 1154 */ 1155 if( (flags & BTREE_PRIVATE)==0 1156 && isMemdb==0 1157 && (pSqlite->flags & SQLITE_Vtab)==0 1158 && zFilename && zFilename[0] 1159 ){ 1160 if( sqlite3SharedCacheEnabled ){ 1161 int nFullPathname = pVfs->mxPathname+1; 1162 char *zFullPathname = (char *)sqlite3_malloc(nFullPathname); 1163 sqlite3_mutex *mutexShared; 1164 p->sharable = 1; 1165 if( pSqlite ){ 1166 pSqlite->flags |= SQLITE_SharedCache; 1167 } 1168 if( !zFullPathname ){ 1169 sqlite3_free(p); 1170 return SQLITE_NOMEM; 1171 } 1172 sqlite3OsFullPathname(pVfs, zFilename, nFullPathname, zFullPathname); 1173 mutexShared = sqlite3_mutex_alloc(SQLITE_MUTEX_STATIC_MASTER); 1174 sqlite3_mutex_enter(mutexShared); 1175 for(pBt=sqlite3SharedCacheList; pBt; pBt=pBt->pNext){ 1176 assert( pBt->nRef>0 ); 1177 if( 0==strcmp(zFullPathname, sqlite3PagerFilename(pBt->pPager)) 1178 && sqlite3PagerVfs(pBt->pPager)==pVfs ){ 1179 p->pBt = pBt; 1180 pBt->nRef++; 1181 break; 1182 } 1183 } 1184 sqlite3_mutex_leave(mutexShared); 1185 sqlite3_free(zFullPathname); 1186 } 1187 #ifdef SQLITE_DEBUG 1188 else{ 1189 /* In debug mode, we mark all persistent databases as sharable 1190 ** even when they are not. This exercises the locking code and 1191 ** gives more opportunity for asserts(sqlite3_mutex_held()) 1192 ** statements to find locking problems. 1193 */ 1194 p->sharable = 1; 1195 } 1196 #endif 1197 } 1198 #endif 1199 if( pBt==0 ){ 1200 /* 1201 ** The following asserts make sure that structures used by the btree are 1202 ** the right size. This is to guard against size changes that result 1203 ** when compiling on a different architecture. 1204 */ 1205 assert( sizeof(i64)==8 || sizeof(i64)==4 ); 1206 assert( sizeof(u64)==8 || sizeof(u64)==4 ); 1207 assert( sizeof(u32)==4 ); 1208 assert( sizeof(u16)==2 ); 1209 assert( sizeof(Pgno)==4 ); 1210 1211 pBt = sqlite3MallocZero( sizeof(*pBt) ); 1212 if( pBt==0 ){ 1213 rc = SQLITE_NOMEM; 1214 goto btree_open_out; 1215 } 1216 rc = sqlite3PagerOpen(pVfs, &pBt->pPager, zFilename, 1217 EXTRA_SIZE, flags, vfsFlags); 1218 if( rc==SQLITE_OK ){ 1219 rc = sqlite3PagerReadFileheader(pBt->pPager,sizeof(zDbHeader),zDbHeader); 1220 } 1221 if( rc!=SQLITE_OK ){ 1222 goto btree_open_out; 1223 } 1224 p->pBt = pBt; 1225 1226 sqlite3PagerSetDestructor(pBt->pPager, pageDestructor); 1227 sqlite3PagerSetReiniter(pBt->pPager, pageReinit); 1228 pBt->pCursor = 0; 1229 pBt->pPage1 = 0; 1230 pBt->readOnly = sqlite3PagerIsreadonly(pBt->pPager); 1231 pBt->pageSize = get2byte(&zDbHeader[16]); 1232 if( pBt->pageSize<512 || pBt->pageSize>SQLITE_MAX_PAGE_SIZE 1233 || ((pBt->pageSize-1)&pBt->pageSize)!=0 ){ 1234 pBt->pageSize = 0; 1235 sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize); 1236 pBt->maxEmbedFrac = 64; /* 25% */ 1237 pBt->minEmbedFrac = 32; /* 12.5% */ 1238 pBt->minLeafFrac = 32; /* 12.5% */ 1239 #ifndef SQLITE_OMIT_AUTOVACUUM 1240 /* If the magic name ":memory:" will create an in-memory database, then 1241 ** leave the autoVacuum mode at 0 (do not auto-vacuum), even if 1242 ** SQLITE_DEFAULT_AUTOVACUUM is true. On the other hand, if 1243 ** SQLITE_OMIT_MEMORYDB has been defined, then ":memory:" is just a 1244 ** regular file-name. In this case the auto-vacuum applies as per normal. 1245 */ 1246 if( zFilename && !isMemdb ){ 1247 pBt->autoVacuum = (SQLITE_DEFAULT_AUTOVACUUM ? 1 : 0); 1248 pBt->incrVacuum = (SQLITE_DEFAULT_AUTOVACUUM==2 ? 1 : 0); 1249 } 1250 #endif 1251 nReserve = 0; 1252 }else{ 1253 nReserve = zDbHeader[20]; 1254 pBt->maxEmbedFrac = zDbHeader[21]; 1255 pBt->minEmbedFrac = zDbHeader[22]; 1256 pBt->minLeafFrac = zDbHeader[23]; 1257 pBt->pageSizeFixed = 1; 1258 #ifndef SQLITE_OMIT_AUTOVACUUM 1259 pBt->autoVacuum = (get4byte(&zDbHeader[36 + 4*4])?1:0); 1260 pBt->incrVacuum = (get4byte(&zDbHeader[36 + 7*4])?1:0); 1261 #endif 1262 } 1263 pBt->usableSize = pBt->pageSize - nReserve; 1264 assert( (pBt->pageSize & 7)==0 ); /* 8-byte alignment of pageSize */ 1265 sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize); 1266 1267 #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO) 1268 /* Add the new BtShared object to the linked list sharable BtShareds. 1269 */ 1270 if( p->sharable ){ 1271 sqlite3_mutex *mutexShared; 1272 pBt->nRef = 1; 1273 mutexShared = sqlite3_mutex_alloc(SQLITE_MUTEX_STATIC_MASTER); 1274 if( SQLITE_THREADSAFE ){ 1275 pBt->mutex = sqlite3_mutex_alloc(SQLITE_MUTEX_FAST); 1276 if( pBt->mutex==0 ){ 1277 rc = SQLITE_NOMEM; 1278 pSqlite->mallocFailed = 0; 1279 goto btree_open_out; 1280 } 1281 } 1282 sqlite3_mutex_enter(mutexShared); 1283 pBt->pNext = sqlite3SharedCacheList; 1284 sqlite3SharedCacheList = pBt; 1285 sqlite3_mutex_leave(mutexShared); 1286 } 1287 #endif 1288 } 1289 1290 #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO) 1291 /* If the new Btree uses a sharable pBtShared, then link the new 1292 ** Btree into the list of all sharable Btrees for the same connection. 1293 ** The list is kept in ascending order by pBt address. 1294 */ 1295 if( p->sharable ){ 1296 int i; 1297 Btree *pSib; 1298 for(i=0; i<pSqlite->nDb; i++){ 1299 if( (pSib = pSqlite->aDb[i].pBt)!=0 && pSib->sharable ){ 1300 while( pSib->pPrev ){ pSib = pSib->pPrev; } 1301 if( p->pBt<pSib->pBt ){ 1302 p->pNext = pSib; 1303 p->pPrev = 0; 1304 pSib->pPrev = p; 1305 }else{ 1306 while( pSib->pNext && pSib->pNext->pBt<p->pBt ){ 1307 pSib = pSib->pNext; 1308 } 1309 p->pNext = pSib->pNext; 1310 p->pPrev = pSib; 1311 if( p->pNext ){ 1312 p->pNext->pPrev = p; 1313 } 1314 pSib->pNext = p; 1315 } 1316 break; 1317 } 1318 } 1319 } 1320 #endif 1321 *ppBtree = p; 1322 1323 btree_open_out: 1324 if( rc!=SQLITE_OK ){ 1325 if( pBt && pBt->pPager ){ 1326 sqlite3PagerClose(pBt->pPager); 1327 } 1328 sqlite3_free(pBt); 1329 sqlite3_free(p); 1330 *ppBtree = 0; 1331 } 1332 return rc; 1333 } 1334 1335 /* 1336 ** Decrement the BtShared.nRef counter. When it reaches zero, 1337 ** remove the BtShared structure from the sharing list. Return 1338 ** true if the BtShared.nRef counter reaches zero and return 1339 ** false if it is still positive. 1340 */ 1341 static int removeFromSharingList(BtShared *pBt){ 1342 #ifndef SQLITE_OMIT_SHARED_CACHE 1343 sqlite3_mutex *pMaster; 1344 BtShared *pList; 1345 int removed = 0; 1346 1347 assert( sqlite3_mutex_notheld(pBt->mutex) ); 1348 pMaster = sqlite3_mutex_alloc(SQLITE_MUTEX_STATIC_MASTER); 1349 sqlite3_mutex_enter(pMaster); 1350 pBt->nRef--; 1351 if( pBt->nRef<=0 ){ 1352 if( sqlite3SharedCacheList==pBt ){ 1353 sqlite3SharedCacheList = pBt->pNext; 1354 }else{ 1355 pList = sqlite3SharedCacheList; 1356 while( pList && pList->pNext!=pBt ){ 1357 pList=pList->pNext; 1358 } 1359 if( pList ){ 1360 pList->pNext = pBt->pNext; 1361 } 1362 } 1363 if( SQLITE_THREADSAFE ){ 1364 sqlite3_mutex_free(pBt->mutex); 1365 } 1366 removed = 1; 1367 } 1368 sqlite3_mutex_leave(pMaster); 1369 return removed; 1370 #else 1371 return 1; 1372 #endif 1373 } 1374 1375 /* 1376 ** Close an open database and invalidate all cursors. 1377 */ 1378 int sqlite3BtreeClose(Btree *p){ 1379 BtShared *pBt = p->pBt; 1380 BtCursor *pCur; 1381 1382 /* Close all cursors opened via this handle. */ 1383 assert( sqlite3_mutex_held(p->pSqlite->mutex) ); 1384 sqlite3BtreeEnter(p); 1385 pCur = pBt->pCursor; 1386 while( pCur ){ 1387 BtCursor *pTmp = pCur; 1388 pCur = pCur->pNext; 1389 if( pTmp->pBtree==p ){ 1390 sqlite3BtreeCloseCursor(pTmp); 1391 } 1392 } 1393 1394 /* Rollback any active transaction and free the handle structure. 1395 ** The call to sqlite3BtreeRollback() drops any table-locks held by 1396 ** this handle. 1397 */ 1398 sqlite3BtreeRollback(p); 1399 sqlite3BtreeLeave(p); 1400 1401 /* If there are still other outstanding references to the shared-btree 1402 ** structure, return now. The remainder of this procedure cleans 1403 ** up the shared-btree. 1404 */ 1405 assert( p->wantToLock==0 && p->locked==0 ); 1406 if( !p->sharable || removeFromSharingList(pBt) ){ 1407 /* The pBt is no longer on the sharing list, so we can access 1408 ** it without having to hold the mutex. 1409 ** 1410 ** Clean out and delete the BtShared object. 1411 */ 1412 assert( !pBt->pCursor ); 1413 sqlite3PagerClose(pBt->pPager); 1414 if( pBt->xFreeSchema && pBt->pSchema ){ 1415 pBt->xFreeSchema(pBt->pSchema); 1416 } 1417 sqlite3_free(pBt->pSchema); 1418 sqlite3_free(pBt); 1419 } 1420 1421 #ifndef SQLITE_OMIT_SHARED_CACHE 1422 assert( p->wantToLock==0 ); 1423 assert( p->locked==0 ); 1424 if( p->pPrev ) p->pPrev->pNext = p->pNext; 1425 if( p->pNext ) p->pNext->pPrev = p->pPrev; 1426 #endif 1427 1428 sqlite3_free(p); 1429 return SQLITE_OK; 1430 } 1431 1432 /* 1433 ** Change the busy handler callback function. 1434 */ 1435 int sqlite3BtreeSetBusyHandler(Btree *p, BusyHandler *pHandler){ 1436 BtShared *pBt = p->pBt; 1437 assert( sqlite3_mutex_held(p->pSqlite->mutex) ); 1438 sqlite3BtreeEnter(p); 1439 pBt->pBusyHandler = pHandler; 1440 sqlite3PagerSetBusyhandler(pBt->pPager, pHandler); 1441 sqlite3BtreeLeave(p); 1442 return SQLITE_OK; 1443 } 1444 1445 /* 1446 ** Change the limit on the number of pages allowed in the cache. 1447 ** 1448 ** The maximum number of cache pages is set to the absolute 1449 ** value of mxPage. If mxPage is negative, the pager will 1450 ** operate asynchronously - it will not stop to do fsync()s 1451 ** to insure data is written to the disk surface before 1452 ** continuing. Transactions still work if synchronous is off, 1453 ** and the database cannot be corrupted if this program 1454 ** crashes. But if the operating system crashes or there is 1455 ** an abrupt power failure when synchronous is off, the database 1456 ** could be left in an inconsistent and unrecoverable state. 1457 ** Synchronous is on by default so database corruption is not 1458 ** normally a worry. 1459 */ 1460 int sqlite3BtreeSetCacheSize(Btree *p, int mxPage){ 1461 BtShared *pBt = p->pBt; 1462 assert( sqlite3_mutex_held(p->pSqlite->mutex) ); 1463 sqlite3BtreeEnter(p); 1464 sqlite3PagerSetCachesize(pBt->pPager, mxPage); 1465 sqlite3BtreeLeave(p); 1466 return SQLITE_OK; 1467 } 1468 1469 /* 1470 ** Change the way data is synced to disk in order to increase or decrease 1471 ** how well the database resists damage due to OS crashes and power 1472 ** failures. Level 1 is the same as asynchronous (no syncs() occur and 1473 ** there is a high probability of damage) Level 2 is the default. There 1474 ** is a very low but non-zero probability of damage. Level 3 reduces the 1475 ** probability of damage to near zero but with a write performance reduction. 1476 */ 1477 #ifndef SQLITE_OMIT_PAGER_PRAGMAS 1478 int sqlite3BtreeSetSafetyLevel(Btree *p, int level, int fullSync){ 1479 BtShared *pBt = p->pBt; 1480 assert( sqlite3_mutex_held(p->pSqlite->mutex) ); 1481 sqlite3BtreeEnter(p); 1482 sqlite3PagerSetSafetyLevel(pBt->pPager, level, fullSync); 1483 sqlite3BtreeLeave(p); 1484 return SQLITE_OK; 1485 } 1486 #endif 1487 1488 /* 1489 ** Return TRUE if the given btree is set to safety level 1. In other 1490 ** words, return TRUE if no sync() occurs on the disk files. 1491 */ 1492 int sqlite3BtreeSyncDisabled(Btree *p){ 1493 BtShared *pBt = p->pBt; 1494 int rc; 1495 assert( sqlite3_mutex_held(p->pSqlite->mutex) ); 1496 sqlite3BtreeEnter(p); 1497 assert( pBt && pBt->pPager ); 1498 rc = sqlite3PagerNosync(pBt->pPager); 1499 sqlite3BtreeLeave(p); 1500 return rc; 1501 } 1502 1503 #if !defined(SQLITE_OMIT_PAGER_PRAGMAS) || !defined(SQLITE_OMIT_VACUUM) 1504 /* 1505 ** Change the default pages size and the number of reserved bytes per page. 1506 ** 1507 ** The page size must be a power of 2 between 512 and 65536. If the page 1508 ** size supplied does not meet this constraint then the page size is not 1509 ** changed. 1510 ** 1511 ** Page sizes are constrained to be a power of two so that the region 1512 ** of the database file used for locking (beginning at PENDING_BYTE, 1513 ** the first byte past the 1GB boundary, 0x40000000) needs to occur 1514 ** at the beginning of a page. 1515 ** 1516 ** If parameter nReserve is less than zero, then the number of reserved 1517 ** bytes per page is left unchanged. 1518 */ 1519 int sqlite3BtreeSetPageSize(Btree *p, int pageSize, int nReserve){ 1520 int rc = SQLITE_OK; 1521 BtShared *pBt = p->pBt; 1522 sqlite3BtreeEnter(p); 1523 if( pBt->pageSizeFixed ){ 1524 sqlite3BtreeLeave(p); 1525 return SQLITE_READONLY; 1526 } 1527 if( nReserve<0 ){ 1528 nReserve = pBt->pageSize - pBt->usableSize; 1529 } 1530 if( pageSize>=512 && pageSize<=SQLITE_MAX_PAGE_SIZE && 1531 ((pageSize-1)&pageSize)==0 ){ 1532 assert( (pageSize & 7)==0 ); 1533 assert( !pBt->pPage1 && !pBt->pCursor ); 1534 pBt->pageSize = pageSize; 1535 rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize); 1536 } 1537 pBt->usableSize = pBt->pageSize - nReserve; 1538 sqlite3BtreeLeave(p); 1539 return rc; 1540 } 1541 1542 /* 1543 ** Return the currently defined page size 1544 */ 1545 int sqlite3BtreeGetPageSize(Btree *p){ 1546 return p->pBt->pageSize; 1547 } 1548 int sqlite3BtreeGetReserve(Btree *p){ 1549 int n; 1550 sqlite3BtreeEnter(p); 1551 n = p->pBt->pageSize - p->pBt->usableSize; 1552 sqlite3BtreeLeave(p); 1553 return n; 1554 } 1555 1556 /* 1557 ** Set the maximum page count for a database if mxPage is positive. 1558 ** No changes are made if mxPage is 0 or negative. 1559 ** Regardless of the value of mxPage, return the maximum page count. 1560 */ 1561 int sqlite3BtreeMaxPageCount(Btree *p, int mxPage){ 1562 int n; 1563 sqlite3BtreeEnter(p); 1564 n = sqlite3PagerMaxPageCount(p->pBt->pPager, mxPage); 1565 sqlite3BtreeLeave(p); 1566 return n; 1567 } 1568 #endif /* !defined(SQLITE_OMIT_PAGER_PRAGMAS) || !defined(SQLITE_OMIT_VACUUM) */ 1569 1570 /* 1571 ** Change the 'auto-vacuum' property of the database. If the 'autoVacuum' 1572 ** parameter is non-zero, then auto-vacuum mode is enabled. If zero, it 1573 ** is disabled. The default value for the auto-vacuum property is 1574 ** determined by the SQLITE_DEFAULT_AUTOVACUUM macro. 1575 */ 1576 int sqlite3BtreeSetAutoVacuum(Btree *p, int autoVacuum){ 1577 #ifdef SQLITE_OMIT_AUTOVACUUM 1578 return SQLITE_READONLY; 1579 #else 1580 BtShared *pBt = p->pBt; 1581 int rc = SQLITE_OK; 1582 int av = (autoVacuum?1:0); 1583 1584 sqlite3BtreeEnter(p); 1585 if( pBt->pageSizeFixed && av!=pBt->autoVacuum ){ 1586 rc = SQLITE_READONLY; 1587 }else{ 1588 pBt->autoVacuum = av; 1589 } 1590 sqlite3BtreeLeave(p); 1591 return rc; 1592 #endif 1593 } 1594 1595 /* 1596 ** Return the value of the 'auto-vacuum' property. If auto-vacuum is 1597 ** enabled 1 is returned. Otherwise 0. 1598 */ 1599 int sqlite3BtreeGetAutoVacuum(Btree *p){ 1600 #ifdef SQLITE_OMIT_AUTOVACUUM 1601 return BTREE_AUTOVACUUM_NONE; 1602 #else 1603 int rc; 1604 sqlite3BtreeEnter(p); 1605 rc = ( 1606 (!p->pBt->autoVacuum)?BTREE_AUTOVACUUM_NONE: 1607 (!p->pBt->incrVacuum)?BTREE_AUTOVACUUM_FULL: 1608 BTREE_AUTOVACUUM_INCR 1609 ); 1610 sqlite3BtreeLeave(p); 1611 return rc; 1612 #endif 1613 } 1614 1615 1616 /* 1617 ** Get a reference to pPage1 of the database file. This will 1618 ** also acquire a readlock on that file. 1619 ** 1620 ** SQLITE_OK is returned on success. If the file is not a 1621 ** well-formed database file, then SQLITE_CORRUPT is returned. 1622 ** SQLITE_BUSY is returned if the database is locked. SQLITE_NOMEM 1623 ** is returned if we run out of memory. 1624 */ 1625 static int lockBtree(BtShared *pBt){ 1626 int rc, pageSize; 1627 MemPage *pPage1; 1628 1629 assert( sqlite3_mutex_held(pBt->mutex) ); 1630 if( pBt->pPage1 ) return SQLITE_OK; 1631 rc = sqlite3BtreeGetPage(pBt, 1, &pPage1, 0); 1632 if( rc!=SQLITE_OK ) return rc; 1633 1634 1635 /* Do some checking to help insure the file we opened really is 1636 ** a valid database file. 1637 */ 1638 rc = SQLITE_NOTADB; 1639 if( sqlite3PagerPagecount(pBt->pPager)>0 ){ 1640 u8 *page1 = pPage1->aData; 1641 if( memcmp(page1, zMagicHeader, 16)!=0 ){ 1642 goto page1_init_failed; 1643 } 1644 if( page1[18]>1 ){ 1645 pBt->readOnly = 1; 1646 } 1647 if( page1[19]>1 ){ 1648 goto page1_init_failed; 1649 } 1650 pageSize = get2byte(&page1[16]); 1651 if( ((pageSize-1)&pageSize)!=0 || pageSize<512 || 1652 (SQLITE_MAX_PAGE_SIZE<32768 && pageSize>SQLITE_MAX_PAGE_SIZE) 1653 ){ 1654 goto page1_init_failed; 1655 } 1656 assert( (pageSize & 7)==0 ); 1657 pBt->pageSize = pageSize; 1658 pBt->usableSize = pageSize - page1[20]; 1659 if( pBt->usableSize<500 ){ 1660 goto page1_init_failed; 1661 } 1662 pBt->maxEmbedFrac = page1[21]; 1663 pBt->minEmbedFrac = page1[22]; 1664 pBt->minLeafFrac = page1[23]; 1665 #ifndef SQLITE_OMIT_AUTOVACUUM 1666 pBt->autoVacuum = (get4byte(&page1[36 + 4*4])?1:0); 1667 pBt->incrVacuum = (get4byte(&page1[36 + 7*4])?1:0); 1668 #endif 1669 } 1670 1671 /* maxLocal is the maximum amount of payload to store locally for 1672 ** a cell. Make sure it is small enough so that at least minFanout 1673 ** cells can will fit on one page. We assume a 10-byte page header. 1674 ** Besides the payload, the cell must store: 1675 ** 2-byte pointer to the cell 1676 ** 4-byte child pointer 1677 ** 9-byte nKey value 1678 ** 4-byte nData value 1679 ** 4-byte overflow page pointer 1680 ** So a cell consists of a 2-byte poiner, a header which is as much as 1681 ** 17 bytes long, 0 to N bytes of payload, and an optional 4 byte overflow 1682 ** page pointer. 1683 */ 1684 pBt->maxLocal = (pBt->usableSize-12)*pBt->maxEmbedFrac/255 - 23; 1685 pBt->minLocal = (pBt->usableSize-12)*pBt->minEmbedFrac/255 - 23; 1686 pBt->maxLeaf = pBt->usableSize - 35; 1687 pBt->minLeaf = (pBt->usableSize-12)*pBt->minLeafFrac/255 - 23; 1688 if( pBt->minLocal>pBt->maxLocal || pBt->maxLocal<0 ){ 1689 goto page1_init_failed; 1690 } 1691 assert( pBt->maxLeaf + 23 <= MX_CELL_SIZE(pBt) ); 1692 pBt->pPage1 = pPage1; 1693 return SQLITE_OK; 1694 1695 page1_init_failed: 1696 releasePage(pPage1); 1697 pBt->pPage1 = 0; 1698 return rc; 1699 } 1700 1701 /* 1702 ** This routine works like lockBtree() except that it also invokes the 1703 ** busy callback if there is lock contention. 1704 */ 1705 static int lockBtreeWithRetry(Btree *pRef){ 1706 int rc = SQLITE_OK; 1707 1708 assert( sqlite3BtreeHoldsMutex(pRef) ); 1709 if( pRef->inTrans==TRANS_NONE ){ 1710 u8 inTransaction = pRef->pBt->inTransaction; 1711 btreeIntegrity(pRef); 1712 rc = sqlite3BtreeBeginTrans(pRef, 0); 1713 pRef->pBt->inTransaction = inTransaction; 1714 pRef->inTrans = TRANS_NONE; 1715 if( rc==SQLITE_OK ){ 1716 pRef->pBt->nTransaction--; 1717 } 1718 btreeIntegrity(pRef); 1719 } 1720 return rc; 1721 } 1722 1723 1724 /* 1725 ** If there are no outstanding cursors and we are not in the middle 1726 ** of a transaction but there is a read lock on the database, then 1727 ** this routine unrefs the first page of the database file which 1728 ** has the effect of releasing the read lock. 1729 ** 1730 ** If there are any outstanding cursors, this routine is a no-op. 1731 ** 1732 ** If there is a transaction in progress, this routine is a no-op. 1733 */ 1734 static void unlockBtreeIfUnused(BtShared *pBt){ 1735 assert( sqlite3_mutex_held(pBt->mutex) ); 1736 if( pBt->inTransaction==TRANS_NONE && pBt->pCursor==0 && pBt->pPage1!=0 ){ 1737 if( sqlite3PagerRefcount(pBt->pPager)>=1 ){ 1738 if( pBt->pPage1->aData==0 ){ 1739 MemPage *pPage = pBt->pPage1; 1740 pPage->aData = sqlite3PagerGetData(pPage->pDbPage); 1741 pPage->pBt = pBt; 1742 pPage->pgno = 1; 1743 } 1744 releasePage(pBt->pPage1); 1745 } 1746 pBt->pPage1 = 0; 1747 pBt->inStmt = 0; 1748 } 1749 } 1750 1751 /* 1752 ** Create a new database by initializing the first page of the 1753 ** file. 1754 */ 1755 static int newDatabase(BtShared *pBt){ 1756 MemPage *pP1; 1757 unsigned char *data; 1758 int rc; 1759 1760 assert( sqlite3_mutex_held(pBt->mutex) ); 1761 if( sqlite3PagerPagecount(pBt->pPager)>0 ) return SQLITE_OK; 1762 pP1 = pBt->pPage1; 1763 assert( pP1!=0 ); 1764 data = pP1->aData; 1765 rc = sqlite3PagerWrite(pP1->pDbPage); 1766 if( rc ) return rc; 1767 memcpy(data, zMagicHeader, sizeof(zMagicHeader)); 1768 assert( sizeof(zMagicHeader)==16 ); 1769 put2byte(&data[16], pBt->pageSize); 1770 data[18] = 1; 1771 data[19] = 1; 1772 data[20] = pBt->pageSize - pBt->usableSize; 1773 data[21] = pBt->maxEmbedFrac; 1774 data[22] = pBt->minEmbedFrac; 1775 data[23] = pBt->minLeafFrac; 1776 memset(&data[24], 0, 100-24); 1777 zeroPage(pP1, PTF_INTKEY|PTF_LEAF|PTF_LEAFDATA ); 1778 pBt->pageSizeFixed = 1; 1779 #ifndef SQLITE_OMIT_AUTOVACUUM 1780 assert( pBt->autoVacuum==1 || pBt->autoVacuum==0 ); 1781 assert( pBt->incrVacuum==1 || pBt->incrVacuum==0 ); 1782 put4byte(&data[36 + 4*4], pBt->autoVacuum); 1783 put4byte(&data[36 + 7*4], pBt->incrVacuum); 1784 #endif 1785 return SQLITE_OK; 1786 } 1787 1788 /* 1789 ** Attempt to start a new transaction. A write-transaction 1790 ** is started if the second argument is nonzero, otherwise a read- 1791 ** transaction. If the second argument is 2 or more and exclusive 1792 ** transaction is started, meaning that no other process is allowed 1793 ** to access the database. A preexisting transaction may not be 1794 ** upgraded to exclusive by calling this routine a second time - the 1795 ** exclusivity flag only works for a new transaction. 1796 ** 1797 ** A write-transaction must be started before attempting any 1798 ** changes to the database. None of the following routines 1799 ** will work unless a transaction is started first: 1800 ** 1801 ** sqlite3BtreeCreateTable() 1802 ** sqlite3BtreeCreateIndex() 1803 ** sqlite3BtreeClearTable() 1804 ** sqlite3BtreeDropTable() 1805 ** sqlite3BtreeInsert() 1806 ** sqlite3BtreeDelete() 1807 ** sqlite3BtreeUpdateMeta() 1808 ** 1809 ** If an initial attempt to acquire the lock fails because of lock contention 1810 ** and the database was previously unlocked, then invoke the busy handler 1811 ** if there is one. But if there was previously a read-lock, do not 1812 ** invoke the busy handler - just return SQLITE_BUSY. SQLITE_BUSY is 1813 ** returned when there is already a read-lock in order to avoid a deadlock. 1814 ** 1815 ** Suppose there are two processes A and B. A has a read lock and B has 1816 ** a reserved lock. B tries to promote to exclusive but is blocked because 1817 ** of A's read lock. A tries to promote to reserved but is blocked by B. 1818 ** One or the other of the two processes must give way or there can be 1819 ** no progress. By returning SQLITE_BUSY and not invoking the busy callback 1820 ** when A already has a read lock, we encourage A to give up and let B 1821 ** proceed. 1822 */ 1823 int sqlite3BtreeBeginTrans(Btree *p, int wrflag){ 1824 BtShared *pBt = p->pBt; 1825 int rc = SQLITE_OK; 1826 1827 sqlite3BtreeEnter(p); 1828 btreeIntegrity(p); 1829 1830 /* If the btree is already in a write-transaction, or it 1831 ** is already in a read-transaction and a read-transaction 1832 ** is requested, this is a no-op. 1833 */ 1834 if( p->inTrans==TRANS_WRITE || (p->inTrans==TRANS_READ && !wrflag) ){ 1835 goto trans_begun; 1836 } 1837 1838 /* Write transactions are not possible on a read-only database */ 1839 if( pBt->readOnly && wrflag ){ 1840 rc = SQLITE_READONLY; 1841 goto trans_begun; 1842 } 1843 1844 /* If another database handle has already opened a write transaction 1845 ** on this shared-btree structure and a second write transaction is 1846 ** requested, return SQLITE_BUSY. 1847 */ 1848 if( pBt->inTransaction==TRANS_WRITE && wrflag ){ 1849 rc = SQLITE_BUSY; 1850 goto trans_begun; 1851 } 1852 1853 do { 1854 if( pBt->pPage1==0 ){ 1855 rc = lockBtree(pBt); 1856 } 1857 1858 if( rc==SQLITE_OK && wrflag ){ 1859 if( pBt->readOnly ){ 1860 rc = SQLITE_READONLY; 1861 }else{ 1862 rc = sqlite3PagerBegin(pBt->pPage1->pDbPage, wrflag>1); 1863 if( rc==SQLITE_OK ){ 1864 rc = newDatabase(pBt); 1865 } 1866 } 1867 } 1868 1869 if( rc==SQLITE_OK ){ 1870 if( wrflag ) pBt->inStmt = 0; 1871 }else{ 1872 unlockBtreeIfUnused(pBt); 1873 } 1874 }while( rc==SQLITE_BUSY && pBt->inTransaction==TRANS_NONE && 1875 sqlite3InvokeBusyHandler(pBt->pBusyHandler) ); 1876 1877 if( rc==SQLITE_OK ){ 1878 if( p->inTrans==TRANS_NONE ){ 1879 pBt->nTransaction++; 1880 } 1881 p->inTrans = (wrflag?TRANS_WRITE:TRANS_READ); 1882 if( p->inTrans>pBt->inTransaction ){ 1883 pBt->inTransaction = p->inTrans; 1884 } 1885 } 1886 1887 1888 trans_begun: 1889 btreeIntegrity(p); 1890 sqlite3BtreeLeave(p); 1891 return rc; 1892 } 1893 1894 #ifndef SQLITE_OMIT_AUTOVACUUM 1895 1896 /* 1897 ** Set the pointer-map entries for all children of page pPage. Also, if 1898 ** pPage contains cells that point to overflow pages, set the pointer 1899 ** map entries for the overflow pages as well. 1900 */ 1901 static int setChildPtrmaps(MemPage *pPage){ 1902 int i; /* Counter variable */ 1903 int nCell; /* Number of cells in page pPage */ 1904 int rc; /* Return code */ 1905 BtShared *pBt = pPage->pBt; 1906 int isInitOrig = pPage->isInit; 1907 Pgno pgno = pPage->pgno; 1908 1909 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 1910 rc = sqlite3BtreeInitPage(pPage, pPage->pParent); 1911 if( rc!=SQLITE_OK ){ 1912 goto set_child_ptrmaps_out; 1913 } 1914 nCell = pPage->nCell; 1915 1916 for(i=0; i<nCell; i++){ 1917 u8 *pCell = findCell(pPage, i); 1918 1919 rc = ptrmapPutOvflPtr(pPage, pCell); 1920 if( rc!=SQLITE_OK ){ 1921 goto set_child_ptrmaps_out; 1922 } 1923 1924 if( !pPage->leaf ){ 1925 Pgno childPgno = get4byte(pCell); 1926 rc = ptrmapPut(pBt, childPgno, PTRMAP_BTREE, pgno); 1927 if( rc!=SQLITE_OK ) goto set_child_ptrmaps_out; 1928 } 1929 } 1930 1931 if( !pPage->leaf ){ 1932 Pgno childPgno = get4byte(&pPage->aData[pPage->hdrOffset+8]); 1933 rc = ptrmapPut(pBt, childPgno, PTRMAP_BTREE, pgno); 1934 } 1935 1936 set_child_ptrmaps_out: 1937 pPage->isInit = isInitOrig; 1938 return rc; 1939 } 1940 1941 /* 1942 ** Somewhere on pPage, which is guarenteed to be a btree page, not an overflow 1943 ** page, is a pointer to page iFrom. Modify this pointer so that it points to 1944 ** iTo. Parameter eType describes the type of pointer to be modified, as 1945 ** follows: 1946 ** 1947 ** PTRMAP_BTREE: pPage is a btree-page. The pointer points at a child 1948 ** page of pPage. 1949 ** 1950 ** PTRMAP_OVERFLOW1: pPage is a btree-page. The pointer points at an overflow 1951 ** page pointed to by one of the cells on pPage. 1952 ** 1953 ** PTRMAP_OVERFLOW2: pPage is an overflow-page. The pointer points at the next 1954 ** overflow page in the list. 1955 */ 1956 static int modifyPagePointer(MemPage *pPage, Pgno iFrom, Pgno iTo, u8 eType){ 1957 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 1958 if( eType==PTRMAP_OVERFLOW2 ){ 1959 /* The pointer is always the first 4 bytes of the page in this case. */ 1960 if( get4byte(pPage->aData)!=iFrom ){ 1961 return SQLITE_CORRUPT_BKPT; 1962 } 1963 put4byte(pPage->aData, iTo); 1964 }else{ 1965 int isInitOrig = pPage->isInit; 1966 int i; 1967 int nCell; 1968 1969 sqlite3BtreeInitPage(pPage, 0); 1970 nCell = pPage->nCell; 1971 1972 for(i=0; i<nCell; i++){ 1973 u8 *pCell = findCell(pPage, i); 1974 if( eType==PTRMAP_OVERFLOW1 ){ 1975 CellInfo info; 1976 sqlite3BtreeParseCellPtr(pPage, pCell, &info); 1977 if( info.iOverflow ){ 1978 if( iFrom==get4byte(&pCell[info.iOverflow]) ){ 1979 put4byte(&pCell[info.iOverflow], iTo); 1980 break; 1981 } 1982 } 1983 }else{ 1984 if( get4byte(pCell)==iFrom ){ 1985 put4byte(pCell, iTo); 1986 break; 1987 } 1988 } 1989 } 1990 1991 if( i==nCell ){ 1992 if( eType!=PTRMAP_BTREE || 1993 get4byte(&pPage->aData[pPage->hdrOffset+8])!=iFrom ){ 1994 return SQLITE_CORRUPT_BKPT; 1995 } 1996 put4byte(&pPage->aData[pPage->hdrOffset+8], iTo); 1997 } 1998 1999 pPage->isInit = isInitOrig; 2000 } 2001 return SQLITE_OK; 2002 } 2003 2004 2005 /* 2006 ** Move the open database page pDbPage to location iFreePage in the 2007 ** database. The pDbPage reference remains valid. 2008 */ 2009 static int relocatePage( 2010 BtShared *pBt, /* Btree */ 2011 MemPage *pDbPage, /* Open page to move */ 2012 u8 eType, /* Pointer map 'type' entry for pDbPage */ 2013 Pgno iPtrPage, /* Pointer map 'page-no' entry for pDbPage */ 2014 Pgno iFreePage /* The location to move pDbPage to */ 2015 ){ 2016 MemPage *pPtrPage; /* The page that contains a pointer to pDbPage */ 2017 Pgno iDbPage = pDbPage->pgno; 2018 Pager *pPager = pBt->pPager; 2019 int rc; 2020 2021 assert( eType==PTRMAP_OVERFLOW2 || eType==PTRMAP_OVERFLOW1 || 2022 eType==PTRMAP_BTREE || eType==PTRMAP_ROOTPAGE ); 2023 assert( sqlite3_mutex_held(pBt->mutex) ); 2024 assert( pDbPage->pBt==pBt ); 2025 2026 /* Move page iDbPage from it's current location to page number iFreePage */ 2027 TRACE(("AUTOVACUUM: Moving %d to free page %d (ptr page %d type %d)\n", 2028 iDbPage, iFreePage, iPtrPage, eType)); 2029 rc = sqlite3PagerMovepage(pPager, pDbPage->pDbPage, iFreePage); 2030 if( rc!=SQLITE_OK ){ 2031 return rc; 2032 } 2033 pDbPage->pgno = iFreePage; 2034 2035 /* If pDbPage was a btree-page, then it may have child pages and/or cells 2036 ** that point to overflow pages. The pointer map entries for all these 2037 ** pages need to be changed. 2038 ** 2039 ** If pDbPage is an overflow page, then the first 4 bytes may store a 2040 ** pointer to a subsequent overflow page. If this is the case, then 2041 ** the pointer map needs to be updated for the subsequent overflow page. 2042 */ 2043 if( eType==PTRMAP_BTREE || eType==PTRMAP_ROOTPAGE ){ 2044 rc = setChildPtrmaps(pDbPage); 2045 if( rc!=SQLITE_OK ){ 2046 return rc; 2047 } 2048 }else{ 2049 Pgno nextOvfl = get4byte(pDbPage->aData); 2050 if( nextOvfl!=0 ){ 2051 rc = ptrmapPut(pBt, nextOvfl, PTRMAP_OVERFLOW2, iFreePage); 2052 if( rc!=SQLITE_OK ){ 2053 return rc; 2054 } 2055 } 2056 } 2057 2058 /* Fix the database pointer on page iPtrPage that pointed at iDbPage so 2059 ** that it points at iFreePage. Also fix the pointer map entry for 2060 ** iPtrPage. 2061 */ 2062 if( eType!=PTRMAP_ROOTPAGE ){ 2063 rc = sqlite3BtreeGetPage(pBt, iPtrPage, &pPtrPage, 0); 2064 if( rc!=SQLITE_OK ){ 2065 return rc; 2066 } 2067 rc = sqlite3PagerWrite(pPtrPage->pDbPage); 2068 if( rc!=SQLITE_OK ){ 2069 releasePage(pPtrPage); 2070 return rc; 2071 } 2072 rc = modifyPagePointer(pPtrPage, iDbPage, iFreePage, eType); 2073 releasePage(pPtrPage); 2074 if( rc==SQLITE_OK ){ 2075 rc = ptrmapPut(pBt, iFreePage, eType, iPtrPage); 2076 } 2077 } 2078 return rc; 2079 } 2080 2081 /* Forward declaration required by incrVacuumStep(). */ 2082 static int allocateBtreePage(BtShared *, MemPage **, Pgno *, Pgno, u8); 2083 2084 /* 2085 ** Perform a single step of an incremental-vacuum. If successful, 2086 ** return SQLITE_OK. If there is no work to do (and therefore no 2087 ** point in calling this function again), return SQLITE_DONE. 2088 ** 2089 ** More specificly, this function attempts to re-organize the 2090 ** database so that the last page of the file currently in use 2091 ** is no longer in use. 2092 ** 2093 ** If the nFin parameter is non-zero, the implementation assumes 2094 ** that the caller will keep calling incrVacuumStep() until 2095 ** it returns SQLITE_DONE or an error, and that nFin is the 2096 ** number of pages the database file will contain after this 2097 ** process is complete. 2098 */ 2099 static int incrVacuumStep(BtShared *pBt, Pgno nFin){ 2100 Pgno iLastPg; /* Last page in the database */ 2101 Pgno nFreeList; /* Number of pages still on the free-list */ 2102 2103 assert( sqlite3_mutex_held(pBt->mutex) ); 2104 iLastPg = pBt->nTrunc; 2105 if( iLastPg==0 ){ 2106 iLastPg = sqlite3PagerPagecount(pBt->pPager); 2107 } 2108 2109 if( !PTRMAP_ISPAGE(pBt, iLastPg) && iLastPg!=PENDING_BYTE_PAGE(pBt) ){ 2110 int rc; 2111 u8 eType; 2112 Pgno iPtrPage; 2113 2114 nFreeList = get4byte(&pBt->pPage1->aData[36]); 2115 if( nFreeList==0 || nFin==iLastPg ){ 2116 return SQLITE_DONE; 2117 } 2118 2119 rc = ptrmapGet(pBt, iLastPg, &eType, &iPtrPage); 2120 if( rc!=SQLITE_OK ){ 2121 return rc; 2122 } 2123 if( eType==PTRMAP_ROOTPAGE ){ 2124 return SQLITE_CORRUPT_BKPT; 2125 } 2126 2127 if( eType==PTRMAP_FREEPAGE ){ 2128 if( nFin==0 ){ 2129 /* Remove the page from the files free-list. This is not required 2130 ** if nFin is non-zero. In that case, the free-list will be 2131 ** truncated to zero after this function returns, so it doesn't 2132 ** matter if it still contains some garbage entries. 2133 */ 2134 Pgno iFreePg; 2135 MemPage *pFreePg; 2136 rc = allocateBtreePage(pBt, &pFreePg, &iFreePg, iLastPg, 1); 2137 if( rc!=SQLITE_OK ){ 2138 return rc; 2139 } 2140 assert( iFreePg==iLastPg ); 2141 releasePage(pFreePg); 2142 } 2143 } else { 2144 Pgno iFreePg; /* Index of free page to move pLastPg to */ 2145 MemPage *pLastPg; 2146 2147 rc = sqlite3BtreeGetPage(pBt, iLastPg, &pLastPg, 0); 2148 if( rc!=SQLITE_OK ){ 2149 return rc; 2150 } 2151 2152 /* If nFin is zero, this loop runs exactly once and page pLastPg 2153 ** is swapped with the first free page pulled off the free list. 2154 ** 2155 ** On the other hand, if nFin is greater than zero, then keep 2156 ** looping until a free-page located within the first nFin pages 2157 ** of the file is found. 2158 */ 2159 do { 2160 MemPage *pFreePg; 2161 rc = allocateBtreePage(pBt, &pFreePg, &iFreePg, 0, 0); 2162 if( rc!=SQLITE_OK ){ 2163 releasePage(pLastPg); 2164 return rc; 2165 } 2166 releasePage(pFreePg); 2167 }while( nFin!=0 && iFreePg>nFin ); 2168 assert( iFreePg<iLastPg ); 2169 2170 rc = sqlite3PagerWrite(pLastPg->pDbPage); 2171 if( rc==SQLITE_OK ){ 2172 rc = relocatePage(pBt, pLastPg, eType, iPtrPage, iFreePg); 2173 } 2174 releasePage(pLastPg); 2175 if( rc!=SQLITE_OK ){ 2176 return rc; 2177 } 2178 } 2179 } 2180 2181 pBt->nTrunc = iLastPg - 1; 2182 while( pBt->nTrunc==PENDING_BYTE_PAGE(pBt)||PTRMAP_ISPAGE(pBt, pBt->nTrunc) ){ 2183 pBt->nTrunc--; 2184 } 2185 return SQLITE_OK; 2186 } 2187 2188 /* 2189 ** A write-transaction must be opened before calling this function. 2190 ** It performs a single unit of work towards an incremental vacuum. 2191 ** 2192 ** If the incremental vacuum is finished after this function has run, 2193 ** SQLITE_DONE is returned. If it is not finished, but no error occured, 2194 ** SQLITE_OK is returned. Otherwise an SQLite error code. 2195 */ 2196 int sqlite3BtreeIncrVacuum(Btree *p){ 2197 int rc; 2198 BtShared *pBt = p->pBt; 2199 2200 sqlite3BtreeEnter(p); 2201 assert( pBt->inTransaction==TRANS_WRITE && p->inTrans==TRANS_WRITE ); 2202 if( !pBt->autoVacuum ){ 2203 rc = SQLITE_DONE; 2204 }else{ 2205 invalidateAllOverflowCache(pBt); 2206 rc = incrVacuumStep(pBt, 0); 2207 } 2208 sqlite3BtreeLeave(p); 2209 return rc; 2210 } 2211 2212 /* 2213 ** This routine is called prior to sqlite3PagerCommit when a transaction 2214 ** is commited for an auto-vacuum database. 2215 ** 2216 ** If SQLITE_OK is returned, then *pnTrunc is set to the number of pages 2217 ** the database file should be truncated to during the commit process. 2218 ** i.e. the database has been reorganized so that only the first *pnTrunc 2219 ** pages are in use. 2220 */ 2221 static int autoVacuumCommit(BtShared *pBt, Pgno *pnTrunc){ 2222 int rc = SQLITE_OK; 2223 Pager *pPager = pBt->pPager; 2224 #ifndef NDEBUG 2225 int nRef = sqlite3PagerRefcount(pPager); 2226 #endif 2227 2228 assert( sqlite3_mutex_held(pBt->mutex) ); 2229 invalidateAllOverflowCache(pBt); 2230 assert(pBt->autoVacuum); 2231 if( !pBt->incrVacuum ){ 2232 Pgno nFin = 0; 2233 2234 if( pBt->nTrunc==0 ){ 2235 Pgno nFree; 2236 Pgno nPtrmap; 2237 const int pgsz = pBt->pageSize; 2238 Pgno nOrig = sqlite3PagerPagecount(pBt->pPager); 2239 2240 if( PTRMAP_ISPAGE(pBt, nOrig) ){ 2241 return SQLITE_CORRUPT_BKPT; 2242 } 2243 if( nOrig==PENDING_BYTE_PAGE(pBt) ){ 2244 nOrig--; 2245 } 2246 nFree = get4byte(&pBt->pPage1->aData[36]); 2247 nPtrmap = (nFree-nOrig+PTRMAP_PAGENO(pBt, nOrig)+pgsz/5)/(pgsz/5); 2248 nFin = nOrig - nFree - nPtrmap; 2249 if( nOrig>PENDING_BYTE_PAGE(pBt) && nFin<=PENDING_BYTE_PAGE(pBt) ){ 2250 nFin--; 2251 } 2252 while( PTRMAP_ISPAGE(pBt, nFin) || nFin==PENDING_BYTE_PAGE(pBt) ){ 2253 nFin--; 2254 } 2255 } 2256 2257 while( rc==SQLITE_OK ){ 2258 rc = incrVacuumStep(pBt, nFin); 2259 } 2260 if( rc==SQLITE_DONE ){ 2261 assert(nFin==0 || pBt->nTrunc==0 || nFin<=pBt->nTrunc); 2262 rc = SQLITE_OK; 2263 if( pBt->nTrunc ){ 2264 rc = sqlite3PagerWrite(pBt->pPage1->pDbPage); 2265 put4byte(&pBt->pPage1->aData[32], 0); 2266 put4byte(&pBt->pPage1->aData[36], 0); 2267 pBt->nTrunc = nFin; 2268 } 2269 } 2270 if( rc!=SQLITE_OK ){ 2271 sqlite3PagerRollback(pPager); 2272 } 2273 } 2274 2275 if( rc==SQLITE_OK ){ 2276 *pnTrunc = pBt->nTrunc; 2277 pBt->nTrunc = 0; 2278 } 2279 assert( nRef==sqlite3PagerRefcount(pPager) ); 2280 return rc; 2281 } 2282 2283 #endif 2284 2285 /* 2286 ** This routine does the first phase of a two-phase commit. This routine 2287 ** causes a rollback journal to be created (if it does not already exist) 2288 ** and populated with enough information so that if a power loss occurs 2289 ** the database can be restored to its original state by playing back 2290 ** the journal. Then the contents of the journal are flushed out to 2291 ** the disk. After the journal is safely on oxide, the changes to the 2292 ** database are written into the database file and flushed to oxide. 2293 ** At the end of this call, the rollback journal still exists on the 2294 ** disk and we are still holding all locks, so the transaction has not 2295 ** committed. See sqlite3BtreeCommit() for the second phase of the 2296 ** commit process. 2297 ** 2298 ** This call is a no-op if no write-transaction is currently active on pBt. 2299 ** 2300 ** Otherwise, sync the database file for the btree pBt. zMaster points to 2301 ** the name of a master journal file that should be written into the 2302 ** individual journal file, or is NULL, indicating no master journal file 2303 ** (single database transaction). 2304 ** 2305 ** When this is called, the master journal should already have been 2306 ** created, populated with this journal pointer and synced to disk. 2307 ** 2308 ** Once this is routine has returned, the only thing required to commit 2309 ** the write-transaction for this database file is to delete the journal. 2310 */ 2311 int sqlite3BtreeCommitPhaseOne(Btree *p, const char *zMaster){ 2312 int rc = SQLITE_OK; 2313 if( p->inTrans==TRANS_WRITE ){ 2314 BtShared *pBt = p->pBt; 2315 Pgno nTrunc = 0; 2316 sqlite3BtreeEnter(p); 2317 #ifndef SQLITE_OMIT_AUTOVACUUM 2318 if( pBt->autoVacuum ){ 2319 rc = autoVacuumCommit(pBt, &nTrunc); 2320 if( rc!=SQLITE_OK ){ 2321 sqlite3BtreeLeave(p); 2322 return rc; 2323 } 2324 } 2325 #endif 2326 rc = sqlite3PagerCommitPhaseOne(pBt->pPager, zMaster, nTrunc); 2327 sqlite3BtreeLeave(p); 2328 } 2329 return rc; 2330 } 2331 2332 /* 2333 ** Commit the transaction currently in progress. 2334 ** 2335 ** This routine implements the second phase of a 2-phase commit. The 2336 ** sqlite3BtreeSync() routine does the first phase and should be invoked 2337 ** prior to calling this routine. The sqlite3BtreeSync() routine did 2338 ** all the work of writing information out to disk and flushing the 2339 ** contents so that they are written onto the disk platter. All this 2340 ** routine has to do is delete or truncate the rollback journal 2341 ** (which causes the transaction to commit) and drop locks. 2342 ** 2343 ** This will release the write lock on the database file. If there 2344 ** are no active cursors, it also releases the read lock. 2345 */ 2346 int sqlite3BtreeCommitPhaseTwo(Btree *p){ 2347 BtShared *pBt = p->pBt; 2348 2349 sqlite3BtreeEnter(p); 2350 btreeIntegrity(p); 2351 2352 /* If the handle has a write-transaction open, commit the shared-btrees 2353 ** transaction and set the shared state to TRANS_READ. 2354 */ 2355 if( p->inTrans==TRANS_WRITE ){ 2356 int rc; 2357 assert( pBt->inTransaction==TRANS_WRITE ); 2358 assert( pBt->nTransaction>0 ); 2359 rc = sqlite3PagerCommitPhaseTwo(pBt->pPager); 2360 if( rc!=SQLITE_OK ){ 2361 sqlite3BtreeLeave(p); 2362 return rc; 2363 } 2364 pBt->inTransaction = TRANS_READ; 2365 pBt->inStmt = 0; 2366 } 2367 unlockAllTables(p); 2368 2369 /* If the handle has any kind of transaction open, decrement the transaction 2370 ** count of the shared btree. If the transaction count reaches 0, set 2371 ** the shared state to TRANS_NONE. The unlockBtreeIfUnused() call below 2372 ** will unlock the pager. 2373 */ 2374 if( p->inTrans!=TRANS_NONE ){ 2375 pBt->nTransaction--; 2376 if( 0==pBt->nTransaction ){ 2377 pBt->inTransaction = TRANS_NONE; 2378 } 2379 } 2380 2381 /* Set the handles current transaction state to TRANS_NONE and unlock 2382 ** the pager if this call closed the only read or write transaction. 2383 */ 2384 p->inTrans = TRANS_NONE; 2385 unlockBtreeIfUnused(pBt); 2386 2387 btreeIntegrity(p); 2388 sqlite3BtreeLeave(p); 2389 return SQLITE_OK; 2390 } 2391 2392 /* 2393 ** Do both phases of a commit. 2394 */ 2395 int sqlite3BtreeCommit(Btree *p){ 2396 int rc; 2397 sqlite3BtreeEnter(p); 2398 rc = sqlite3BtreeCommitPhaseOne(p, 0); 2399 if( rc==SQLITE_OK ){ 2400 rc = sqlite3BtreeCommitPhaseTwo(p); 2401 } 2402 sqlite3BtreeLeave(p); 2403 return rc; 2404 } 2405 2406 #ifndef NDEBUG 2407 /* 2408 ** Return the number of write-cursors open on this handle. This is for use 2409 ** in assert() expressions, so it is only compiled if NDEBUG is not 2410 ** defined. 2411 ** 2412 ** For the purposes of this routine, a write-cursor is any cursor that 2413 ** is capable of writing to the databse. That means the cursor was 2414 ** originally opened for writing and the cursor has not be disabled 2415 ** by having its state changed to CURSOR_FAULT. 2416 */ 2417 static int countWriteCursors(BtShared *pBt){ 2418 BtCursor *pCur; 2419 int r = 0; 2420 for(pCur=pBt->pCursor; pCur; pCur=pCur->pNext){ 2421 if( pCur->wrFlag && pCur->eState!=CURSOR_FAULT ) r++; 2422 } 2423 return r; 2424 } 2425 #endif 2426 2427 /* 2428 ** This routine sets the state to CURSOR_FAULT and the error 2429 ** code to errCode for every cursor on BtShared that pBtree 2430 ** references. 2431 ** 2432 ** Every cursor is tripped, including cursors that belong 2433 ** to other database connections that happen to be sharing 2434 ** the cache with pBtree. 2435 ** 2436 ** This routine gets called when a rollback occurs. 2437 ** All cursors using the same cache must be tripped 2438 ** to prevent them from trying to use the btree after 2439 ** the rollback. The rollback may have deleted tables 2440 ** or moved root pages, so it is not sufficient to 2441 ** save the state of the cursor. The cursor must be 2442 ** invalidated. 2443 */ 2444 void sqlite3BtreeTripAllCursors(Btree *pBtree, int errCode){ 2445 BtCursor *p; 2446 sqlite3BtreeEnter(pBtree); 2447 for(p=pBtree->pBt->pCursor; p; p=p->pNext){ 2448 clearCursorPosition(p); 2449 p->eState = CURSOR_FAULT; 2450 p->skip = errCode; 2451 } 2452 sqlite3BtreeLeave(pBtree); 2453 } 2454 2455 /* 2456 ** Rollback the transaction in progress. All cursors will be 2457 ** invalided by this operation. Any attempt to use a cursor 2458 ** that was open at the beginning of this operation will result 2459 ** in an error. 2460 ** 2461 ** This will release the write lock on the database file. If there 2462 ** are no active cursors, it also releases the read lock. 2463 */ 2464 int sqlite3BtreeRollback(Btree *p){ 2465 int rc; 2466 BtShared *pBt = p->pBt; 2467 MemPage *pPage1; 2468 2469 sqlite3BtreeEnter(p); 2470 rc = saveAllCursors(pBt, 0, 0); 2471 #ifndef SQLITE_OMIT_SHARED_CACHE 2472 if( rc!=SQLITE_OK ){ 2473 /* This is a horrible situation. An IO or malloc() error occured whilst 2474 ** trying to save cursor positions. If this is an automatic rollback (as 2475 ** the result of a constraint, malloc() failure or IO error) then 2476 ** the cache may be internally inconsistent (not contain valid trees) so 2477 ** we cannot simply return the error to the caller. Instead, abort 2478 ** all queries that may be using any of the cursors that failed to save. 2479 */ 2480 sqlite3BtreeTripAllCursors(p, rc); 2481 } 2482 #endif 2483 btreeIntegrity(p); 2484 unlockAllTables(p); 2485 2486 if( p->inTrans==TRANS_WRITE ){ 2487 int rc2; 2488 2489 #ifndef SQLITE_OMIT_AUTOVACUUM 2490 pBt->nTrunc = 0; 2491 #endif 2492 2493 assert( TRANS_WRITE==pBt->inTransaction ); 2494 rc2 = sqlite3PagerRollback(pBt->pPager); 2495 if( rc2!=SQLITE_OK ){ 2496 rc = rc2; 2497 } 2498 2499 /* The rollback may have destroyed the pPage1->aData value. So 2500 ** call sqlite3BtreeGetPage() on page 1 again to make 2501 ** sure pPage1->aData is set correctly. */ 2502 if( sqlite3BtreeGetPage(pBt, 1, &pPage1, 0)==SQLITE_OK ){ 2503 releasePage(pPage1); 2504 } 2505 assert( countWriteCursors(pBt)==0 ); 2506 pBt->inTransaction = TRANS_READ; 2507 } 2508 2509 if( p->inTrans!=TRANS_NONE ){ 2510 assert( pBt->nTransaction>0 ); 2511 pBt->nTransaction--; 2512 if( 0==pBt->nTransaction ){ 2513 pBt->inTransaction = TRANS_NONE; 2514 } 2515 } 2516 2517 p->inTrans = TRANS_NONE; 2518 pBt->inStmt = 0; 2519 unlockBtreeIfUnused(pBt); 2520 2521 btreeIntegrity(p); 2522 sqlite3BtreeLeave(p); 2523 return rc; 2524 } 2525 2526 /* 2527 ** Start a statement subtransaction. The subtransaction can 2528 ** can be rolled back independently of the main transaction. 2529 ** You must start a transaction before starting a subtransaction. 2530 ** The subtransaction is ended automatically if the main transaction 2531 ** commits or rolls back. 2532 ** 2533 ** Only one subtransaction may be active at a time. It is an error to try 2534 ** to start a new subtransaction if another subtransaction is already active. 2535 ** 2536 ** Statement subtransactions are used around individual SQL statements 2537 ** that are contained within a BEGIN...COMMIT block. If a constraint 2538 ** error occurs within the statement, the effect of that one statement 2539 ** can be rolled back without having to rollback the entire transaction. 2540 */ 2541 int sqlite3BtreeBeginStmt(Btree *p){ 2542 int rc; 2543 BtShared *pBt = p->pBt; 2544 sqlite3BtreeEnter(p); 2545 if( (p->inTrans!=TRANS_WRITE) || pBt->inStmt ){ 2546 rc = pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR; 2547 }else{ 2548 assert( pBt->inTransaction==TRANS_WRITE ); 2549 rc = pBt->readOnly ? SQLITE_OK : sqlite3PagerStmtBegin(pBt->pPager); 2550 pBt->inStmt = 1; 2551 } 2552 sqlite3BtreeLeave(p); 2553 return rc; 2554 } 2555 2556 2557 /* 2558 ** Commit the statment subtransaction currently in progress. If no 2559 ** subtransaction is active, this is a no-op. 2560 */ 2561 int sqlite3BtreeCommitStmt(Btree *p){ 2562 int rc; 2563 BtShared *pBt = p->pBt; 2564 sqlite3BtreeEnter(p); 2565 if( pBt->inStmt && !pBt->readOnly ){ 2566 rc = sqlite3PagerStmtCommit(pBt->pPager); 2567 }else{ 2568 rc = SQLITE_OK; 2569 } 2570 pBt->inStmt = 0; 2571 sqlite3BtreeLeave(p); 2572 return rc; 2573 } 2574 2575 /* 2576 ** Rollback the active statement subtransaction. If no subtransaction 2577 ** is active this routine is a no-op. 2578 ** 2579 ** All cursors will be invalidated by this operation. Any attempt 2580 ** to use a cursor that was open at the beginning of this operation 2581 ** will result in an error. 2582 */ 2583 int sqlite3BtreeRollbackStmt(Btree *p){ 2584 int rc = SQLITE_OK; 2585 BtShared *pBt = p->pBt; 2586 sqlite3BtreeEnter(p); 2587 if( pBt->inStmt && !pBt->readOnly ){ 2588 rc = sqlite3PagerStmtRollback(pBt->pPager); 2589 assert( countWriteCursors(pBt)==0 ); 2590 pBt->inStmt = 0; 2591 } 2592 sqlite3BtreeLeave(p); 2593 return rc; 2594 } 2595 2596 /* 2597 ** Default key comparison function to be used if no comparison function 2598 ** is specified on the sqlite3BtreeCursor() call. 2599 */ 2600 static int dfltCompare( 2601 void *NotUsed, /* User data is not used */ 2602 int n1, const void *p1, /* First key to compare */ 2603 int n2, const void *p2 /* Second key to compare */ 2604 ){ 2605 int c; 2606 c = memcmp(p1, p2, n1<n2 ? n1 : n2); 2607 if( c==0 ){ 2608 c = n1 - n2; 2609 } 2610 return c; 2611 } 2612 2613 /* 2614 ** Create a new cursor for the BTree whose root is on the page 2615 ** iTable. The act of acquiring a cursor gets a read lock on 2616 ** the database file. 2617 ** 2618 ** If wrFlag==0, then the cursor can only be used for reading. 2619 ** If wrFlag==1, then the cursor can be used for reading or for 2620 ** writing if other conditions for writing are also met. These 2621 ** are the conditions that must be met in order for writing to 2622 ** be allowed: 2623 ** 2624 ** 1: The cursor must have been opened with wrFlag==1 2625 ** 2626 ** 2: Other database connections that share the same pager cache 2627 ** but which are not in the READ_UNCOMMITTED state may not have 2628 ** cursors open with wrFlag==0 on the same table. Otherwise 2629 ** the changes made by this write cursor would be visible to 2630 ** the read cursors in the other database connection. 2631 ** 2632 ** 3: The database must be writable (not on read-only media) 2633 ** 2634 ** 4: There must be an active transaction. 2635 ** 2636 ** No checking is done to make sure that page iTable really is the 2637 ** root page of a b-tree. If it is not, then the cursor acquired 2638 ** will not work correctly. 2639 ** 2640 ** The comparison function must be logically the same for every cursor 2641 ** on a particular table. Changing the comparison function will result 2642 ** in incorrect operations. If the comparison function is NULL, a 2643 ** default comparison function is used. The comparison function is 2644 ** always ignored for INTKEY tables. 2645 */ 2646 static int btreeCursor( 2647 Btree *p, /* The btree */ 2648 int iTable, /* Root page of table to open */ 2649 int wrFlag, /* 1 to write. 0 read-only */ 2650 int (*xCmp)(void*,int,const void*,int,const void*), /* Key Comparison func */ 2651 void *pArg, /* First arg to xCompare() */ 2652 BtCursor **ppCur /* Write new cursor here */ 2653 ){ 2654 int rc; 2655 BtCursor *pCur; 2656 BtShared *pBt = p->pBt; 2657 2658 assert( sqlite3BtreeHoldsMutex(p) ); 2659 *ppCur = 0; 2660 if( wrFlag ){ 2661 if( pBt->readOnly ){ 2662 return SQLITE_READONLY; 2663 } 2664 if( checkReadLocks(p, iTable, 0) ){ 2665 return SQLITE_LOCKED; 2666 } 2667 } 2668 2669 if( pBt->pPage1==0 ){ 2670 rc = lockBtreeWithRetry(p); 2671 if( rc!=SQLITE_OK ){ 2672 return rc; 2673 } 2674 if( pBt->readOnly && wrFlag ){ 2675 return SQLITE_READONLY; 2676 } 2677 } 2678 pCur = sqlite3MallocZero( sizeof(*pCur) ); 2679 if( pCur==0 ){ 2680 rc = SQLITE_NOMEM; 2681 goto create_cursor_exception; 2682 } 2683 pCur->pgnoRoot = (Pgno)iTable; 2684 if( iTable==1 && sqlite3PagerPagecount(pBt->pPager)==0 ){ 2685 rc = SQLITE_EMPTY; 2686 goto create_cursor_exception; 2687 } 2688 rc = getAndInitPage(pBt, pCur->pgnoRoot, &pCur->pPage, 0); 2689 if( rc!=SQLITE_OK ){ 2690 goto create_cursor_exception; 2691 } 2692 2693 /* Now that no other errors can occur, finish filling in the BtCursor 2694 ** variables, link the cursor into the BtShared list and set *ppCur (the 2695 ** output argument to this function). 2696 */ 2697 pCur->xCompare = xCmp ? xCmp : dfltCompare; 2698 pCur->pArg = pArg; 2699 pCur->pBtree = p; 2700 pCur->pBt = pBt; 2701 pCur->wrFlag = wrFlag; 2702 pCur->pNext = pBt->pCursor; 2703 if( pCur->pNext ){ 2704 pCur->pNext->pPrev = pCur; 2705 } 2706 pBt->pCursor = pCur; 2707 pCur->eState = CURSOR_INVALID; 2708 *ppCur = pCur; 2709 2710 return SQLITE_OK; 2711 2712 create_cursor_exception: 2713 if( pCur ){ 2714 releasePage(pCur->pPage); 2715 sqlite3_free(pCur); 2716 } 2717 unlockBtreeIfUnused(pBt); 2718 return rc; 2719 } 2720 int sqlite3BtreeCursor( 2721 Btree *p, /* The btree */ 2722 int iTable, /* Root page of table to open */ 2723 int wrFlag, /* 1 to write. 0 read-only */ 2724 int (*xCmp)(void*,int,const void*,int,const void*), /* Key Comparison func */ 2725 void *pArg, /* First arg to xCompare() */ 2726 BtCursor **ppCur /* Write new cursor here */ 2727 ){ 2728 int rc; 2729 sqlite3BtreeEnter(p); 2730 rc = btreeCursor(p, iTable, wrFlag, xCmp, pArg, ppCur); 2731 sqlite3BtreeLeave(p); 2732 return rc; 2733 } 2734 2735 2736 /* 2737 ** Close a cursor. The read lock on the database file is released 2738 ** when the last cursor is closed. 2739 */ 2740 int sqlite3BtreeCloseCursor(BtCursor *pCur){ 2741 BtShared *pBt = pCur->pBt; 2742 Btree *pBtree = pCur->pBtree; 2743 2744 sqlite3BtreeEnter(pBtree); 2745 clearCursorPosition(pCur); 2746 if( pCur->pPrev ){ 2747 pCur->pPrev->pNext = pCur->pNext; 2748 }else{ 2749 pBt->pCursor = pCur->pNext; 2750 } 2751 if( pCur->pNext ){ 2752 pCur->pNext->pPrev = pCur->pPrev; 2753 } 2754 releasePage(pCur->pPage); 2755 unlockBtreeIfUnused(pBt); 2756 invalidateOverflowCache(pCur); 2757 sqlite3_free(pCur); 2758 sqlite3BtreeLeave(pBtree); 2759 return SQLITE_OK; 2760 } 2761 2762 /* 2763 ** Make a temporary cursor by filling in the fields of pTempCur. 2764 ** The temporary cursor is not on the cursor list for the Btree. 2765 */ 2766 void sqlite3BtreeGetTempCursor(BtCursor *pCur, BtCursor *pTempCur){ 2767 assert( cursorHoldsMutex(pCur) ); 2768 memcpy(pTempCur, pCur, sizeof(*pCur)); 2769 pTempCur->pNext = 0; 2770 pTempCur->pPrev = 0; 2771 if( pTempCur->pPage ){ 2772 sqlite3PagerRef(pTempCur->pPage->pDbPage); 2773 } 2774 } 2775 2776 /* 2777 ** Delete a temporary cursor such as was made by the CreateTemporaryCursor() 2778 ** function above. 2779 */ 2780 void sqlite3BtreeReleaseTempCursor(BtCursor *pCur){ 2781 assert( cursorHoldsMutex(pCur) ); 2782 if( pCur->pPage ){ 2783 sqlite3PagerUnref(pCur->pPage->pDbPage); 2784 } 2785 } 2786 2787 /* 2788 ** Make sure the BtCursor* given in the argument has a valid 2789 ** BtCursor.info structure. If it is not already valid, call 2790 ** sqlite3BtreeParseCell() to fill it in. 2791 ** 2792 ** BtCursor.info is a cache of the information in the current cell. 2793 ** Using this cache reduces the number of calls to sqlite3BtreeParseCell(). 2794 ** 2795 ** 2007-06-25: There is a bug in some versions of MSVC that cause the 2796 ** compiler to crash when getCellInfo() is implemented as a macro. 2797 ** But there is a measureable speed advantage to using the macro on gcc 2798 ** (when less compiler optimizations like -Os or -O0 are used and the 2799 ** compiler is not doing agressive inlining.) So we use a real function 2800 ** for MSVC and a macro for everything else. Ticket #2457. 2801 */ 2802 #ifndef NDEBUG 2803 static void assertCellInfo(BtCursor *pCur){ 2804 CellInfo info; 2805 memset(&info, 0, sizeof(info)); 2806 sqlite3BtreeParseCell(pCur->pPage, pCur->idx, &info); 2807 assert( memcmp(&info, &pCur->info, sizeof(info))==0 ); 2808 } 2809 #else 2810 #define assertCellInfo(x) 2811 #endif 2812 #ifdef _MSC_VER 2813 /* Use a real function in MSVC to work around bugs in that compiler. */ 2814 static void getCellInfo(BtCursor *pCur){ 2815 if( pCur->info.nSize==0 ){ 2816 sqlite3BtreeParseCell(pCur->pPage, pCur->idx, &pCur->info); 2817 }else{ 2818 assertCellInfo(pCur); 2819 } 2820 } 2821 #else /* if not _MSC_VER */ 2822 /* Use a macro in all other compilers so that the function is inlined */ 2823 #define getCellInfo(pCur) \ 2824 if( pCur->info.nSize==0 ){ \ 2825 sqlite3BtreeParseCell(pCur->pPage, pCur->idx, &pCur->info); \ 2826 }else{ \ 2827 assertCellInfo(pCur); \ 2828 } 2829 #endif /* _MSC_VER */ 2830 2831 /* 2832 ** Set *pSize to the size of the buffer needed to hold the value of 2833 ** the key for the current entry. If the cursor is not pointing 2834 ** to a valid entry, *pSize is set to 0. 2835 ** 2836 ** For a table with the INTKEY flag set, this routine returns the key 2837 ** itself, not the number of bytes in the key. 2838 */ 2839 int sqlite3BtreeKeySize(BtCursor *pCur, i64 *pSize){ 2840 int rc; 2841 2842 assert( cursorHoldsMutex(pCur) ); 2843 rc = restoreOrClearCursorPosition(pCur); 2844 if( rc==SQLITE_OK ){ 2845 assert( pCur->eState==CURSOR_INVALID || pCur->eState==CURSOR_VALID ); 2846 if( pCur->eState==CURSOR_INVALID ){ 2847 *pSize = 0; 2848 }else{ 2849 getCellInfo(pCur); 2850 *pSize = pCur->info.nKey; 2851 } 2852 } 2853 return rc; 2854 } 2855 2856 /* 2857 ** Set *pSize to the number of bytes of data in the entry the 2858 ** cursor currently points to. Always return SQLITE_OK. 2859 ** Failure is not possible. If the cursor is not currently 2860 ** pointing to an entry (which can happen, for example, if 2861 ** the database is empty) then *pSize is set to 0. 2862 */ 2863 int sqlite3BtreeDataSize(BtCursor *pCur, u32 *pSize){ 2864 int rc; 2865 2866 assert( cursorHoldsMutex(pCur) ); 2867 rc = restoreOrClearCursorPosition(pCur); 2868 if( rc==SQLITE_OK ){ 2869 assert( pCur->eState==CURSOR_INVALID || pCur->eState==CURSOR_VALID ); 2870 if( pCur->eState==CURSOR_INVALID ){ 2871 /* Not pointing at a valid entry - set *pSize to 0. */ 2872 *pSize = 0; 2873 }else{ 2874 getCellInfo(pCur); 2875 *pSize = pCur->info.nData; 2876 } 2877 } 2878 return rc; 2879 } 2880 2881 /* 2882 ** Given the page number of an overflow page in the database (parameter 2883 ** ovfl), this function finds the page number of the next page in the 2884 ** linked list of overflow pages. If possible, it uses the auto-vacuum 2885 ** pointer-map data instead of reading the content of page ovfl to do so. 2886 ** 2887 ** If an error occurs an SQLite error code is returned. Otherwise: 2888 ** 2889 ** Unless pPgnoNext is NULL, the page number of the next overflow 2890 ** page in the linked list is written to *pPgnoNext. If page ovfl 2891 ** is the last page in it's linked list, *pPgnoNext is set to zero. 2892 ** 2893 ** If ppPage is not NULL, *ppPage is set to the MemPage* handle 2894 ** for page ovfl. The underlying pager page may have been requested 2895 ** with the noContent flag set, so the page data accessable via 2896 ** this handle may not be trusted. 2897 */ 2898 static int getOverflowPage( 2899 BtShared *pBt, 2900 Pgno ovfl, /* Overflow page */ 2901 MemPage **ppPage, /* OUT: MemPage handle */ 2902 Pgno *pPgnoNext /* OUT: Next overflow page number */ 2903 ){ 2904 Pgno next = 0; 2905 int rc; 2906 2907 assert( sqlite3_mutex_held(pBt->mutex) ); 2908 /* One of these must not be NULL. Otherwise, why call this function? */ 2909 assert(ppPage || pPgnoNext); 2910 2911 /* If pPgnoNext is NULL, then this function is being called to obtain 2912 ** a MemPage* reference only. No page-data is required in this case. 2913 */ 2914 if( !pPgnoNext ){ 2915 return sqlite3BtreeGetPage(pBt, ovfl, ppPage, 1); 2916 } 2917 2918 #ifndef SQLITE_OMIT_AUTOVACUUM 2919 /* Try to find the next page in the overflow list using the 2920 ** autovacuum pointer-map pages. Guess that the next page in 2921 ** the overflow list is page number (ovfl+1). If that guess turns 2922 ** out to be wrong, fall back to loading the data of page 2923 ** number ovfl to determine the next page number. 2924 */ 2925 if( pBt->autoVacuum ){ 2926 Pgno pgno; 2927 Pgno iGuess = ovfl+1; 2928 u8 eType; 2929 2930 while( PTRMAP_ISPAGE(pBt, iGuess) || iGuess==PENDING_BYTE_PAGE(pBt) ){ 2931 iGuess++; 2932 } 2933 2934 if( iGuess<=sqlite3PagerPagecount(pBt->pPager) ){ 2935 rc = ptrmapGet(pBt, iGuess, &eType, &pgno); 2936 if( rc!=SQLITE_OK ){ 2937 return rc; 2938 } 2939 if( eType==PTRMAP_OVERFLOW2 && pgno==ovfl ){ 2940 next = iGuess; 2941 } 2942 } 2943 } 2944 #endif 2945 2946 if( next==0 || ppPage ){ 2947 MemPage *pPage = 0; 2948 2949 rc = sqlite3BtreeGetPage(pBt, ovfl, &pPage, next!=0); 2950 assert(rc==SQLITE_OK || pPage==0); 2951 if( next==0 && rc==SQLITE_OK ){ 2952 next = get4byte(pPage->aData); 2953 } 2954 2955 if( ppPage ){ 2956 *ppPage = pPage; 2957 }else{ 2958 releasePage(pPage); 2959 } 2960 } 2961 *pPgnoNext = next; 2962 2963 return rc; 2964 } 2965 2966 /* 2967 ** Copy data from a buffer to a page, or from a page to a buffer. 2968 ** 2969 ** pPayload is a pointer to data stored on database page pDbPage. 2970 ** If argument eOp is false, then nByte bytes of data are copied 2971 ** from pPayload to the buffer pointed at by pBuf. If eOp is true, 2972 ** then sqlite3PagerWrite() is called on pDbPage and nByte bytes 2973 ** of data are copied from the buffer pBuf to pPayload. 2974 ** 2975 ** SQLITE_OK is returned on success, otherwise an error code. 2976 */ 2977 static int copyPayload( 2978 void *pPayload, /* Pointer to page data */ 2979 void *pBuf, /* Pointer to buffer */ 2980 int nByte, /* Number of bytes to copy */ 2981 int eOp, /* 0 -> copy from page, 1 -> copy to page */ 2982 DbPage *pDbPage /* Page containing pPayload */ 2983 ){ 2984 if( eOp ){ 2985 /* Copy data from buffer to page (a write operation) */ 2986 int rc = sqlite3PagerWrite(pDbPage); 2987 if( rc!=SQLITE_OK ){ 2988 return rc; 2989 } 2990 memcpy(pPayload, pBuf, nByte); 2991 }else{ 2992 /* Copy data from page to buffer (a read operation) */ 2993 memcpy(pBuf, pPayload, nByte); 2994 } 2995 return SQLITE_OK; 2996 } 2997 2998 /* 2999 ** This function is used to read or overwrite payload information 3000 ** for the entry that the pCur cursor is pointing to. If the eOp 3001 ** parameter is 0, this is a read operation (data copied into 3002 ** buffer pBuf). If it is non-zero, a write (data copied from 3003 ** buffer pBuf). 3004 ** 3005 ** A total of "amt" bytes are read or written beginning at "offset". 3006 ** Data is read to or from the buffer pBuf. 3007 ** 3008 ** This routine does not make a distinction between key and data. 3009 ** It just reads or writes bytes from the payload area. Data might 3010 ** appear on the main page or be scattered out on multiple overflow 3011 ** pages. 3012 ** 3013 ** If the BtCursor.isIncrblobHandle flag is set, and the current 3014 ** cursor entry uses one or more overflow pages, this function 3015 ** allocates space for and lazily popluates the overflow page-list 3016 ** cache array (BtCursor.aOverflow). Subsequent calls use this 3017 ** cache to make seeking to the supplied offset more efficient. 3018 ** 3019 ** Once an overflow page-list cache has been allocated, it may be 3020 ** invalidated if some other cursor writes to the same table, or if 3021 ** the cursor is moved to a different row. Additionally, in auto-vacuum 3022 ** mode, the following events may invalidate an overflow page-list cache. 3023 ** 3024 ** * An incremental vacuum, 3025 ** * A commit in auto_vacuum="full" mode, 3026 ** * Creating a table (may require moving an overflow page). 3027 */ 3028 static int accessPayload( 3029 BtCursor *pCur, /* Cursor pointing to entry to read from */ 3030 int offset, /* Begin reading this far into payload */ 3031 int amt, /* Read this many bytes */ 3032 unsigned char *pBuf, /* Write the bytes into this buffer */ 3033 int skipKey, /* offset begins at data if this is true */ 3034 int eOp /* zero to read. non-zero to write. */ 3035 ){ 3036 unsigned char *aPayload; 3037 int rc = SQLITE_OK; 3038 u32 nKey; 3039 int iIdx = 0; 3040 MemPage *pPage = pCur->pPage; /* Btree page of current cursor entry */ 3041 BtShared *pBt; /* Btree this cursor belongs to */ 3042 3043 assert( pPage ); 3044 assert( pCur->eState==CURSOR_VALID ); 3045 assert( pCur->idx>=0 && pCur->idx<pPage->nCell ); 3046 assert( offset>=0 ); 3047 assert( cursorHoldsMutex(pCur) ); 3048 3049 getCellInfo(pCur); 3050 aPayload = pCur->info.pCell + pCur->info.nHeader; 3051 nKey = (pPage->intKey ? 0 : pCur->info.nKey); 3052 3053 if( skipKey ){ 3054 offset += nKey; 3055 } 3056 if( offset+amt > nKey+pCur->info.nData ){ 3057 /* Trying to read or write past the end of the data is an error */ 3058 return SQLITE_ERROR; 3059 } 3060 3061 /* Check if data must be read/written to/from the btree page itself. */ 3062 if( offset<pCur->info.nLocal ){ 3063 int a = amt; 3064 if( a+offset>pCur->info.nLocal ){ 3065 a = pCur->info.nLocal - offset; 3066 } 3067 rc = copyPayload(&aPayload[offset], pBuf, a, eOp, pPage->pDbPage); 3068 offset = 0; 3069 pBuf += a; 3070 amt -= a; 3071 }else{ 3072 offset -= pCur->info.nLocal; 3073 } 3074 3075 pBt = pCur->pBt; 3076 if( rc==SQLITE_OK && amt>0 ){ 3077 const int ovflSize = pBt->usableSize - 4; /* Bytes content per ovfl page */ 3078 Pgno nextPage; 3079 3080 nextPage = get4byte(&aPayload[pCur->info.nLocal]); 3081 3082 #ifndef SQLITE_OMIT_INCRBLOB 3083 /* If the isIncrblobHandle flag is set and the BtCursor.aOverflow[] 3084 ** has not been allocated, allocate it now. The array is sized at 3085 ** one entry for each overflow page in the overflow chain. The 3086 ** page number of the first overflow page is stored in aOverflow[0], 3087 ** etc. A value of 0 in the aOverflow[] array means "not yet known" 3088 ** (the cache is lazily populated). 3089 */ 3090 if( pCur->isIncrblobHandle && !pCur->aOverflow ){ 3091 int nOvfl = (pCur->info.nPayload-pCur->info.nLocal+ovflSize-1)/ovflSize; 3092 pCur->aOverflow = (Pgno *)sqlite3MallocZero(sizeof(Pgno)*nOvfl); 3093 if( nOvfl && !pCur->aOverflow ){ 3094 rc = SQLITE_NOMEM; 3095 } 3096 } 3097 3098 /* If the overflow page-list cache has been allocated and the 3099 ** entry for the first required overflow page is valid, skip 3100 ** directly to it. 3101 */ 3102 if( pCur->aOverflow && pCur->aOverflow[offset/ovflSize] ){ 3103 iIdx = (offset/ovflSize); 3104 nextPage = pCur->aOverflow[iIdx]; 3105 offset = (offset%ovflSize); 3106 } 3107 #endif 3108 3109 for( ; rc==SQLITE_OK && amt>0 && nextPage; iIdx++){ 3110 3111 #ifndef SQLITE_OMIT_INCRBLOB 3112 /* If required, populate the overflow page-list cache. */ 3113 if( pCur->aOverflow ){ 3114 assert(!pCur->aOverflow[iIdx] || pCur->aOverflow[iIdx]==nextPage); 3115 pCur->aOverflow[iIdx] = nextPage; 3116 } 3117 #endif 3118 3119 if( offset>=ovflSize ){ 3120 /* The only reason to read this page is to obtain the page 3121 ** number for the next page in the overflow chain. The page 3122 ** data is not required. So first try to lookup the overflow 3123 ** page-list cache, if any, then fall back to the getOverflowPage() 3124 ** function. 3125 */ 3126 #ifndef SQLITE_OMIT_INCRBLOB 3127 if( pCur->aOverflow && pCur->aOverflow[iIdx+1] ){ 3128 nextPage = pCur->aOverflow[iIdx+1]; 3129 } else 3130 #endif 3131 rc = getOverflowPage(pBt, nextPage, 0, &nextPage); 3132 offset -= ovflSize; 3133 }else{ 3134 /* Need to read this page properly. It contains some of the 3135 ** range of data that is being read (eOp==0) or written (eOp!=0). 3136 */ 3137 DbPage *pDbPage; 3138 int a = amt; 3139 rc = sqlite3PagerGet(pBt->pPager, nextPage, &pDbPage); 3140 if( rc==SQLITE_OK ){ 3141 aPayload = sqlite3PagerGetData(pDbPage); 3142 nextPage = get4byte(aPayload); 3143 if( a + offset > ovflSize ){ 3144 a = ovflSize - offset; 3145 } 3146 rc = copyPayload(&aPayload[offset+4], pBuf, a, eOp, pDbPage); 3147 sqlite3PagerUnref(pDbPage); 3148 offset = 0; 3149 amt -= a; 3150 pBuf += a; 3151 } 3152 } 3153 } 3154 } 3155 3156 if( rc==SQLITE_OK && amt>0 ){ 3157 return SQLITE_CORRUPT_BKPT; 3158 } 3159 return rc; 3160 } 3161 3162 /* 3163 ** Read part of the key associated with cursor pCur. Exactly 3164 ** "amt" bytes will be transfered into pBuf[]. The transfer 3165 ** begins at "offset". 3166 ** 3167 ** Return SQLITE_OK on success or an error code if anything goes 3168 ** wrong. An error is returned if "offset+amt" is larger than 3169 ** the available payload. 3170 */ 3171 int sqlite3BtreeKey(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){ 3172 int rc; 3173 3174 assert( cursorHoldsMutex(pCur) ); 3175 rc = restoreOrClearCursorPosition(pCur); 3176 if( rc==SQLITE_OK ){ 3177 assert( pCur->eState==CURSOR_VALID ); 3178 assert( pCur->pPage!=0 ); 3179 if( pCur->pPage->intKey ){ 3180 return SQLITE_CORRUPT_BKPT; 3181 } 3182 assert( pCur->pPage->intKey==0 ); 3183 assert( pCur->idx>=0 && pCur->idx<pCur->pPage->nCell ); 3184 rc = accessPayload(pCur, offset, amt, (unsigned char*)pBuf, 0, 0); 3185 } 3186 return rc; 3187 } 3188 3189 /* 3190 ** Read part of the data associated with cursor pCur. Exactly 3191 ** "amt" bytes will be transfered into pBuf[]. The transfer 3192 ** begins at "offset". 3193 ** 3194 ** Return SQLITE_OK on success or an error code if anything goes 3195 ** wrong. An error is returned if "offset+amt" is larger than 3196 ** the available payload. 3197 */ 3198 int sqlite3BtreeData(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){ 3199 int rc; 3200 3201 assert( cursorHoldsMutex(pCur) ); 3202 rc = restoreOrClearCursorPosition(pCur); 3203 if( rc==SQLITE_OK ){ 3204 assert( pCur->eState==CURSOR_VALID ); 3205 assert( pCur->pPage!=0 ); 3206 assert( pCur->idx>=0 && pCur->idx<pCur->pPage->nCell ); 3207 rc = accessPayload(pCur, offset, amt, pBuf, 1, 0); 3208 } 3209 return rc; 3210 } 3211 3212 /* 3213 ** Return a pointer to payload information from the entry that the 3214 ** pCur cursor is pointing to. The pointer is to the beginning of 3215 ** the key if skipKey==0 and it points to the beginning of data if 3216 ** skipKey==1. The number of bytes of available key/data is written 3217 ** into *pAmt. If *pAmt==0, then the value returned will not be 3218 ** a valid pointer. 3219 ** 3220 ** This routine is an optimization. It is common for the entire key 3221 ** and data to fit on the local page and for there to be no overflow 3222 ** pages. When that is so, this routine can be used to access the 3223 ** key and data without making a copy. If the key and/or data spills 3224 ** onto overflow pages, then accessPayload() must be used to reassembly 3225 ** the key/data and copy it into a preallocated buffer. 3226 ** 3227 ** The pointer returned by this routine looks directly into the cached 3228 ** page of the database. The data might change or move the next time 3229 ** any btree routine is called. 3230 */ 3231 static const unsigned char *fetchPayload( 3232 BtCursor *pCur, /* Cursor pointing to entry to read from */ 3233 int *pAmt, /* Write the number of available bytes here */ 3234 int skipKey /* read beginning at data if this is true */ 3235 ){ 3236 unsigned char *aPayload; 3237 MemPage *pPage; 3238 u32 nKey; 3239 int nLocal; 3240 3241 assert( pCur!=0 && pCur->pPage!=0 ); 3242 assert( pCur->eState==CURSOR_VALID ); 3243 assert( cursorHoldsMutex(pCur) ); 3244 pPage = pCur->pPage; 3245 assert( pCur->idx>=0 && pCur->idx<pPage->nCell ); 3246 getCellInfo(pCur); 3247 aPayload = pCur->info.pCell; 3248 aPayload += pCur->info.nHeader; 3249 if( pPage->intKey ){ 3250 nKey = 0; 3251 }else{ 3252 nKey = pCur->info.nKey; 3253 } 3254 if( skipKey ){ 3255 aPayload += nKey; 3256 nLocal = pCur->info.nLocal - nKey; 3257 }else{ 3258 nLocal = pCur->info.nLocal; 3259 if( nLocal>nKey ){ 3260 nLocal = nKey; 3261 } 3262 } 3263 *pAmt = nLocal; 3264 return aPayload; 3265 } 3266 3267 3268 /* 3269 ** For the entry that cursor pCur is point to, return as 3270 ** many bytes of the key or data as are available on the local 3271 ** b-tree page. Write the number of available bytes into *pAmt. 3272 ** 3273 ** The pointer returned is ephemeral. The key/data may move 3274 ** or be destroyed on the next call to any Btree routine, 3275 ** including calls from other threads against the same cache. 3276 ** Hence, a mutex on the BtShared should be held prior to calling 3277 ** this routine. 3278 ** 3279 ** These routines is used to get quick access to key and data 3280 ** in the common case where no overflow pages are used. 3281 */ 3282 const void *sqlite3BtreeKeyFetch(BtCursor *pCur, int *pAmt){ 3283 assert( cursorHoldsMutex(pCur) ); 3284 if( pCur->eState==CURSOR_VALID ){ 3285 return (const void*)fetchPayload(pCur, pAmt, 0); 3286 } 3287 return 0; 3288 } 3289 const void *sqlite3BtreeDataFetch(BtCursor *pCur, int *pAmt){ 3290 assert( cursorHoldsMutex(pCur) ); 3291 if( pCur->eState==CURSOR_VALID ){ 3292 return (const void*)fetchPayload(pCur, pAmt, 1); 3293 } 3294 return 0; 3295 } 3296 3297 3298 /* 3299 ** Move the cursor down to a new child page. The newPgno argument is the 3300 ** page number of the child page to move to. 3301 */ 3302 static int moveToChild(BtCursor *pCur, u32 newPgno){ 3303 int rc; 3304 MemPage *pNewPage; 3305 MemPage *pOldPage; 3306 BtShared *pBt = pCur->pBt; 3307 3308 assert( cursorHoldsMutex(pCur) ); 3309 assert( pCur->eState==CURSOR_VALID ); 3310 rc = getAndInitPage(pBt, newPgno, &pNewPage, pCur->pPage); 3311 if( rc ) return rc; 3312 pNewPage->idxParent = pCur->idx; 3313 pOldPage = pCur->pPage; 3314 pOldPage->idxShift = 0; 3315 releasePage(pOldPage); 3316 pCur->pPage = pNewPage; 3317 pCur->idx = 0; 3318 pCur->info.nSize = 0; 3319 if( pNewPage->nCell<1 ){ 3320 return SQLITE_CORRUPT_BKPT; 3321 } 3322 return SQLITE_OK; 3323 } 3324 3325 /* 3326 ** Return true if the page is the virtual root of its table. 3327 ** 3328 ** The virtual root page is the root page for most tables. But 3329 ** for the table rooted on page 1, sometime the real root page 3330 ** is empty except for the right-pointer. In such cases the 3331 ** virtual root page is the page that the right-pointer of page 3332 ** 1 is pointing to. 3333 */ 3334 int sqlite3BtreeIsRootPage(MemPage *pPage){ 3335 MemPage *pParent; 3336 3337 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 3338 pParent = pPage->pParent; 3339 if( pParent==0 ) return 1; 3340 if( pParent->pgno>1 ) return 0; 3341 if( get2byte(&pParent->aData[pParent->hdrOffset+3])==0 ) return 1; 3342 return 0; 3343 } 3344 3345 /* 3346 ** Move the cursor up to the parent page. 3347 ** 3348 ** pCur->idx is set to the cell index that contains the pointer 3349 ** to the page we are coming from. If we are coming from the 3350 ** right-most child page then pCur->idx is set to one more than 3351 ** the largest cell index. 3352 */ 3353 void sqlite3BtreeMoveToParent(BtCursor *pCur){ 3354 MemPage *pParent; 3355 MemPage *pPage; 3356 int idxParent; 3357 3358 assert( cursorHoldsMutex(pCur) ); 3359 assert( pCur->eState==CURSOR_VALID ); 3360 pPage = pCur->pPage; 3361 assert( pPage!=0 ); 3362 assert( !sqlite3BtreeIsRootPage(pPage) ); 3363 pParent = pPage->pParent; 3364 assert( pParent!=0 ); 3365 idxParent = pPage->idxParent; 3366 sqlite3PagerRef(pParent->pDbPage); 3367 releasePage(pPage); 3368 pCur->pPage = pParent; 3369 pCur->info.nSize = 0; 3370 assert( pParent->idxShift==0 ); 3371 pCur->idx = idxParent; 3372 } 3373 3374 /* 3375 ** Move the cursor to the root page 3376 */ 3377 static int moveToRoot(BtCursor *pCur){ 3378 MemPage *pRoot; 3379 int rc = SQLITE_OK; 3380 Btree *p = pCur->pBtree; 3381 BtShared *pBt = p->pBt; 3382 3383 assert( cursorHoldsMutex(pCur) ); 3384 assert( CURSOR_INVALID < CURSOR_REQUIRESEEK ); 3385 assert( CURSOR_VALID < CURSOR_REQUIRESEEK ); 3386 assert( CURSOR_FAULT > CURSOR_REQUIRESEEK ); 3387 if( pCur->eState>=CURSOR_REQUIRESEEK ){ 3388 if( pCur->eState==CURSOR_FAULT ){ 3389 return pCur->skip; 3390 } 3391 clearCursorPosition(pCur); 3392 } 3393 pRoot = pCur->pPage; 3394 if( pRoot && pRoot->pgno==pCur->pgnoRoot ){ 3395 assert( pRoot->isInit ); 3396 }else{ 3397 if( 3398 SQLITE_OK!=(rc = getAndInitPage(pBt, pCur->pgnoRoot, &pRoot, 0)) 3399 ){ 3400 pCur->eState = CURSOR_INVALID; 3401 return rc; 3402 } 3403 releasePage(pCur->pPage); 3404 pCur->pPage = pRoot; 3405 } 3406 pCur->idx = 0; 3407 pCur->info.nSize = 0; 3408 if( pRoot->nCell==0 && !pRoot->leaf ){ 3409 Pgno subpage; 3410 assert( pRoot->pgno==1 ); 3411 subpage = get4byte(&pRoot->aData[pRoot->hdrOffset+8]); 3412 assert( subpage>0 ); 3413 pCur->eState = CURSOR_VALID; 3414 rc = moveToChild(pCur, subpage); 3415 } 3416 pCur->eState = ((pCur->pPage->nCell>0)?CURSOR_VALID:CURSOR_INVALID); 3417 return rc; 3418 } 3419 3420 /* 3421 ** Move the cursor down to the left-most leaf entry beneath the 3422 ** entry to which it is currently pointing. 3423 ** 3424 ** The left-most leaf is the one with the smallest key - the first 3425 ** in ascending order. 3426 */ 3427 static int moveToLeftmost(BtCursor *pCur){ 3428 Pgno pgno; 3429 int rc = SQLITE_OK; 3430 MemPage *pPage; 3431 3432 assert( cursorHoldsMutex(pCur) ); 3433 assert( pCur->eState==CURSOR_VALID ); 3434 while( rc==SQLITE_OK && !(pPage = pCur->pPage)->leaf ){ 3435 assert( pCur->idx>=0 && pCur->idx<pPage->nCell ); 3436 pgno = get4byte(findCell(pPage, pCur->idx)); 3437 rc = moveToChild(pCur, pgno); 3438 } 3439 return rc; 3440 } 3441 3442 /* 3443 ** Move the cursor down to the right-most leaf entry beneath the 3444 ** page to which it is currently pointing. Notice the difference 3445 ** between moveToLeftmost() and moveToRightmost(). moveToLeftmost() 3446 ** finds the left-most entry beneath the *entry* whereas moveToRightmost() 3447 ** finds the right-most entry beneath the *page*. 3448 ** 3449 ** The right-most entry is the one with the largest key - the last 3450 ** key in ascending order. 3451 */ 3452 static int moveToRightmost(BtCursor *pCur){ 3453 Pgno pgno; 3454 int rc = SQLITE_OK; 3455 MemPage *pPage; 3456 3457 assert( cursorHoldsMutex(pCur) ); 3458 assert( pCur->eState==CURSOR_VALID ); 3459 while( rc==SQLITE_OK && !(pPage = pCur->pPage)->leaf ){ 3460 pgno = get4byte(&pPage->aData[pPage->hdrOffset+8]); 3461 pCur->idx = pPage->nCell; 3462 rc = moveToChild(pCur, pgno); 3463 } 3464 if( rc==SQLITE_OK ){ 3465 pCur->idx = pPage->nCell - 1; 3466 pCur->info.nSize = 0; 3467 } 3468 return SQLITE_OK; 3469 } 3470 3471 /* Move the cursor to the first entry in the table. Return SQLITE_OK 3472 ** on success. Set *pRes to 0 if the cursor actually points to something 3473 ** or set *pRes to 1 if the table is empty. 3474 */ 3475 int sqlite3BtreeFirst(BtCursor *pCur, int *pRes){ 3476 int rc; 3477 3478 assert( cursorHoldsMutex(pCur) ); 3479 assert( sqlite3_mutex_held(pCur->pBtree->pSqlite->mutex) ); 3480 rc = moveToRoot(pCur); 3481 if( rc==SQLITE_OK ){ 3482 if( pCur->eState==CURSOR_INVALID ){ 3483 assert( pCur->pPage->nCell==0 ); 3484 *pRes = 1; 3485 rc = SQLITE_OK; 3486 }else{ 3487 assert( pCur->pPage->nCell>0 ); 3488 *pRes = 0; 3489 rc = moveToLeftmost(pCur); 3490 } 3491 } 3492 return rc; 3493 } 3494 3495 /* Move the cursor to the last entry in the table. Return SQLITE_OK 3496 ** on success. Set *pRes to 0 if the cursor actually points to something 3497 ** or set *pRes to 1 if the table is empty. 3498 */ 3499 int sqlite3BtreeLast(BtCursor *pCur, int *pRes){ 3500 int rc; 3501 3502 assert( cursorHoldsMutex(pCur) ); 3503 assert( sqlite3_mutex_held(pCur->pBtree->pSqlite->mutex) ); 3504 rc = moveToRoot(pCur); 3505 if( rc==SQLITE_OK ){ 3506 if( CURSOR_INVALID==pCur->eState ){ 3507 assert( pCur->pPage->nCell==0 ); 3508 *pRes = 1; 3509 }else{ 3510 assert( pCur->eState==CURSOR_VALID ); 3511 *pRes = 0; 3512 rc = moveToRightmost(pCur); 3513 } 3514 } 3515 return rc; 3516 } 3517 3518 /* Move the cursor so that it points to an entry near pKey/nKey. 3519 ** Return a success code. 3520 ** 3521 ** For INTKEY tables, only the nKey parameter is used. pKey is 3522 ** ignored. For other tables, nKey is the number of bytes of data 3523 ** in pKey. The comparison function specified when the cursor was 3524 ** created is used to compare keys. 3525 ** 3526 ** If an exact match is not found, then the cursor is always 3527 ** left pointing at a leaf page which would hold the entry if it 3528 ** were present. The cursor might point to an entry that comes 3529 ** before or after the key. 3530 ** 3531 ** The result of comparing the key with the entry to which the 3532 ** cursor is written to *pRes if pRes!=NULL. The meaning of 3533 ** this value is as follows: 3534 ** 3535 ** *pRes<0 The cursor is left pointing at an entry that 3536 ** is smaller than pKey or if the table is empty 3537 ** and the cursor is therefore left point to nothing. 3538 ** 3539 ** *pRes==0 The cursor is left pointing at an entry that 3540 ** exactly matches pKey. 3541 ** 3542 ** *pRes>0 The cursor is left pointing at an entry that 3543 ** is larger than pKey. 3544 ** 3545 */ 3546 int sqlite3BtreeMoveto( 3547 BtCursor *pCur, /* The cursor to be moved */ 3548 const void *pKey, /* The key content for indices. Not used by tables */ 3549 i64 nKey, /* Size of pKey. Or the key for tables */ 3550 int biasRight, /* If true, bias the search to the high end */ 3551 int *pRes /* Search result flag */ 3552 ){ 3553 int rc; 3554 3555 assert( cursorHoldsMutex(pCur) ); 3556 assert( sqlite3_mutex_held(pCur->pBtree->pSqlite->mutex) ); 3557 rc = moveToRoot(pCur); 3558 if( rc ){ 3559 return rc; 3560 } 3561 assert( pCur->pPage ); 3562 assert( pCur->pPage->isInit ); 3563 if( pCur->eState==CURSOR_INVALID ){ 3564 *pRes = -1; 3565 assert( pCur->pPage->nCell==0 ); 3566 return SQLITE_OK; 3567 } 3568 for(;;){ 3569 int lwr, upr; 3570 Pgno chldPg; 3571 MemPage *pPage = pCur->pPage; 3572 int c = -1; /* pRes return if table is empty must be -1 */ 3573 lwr = 0; 3574 upr = pPage->nCell-1; 3575 if( !pPage->intKey && pKey==0 ){ 3576 return SQLITE_CORRUPT_BKPT; 3577 } 3578 if( biasRight ){ 3579 pCur->idx = upr; 3580 }else{ 3581 pCur->idx = (upr+lwr)/2; 3582 } 3583 if( lwr<=upr ) for(;;){ 3584 void *pCellKey; 3585 i64 nCellKey; 3586 pCur->info.nSize = 0; 3587 if( pPage->intKey ){ 3588 u8 *pCell; 3589 pCell = findCell(pPage, pCur->idx) + pPage->childPtrSize; 3590 if( pPage->hasData ){ 3591 u32 dummy; 3592 pCell += getVarint32(pCell, &dummy); 3593 } 3594 getVarint(pCell, (u64 *)&nCellKey); 3595 if( nCellKey<nKey ){ 3596 c = -1; 3597 }else if( nCellKey>nKey ){ 3598 c = +1; 3599 }else{ 3600 c = 0; 3601 } 3602 }else{ 3603 int available; 3604 pCellKey = (void *)fetchPayload(pCur, &available, 0); 3605 nCellKey = pCur->info.nKey; 3606 if( available>=nCellKey ){ 3607 c = pCur->xCompare(pCur->pArg, nCellKey, pCellKey, nKey, pKey); 3608 }else{ 3609 pCellKey = sqlite3_malloc( nCellKey ); 3610 if( pCellKey==0 ) return SQLITE_NOMEM; 3611 rc = sqlite3BtreeKey(pCur, 0, nCellKey, (void *)pCellKey); 3612 c = pCur->xCompare(pCur->pArg, nCellKey, pCellKey, nKey, pKey); 3613 sqlite3_free(pCellKey); 3614 if( rc ){ 3615 return rc; 3616 } 3617 } 3618 } 3619 if( c==0 ){ 3620 if( pPage->leafData && !pPage->leaf ){ 3621 lwr = pCur->idx; 3622 upr = lwr - 1; 3623 break; 3624 }else{ 3625 if( pRes ) *pRes = 0; 3626 return SQLITE_OK; 3627 } 3628 } 3629 if( c<0 ){ 3630 lwr = pCur->idx+1; 3631 }else{ 3632 upr = pCur->idx-1; 3633 } 3634 if( lwr>upr ){ 3635 break; 3636 } 3637 pCur->idx = (lwr+upr)/2; 3638 } 3639 assert( lwr==upr+1 ); 3640 assert( pPage->isInit ); 3641 if( pPage->leaf ){ 3642 chldPg = 0; 3643 }else if( lwr>=pPage->nCell ){ 3644 chldPg = get4byte(&pPage->aData[pPage->hdrOffset+8]); 3645 }else{ 3646 chldPg = get4byte(findCell(pPage, lwr)); 3647 } 3648 if( chldPg==0 ){ 3649 assert( pCur->idx>=0 && pCur->idx<pCur->pPage->nCell ); 3650 if( pRes ) *pRes = c; 3651 return SQLITE_OK; 3652 } 3653 pCur->idx = lwr; 3654 pCur->info.nSize = 0; 3655 rc = moveToChild(pCur, chldPg); 3656 if( rc ){ 3657 return rc; 3658 } 3659 } 3660 /* NOT REACHED */ 3661 } 3662 3663 3664 /* 3665 ** Return TRUE if the cursor is not pointing at an entry of the table. 3666 ** 3667 ** TRUE will be returned after a call to sqlite3BtreeNext() moves 3668 ** past the last entry in the table or sqlite3BtreePrev() moves past 3669 ** the first entry. TRUE is also returned if the table is empty. 3670 */ 3671 int sqlite3BtreeEof(BtCursor *pCur){ 3672 /* TODO: What if the cursor is in CURSOR_REQUIRESEEK but all table entries 3673 ** have been deleted? This API will need to change to return an error code 3674 ** as well as the boolean result value. 3675 */ 3676 return (CURSOR_VALID!=pCur->eState); 3677 } 3678 3679 /* 3680 ** Return the database connection handle for a cursor. 3681 */ 3682 sqlite3 *sqlite3BtreeCursorDb(const BtCursor *pCur){ 3683 assert( sqlite3_mutex_held(pCur->pBtree->pSqlite->mutex) ); 3684 return pCur->pBtree->pSqlite; 3685 } 3686 3687 /* 3688 ** Advance the cursor to the next entry in the database. If 3689 ** successful then set *pRes=0. If the cursor 3690 ** was already pointing to the last entry in the database before 3691 ** this routine was called, then set *pRes=1. 3692 */ 3693 static int btreeNext(BtCursor *pCur, int *pRes){ 3694 int rc; 3695 MemPage *pPage; 3696 3697 assert( cursorHoldsMutex(pCur) ); 3698 rc = restoreOrClearCursorPosition(pCur); 3699 if( rc!=SQLITE_OK ){ 3700 return rc; 3701 } 3702 assert( pRes!=0 ); 3703 pPage = pCur->pPage; 3704 if( CURSOR_INVALID==pCur->eState ){ 3705 *pRes = 1; 3706 return SQLITE_OK; 3707 } 3708 if( pCur->skip>0 ){ 3709 pCur->skip = 0; 3710 *pRes = 0; 3711 return SQLITE_OK; 3712 } 3713 pCur->skip = 0; 3714 3715 assert( pPage->isInit ); 3716 assert( pCur->idx<pPage->nCell ); 3717 3718 pCur->idx++; 3719 pCur->info.nSize = 0; 3720 if( pCur->idx>=pPage->nCell ){ 3721 if( !pPage->leaf ){ 3722 rc = moveToChild(pCur, get4byte(&pPage->aData[pPage->hdrOffset+8])); 3723 if( rc ) return rc; 3724 rc = moveToLeftmost(pCur); 3725 *pRes = 0; 3726 return rc; 3727 } 3728 do{ 3729 if( sqlite3BtreeIsRootPage(pPage) ){ 3730 *pRes = 1; 3731 pCur->eState = CURSOR_INVALID; 3732 return SQLITE_OK; 3733 } 3734 sqlite3BtreeMoveToParent(pCur); 3735 pPage = pCur->pPage; 3736 }while( pCur->idx>=pPage->nCell ); 3737 *pRes = 0; 3738 if( pPage->leafData ){ 3739 rc = sqlite3BtreeNext(pCur, pRes); 3740 }else{ 3741 rc = SQLITE_OK; 3742 } 3743 return rc; 3744 } 3745 *pRes = 0; 3746 if( pPage->leaf ){ 3747 return SQLITE_OK; 3748 } 3749 rc = moveToLeftmost(pCur); 3750 return rc; 3751 } 3752 int sqlite3BtreeNext(BtCursor *pCur, int *pRes){ 3753 int rc; 3754 assert( cursorHoldsMutex(pCur) ); 3755 rc = btreeNext(pCur, pRes); 3756 return rc; 3757 } 3758 3759 3760 /* 3761 ** Step the cursor to the back to the previous entry in the database. If 3762 ** successful then set *pRes=0. If the cursor 3763 ** was already pointing to the first entry in the database before 3764 ** this routine was called, then set *pRes=1. 3765 */ 3766 static int btreePrevious(BtCursor *pCur, int *pRes){ 3767 int rc; 3768 Pgno pgno; 3769 MemPage *pPage; 3770 3771 assert( cursorHoldsMutex(pCur) ); 3772 rc = restoreOrClearCursorPosition(pCur); 3773 if( rc!=SQLITE_OK ){ 3774 return rc; 3775 } 3776 if( CURSOR_INVALID==pCur->eState ){ 3777 *pRes = 1; 3778 return SQLITE_OK; 3779 } 3780 if( pCur->skip<0 ){ 3781 pCur->skip = 0; 3782 *pRes = 0; 3783 return SQLITE_OK; 3784 } 3785 pCur->skip = 0; 3786 3787 pPage = pCur->pPage; 3788 assert( pPage->isInit ); 3789 assert( pCur->idx>=0 ); 3790 if( !pPage->leaf ){ 3791 pgno = get4byte( findCell(pPage, pCur->idx) ); 3792 rc = moveToChild(pCur, pgno); 3793 if( rc ){ 3794 return rc; 3795 } 3796 rc = moveToRightmost(pCur); 3797 }else{ 3798 while( pCur->idx==0 ){ 3799 if( sqlite3BtreeIsRootPage(pPage) ){ 3800 pCur->eState = CURSOR_INVALID; 3801 *pRes = 1; 3802 return SQLITE_OK; 3803 } 3804 sqlite3BtreeMoveToParent(pCur); 3805 pPage = pCur->pPage; 3806 } 3807 pCur->idx--; 3808 pCur->info.nSize = 0; 3809 if( pPage->leafData && !pPage->leaf ){ 3810 rc = sqlite3BtreePrevious(pCur, pRes); 3811 }else{ 3812 rc = SQLITE_OK; 3813 } 3814 } 3815 *pRes = 0; 3816 return rc; 3817 } 3818 int sqlite3BtreePrevious(BtCursor *pCur, int *pRes){ 3819 int rc; 3820 assert( cursorHoldsMutex(pCur) ); 3821 rc = btreePrevious(pCur, pRes); 3822 return rc; 3823 } 3824 3825 /* 3826 ** Allocate a new page from the database file. 3827 ** 3828 ** The new page is marked as dirty. (In other words, sqlite3PagerWrite() 3829 ** has already been called on the new page.) The new page has also 3830 ** been referenced and the calling routine is responsible for calling 3831 ** sqlite3PagerUnref() on the new page when it is done. 3832 ** 3833 ** SQLITE_OK is returned on success. Any other return value indicates 3834 ** an error. *ppPage and *pPgno are undefined in the event of an error. 3835 ** Do not invoke sqlite3PagerUnref() on *ppPage if an error is returned. 3836 ** 3837 ** If the "nearby" parameter is not 0, then a (feeble) effort is made to 3838 ** locate a page close to the page number "nearby". This can be used in an 3839 ** attempt to keep related pages close to each other in the database file, 3840 ** which in turn can make database access faster. 3841 ** 3842 ** If the "exact" parameter is not 0, and the page-number nearby exists 3843 ** anywhere on the free-list, then it is guarenteed to be returned. This 3844 ** is only used by auto-vacuum databases when allocating a new table. 3845 */ 3846 static int allocateBtreePage( 3847 BtShared *pBt, 3848 MemPage **ppPage, 3849 Pgno *pPgno, 3850 Pgno nearby, 3851 u8 exact 3852 ){ 3853 MemPage *pPage1; 3854 int rc; 3855 int n; /* Number of pages on the freelist */ 3856 int k; /* Number of leaves on the trunk of the freelist */ 3857 MemPage *pTrunk = 0; 3858 MemPage *pPrevTrunk = 0; 3859 3860 assert( sqlite3_mutex_held(pBt->mutex) ); 3861 pPage1 = pBt->pPage1; 3862 n = get4byte(&pPage1->aData[36]); 3863 if( n>0 ){ 3864 /* There are pages on the freelist. Reuse one of those pages. */ 3865 Pgno iTrunk; 3866 u8 searchList = 0; /* If the free-list must be searched for 'nearby' */ 3867 3868 /* If the 'exact' parameter was true and a query of the pointer-map 3869 ** shows that the page 'nearby' is somewhere on the free-list, then 3870 ** the entire-list will be searched for that page. 3871 */ 3872 #ifndef SQLITE_OMIT_AUTOVACUUM 3873 if( exact && nearby<=sqlite3PagerPagecount(pBt->pPager) ){ 3874 u8 eType; 3875 assert( nearby>0 ); 3876 assert( pBt->autoVacuum ); 3877 rc = ptrmapGet(pBt, nearby, &eType, 0); 3878 if( rc ) return rc; 3879 if( eType==PTRMAP_FREEPAGE ){ 3880 searchList = 1; 3881 } 3882 *pPgno = nearby; 3883 } 3884 #endif 3885 3886 /* Decrement the free-list count by 1. Set iTrunk to the index of the 3887 ** first free-list trunk page. iPrevTrunk is initially 1. 3888 */ 3889 rc = sqlite3PagerWrite(pPage1->pDbPage); 3890 if( rc ) return rc; 3891 put4byte(&pPage1->aData[36], n-1); 3892 3893 /* The code within this loop is run only once if the 'searchList' variable 3894 ** is not true. Otherwise, it runs once for each trunk-page on the 3895 ** free-list until the page 'nearby' is located. 3896 */ 3897 do { 3898 pPrevTrunk = pTrunk; 3899 if( pPrevTrunk ){ 3900 iTrunk = get4byte(&pPrevTrunk->aData[0]); 3901 }else{ 3902 iTrunk = get4byte(&pPage1->aData[32]); 3903 } 3904 rc = sqlite3BtreeGetPage(pBt, iTrunk, &pTrunk, 0); 3905 if( rc ){ 3906 pTrunk = 0; 3907 goto end_allocate_page; 3908 } 3909 3910 k = get4byte(&pTrunk->aData[4]); 3911 if( k==0 && !searchList ){ 3912 /* The trunk has no leaves and the list is not being searched. 3913 ** So extract the trunk page itself and use it as the newly 3914 ** allocated page */ 3915 assert( pPrevTrunk==0 ); 3916 rc = sqlite3PagerWrite(pTrunk->pDbPage); 3917 if( rc ){ 3918 goto end_allocate_page; 3919 } 3920 *pPgno = iTrunk; 3921 memcpy(&pPage1->aData[32], &pTrunk->aData[0], 4); 3922 *ppPage = pTrunk; 3923 pTrunk = 0; 3924 TRACE(("ALLOCATE: %d trunk - %d free pages left\n", *pPgno, n-1)); 3925 }else if( k>pBt->usableSize/4 - 8 ){ 3926 /* Value of k is out of range. Database corruption */ 3927 rc = SQLITE_CORRUPT_BKPT; 3928 goto end_allocate_page; 3929 #ifndef SQLITE_OMIT_AUTOVACUUM 3930 }else if( searchList && nearby==iTrunk ){ 3931 /* The list is being searched and this trunk page is the page 3932 ** to allocate, regardless of whether it has leaves. 3933 */ 3934 assert( *pPgno==iTrunk ); 3935 *ppPage = pTrunk; 3936 searchList = 0; 3937 rc = sqlite3PagerWrite(pTrunk->pDbPage); 3938 if( rc ){ 3939 goto end_allocate_page; 3940 } 3941 if( k==0 ){ 3942 if( !pPrevTrunk ){ 3943 memcpy(&pPage1->aData[32], &pTrunk->aData[0], 4); 3944 }else{ 3945 memcpy(&pPrevTrunk->aData[0], &pTrunk->aData[0], 4); 3946 } 3947 }else{ 3948 /* The trunk page is required by the caller but it contains 3949 ** pointers to free-list leaves. The first leaf becomes a trunk 3950 ** page in this case. 3951 */ 3952 MemPage *pNewTrunk; 3953 Pgno iNewTrunk = get4byte(&pTrunk->aData[8]); 3954 rc = sqlite3BtreeGetPage(pBt, iNewTrunk, &pNewTrunk, 0); 3955 if( rc!=SQLITE_OK ){ 3956 goto end_allocate_page; 3957 } 3958 rc = sqlite3PagerWrite(pNewTrunk->pDbPage); 3959 if( rc!=SQLITE_OK ){ 3960 releasePage(pNewTrunk); 3961 goto end_allocate_page; 3962 } 3963 memcpy(&pNewTrunk->aData[0], &pTrunk->aData[0], 4); 3964 put4byte(&pNewTrunk->aData[4], k-1); 3965 memcpy(&pNewTrunk->aData[8], &pTrunk->aData[12], (k-1)*4); 3966 releasePage(pNewTrunk); 3967 if( !pPrevTrunk ){ 3968 put4byte(&pPage1->aData[32], iNewTrunk); 3969 }else{ 3970 rc = sqlite3PagerWrite(pPrevTrunk->pDbPage); 3971 if( rc ){ 3972 goto end_allocate_page; 3973 } 3974 put4byte(&pPrevTrunk->aData[0], iNewTrunk); 3975 } 3976 } 3977 pTrunk = 0; 3978 TRACE(("ALLOCATE: %d trunk - %d free pages left\n", *pPgno, n-1)); 3979 #endif 3980 }else{ 3981 /* Extract a leaf from the trunk */ 3982 int closest; 3983 Pgno iPage; 3984 unsigned char *aData = pTrunk->aData; 3985 rc = sqlite3PagerWrite(pTrunk->pDbPage); 3986 if( rc ){ 3987 goto end_allocate_page; 3988 } 3989 if( nearby>0 ){ 3990 int i, dist; 3991 closest = 0; 3992 dist = get4byte(&aData[8]) - nearby; 3993 if( dist<0 ) dist = -dist; 3994 for(i=1; i<k; i++){ 3995 int d2 = get4byte(&aData[8+i*4]) - nearby; 3996 if( d2<0 ) d2 = -d2; 3997 if( d2<dist ){ 3998 closest = i; 3999 dist = d2; 4000 } 4001 } 4002 }else{ 4003 closest = 0; 4004 } 4005 4006 iPage = get4byte(&aData[8+closest*4]); 4007 if( !searchList || iPage==nearby ){ 4008 *pPgno = iPage; 4009 if( *pPgno>sqlite3PagerPagecount(pBt->pPager) ){ 4010 /* Free page off the end of the file */ 4011 return SQLITE_CORRUPT_BKPT; 4012 } 4013 TRACE(("ALLOCATE: %d was leaf %d of %d on trunk %d" 4014 ": %d more free pages\n", 4015 *pPgno, closest+1, k, pTrunk->pgno, n-1)); 4016 if( closest<k-1 ){ 4017 memcpy(&aData[8+closest*4], &aData[4+k*4], 4); 4018 } 4019 put4byte(&aData[4], k-1); 4020 rc = sqlite3BtreeGetPage(pBt, *pPgno, ppPage, 1); 4021 if( rc==SQLITE_OK ){ 4022 sqlite3PagerDontRollback((*ppPage)->pDbPage); 4023 rc = sqlite3PagerWrite((*ppPage)->pDbPage); 4024 if( rc!=SQLITE_OK ){ 4025 releasePage(*ppPage); 4026 } 4027 } 4028 searchList = 0; 4029 } 4030 } 4031 releasePage(pPrevTrunk); 4032 pPrevTrunk = 0; 4033 }while( searchList ); 4034 }else{ 4035 /* There are no pages on the freelist, so create a new page at the 4036 ** end of the file */ 4037 *pPgno = sqlite3PagerPagecount(pBt->pPager) + 1; 4038 4039 #ifndef SQLITE_OMIT_AUTOVACUUM 4040 if( pBt->nTrunc ){ 4041 /* An incr-vacuum has already run within this transaction. So the 4042 ** page to allocate is not from the physical end of the file, but 4043 ** at pBt->nTrunc. 4044 */ 4045 *pPgno = pBt->nTrunc+1; 4046 if( *pPgno==PENDING_BYTE_PAGE(pBt) ){ 4047 (*pPgno)++; 4048 } 4049 } 4050 if( pBt->autoVacuum && PTRMAP_ISPAGE(pBt, *pPgno) ){ 4051 /* If *pPgno refers to a pointer-map page, allocate two new pages 4052 ** at the end of the file instead of one. The first allocated page 4053 ** becomes a new pointer-map page, the second is used by the caller. 4054 */ 4055 TRACE(("ALLOCATE: %d from end of file (pointer-map page)\n", *pPgno)); 4056 assert( *pPgno!=PENDING_BYTE_PAGE(pBt) ); 4057 (*pPgno)++; 4058 } 4059 if( pBt->nTrunc ){ 4060 pBt->nTrunc = *pPgno; 4061 } 4062 #endif 4063 4064 assert( *pPgno!=PENDING_BYTE_PAGE(pBt) ); 4065 rc = sqlite3BtreeGetPage(pBt, *pPgno, ppPage, 0); 4066 if( rc ) return rc; 4067 rc = sqlite3PagerWrite((*ppPage)->pDbPage); 4068 if( rc!=SQLITE_OK ){ 4069 releasePage(*ppPage); 4070 } 4071 TRACE(("ALLOCATE: %d from end of file\n", *pPgno)); 4072 } 4073 4074 assert( *pPgno!=PENDING_BYTE_PAGE(pBt) ); 4075 4076 end_allocate_page: 4077 releasePage(pTrunk); 4078 releasePage(pPrevTrunk); 4079 return rc; 4080 } 4081 4082 /* 4083 ** Add a page of the database file to the freelist. 4084 ** 4085 ** sqlite3PagerUnref() is NOT called for pPage. 4086 */ 4087 static int freePage(MemPage *pPage){ 4088 BtShared *pBt = pPage->pBt; 4089 MemPage *pPage1 = pBt->pPage1; 4090 int rc, n, k; 4091 4092 /* Prepare the page for freeing */ 4093 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 4094 assert( pPage->pgno>1 ); 4095 pPage->isInit = 0; 4096 releasePage(pPage->pParent); 4097 pPage->pParent = 0; 4098 4099 /* Increment the free page count on pPage1 */ 4100 rc = sqlite3PagerWrite(pPage1->pDbPage); 4101 if( rc ) return rc; 4102 n = get4byte(&pPage1->aData[36]); 4103 put4byte(&pPage1->aData[36], n+1); 4104 4105 #ifdef SQLITE_SECURE_DELETE 4106 /* If the SQLITE_SECURE_DELETE compile-time option is enabled, then 4107 ** always fully overwrite deleted information with zeros. 4108 */ 4109 rc = sqlite3PagerWrite(pPage->pDbPage); 4110 if( rc ) return rc; 4111 memset(pPage->aData, 0, pPage->pBt->pageSize); 4112 #endif 4113 4114 #ifndef SQLITE_OMIT_AUTOVACUUM 4115 /* If the database supports auto-vacuum, write an entry in the pointer-map 4116 ** to indicate that the page is free. 4117 */ 4118 if( pBt->autoVacuum ){ 4119 rc = ptrmapPut(pBt, pPage->pgno, PTRMAP_FREEPAGE, 0); 4120 if( rc ) return rc; 4121 } 4122 #endif 4123 4124 if( n==0 ){ 4125 /* This is the first free page */ 4126 rc = sqlite3PagerWrite(pPage->pDbPage); 4127 if( rc ) return rc; 4128 memset(pPage->aData, 0, 8); 4129 put4byte(&pPage1->aData[32], pPage->pgno); 4130 TRACE(("FREE-PAGE: %d first\n", pPage->pgno)); 4131 }else{ 4132 /* Other free pages already exist. Retrive the first trunk page 4133 ** of the freelist and find out how many leaves it has. */ 4134 MemPage *pTrunk; 4135 rc = sqlite3BtreeGetPage(pBt, get4byte(&pPage1->aData[32]), &pTrunk, 0); 4136 if( rc ) return rc; 4137 k = get4byte(&pTrunk->aData[4]); 4138 if( k>=pBt->usableSize/4 - 8 ){ 4139 /* The trunk is full. Turn the page being freed into a new 4140 ** trunk page with no leaves. */ 4141 rc = sqlite3PagerWrite(pPage->pDbPage); 4142 if( rc==SQLITE_OK ){ 4143 put4byte(pPage->aData, pTrunk->pgno); 4144 put4byte(&pPage->aData[4], 0); 4145 put4byte(&pPage1->aData[32], pPage->pgno); 4146 TRACE(("FREE-PAGE: %d new trunk page replacing %d\n", 4147 pPage->pgno, pTrunk->pgno)); 4148 } 4149 }else if( k<0 ){ 4150 rc = SQLITE_CORRUPT; 4151 }else{ 4152 /* Add the newly freed page as a leaf on the current trunk */ 4153 rc = sqlite3PagerWrite(pTrunk->pDbPage); 4154 if( rc==SQLITE_OK ){ 4155 put4byte(&pTrunk->aData[4], k+1); 4156 put4byte(&pTrunk->aData[8+k*4], pPage->pgno); 4157 #ifndef SQLITE_SECURE_DELETE 4158 sqlite3PagerDontWrite(pPage->pDbPage); 4159 #endif 4160 } 4161 TRACE(("FREE-PAGE: %d leaf on trunk page %d\n",pPage->pgno,pTrunk->pgno)); 4162 } 4163 releasePage(pTrunk); 4164 } 4165 return rc; 4166 } 4167 4168 /* 4169 ** Free any overflow pages associated with the given Cell. 4170 */ 4171 static int clearCell(MemPage *pPage, unsigned char *pCell){ 4172 BtShared *pBt = pPage->pBt; 4173 CellInfo info; 4174 Pgno ovflPgno; 4175 int rc; 4176 int nOvfl; 4177 int ovflPageSize; 4178 4179 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 4180 sqlite3BtreeParseCellPtr(pPage, pCell, &info); 4181 if( info.iOverflow==0 ){ 4182 return SQLITE_OK; /* No overflow pages. Return without doing anything */ 4183 } 4184 ovflPgno = get4byte(&pCell[info.iOverflow]); 4185 ovflPageSize = pBt->usableSize - 4; 4186 nOvfl = (info.nPayload - info.nLocal + ovflPageSize - 1)/ovflPageSize; 4187 assert( ovflPgno==0 || nOvfl>0 ); 4188 while( nOvfl-- ){ 4189 MemPage *pOvfl; 4190 if( ovflPgno==0 || ovflPgno>sqlite3PagerPagecount(pBt->pPager) ){ 4191 return SQLITE_CORRUPT_BKPT; 4192 } 4193 4194 rc = getOverflowPage(pBt, ovflPgno, &pOvfl, (nOvfl==0)?0:&ovflPgno); 4195 if( rc ) return rc; 4196 rc = freePage(pOvfl); 4197 sqlite3PagerUnref(pOvfl->pDbPage); 4198 if( rc ) return rc; 4199 } 4200 return SQLITE_OK; 4201 } 4202 4203 /* 4204 ** Create the byte sequence used to represent a cell on page pPage 4205 ** and write that byte sequence into pCell[]. Overflow pages are 4206 ** allocated and filled in as necessary. The calling procedure 4207 ** is responsible for making sure sufficient space has been allocated 4208 ** for pCell[]. 4209 ** 4210 ** Note that pCell does not necessary need to point to the pPage->aData 4211 ** area. pCell might point to some temporary storage. The cell will 4212 ** be constructed in this temporary area then copied into pPage->aData 4213 ** later. 4214 */ 4215 static int fillInCell( 4216 MemPage *pPage, /* The page that contains the cell */ 4217 unsigned char *pCell, /* Complete text of the cell */ 4218 const void *pKey, i64 nKey, /* The key */ 4219 const void *pData,int nData, /* The data */ 4220 int nZero, /* Extra zero bytes to append to pData */ 4221 int *pnSize /* Write cell size here */ 4222 ){ 4223 int nPayload; 4224 const u8 *pSrc; 4225 int nSrc, n, rc; 4226 int spaceLeft; 4227 MemPage *pOvfl = 0; 4228 MemPage *pToRelease = 0; 4229 unsigned char *pPrior; 4230 unsigned char *pPayload; 4231 BtShared *pBt = pPage->pBt; 4232 Pgno pgnoOvfl = 0; 4233 int nHeader; 4234 CellInfo info; 4235 4236 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 4237 4238 /* Fill in the header. */ 4239 nHeader = 0; 4240 if( !pPage->leaf ){ 4241 nHeader += 4; 4242 } 4243 if( pPage->hasData ){ 4244 nHeader += putVarint(&pCell[nHeader], nData+nZero); 4245 }else{ 4246 nData = nZero = 0; 4247 } 4248 nHeader += putVarint(&pCell[nHeader], *(u64*)&nKey); 4249 sqlite3BtreeParseCellPtr(pPage, pCell, &info); 4250 assert( info.nHeader==nHeader ); 4251 assert( info.nKey==nKey ); 4252 assert( info.nData==nData+nZero ); 4253 4254 /* Fill in the payload */ 4255 nPayload = nData + nZero; 4256 if( pPage->intKey ){ 4257 pSrc = pData; 4258 nSrc = nData; 4259 nData = 0; 4260 }else{ 4261 nPayload += nKey; 4262 pSrc = pKey; 4263 nSrc = nKey; 4264 } 4265 *pnSize = info.nSize; 4266 spaceLeft = info.nLocal; 4267 pPayload = &pCell[nHeader]; 4268 pPrior = &pCell[info.iOverflow]; 4269 4270 while( nPayload>0 ){ 4271 if( spaceLeft==0 ){ 4272 int isExact = 0; 4273 #ifndef SQLITE_OMIT_AUTOVACUUM 4274 Pgno pgnoPtrmap = pgnoOvfl; /* Overflow page pointer-map entry page */ 4275 if( pBt->autoVacuum ){ 4276 do{ 4277 pgnoOvfl++; 4278 } while( 4279 PTRMAP_ISPAGE(pBt, pgnoOvfl) || pgnoOvfl==PENDING_BYTE_PAGE(pBt) 4280 ); 4281 if( pgnoOvfl>1 ){ 4282 /* isExact = 1; */ 4283 } 4284 } 4285 #endif 4286 rc = allocateBtreePage(pBt, &pOvfl, &pgnoOvfl, pgnoOvfl, isExact); 4287 #ifndef SQLITE_OMIT_AUTOVACUUM 4288 /* If the database supports auto-vacuum, and the second or subsequent 4289 ** overflow page is being allocated, add an entry to the pointer-map 4290 ** for that page now. 4291 ** 4292 ** If this is the first overflow page, then write a partial entry 4293 ** to the pointer-map. If we write nothing to this pointer-map slot, 4294 ** then the optimistic overflow chain processing in clearCell() 4295 ** may misinterpret the uninitialised values and delete the 4296 ** wrong pages from the database. 4297 */ 4298 if( pBt->autoVacuum && rc==SQLITE_OK ){ 4299 u8 eType = (pgnoPtrmap?PTRMAP_OVERFLOW2:PTRMAP_OVERFLOW1); 4300 rc = ptrmapPut(pBt, pgnoOvfl, eType, pgnoPtrmap); 4301 if( rc ){ 4302 releasePage(pOvfl); 4303 } 4304 } 4305 #endif 4306 if( rc ){ 4307 releasePage(pToRelease); 4308 return rc; 4309 } 4310 put4byte(pPrior, pgnoOvfl); 4311 releasePage(pToRelease); 4312 pToRelease = pOvfl; 4313 pPrior = pOvfl->aData; 4314 put4byte(pPrior, 0); 4315 pPayload = &pOvfl->aData[4]; 4316 spaceLeft = pBt->usableSize - 4; 4317 } 4318 n = nPayload; 4319 if( n>spaceLeft ) n = spaceLeft; 4320 if( nSrc>0 ){ 4321 if( n>nSrc ) n = nSrc; 4322 assert( pSrc ); 4323 memcpy(pPayload, pSrc, n); 4324 }else{ 4325 memset(pPayload, 0, n); 4326 } 4327 nPayload -= n; 4328 pPayload += n; 4329 pSrc += n; 4330 nSrc -= n; 4331 spaceLeft -= n; 4332 if( nSrc==0 ){ 4333 nSrc = nData; 4334 pSrc = pData; 4335 } 4336 } 4337 releasePage(pToRelease); 4338 return SQLITE_OK; 4339 } 4340 4341 /* 4342 ** Change the MemPage.pParent pointer on the page whose number is 4343 ** given in the second argument so that MemPage.pParent holds the 4344 ** pointer in the third argument. 4345 */ 4346 static int reparentPage(BtShared *pBt, Pgno pgno, MemPage *pNewParent, int idx){ 4347 MemPage *pThis; 4348 DbPage *pDbPage; 4349 4350 assert( sqlite3_mutex_held(pBt->mutex) ); 4351 assert( pNewParent!=0 ); 4352 if( pgno==0 ) return SQLITE_OK; 4353 assert( pBt->pPager!=0 ); 4354 pDbPage = sqlite3PagerLookup(pBt->pPager, pgno); 4355 if( pDbPage ){ 4356 pThis = (MemPage *)sqlite3PagerGetExtra(pDbPage); 4357 if( pThis->isInit ){ 4358 assert( pThis->aData==sqlite3PagerGetData(pDbPage) ); 4359 if( pThis->pParent!=pNewParent ){ 4360 if( pThis->pParent ) sqlite3PagerUnref(pThis->pParent->pDbPage); 4361 pThis->pParent = pNewParent; 4362 sqlite3PagerRef(pNewParent->pDbPage); 4363 } 4364 pThis->idxParent = idx; 4365 } 4366 sqlite3PagerUnref(pDbPage); 4367 } 4368 4369 #ifndef SQLITE_OMIT_AUTOVACUUM 4370 if( pBt->autoVacuum ){ 4371 return ptrmapPut(pBt, pgno, PTRMAP_BTREE, pNewParent->pgno); 4372 } 4373 #endif 4374 return SQLITE_OK; 4375 } 4376 4377 4378 4379 /* 4380 ** Change the pParent pointer of all children of pPage to point back 4381 ** to pPage. 4382 ** 4383 ** In other words, for every child of pPage, invoke reparentPage() 4384 ** to make sure that each child knows that pPage is its parent. 4385 ** 4386 ** This routine gets called after you memcpy() one page into 4387 ** another. 4388 */ 4389 static int reparentChildPages(MemPage *pPage){ 4390 int i; 4391 BtShared *pBt = pPage->pBt; 4392 int rc = SQLITE_OK; 4393 4394 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 4395 if( pPage->leaf ) return SQLITE_OK; 4396 4397 for(i=0; i<pPage->nCell; i++){ 4398 u8 *pCell = findCell(pPage, i); 4399 if( !pPage->leaf ){ 4400 rc = reparentPage(pBt, get4byte(pCell), pPage, i); 4401 if( rc!=SQLITE_OK ) return rc; 4402 } 4403 } 4404 if( !pPage->leaf ){ 4405 rc = reparentPage(pBt, get4byte(&pPage->aData[pPage->hdrOffset+8]), 4406 pPage, i); 4407 pPage->idxShift = 0; 4408 } 4409 return rc; 4410 } 4411 4412 /* 4413 ** Remove the i-th cell from pPage. This routine effects pPage only. 4414 ** The cell content is not freed or deallocated. It is assumed that 4415 ** the cell content has been copied someplace else. This routine just 4416 ** removes the reference to the cell from pPage. 4417 ** 4418 ** "sz" must be the number of bytes in the cell. 4419 */ 4420 static void dropCell(MemPage *pPage, int idx, int sz){ 4421 int i; /* Loop counter */ 4422 int pc; /* Offset to cell content of cell being deleted */ 4423 u8 *data; /* pPage->aData */ 4424 u8 *ptr; /* Used to move bytes around within data[] */ 4425 4426 assert( idx>=0 && idx<pPage->nCell ); 4427 assert( sz==cellSize(pPage, idx) ); 4428 assert( sqlite3PagerIswriteable(pPage->pDbPage) ); 4429 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 4430 data = pPage->aData; 4431 ptr = &data[pPage->cellOffset + 2*idx]; 4432 pc = get2byte(ptr); 4433 assert( pc>10 && pc+sz<=pPage->pBt->usableSize ); 4434 freeSpace(pPage, pc, sz); 4435 for(i=idx+1; i<pPage->nCell; i++, ptr+=2){ 4436 ptr[0] = ptr[2]; 4437 ptr[1] = ptr[3]; 4438 } 4439 pPage->nCell--; 4440 put2byte(&data[pPage->hdrOffset+3], pPage->nCell); 4441 pPage->nFree += 2; 4442 pPage->idxShift = 1; 4443 } 4444 4445 /* 4446 ** Insert a new cell on pPage at cell index "i". pCell points to the 4447 ** content of the cell. 4448 ** 4449 ** If the cell content will fit on the page, then put it there. If it 4450 ** will not fit, then make a copy of the cell content into pTemp if 4451 ** pTemp is not null. Regardless of pTemp, allocate a new entry 4452 ** in pPage->aOvfl[] and make it point to the cell content (either 4453 ** in pTemp or the original pCell) and also record its index. 4454 ** Allocating a new entry in pPage->aCell[] implies that 4455 ** pPage->nOverflow is incremented. 4456 ** 4457 ** If nSkip is non-zero, then do not copy the first nSkip bytes of the 4458 ** cell. The caller will overwrite them after this function returns. If 4459 ** nSkip is non-zero, then pCell may not point to an invalid memory location 4460 ** (but pCell+nSkip is always valid). 4461 */ 4462 static int insertCell( 4463 MemPage *pPage, /* Page into which we are copying */ 4464 int i, /* New cell becomes the i-th cell of the page */ 4465 u8 *pCell, /* Content of the new cell */ 4466 int sz, /* Bytes of content in pCell */ 4467 u8 *pTemp, /* Temp storage space for pCell, if needed */ 4468 u8 nSkip /* Do not write the first nSkip bytes of the cell */ 4469 ){ 4470 int idx; /* Where to write new cell content in data[] */ 4471 int j; /* Loop counter */ 4472 int top; /* First byte of content for any cell in data[] */ 4473 int end; /* First byte past the last cell pointer in data[] */ 4474 int ins; /* Index in data[] where new cell pointer is inserted */ 4475 int hdr; /* Offset into data[] of the page header */ 4476 int cellOffset; /* Address of first cell pointer in data[] */ 4477 u8 *data; /* The content of the whole page */ 4478 u8 *ptr; /* Used for moving information around in data[] */ 4479 4480 assert( i>=0 && i<=pPage->nCell+pPage->nOverflow ); 4481 assert( sz==cellSizePtr(pPage, pCell) ); 4482 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 4483 if( pPage->nOverflow || sz+2>pPage->nFree ){ 4484 if( pTemp ){ 4485 memcpy(pTemp+nSkip, pCell+nSkip, sz-nSkip); 4486 pCell = pTemp; 4487 } 4488 j = pPage->nOverflow++; 4489 assert( j<sizeof(pPage->aOvfl)/sizeof(pPage->aOvfl[0]) ); 4490 pPage->aOvfl[j].pCell = pCell; 4491 pPage->aOvfl[j].idx = i; 4492 pPage->nFree = 0; 4493 }else{ 4494 int rc = sqlite3PagerWrite(pPage->pDbPage); 4495 if( rc!=SQLITE_OK ){ 4496 return rc; 4497 } 4498 assert( sqlite3PagerIswriteable(pPage->pDbPage) ); 4499 data = pPage->aData; 4500 hdr = pPage->hdrOffset; 4501 top = get2byte(&data[hdr+5]); 4502 cellOffset = pPage->cellOffset; 4503 end = cellOffset + 2*pPage->nCell + 2; 4504 ins = cellOffset + 2*i; 4505 if( end > top - sz ){ 4506 rc = defragmentPage(pPage); 4507 if( rc!=SQLITE_OK ) return rc; 4508 top = get2byte(&data[hdr+5]); 4509 assert( end + sz <= top ); 4510 } 4511 idx = allocateSpace(pPage, sz); 4512 assert( idx>0 ); 4513 assert( end <= get2byte(&data[hdr+5]) ); 4514 pPage->nCell++; 4515 pPage->nFree -= 2; 4516 memcpy(&data[idx+nSkip], pCell+nSkip, sz-nSkip); 4517 for(j=end-2, ptr=&data[j]; j>ins; j-=2, ptr-=2){ 4518 ptr[0] = ptr[-2]; 4519 ptr[1] = ptr[-1]; 4520 } 4521 put2byte(&data[ins], idx); 4522 put2byte(&data[hdr+3], pPage->nCell); 4523 pPage->idxShift = 1; 4524 #ifndef SQLITE_OMIT_AUTOVACUUM 4525 if( pPage->pBt->autoVacuum ){ 4526 /* The cell may contain a pointer to an overflow page. If so, write 4527 ** the entry for the overflow page into the pointer map. 4528 */ 4529 CellInfo info; 4530 sqlite3BtreeParseCellPtr(pPage, pCell, &info); 4531 assert( (info.nData+(pPage->intKey?0:info.nKey))==info.nPayload ); 4532 if( (info.nData+(pPage->intKey?0:info.nKey))>info.nLocal ){ 4533 Pgno pgnoOvfl = get4byte(&pCell[info.iOverflow]); 4534 rc = ptrmapPut(pPage->pBt, pgnoOvfl, PTRMAP_OVERFLOW1, pPage->pgno); 4535 if( rc!=SQLITE_OK ) return rc; 4536 } 4537 } 4538 #endif 4539 } 4540 4541 return SQLITE_OK; 4542 } 4543 4544 /* 4545 ** Add a list of cells to a page. The page should be initially empty. 4546 ** The cells are guaranteed to fit on the page. 4547 */ 4548 static void assemblePage( 4549 MemPage *pPage, /* The page to be assemblied */ 4550 int nCell, /* The number of cells to add to this page */ 4551 u8 **apCell, /* Pointers to cell bodies */ 4552 int *aSize /* Sizes of the cells */ 4553 ){ 4554 int i; /* Loop counter */ 4555 int totalSize; /* Total size of all cells */ 4556 int hdr; /* Index of page header */ 4557 int cellptr; /* Address of next cell pointer */ 4558 int cellbody; /* Address of next cell body */ 4559 u8 *data; /* Data for the page */ 4560 4561 assert( pPage->nOverflow==0 ); 4562 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 4563 totalSize = 0; 4564 for(i=0; i<nCell; i++){ 4565 totalSize += aSize[i]; 4566 } 4567 assert( totalSize+2*nCell<=pPage->nFree ); 4568 assert( pPage->nCell==0 ); 4569 cellptr = pPage->cellOffset; 4570 data = pPage->aData; 4571 hdr = pPage->hdrOffset; 4572 put2byte(&data[hdr+3], nCell); 4573 if( nCell ){ 4574 cellbody = allocateSpace(pPage, totalSize); 4575 assert( cellbody>0 ); 4576 assert( pPage->nFree >= 2*nCell ); 4577 pPage->nFree -= 2*nCell; 4578 for(i=0; i<nCell; i++){ 4579 put2byte(&data[cellptr], cellbody); 4580 memcpy(&data[cellbody], apCell[i], aSize[i]); 4581 cellptr += 2; 4582 cellbody += aSize[i]; 4583 } 4584 assert( cellbody==pPage->pBt->usableSize ); 4585 } 4586 pPage->nCell = nCell; 4587 } 4588 4589 /* 4590 ** The following parameters determine how many adjacent pages get involved 4591 ** in a balancing operation. NN is the number of neighbors on either side 4592 ** of the page that participate in the balancing operation. NB is the 4593 ** total number of pages that participate, including the target page and 4594 ** NN neighbors on either side. 4595 ** 4596 ** The minimum value of NN is 1 (of course). Increasing NN above 1 4597 ** (to 2 or 3) gives a modest improvement in SELECT and DELETE performance 4598 ** in exchange for a larger degradation in INSERT and UPDATE performance. 4599 ** The value of NN appears to give the best results overall. 4600 */ 4601 #define NN 1 /* Number of neighbors on either side of pPage */ 4602 #define NB (NN*2+1) /* Total pages involved in the balance */ 4603 4604 /* Forward reference */ 4605 static int balance(MemPage*, int); 4606 4607 #ifndef SQLITE_OMIT_QUICKBALANCE 4608 /* 4609 ** This version of balance() handles the common special case where 4610 ** a new entry is being inserted on the extreme right-end of the 4611 ** tree, in other words, when the new entry will become the largest 4612 ** entry in the tree. 4613 ** 4614 ** Instead of trying balance the 3 right-most leaf pages, just add 4615 ** a new page to the right-hand side and put the one new entry in 4616 ** that page. This leaves the right side of the tree somewhat 4617 ** unbalanced. But odds are that we will be inserting new entries 4618 ** at the end soon afterwards so the nearly empty page will quickly 4619 ** fill up. On average. 4620 ** 4621 ** pPage is the leaf page which is the right-most page in the tree. 4622 ** pParent is its parent. pPage must have a single overflow entry 4623 ** which is also the right-most entry on the page. 4624 */ 4625 static int balance_quick(MemPage *pPage, MemPage *pParent){ 4626 int rc; 4627 MemPage *pNew; 4628 Pgno pgnoNew; 4629 u8 *pCell; 4630 int szCell; 4631 CellInfo info; 4632 BtShared *pBt = pPage->pBt; 4633 int parentIdx = pParent->nCell; /* pParent new divider cell index */ 4634 int parentSize; /* Size of new divider cell */ 4635 u8 parentCell[64]; /* Space for the new divider cell */ 4636 4637 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 4638 4639 /* Allocate a new page. Insert the overflow cell from pPage 4640 ** into it. Then remove the overflow cell from pPage. 4641 */ 4642 rc = allocateBtreePage(pBt, &pNew, &pgnoNew, 0, 0); 4643 if( rc!=SQLITE_OK ){ 4644 return rc; 4645 } 4646 pCell = pPage->aOvfl[0].pCell; 4647 szCell = cellSizePtr(pPage, pCell); 4648 zeroPage(pNew, pPage->aData[0]); 4649 assemblePage(pNew, 1, &pCell, &szCell); 4650 pPage->nOverflow = 0; 4651 4652 /* Set the parent of the newly allocated page to pParent. */ 4653 pNew->pParent = pParent; 4654 sqlite3PagerRef(pParent->pDbPage); 4655 4656 /* pPage is currently the right-child of pParent. Change this 4657 ** so that the right-child is the new page allocated above and 4658 ** pPage is the next-to-right child. 4659 */ 4660 assert( pPage->nCell>0 ); 4661 pCell = findCell(pPage, pPage->nCell-1); 4662 sqlite3BtreeParseCellPtr(pPage, pCell, &info); 4663 rc = fillInCell(pParent, parentCell, 0, info.nKey, 0, 0, 0, &parentSize); 4664 if( rc!=SQLITE_OK ){ 4665 return rc; 4666 } 4667 assert( parentSize<64 ); 4668 rc = insertCell(pParent, parentIdx, parentCell, parentSize, 0, 4); 4669 if( rc!=SQLITE_OK ){ 4670 return rc; 4671 } 4672 put4byte(findOverflowCell(pParent,parentIdx), pPage->pgno); 4673 put4byte(&pParent->aData[pParent->hdrOffset+8], pgnoNew); 4674 4675 #ifndef SQLITE_OMIT_AUTOVACUUM 4676 /* If this is an auto-vacuum database, update the pointer map 4677 ** with entries for the new page, and any pointer from the 4678 ** cell on the page to an overflow page. 4679 */ 4680 if( pBt->autoVacuum ){ 4681 rc = ptrmapPut(pBt, pgnoNew, PTRMAP_BTREE, pParent->pgno); 4682 if( rc==SQLITE_OK ){ 4683 rc = ptrmapPutOvfl(pNew, 0); 4684 } 4685 if( rc!=SQLITE_OK ){ 4686 releasePage(pNew); 4687 return rc; 4688 } 4689 } 4690 #endif 4691 4692 /* Release the reference to the new page and balance the parent page, 4693 ** in case the divider cell inserted caused it to become overfull. 4694 */ 4695 releasePage(pNew); 4696 return balance(pParent, 0); 4697 } 4698 #endif /* SQLITE_OMIT_QUICKBALANCE */ 4699 4700 /* 4701 ** This routine redistributes Cells on pPage and up to NN*2 siblings 4702 ** of pPage so that all pages have about the same amount of free space. 4703 ** Usually NN siblings on either side of pPage is used in the balancing, 4704 ** though more siblings might come from one side if pPage is the first 4705 ** or last child of its parent. If pPage has fewer than 2*NN siblings 4706 ** (something which can only happen if pPage is the root page or a 4707 ** child of root) then all available siblings participate in the balancing. 4708 ** 4709 ** The number of siblings of pPage might be increased or decreased by one or 4710 ** two in an effort to keep pages nearly full but not over full. The root page 4711 ** is special and is allowed to be nearly empty. If pPage is 4712 ** the root page, then the depth of the tree might be increased 4713 ** or decreased by one, as necessary, to keep the root page from being 4714 ** overfull or completely empty. 4715 ** 4716 ** Note that when this routine is called, some of the Cells on pPage 4717 ** might not actually be stored in pPage->aData[]. This can happen 4718 ** if the page is overfull. Part of the job of this routine is to 4719 ** make sure all Cells for pPage once again fit in pPage->aData[]. 4720 ** 4721 ** In the course of balancing the siblings of pPage, the parent of pPage 4722 ** might become overfull or underfull. If that happens, then this routine 4723 ** is called recursively on the parent. 4724 ** 4725 ** If this routine fails for any reason, it might leave the database 4726 ** in a corrupted state. So if this routine fails, the database should 4727 ** be rolled back. 4728 */ 4729 static int balance_nonroot(MemPage *pPage){ 4730 MemPage *pParent; /* The parent of pPage */ 4731 BtShared *pBt; /* The whole database */ 4732 int nCell = 0; /* Number of cells in apCell[] */ 4733 int nMaxCells = 0; /* Allocated size of apCell, szCell, aFrom. */ 4734 int nOld; /* Number of pages in apOld[] */ 4735 int nNew; /* Number of pages in apNew[] */ 4736 int nDiv; /* Number of cells in apDiv[] */ 4737 int i, j, k; /* Loop counters */ 4738 int idx; /* Index of pPage in pParent->aCell[] */ 4739 int nxDiv; /* Next divider slot in pParent->aCell[] */ 4740 int rc; /* The return code */ 4741 int leafCorrection; /* 4 if pPage is a leaf. 0 if not */ 4742 int leafData; /* True if pPage is a leaf of a LEAFDATA tree */ 4743 int usableSpace; /* Bytes in pPage beyond the header */ 4744 int pageFlags; /* Value of pPage->aData[0] */ 4745 int subtotal; /* Subtotal of bytes in cells on one page */ 4746 int iSpace = 0; /* First unused byte of aSpace[] */ 4747 MemPage *apOld[NB]; /* pPage and up to two siblings */ 4748 Pgno pgnoOld[NB]; /* Page numbers for each page in apOld[] */ 4749 MemPage *apCopy[NB]; /* Private copies of apOld[] pages */ 4750 MemPage *apNew[NB+2]; /* pPage and up to NB siblings after balancing */ 4751 Pgno pgnoNew[NB+2]; /* Page numbers for each page in apNew[] */ 4752 u8 *apDiv[NB]; /* Divider cells in pParent */ 4753 int cntNew[NB+2]; /* Index in aCell[] of cell after i-th page */ 4754 int szNew[NB+2]; /* Combined size of cells place on i-th page */ 4755 u8 **apCell = 0; /* All cells begin balanced */ 4756 int *szCell; /* Local size of all cells in apCell[] */ 4757 u8 *aCopy[NB]; /* Space for holding data of apCopy[] */ 4758 u8 *aSpace; /* Space to hold copies of dividers cells */ 4759 #ifndef SQLITE_OMIT_AUTOVACUUM 4760 u8 *aFrom = 0; 4761 #endif 4762 4763 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 4764 4765 /* 4766 ** Find the parent page. 4767 */ 4768 assert( pPage->isInit ); 4769 assert( sqlite3PagerIswriteable(pPage->pDbPage) || pPage->nOverflow==1 ); 4770 pBt = pPage->pBt; 4771 pParent = pPage->pParent; 4772 assert( pParent ); 4773 if( SQLITE_OK!=(rc = sqlite3PagerWrite(pParent->pDbPage)) ){ 4774 return rc; 4775 } 4776 TRACE(("BALANCE: begin page %d child of %d\n", pPage->pgno, pParent->pgno)); 4777 4778 #ifndef SQLITE_OMIT_QUICKBALANCE 4779 /* 4780 ** A special case: If a new entry has just been inserted into a 4781 ** table (that is, a btree with integer keys and all data at the leaves) 4782 ** and the new entry is the right-most entry in the tree (it has the 4783 ** largest key) then use the special balance_quick() routine for 4784 ** balancing. balance_quick() is much faster and results in a tighter 4785 ** packing of data in the common case. 4786 */ 4787 if( pPage->leaf && 4788 pPage->intKey && 4789 pPage->leafData && 4790 pPage->nOverflow==1 && 4791 pPage->aOvfl[0].idx==pPage->nCell && 4792 pPage->pParent->pgno!=1 && 4793 get4byte(&pParent->aData[pParent->hdrOffset+8])==pPage->pgno 4794 ){ 4795 /* 4796 ** TODO: Check the siblings to the left of pPage. It may be that 4797 ** they are not full and no new page is required. 4798 */ 4799 return balance_quick(pPage, pParent); 4800 } 4801 #endif 4802 4803 if( SQLITE_OK!=(rc = sqlite3PagerWrite(pPage->pDbPage)) ){ 4804 return rc; 4805 } 4806 4807 /* 4808 ** Find the cell in the parent page whose left child points back 4809 ** to pPage. The "idx" variable is the index of that cell. If pPage 4810 ** is the rightmost child of pParent then set idx to pParent->nCell 4811 */ 4812 if( pParent->idxShift ){ 4813 Pgno pgno; 4814 pgno = pPage->pgno; 4815 assert( pgno==sqlite3PagerPagenumber(pPage->pDbPage) ); 4816 for(idx=0; idx<pParent->nCell; idx++){ 4817 if( get4byte(findCell(pParent, idx))==pgno ){ 4818 break; 4819 } 4820 } 4821 assert( idx<pParent->nCell 4822 || get4byte(&pParent->aData[pParent->hdrOffset+8])==pgno ); 4823 }else{ 4824 idx = pPage->idxParent; 4825 } 4826 4827 /* 4828 ** Initialize variables so that it will be safe to jump 4829 ** directly to balance_cleanup at any moment. 4830 */ 4831 nOld = nNew = 0; 4832 sqlite3PagerRef(pParent->pDbPage); 4833 4834 /* 4835 ** Find sibling pages to pPage and the cells in pParent that divide 4836 ** the siblings. An attempt is made to find NN siblings on either 4837 ** side of pPage. More siblings are taken from one side, however, if 4838 ** pPage there are fewer than NN siblings on the other side. If pParent 4839 ** has NB or fewer children then all children of pParent are taken. 4840 */ 4841 nxDiv = idx - NN; 4842 if( nxDiv + NB > pParent->nCell ){ 4843 nxDiv = pParent->nCell - NB + 1; 4844 } 4845 if( nxDiv<0 ){ 4846 nxDiv = 0; 4847 } 4848 nDiv = 0; 4849 for(i=0, k=nxDiv; i<NB; i++, k++){ 4850 if( k<pParent->nCell ){ 4851 apDiv[i] = findCell(pParent, k); 4852 nDiv++; 4853 assert( !pParent->leaf ); 4854 pgnoOld[i] = get4byte(apDiv[i]); 4855 }else if( k==pParent->nCell ){ 4856 pgnoOld[i] = get4byte(&pParent->aData[pParent->hdrOffset+8]); 4857 }else{ 4858 break; 4859 } 4860 rc = getAndInitPage(pBt, pgnoOld[i], &apOld[i], pParent); 4861 if( rc ) goto balance_cleanup; 4862 apOld[i]->idxParent = k; 4863 apCopy[i] = 0; 4864 assert( i==nOld ); 4865 nOld++; 4866 nMaxCells += 1+apOld[i]->nCell+apOld[i]->nOverflow; 4867 } 4868 4869 /* Make nMaxCells a multiple of 2 in order to preserve 8-byte 4870 ** alignment */ 4871 nMaxCells = (nMaxCells + 1)&~1; 4872 4873 /* 4874 ** Allocate space for memory structures 4875 */ 4876 apCell = sqlite3_malloc( 4877 nMaxCells*sizeof(u8*) /* apCell */ 4878 + nMaxCells*sizeof(int) /* szCell */ 4879 + ROUND8(sizeof(MemPage))*NB /* aCopy */ 4880 + pBt->pageSize*(5+NB) /* aSpace */ 4881 + (ISAUTOVACUUM ? nMaxCells : 0) /* aFrom */ 4882 ); 4883 if( apCell==0 ){ 4884 rc = SQLITE_NOMEM; 4885 goto balance_cleanup; 4886 } 4887 szCell = (int*)&apCell[nMaxCells]; 4888 aCopy[0] = (u8*)&szCell[nMaxCells]; 4889 assert( ((aCopy[0] - (u8*)apCell) & 7)==0 ); /* 8-byte alignment required */ 4890 for(i=1; i<NB; i++){ 4891 aCopy[i] = &aCopy[i-1][pBt->pageSize+ROUND8(sizeof(MemPage))]; 4892 assert( ((aCopy[i] - (u8*)apCell) & 7)==0 ); /* 8-byte alignment required */ 4893 } 4894 aSpace = &aCopy[NB-1][pBt->pageSize+ROUND8(sizeof(MemPage))]; 4895 assert( ((aSpace - (u8*)apCell) & 7)==0 ); /* 8-byte alignment required */ 4896 #ifndef SQLITE_OMIT_AUTOVACUUM 4897 if( pBt->autoVacuum ){ 4898 aFrom = &aSpace[5*pBt->pageSize]; 4899 } 4900 #endif 4901 4902 /* 4903 ** Make copies of the content of pPage and its siblings into aOld[]. 4904 ** The rest of this function will use data from the copies rather 4905 ** that the original pages since the original pages will be in the 4906 ** process of being overwritten. 4907 */ 4908 for(i=0; i<nOld; i++){ 4909 MemPage *p = apCopy[i] = (MemPage*)aCopy[i]; 4910 memcpy(p, apOld[i], sizeof(MemPage)); 4911 p->aData = (void*)&p[1]; 4912 memcpy(p->aData, apOld[i]->aData, pBt->pageSize); 4913 } 4914 4915 /* 4916 ** Load pointers to all cells on sibling pages and the divider cells 4917 ** into the local apCell[] array. Make copies of the divider cells 4918 ** into space obtained form aSpace[] and remove the the divider Cells 4919 ** from pParent. 4920 ** 4921 ** If the siblings are on leaf pages, then the child pointers of the 4922 ** divider cells are stripped from the cells before they are copied 4923 ** into aSpace[]. In this way, all cells in apCell[] are without 4924 ** child pointers. If siblings are not leaves, then all cell in 4925 ** apCell[] include child pointers. Either way, all cells in apCell[] 4926 ** are alike. 4927 ** 4928 ** leafCorrection: 4 if pPage is a leaf. 0 if pPage is not a leaf. 4929 ** leafData: 1 if pPage holds key+data and pParent holds only keys. 4930 */ 4931 nCell = 0; 4932 leafCorrection = pPage->leaf*4; 4933 leafData = pPage->leafData && pPage->leaf; 4934 for(i=0; i<nOld; i++){ 4935 MemPage *pOld = apCopy[i]; 4936 int limit = pOld->nCell+pOld->nOverflow; 4937 for(j=0; j<limit; j++){ 4938 assert( nCell<nMaxCells ); 4939 apCell[nCell] = findOverflowCell(pOld, j); 4940 szCell[nCell] = cellSizePtr(pOld, apCell[nCell]); 4941 #ifndef SQLITE_OMIT_AUTOVACUUM 4942 if( pBt->autoVacuum ){ 4943 int a; 4944 aFrom[nCell] = i; 4945 for(a=0; a<pOld->nOverflow; a++){ 4946 if( pOld->aOvfl[a].pCell==apCell[nCell] ){ 4947 aFrom[nCell] = 0xFF; 4948 break; 4949 } 4950 } 4951 } 4952 #endif 4953 nCell++; 4954 } 4955 if( i<nOld-1 ){ 4956 int sz = cellSizePtr(pParent, apDiv[i]); 4957 if( leafData ){ 4958 /* With the LEAFDATA flag, pParent cells hold only INTKEYs that 4959 ** are duplicates of keys on the child pages. We need to remove 4960 ** the divider cells from pParent, but the dividers cells are not 4961 ** added to apCell[] because they are duplicates of child cells. 4962 */ 4963 dropCell(pParent, nxDiv, sz); 4964 }else{ 4965 u8 *pTemp; 4966 assert( nCell<nMaxCells ); 4967 szCell[nCell] = sz; 4968 pTemp = &aSpace[iSpace]; 4969 iSpace += sz; 4970 assert( iSpace<=pBt->pageSize*5 ); 4971 memcpy(pTemp, apDiv[i], sz); 4972 apCell[nCell] = pTemp+leafCorrection; 4973 #ifndef SQLITE_OMIT_AUTOVACUUM 4974 if( pBt->autoVacuum ){ 4975 aFrom[nCell] = 0xFF; 4976 } 4977 #endif 4978 dropCell(pParent, nxDiv, sz); 4979 szCell[nCell] -= leafCorrection; 4980 assert( get4byte(pTemp)==pgnoOld[i] ); 4981 if( !pOld->leaf ){ 4982 assert( leafCorrection==0 ); 4983 /* The right pointer of the child page pOld becomes the left 4984 ** pointer of the divider cell */ 4985 memcpy(apCell[nCell], &pOld->aData[pOld->hdrOffset+8], 4); 4986 }else{ 4987 assert( leafCorrection==4 ); 4988 if( szCell[nCell]<4 ){ 4989 /* Do not allow any cells smaller than 4 bytes. */ 4990 szCell[nCell] = 4; 4991 } 4992 } 4993 nCell++; 4994 } 4995 } 4996 } 4997 4998 /* 4999 ** Figure out the number of pages needed to hold all nCell cells. 5000 ** Store this number in "k". Also compute szNew[] which is the total 5001 ** size of all cells on the i-th page and cntNew[] which is the index 5002 ** in apCell[] of the cell that divides page i from page i+1. 5003 ** cntNew[k] should equal nCell. 5004 ** 5005 ** Values computed by this block: 5006 ** 5007 ** k: The total number of sibling pages 5008 ** szNew[i]: Spaced used on the i-th sibling page. 5009 ** cntNew[i]: Index in apCell[] and szCell[] for the first cell to 5010 ** the right of the i-th sibling page. 5011 ** usableSpace: Number of bytes of space available on each sibling. 5012 ** 5013 */ 5014 usableSpace = pBt->usableSize - 12 + leafCorrection; 5015 for(subtotal=k=i=0; i<nCell; i++){ 5016 assert( i<nMaxCells ); 5017 subtotal += szCell[i] + 2; 5018 if( subtotal > usableSpace ){ 5019 szNew[k] = subtotal - szCell[i]; 5020 cntNew[k] = i; 5021 if( leafData ){ i--; } 5022 subtotal = 0; 5023 k++; 5024 } 5025 } 5026 szNew[k] = subtotal; 5027 cntNew[k] = nCell; 5028 k++; 5029 5030 /* 5031 ** The packing computed by the previous block is biased toward the siblings 5032 ** on the left side. The left siblings are always nearly full, while the 5033 ** right-most sibling might be nearly empty. This block of code attempts 5034 ** to adjust the packing of siblings to get a better balance. 5035 ** 5036 ** This adjustment is more than an optimization. The packing above might 5037 ** be so out of balance as to be illegal. For example, the right-most 5038 ** sibling might be completely empty. This adjustment is not optional. 5039 */ 5040 for(i=k-1; i>0; i--){ 5041 int szRight = szNew[i]; /* Size of sibling on the right */ 5042 int szLeft = szNew[i-1]; /* Size of sibling on the left */ 5043 int r; /* Index of right-most cell in left sibling */ 5044 int d; /* Index of first cell to the left of right sibling */ 5045 5046 r = cntNew[i-1] - 1; 5047 d = r + 1 - leafData; 5048 assert( d<nMaxCells ); 5049 assert( r<nMaxCells ); 5050 while( szRight==0 || szRight+szCell[d]+2<=szLeft-(szCell[r]+2) ){ 5051 szRight += szCell[d] + 2; 5052 szLeft -= szCell[r] + 2; 5053 cntNew[i-1]--; 5054 r = cntNew[i-1] - 1; 5055 d = r + 1 - leafData; 5056 } 5057 szNew[i] = szRight; 5058 szNew[i-1] = szLeft; 5059 } 5060 5061 /* Either we found one or more cells (cntnew[0])>0) or we are the 5062 ** a virtual root page. A virtual root page is when the real root 5063 ** page is page 1 and we are the only child of that page. 5064 */ 5065 assert( cntNew[0]>0 || (pParent->pgno==1 && pParent->nCell==0) ); 5066 5067 /* 5068 ** Allocate k new pages. Reuse old pages where possible. 5069 */ 5070 assert( pPage->pgno>1 ); 5071 pageFlags = pPage->aData[0]; 5072 for(i=0; i<k; i++){ 5073 MemPage *pNew; 5074 if( i<nOld ){ 5075 pNew = apNew[i] = apOld[i]; 5076 pgnoNew[i] = pgnoOld[i]; 5077 apOld[i] = 0; 5078 rc = sqlite3PagerWrite(pNew->pDbPage); 5079 nNew++; 5080 if( rc ) goto balance_cleanup; 5081 }else{ 5082 assert( i>0 ); 5083 rc = allocateBtreePage(pBt, &pNew, &pgnoNew[i], pgnoNew[i-1], 0); 5084 if( rc ) goto balance_cleanup; 5085 apNew[i] = pNew; 5086 nNew++; 5087 } 5088 zeroPage(pNew, pageFlags); 5089 } 5090 5091 /* Free any old pages that were not reused as new pages. 5092 */ 5093 while( i<nOld ){ 5094 rc = freePage(apOld[i]); 5095 if( rc ) goto balance_cleanup; 5096 releasePage(apOld[i]); 5097 apOld[i] = 0; 5098 i++; 5099 } 5100 5101 /* 5102 ** Put the new pages in accending order. This helps to 5103 ** keep entries in the disk file in order so that a scan 5104 ** of the table is a linear scan through the file. That 5105 ** in turn helps the operating system to deliver pages 5106 ** from the disk more rapidly. 5107 ** 5108 ** An O(n^2) insertion sort algorithm is used, but since 5109 ** n is never more than NB (a small constant), that should 5110 ** not be a problem. 5111 ** 5112 ** When NB==3, this one optimization makes the database 5113 ** about 25% faster for large insertions and deletions. 5114 */ 5115 for(i=0; i<k-1; i++){ 5116 int minV = pgnoNew[i]; 5117 int minI = i; 5118 for(j=i+1; j<k; j++){ 5119 if( pgnoNew[j]<(unsigned)minV ){ 5120 minI = j; 5121 minV = pgnoNew[j]; 5122 } 5123 } 5124 if( minI>i ){ 5125 int t; 5126 MemPage *pT; 5127 t = pgnoNew[i]; 5128 pT = apNew[i]; 5129 pgnoNew[i] = pgnoNew[minI]; 5130 apNew[i] = apNew[minI]; 5131 pgnoNew[minI] = t; 5132 apNew[minI] = pT; 5133 } 5134 } 5135 TRACE(("BALANCE: old: %d %d %d new: %d(%d) %d(%d) %d(%d) %d(%d) %d(%d)\n", 5136 pgnoOld[0], 5137 nOld>=2 ? pgnoOld[1] : 0, 5138 nOld>=3 ? pgnoOld[2] : 0, 5139 pgnoNew[0], szNew[0], 5140 nNew>=2 ? pgnoNew[1] : 0, nNew>=2 ? szNew[1] : 0, 5141 nNew>=3 ? pgnoNew[2] : 0, nNew>=3 ? szNew[2] : 0, 5142 nNew>=4 ? pgnoNew[3] : 0, nNew>=4 ? szNew[3] : 0, 5143 nNew>=5 ? pgnoNew[4] : 0, nNew>=5 ? szNew[4] : 0)); 5144 5145 /* 5146 ** Evenly distribute the data in apCell[] across the new pages. 5147 ** Insert divider cells into pParent as necessary. 5148 */ 5149 j = 0; 5150 for(i=0; i<nNew; i++){ 5151 /* Assemble the new sibling page. */ 5152 MemPage *pNew = apNew[i]; 5153 assert( j<nMaxCells ); 5154 assert( pNew->pgno==pgnoNew[i] ); 5155 assemblePage(pNew, cntNew[i]-j, &apCell[j], &szCell[j]); 5156 assert( pNew->nCell>0 || (nNew==1 && cntNew[0]==0) ); 5157 assert( pNew->nOverflow==0 ); 5158 5159 #ifndef SQLITE_OMIT_AUTOVACUUM 5160 /* If this is an auto-vacuum database, update the pointer map entries 5161 ** that point to the siblings that were rearranged. These can be: left 5162 ** children of cells, the right-child of the page, or overflow pages 5163 ** pointed to by cells. 5164 */ 5165 if( pBt->autoVacuum ){ 5166 for(k=j; k<cntNew[i]; k++){ 5167 assert( k<nMaxCells ); 5168 if( aFrom[k]==0xFF || apCopy[aFrom[k]]->pgno!=pNew->pgno ){ 5169 rc = ptrmapPutOvfl(pNew, k-j); 5170 if( rc!=SQLITE_OK ){ 5171 goto balance_cleanup; 5172 } 5173 } 5174 } 5175 } 5176 #endif 5177 5178 j = cntNew[i]; 5179 5180 /* If the sibling page assembled above was not the right-most sibling, 5181 ** insert a divider cell into the parent page. 5182 */ 5183 if( i<nNew-1 && j<nCell ){ 5184 u8 *pCell; 5185 u8 *pTemp; 5186 int sz; 5187 5188 assert( j<nMaxCells ); 5189 pCell = apCell[j]; 5190 sz = szCell[j] + leafCorrection; 5191 if( !pNew->leaf ){ 5192 memcpy(&pNew->aData[8], pCell, 4); 5193 pTemp = 0; 5194 }else if( leafData ){ 5195 /* If the tree is a leaf-data tree, and the siblings are leaves, 5196 ** then there is no divider cell in apCell[]. Instead, the divider 5197 ** cell consists of the integer key for the right-most cell of 5198 ** the sibling-page assembled above only. 5199 */ 5200 CellInfo info; 5201 j--; 5202 sqlite3BtreeParseCellPtr(pNew, apCell[j], &info); 5203 pCell = &aSpace[iSpace]; 5204 fillInCell(pParent, pCell, 0, info.nKey, 0, 0, 0, &sz); 5205 iSpace += sz; 5206 assert( iSpace<=pBt->pageSize*5 ); 5207 pTemp = 0; 5208 }else{ 5209 pCell -= 4; 5210 pTemp = &aSpace[iSpace]; 5211 iSpace += sz; 5212 assert( iSpace<=pBt->pageSize*5 ); 5213 /* Obscure case for non-leaf-data trees: If the cell at pCell was 5214 ** previously stored on a leaf node, and it's reported size was 4 5215 ** bytes, then it may actually be smaller than this 5216 ** (see sqlite3BtreeParseCellPtr(), 4 bytes is the minimum size of 5217 ** any cell). But it's important to pass the correct size to 5218 ** insertCell(), so reparse the cell now. 5219 ** 5220 ** Note that this can never happen in an SQLite data file, as all 5221 ** cells are at least 4 bytes. It only happens in b-trees used 5222 ** to evaluate "IN (SELECT ...)" and similar clauses. 5223 */ 5224 if( szCell[j]==4 ){ 5225 assert(leafCorrection==4); 5226 sz = cellSizePtr(pParent, pCell); 5227 } 5228 } 5229 rc = insertCell(pParent, nxDiv, pCell, sz, pTemp, 4); 5230 if( rc!=SQLITE_OK ) goto balance_cleanup; 5231 put4byte(findOverflowCell(pParent,nxDiv), pNew->pgno); 5232 #ifndef SQLITE_OMIT_AUTOVACUUM 5233 /* If this is an auto-vacuum database, and not a leaf-data tree, 5234 ** then update the pointer map with an entry for the overflow page 5235 ** that the cell just inserted points to (if any). 5236 */ 5237 if( pBt->autoVacuum && !leafData ){ 5238 rc = ptrmapPutOvfl(pParent, nxDiv); 5239 if( rc!=SQLITE_OK ){ 5240 goto balance_cleanup; 5241 } 5242 } 5243 #endif 5244 j++; 5245 nxDiv++; 5246 } 5247 } 5248 assert( j==nCell ); 5249 assert( nOld>0 ); 5250 assert( nNew>0 ); 5251 if( (pageFlags & PTF_LEAF)==0 ){ 5252 memcpy(&apNew[nNew-1]->aData[8], &apCopy[nOld-1]->aData[8], 4); 5253 } 5254 if( nxDiv==pParent->nCell+pParent->nOverflow ){ 5255 /* Right-most sibling is the right-most child of pParent */ 5256 put4byte(&pParent->aData[pParent->hdrOffset+8], pgnoNew[nNew-1]); 5257 }else{ 5258 /* Right-most sibling is the left child of the first entry in pParent 5259 ** past the right-most divider entry */ 5260 put4byte(findOverflowCell(pParent, nxDiv), pgnoNew[nNew-1]); 5261 } 5262 5263 /* 5264 ** Reparent children of all cells. 5265 */ 5266 for(i=0; i<nNew; i++){ 5267 rc = reparentChildPages(apNew[i]); 5268 if( rc!=SQLITE_OK ) goto balance_cleanup; 5269 } 5270 rc = reparentChildPages(pParent); 5271 if( rc!=SQLITE_OK ) goto balance_cleanup; 5272 5273 /* 5274 ** Balance the parent page. Note that the current page (pPage) might 5275 ** have been added to the freelist so it might no longer be initialized. 5276 ** But the parent page will always be initialized. 5277 */ 5278 assert( pParent->isInit ); 5279 rc = balance(pParent, 0); 5280 5281 /* 5282 ** Cleanup before returning. 5283 */ 5284 balance_cleanup: 5285 sqlite3_free(apCell); 5286 for(i=0; i<nOld; i++){ 5287 releasePage(apOld[i]); 5288 } 5289 for(i=0; i<nNew; i++){ 5290 releasePage(apNew[i]); 5291 } 5292 releasePage(pParent); 5293 TRACE(("BALANCE: finished with %d: old=%d new=%d cells=%d\n", 5294 pPage->pgno, nOld, nNew, nCell)); 5295 return rc; 5296 } 5297 5298 /* 5299 ** This routine is called for the root page of a btree when the root 5300 ** page contains no cells. This is an opportunity to make the tree 5301 ** shallower by one level. 5302 */ 5303 static int balance_shallower(MemPage *pPage){ 5304 MemPage *pChild; /* The only child page of pPage */ 5305 Pgno pgnoChild; /* Page number for pChild */ 5306 int rc = SQLITE_OK; /* Return code from subprocedures */ 5307 BtShared *pBt; /* The main BTree structure */ 5308 int mxCellPerPage; /* Maximum number of cells per page */ 5309 u8 **apCell; /* All cells from pages being balanced */ 5310 int *szCell; /* Local size of all cells */ 5311 5312 assert( pPage->pParent==0 ); 5313 assert( pPage->nCell==0 ); 5314 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 5315 pBt = pPage->pBt; 5316 mxCellPerPage = MX_CELL(pBt); 5317 apCell = sqlite3_malloc( mxCellPerPage*(sizeof(u8*)+sizeof(int)) ); 5318 if( apCell==0 ) return SQLITE_NOMEM; 5319 szCell = (int*)&apCell[mxCellPerPage]; 5320 if( pPage->leaf ){ 5321 /* The table is completely empty */ 5322 TRACE(("BALANCE: empty table %d\n", pPage->pgno)); 5323 }else{ 5324 /* The root page is empty but has one child. Transfer the 5325 ** information from that one child into the root page if it 5326 ** will fit. This reduces the depth of the tree by one. 5327 ** 5328 ** If the root page is page 1, it has less space available than 5329 ** its child (due to the 100 byte header that occurs at the beginning 5330 ** of the database fle), so it might not be able to hold all of the 5331 ** information currently contained in the child. If this is the 5332 ** case, then do not do the transfer. Leave page 1 empty except 5333 ** for the right-pointer to the child page. The child page becomes 5334 ** the virtual root of the tree. 5335 */ 5336 pgnoChild = get4byte(&pPage->aData[pPage->hdrOffset+8]); 5337 assert( pgnoChild>0 ); 5338 assert( pgnoChild<=sqlite3PagerPagecount(pPage->pBt->pPager) ); 5339 rc = sqlite3BtreeGetPage(pPage->pBt, pgnoChild, &pChild, 0); 5340 if( rc ) goto end_shallow_balance; 5341 if( pPage->pgno==1 ){ 5342 rc = sqlite3BtreeInitPage(pChild, pPage); 5343 if( rc ) goto end_shallow_balance; 5344 assert( pChild->nOverflow==0 ); 5345 if( pChild->nFree>=100 ){ 5346 /* The child information will fit on the root page, so do the 5347 ** copy */ 5348 int i; 5349 zeroPage(pPage, pChild->aData[0]); 5350 for(i=0; i<pChild->nCell; i++){ 5351 apCell[i] = findCell(pChild,i); 5352 szCell[i] = cellSizePtr(pChild, apCell[i]); 5353 } 5354 assemblePage(pPage, pChild->nCell, apCell, szCell); 5355 /* Copy the right-pointer of the child to the parent. */ 5356 put4byte(&pPage->aData[pPage->hdrOffset+8], 5357 get4byte(&pChild->aData[pChild->hdrOffset+8])); 5358 freePage(pChild); 5359 TRACE(("BALANCE: child %d transfer to page 1\n", pChild->pgno)); 5360 }else{ 5361 /* The child has more information that will fit on the root. 5362 ** The tree is already balanced. Do nothing. */ 5363 TRACE(("BALANCE: child %d will not fit on page 1\n", pChild->pgno)); 5364 } 5365 }else{ 5366 memcpy(pPage->aData, pChild->aData, pPage->pBt->usableSize); 5367 pPage->isInit = 0; 5368 pPage->pParent = 0; 5369 rc = sqlite3BtreeInitPage(pPage, 0); 5370 assert( rc==SQLITE_OK ); 5371 freePage(pChild); 5372 TRACE(("BALANCE: transfer child %d into root %d\n", 5373 pChild->pgno, pPage->pgno)); 5374 } 5375 rc = reparentChildPages(pPage); 5376 assert( pPage->nOverflow==0 ); 5377 #ifndef SQLITE_OMIT_AUTOVACUUM 5378 if( pBt->autoVacuum ){ 5379 int i; 5380 for(i=0; i<pPage->nCell; i++){ 5381 rc = ptrmapPutOvfl(pPage, i); 5382 if( rc!=SQLITE_OK ){ 5383 goto end_shallow_balance; 5384 } 5385 } 5386 } 5387 #endif 5388 releasePage(pChild); 5389 } 5390 end_shallow_balance: 5391 sqlite3_free(apCell); 5392 return rc; 5393 } 5394 5395 5396 /* 5397 ** The root page is overfull 5398 ** 5399 ** When this happens, Create a new child page and copy the 5400 ** contents of the root into the child. Then make the root 5401 ** page an empty page with rightChild pointing to the new 5402 ** child. Finally, call balance_internal() on the new child 5403 ** to cause it to split. 5404 */ 5405 static int balance_deeper(MemPage *pPage){ 5406 int rc; /* Return value from subprocedures */ 5407 MemPage *pChild; /* Pointer to a new child page */ 5408 Pgno pgnoChild; /* Page number of the new child page */ 5409 BtShared *pBt; /* The BTree */ 5410 int usableSize; /* Total usable size of a page */ 5411 u8 *data; /* Content of the parent page */ 5412 u8 *cdata; /* Content of the child page */ 5413 int hdr; /* Offset to page header in parent */ 5414 int brk; /* Offset to content of first cell in parent */ 5415 5416 assert( pPage->pParent==0 ); 5417 assert( pPage->nOverflow>0 ); 5418 pBt = pPage->pBt; 5419 assert( sqlite3_mutex_held(pBt->mutex) ); 5420 rc = allocateBtreePage(pBt, &pChild, &pgnoChild, pPage->pgno, 0); 5421 if( rc ) return rc; 5422 assert( sqlite3PagerIswriteable(pChild->pDbPage) ); 5423 usableSize = pBt->usableSize; 5424 data = pPage->aData; 5425 hdr = pPage->hdrOffset; 5426 brk = get2byte(&data[hdr+5]); 5427 cdata = pChild->aData; 5428 memcpy(cdata, &data[hdr], pPage->cellOffset+2*pPage->nCell-hdr); 5429 memcpy(&cdata[brk], &data[brk], usableSize-brk); 5430 assert( pChild->isInit==0 ); 5431 rc = sqlite3BtreeInitPage(pChild, pPage); 5432 if( rc ) goto balancedeeper_out; 5433 memcpy(pChild->aOvfl, pPage->aOvfl, pPage->nOverflow*sizeof(pPage->aOvfl[0])); 5434 pChild->nOverflow = pPage->nOverflow; 5435 if( pChild->nOverflow ){ 5436 pChild->nFree = 0; 5437 } 5438 assert( pChild->nCell==pPage->nCell ); 5439 zeroPage(pPage, pChild->aData[0] & ~PTF_LEAF); 5440 put4byte(&pPage->aData[pPage->hdrOffset+8], pgnoChild); 5441 TRACE(("BALANCE: copy root %d into %d\n", pPage->pgno, pChild->pgno)); 5442 #ifndef SQLITE_OMIT_AUTOVACUUM 5443 if( pBt->autoVacuum ){ 5444 int i; 5445 rc = ptrmapPut(pBt, pChild->pgno, PTRMAP_BTREE, pPage->pgno); 5446 if( rc ) goto balancedeeper_out; 5447 for(i=0; i<pChild->nCell; i++){ 5448 rc = ptrmapPutOvfl(pChild, i); 5449 if( rc!=SQLITE_OK ){ 5450 return rc; 5451 } 5452 } 5453 } 5454 #endif 5455 rc = balance_nonroot(pChild); 5456 5457 balancedeeper_out: 5458 releasePage(pChild); 5459 return rc; 5460 } 5461 5462 /* 5463 ** Decide if the page pPage needs to be balanced. If balancing is 5464 ** required, call the appropriate balancing routine. 5465 */ 5466 static int balance(MemPage *pPage, int insert){ 5467 int rc = SQLITE_OK; 5468 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 5469 if( pPage->pParent==0 ){ 5470 rc = sqlite3PagerWrite(pPage->pDbPage); 5471 if( rc==SQLITE_OK && pPage->nOverflow>0 ){ 5472 rc = balance_deeper(pPage); 5473 } 5474 if( rc==SQLITE_OK && pPage->nCell==0 ){ 5475 rc = balance_shallower(pPage); 5476 } 5477 }else{ 5478 if( pPage->nOverflow>0 || 5479 (!insert && pPage->nFree>pPage->pBt->usableSize*2/3) ){ 5480 rc = balance_nonroot(pPage); 5481 } 5482 } 5483 return rc; 5484 } 5485 5486 /* 5487 ** This routine checks all cursors that point to table pgnoRoot. 5488 ** If any of those cursors were opened with wrFlag==0 in a different 5489 ** database connection (a database connection that shares the pager 5490 ** cache with the current connection) and that other connection 5491 ** is not in the ReadUncommmitted state, then this routine returns 5492 ** SQLITE_LOCKED. 5493 ** 5494 ** In addition to checking for read-locks (where a read-lock 5495 ** means a cursor opened with wrFlag==0) this routine also moves 5496 ** all write cursors so that they are pointing to the 5497 ** first Cell on the root page. This is necessary because an insert 5498 ** or delete might change the number of cells on a page or delete 5499 ** a page entirely and we do not want to leave any cursors 5500 ** pointing to non-existant pages or cells. 5501 */ 5502 static int checkReadLocks(Btree *pBtree, Pgno pgnoRoot, BtCursor *pExclude){ 5503 BtCursor *p; 5504 BtShared *pBt = pBtree->pBt; 5505 sqlite3 *db = pBtree->pSqlite; 5506 assert( sqlite3BtreeHoldsMutex(pBtree) ); 5507 for(p=pBt->pCursor; p; p=p->pNext){ 5508 if( p==pExclude ) continue; 5509 if( p->eState!=CURSOR_VALID ) continue; 5510 if( p->pgnoRoot!=pgnoRoot ) continue; 5511 if( p->wrFlag==0 ){ 5512 sqlite3 *dbOther = p->pBtree->pSqlite; 5513 if( dbOther==0 || 5514 (dbOther!=db && (dbOther->flags & SQLITE_ReadUncommitted)==0) ){ 5515 return SQLITE_LOCKED; 5516 } 5517 }else if( p->pPage->pgno!=p->pgnoRoot ){ 5518 moveToRoot(p); 5519 } 5520 } 5521 return SQLITE_OK; 5522 } 5523 5524 /* 5525 ** Insert a new record into the BTree. The key is given by (pKey,nKey) 5526 ** and the data is given by (pData,nData). The cursor is used only to 5527 ** define what table the record should be inserted into. The cursor 5528 ** is left pointing at a random location. 5529 ** 5530 ** For an INTKEY table, only the nKey value of the key is used. pKey is 5531 ** ignored. For a ZERODATA table, the pData and nData are both ignored. 5532 */ 5533 int sqlite3BtreeInsert( 5534 BtCursor *pCur, /* Insert data into the table of this cursor */ 5535 const void *pKey, i64 nKey, /* The key of the new record */ 5536 const void *pData, int nData, /* The data of the new record */ 5537 int nZero, /* Number of extra 0 bytes to append to data */ 5538 int appendBias /* True if this is likely an append */ 5539 ){ 5540 int rc; 5541 int loc; 5542 int szNew; 5543 MemPage *pPage; 5544 Btree *p = pCur->pBtree; 5545 BtShared *pBt = p->pBt; 5546 unsigned char *oldCell; 5547 unsigned char *newCell = 0; 5548 5549 assert( cursorHoldsMutex(pCur) ); 5550 if( pBt->inTransaction!=TRANS_WRITE ){ 5551 /* Must start a transaction before doing an insert */ 5552 rc = pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR; 5553 return rc; 5554 } 5555 assert( !pBt->readOnly ); 5556 if( !pCur->wrFlag ){ 5557 return SQLITE_PERM; /* Cursor not open for writing */ 5558 } 5559 if( checkReadLocks(pCur->pBtree, pCur->pgnoRoot, pCur) ){ 5560 return SQLITE_LOCKED; /* The table pCur points to has a read lock */ 5561 } 5562 if( pCur->eState==CURSOR_FAULT ){ 5563 return pCur->skip; 5564 } 5565 5566 /* Save the positions of any other cursors open on this table */ 5567 clearCursorPosition(pCur); 5568 if( 5569 SQLITE_OK!=(rc = saveAllCursors(pBt, pCur->pgnoRoot, pCur)) || 5570 SQLITE_OK!=(rc = sqlite3BtreeMoveto(pCur, pKey, nKey, appendBias, &loc)) 5571 ){ 5572 return rc; 5573 } 5574 5575 pPage = pCur->pPage; 5576 assert( pPage->intKey || nKey>=0 ); 5577 assert( pPage->leaf || !pPage->leafData ); 5578 TRACE(("INSERT: table=%d nkey=%lld ndata=%d page=%d %s\n", 5579 pCur->pgnoRoot, nKey, nData, pPage->pgno, 5580 loc==0 ? "overwrite" : "new entry")); 5581 assert( pPage->isInit ); 5582 newCell = sqlite3_malloc( MX_CELL_SIZE(pBt) ); 5583 if( newCell==0 ) return SQLITE_NOMEM; 5584 rc = fillInCell(pPage, newCell, pKey, nKey, pData, nData, nZero, &szNew); 5585 if( rc ) goto end_insert; 5586 assert( szNew==cellSizePtr(pPage, newCell) ); 5587 assert( szNew<=MX_CELL_SIZE(pBt) ); 5588 if( loc==0 && CURSOR_VALID==pCur->eState ){ 5589 int szOld; 5590 assert( pCur->idx>=0 && pCur->idx<pPage->nCell ); 5591 rc = sqlite3PagerWrite(pPage->pDbPage); 5592 if( rc ){ 5593 goto end_insert; 5594 } 5595 oldCell = findCell(pPage, pCur->idx); 5596 if( !pPage->leaf ){ 5597 memcpy(newCell, oldCell, 4); 5598 } 5599 szOld = cellSizePtr(pPage, oldCell); 5600 rc = clearCell(pPage, oldCell); 5601 if( rc ) goto end_insert; 5602 dropCell(pPage, pCur->idx, szOld); 5603 }else if( loc<0 && pPage->nCell>0 ){ 5604 assert( pPage->leaf ); 5605 pCur->idx++; 5606 pCur->info.nSize = 0; 5607 }else{ 5608 assert( pPage->leaf ); 5609 } 5610 rc = insertCell(pPage, pCur->idx, newCell, szNew, 0, 0); 5611 if( rc!=SQLITE_OK ) goto end_insert; 5612 rc = balance(pPage, 1); 5613 /* sqlite3BtreePageDump(pCur->pBt, pCur->pgnoRoot, 1); */ 5614 /* fflush(stdout); */ 5615 if( rc==SQLITE_OK ){ 5616 moveToRoot(pCur); 5617 } 5618 end_insert: 5619 sqlite3_free(newCell); 5620 return rc; 5621 } 5622 5623 /* 5624 ** Delete the entry that the cursor is pointing to. The cursor 5625 ** is left pointing at a random location. 5626 */ 5627 int sqlite3BtreeDelete(BtCursor *pCur){ 5628 MemPage *pPage = pCur->pPage; 5629 unsigned char *pCell; 5630 int rc; 5631 Pgno pgnoChild = 0; 5632 Btree *p = pCur->pBtree; 5633 BtShared *pBt = p->pBt; 5634 5635 assert( cursorHoldsMutex(pCur) ); 5636 assert( pPage->isInit ); 5637 if( pBt->inTransaction!=TRANS_WRITE ){ 5638 /* Must start a transaction before doing a delete */ 5639 rc = pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR; 5640 return rc; 5641 } 5642 assert( !pBt->readOnly ); 5643 if( pCur->eState==CURSOR_FAULT ){ 5644 return pCur->skip; 5645 } 5646 if( pCur->idx >= pPage->nCell ){ 5647 return SQLITE_ERROR; /* The cursor is not pointing to anything */ 5648 } 5649 if( !pCur->wrFlag ){ 5650 return SQLITE_PERM; /* Did not open this cursor for writing */ 5651 } 5652 if( checkReadLocks(pCur->pBtree, pCur->pgnoRoot, pCur) ){ 5653 return SQLITE_LOCKED; /* The table pCur points to has a read lock */ 5654 } 5655 5656 /* Restore the current cursor position (a no-op if the cursor is not in 5657 ** CURSOR_REQUIRESEEK state) and save the positions of any other cursors 5658 ** open on the same table. Then call sqlite3PagerWrite() on the page 5659 ** that the entry will be deleted from. 5660 */ 5661 if( 5662 (rc = restoreOrClearCursorPosition(pCur))!=0 || 5663 (rc = saveAllCursors(pBt, pCur->pgnoRoot, pCur))!=0 || 5664 (rc = sqlite3PagerWrite(pPage->pDbPage))!=0 5665 ){ 5666 return rc; 5667 } 5668 5669 /* Locate the cell within it's page and leave pCell pointing to the 5670 ** data. The clearCell() call frees any overflow pages associated with the 5671 ** cell. The cell itself is still intact. 5672 */ 5673 pCell = findCell(pPage, pCur->idx); 5674 if( !pPage->leaf ){ 5675 pgnoChild = get4byte(pCell); 5676 } 5677 rc = clearCell(pPage, pCell); 5678 if( rc ){ 5679 return rc; 5680 } 5681 5682 if( !pPage->leaf ){ 5683 /* 5684 ** The entry we are about to delete is not a leaf so if we do not 5685 ** do something we will leave a hole on an internal page. 5686 ** We have to fill the hole by moving in a cell from a leaf. The 5687 ** next Cell after the one to be deleted is guaranteed to exist and 5688 ** to be a leaf so we can use it. 5689 */ 5690 BtCursor leafCur; 5691 unsigned char *pNext; 5692 int szNext; /* The compiler warning is wrong: szNext is always 5693 ** initialized before use. Adding an extra initialization 5694 ** to silence the compiler slows down the code. */ 5695 int notUsed; 5696 unsigned char *tempCell = 0; 5697 assert( !pPage->leafData ); 5698 sqlite3BtreeGetTempCursor(pCur, &leafCur); 5699 rc = sqlite3BtreeNext(&leafCur, ¬Used); 5700 if( rc==SQLITE_OK ){ 5701 rc = sqlite3PagerWrite(leafCur.pPage->pDbPage); 5702 } 5703 if( rc==SQLITE_OK ){ 5704 TRACE(("DELETE: table=%d delete internal from %d replace from leaf %d\n", 5705 pCur->pgnoRoot, pPage->pgno, leafCur.pPage->pgno)); 5706 dropCell(pPage, pCur->idx, cellSizePtr(pPage, pCell)); 5707 pNext = findCell(leafCur.pPage, leafCur.idx); 5708 szNext = cellSizePtr(leafCur.pPage, pNext); 5709 assert( MX_CELL_SIZE(pBt)>=szNext+4 ); 5710 tempCell = sqlite3_malloc( MX_CELL_SIZE(pBt) ); 5711 if( tempCell==0 ){ 5712 rc = SQLITE_NOMEM; 5713 } 5714 } 5715 if( rc==SQLITE_OK ){ 5716 rc = insertCell(pPage, pCur->idx, pNext-4, szNext+4, tempCell, 0); 5717 } 5718 if( rc==SQLITE_OK ){ 5719 put4byte(findOverflowCell(pPage, pCur->idx), pgnoChild); 5720 rc = balance(pPage, 0); 5721 } 5722 if( rc==SQLITE_OK ){ 5723 dropCell(leafCur.pPage, leafCur.idx, szNext); 5724 rc = balance(leafCur.pPage, 0); 5725 } 5726 sqlite3_free(tempCell); 5727 sqlite3BtreeReleaseTempCursor(&leafCur); 5728 }else{ 5729 TRACE(("DELETE: table=%d delete from leaf %d\n", 5730 pCur->pgnoRoot, pPage->pgno)); 5731 dropCell(pPage, pCur->idx, cellSizePtr(pPage, pCell)); 5732 rc = balance(pPage, 0); 5733 } 5734 if( rc==SQLITE_OK ){ 5735 moveToRoot(pCur); 5736 } 5737 return rc; 5738 } 5739 5740 /* 5741 ** Create a new BTree table. Write into *piTable the page 5742 ** number for the root page of the new table. 5743 ** 5744 ** The type of type is determined by the flags parameter. Only the 5745 ** following values of flags are currently in use. Other values for 5746 ** flags might not work: 5747 ** 5748 ** BTREE_INTKEY|BTREE_LEAFDATA Used for SQL tables with rowid keys 5749 ** BTREE_ZERODATA Used for SQL indices 5750 */ 5751 static int btreeCreateTable(Btree *p, int *piTable, int flags){ 5752 BtShared *pBt = p->pBt; 5753 MemPage *pRoot; 5754 Pgno pgnoRoot; 5755 int rc; 5756 5757 assert( sqlite3BtreeHoldsMutex(p) ); 5758 if( pBt->inTransaction!=TRANS_WRITE ){ 5759 /* Must start a transaction first */ 5760 rc = pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR; 5761 return rc; 5762 } 5763 assert( !pBt->readOnly ); 5764 5765 #ifdef SQLITE_OMIT_AUTOVACUUM 5766 rc = allocateBtreePage(pBt, &pRoot, &pgnoRoot, 1, 0); 5767 if( rc ){ 5768 return rc; 5769 } 5770 #else 5771 if( pBt->autoVacuum ){ 5772 Pgno pgnoMove; /* Move a page here to make room for the root-page */ 5773 MemPage *pPageMove; /* The page to move to. */ 5774 5775 /* Creating a new table may probably require moving an existing database 5776 ** to make room for the new tables root page. In case this page turns 5777 ** out to be an overflow page, delete all overflow page-map caches 5778 ** held by open cursors. 5779 */ 5780 invalidateAllOverflowCache(pBt); 5781 5782 /* Read the value of meta[3] from the database to determine where the 5783 ** root page of the new table should go. meta[3] is the largest root-page 5784 ** created so far, so the new root-page is (meta[3]+1). 5785 */ 5786 rc = sqlite3BtreeGetMeta(p, 4, &pgnoRoot); 5787 if( rc!=SQLITE_OK ){ 5788 return rc; 5789 } 5790 pgnoRoot++; 5791 5792 /* The new root-page may not be allocated on a pointer-map page, or the 5793 ** PENDING_BYTE page. 5794 */ 5795 if( pgnoRoot==PTRMAP_PAGENO(pBt, pgnoRoot) || 5796 pgnoRoot==PENDING_BYTE_PAGE(pBt) ){ 5797 pgnoRoot++; 5798 } 5799 assert( pgnoRoot>=3 ); 5800 5801 /* Allocate a page. The page that currently resides at pgnoRoot will 5802 ** be moved to the allocated page (unless the allocated page happens 5803 ** to reside at pgnoRoot). 5804 */ 5805 rc = allocateBtreePage(pBt, &pPageMove, &pgnoMove, pgnoRoot, 1); 5806 if( rc!=SQLITE_OK ){ 5807 return rc; 5808 } 5809 5810 if( pgnoMove!=pgnoRoot ){ 5811 /* pgnoRoot is the page that will be used for the root-page of 5812 ** the new table (assuming an error did not occur). But we were 5813 ** allocated pgnoMove. If required (i.e. if it was not allocated 5814 ** by extending the file), the current page at position pgnoMove 5815 ** is already journaled. 5816 */ 5817 u8 eType; 5818 Pgno iPtrPage; 5819 5820 releasePage(pPageMove); 5821 5822 /* Move the page currently at pgnoRoot to pgnoMove. */ 5823 rc = sqlite3BtreeGetPage(pBt, pgnoRoot, &pRoot, 0); 5824 if( rc!=SQLITE_OK ){ 5825 return rc; 5826 } 5827 rc = ptrmapGet(pBt, pgnoRoot, &eType, &iPtrPage); 5828 if( rc!=SQLITE_OK || eType==PTRMAP_ROOTPAGE || eType==PTRMAP_FREEPAGE ){ 5829 releasePage(pRoot); 5830 return rc; 5831 } 5832 assert( eType!=PTRMAP_ROOTPAGE ); 5833 assert( eType!=PTRMAP_FREEPAGE ); 5834 rc = sqlite3PagerWrite(pRoot->pDbPage); 5835 if( rc!=SQLITE_OK ){ 5836 releasePage(pRoot); 5837 return rc; 5838 } 5839 rc = relocatePage(pBt, pRoot, eType, iPtrPage, pgnoMove); 5840 releasePage(pRoot); 5841 5842 /* Obtain the page at pgnoRoot */ 5843 if( rc!=SQLITE_OK ){ 5844 return rc; 5845 } 5846 rc = sqlite3BtreeGetPage(pBt, pgnoRoot, &pRoot, 0); 5847 if( rc!=SQLITE_OK ){ 5848 return rc; 5849 } 5850 rc = sqlite3PagerWrite(pRoot->pDbPage); 5851 if( rc!=SQLITE_OK ){ 5852 releasePage(pRoot); 5853 return rc; 5854 } 5855 }else{ 5856 pRoot = pPageMove; 5857 } 5858 5859 /* Update the pointer-map and meta-data with the new root-page number. */ 5860 rc = ptrmapPut(pBt, pgnoRoot, PTRMAP_ROOTPAGE, 0); 5861 if( rc ){ 5862 releasePage(pRoot); 5863 return rc; 5864 } 5865 rc = sqlite3BtreeUpdateMeta(p, 4, pgnoRoot); 5866 if( rc ){ 5867 releasePage(pRoot); 5868 return rc; 5869 } 5870 5871 }else{ 5872 rc = allocateBtreePage(pBt, &pRoot, &pgnoRoot, 1, 0); 5873 if( rc ) return rc; 5874 } 5875 #endif 5876 assert( sqlite3PagerIswriteable(pRoot->pDbPage) ); 5877 zeroPage(pRoot, flags | PTF_LEAF); 5878 sqlite3PagerUnref(pRoot->pDbPage); 5879 *piTable = (int)pgnoRoot; 5880 return SQLITE_OK; 5881 } 5882 int sqlite3BtreeCreateTable(Btree *p, int *piTable, int flags){ 5883 int rc; 5884 sqlite3BtreeEnter(p); 5885 rc = btreeCreateTable(p, piTable, flags); 5886 sqlite3BtreeLeave(p); 5887 return rc; 5888 } 5889 5890 /* 5891 ** Erase the given database page and all its children. Return 5892 ** the page to the freelist. 5893 */ 5894 static int clearDatabasePage( 5895 BtShared *pBt, /* The BTree that contains the table */ 5896 Pgno pgno, /* Page number to clear */ 5897 MemPage *pParent, /* Parent page. NULL for the root */ 5898 int freePageFlag /* Deallocate page if true */ 5899 ){ 5900 MemPage *pPage = 0; 5901 int rc; 5902 unsigned char *pCell; 5903 int i; 5904 5905 assert( sqlite3_mutex_held(pBt->mutex) ); 5906 if( pgno>sqlite3PagerPagecount(pBt->pPager) ){ 5907 return SQLITE_CORRUPT_BKPT; 5908 } 5909 5910 rc = getAndInitPage(pBt, pgno, &pPage, pParent); 5911 if( rc ) goto cleardatabasepage_out; 5912 for(i=0; i<pPage->nCell; i++){ 5913 pCell = findCell(pPage, i); 5914 if( !pPage->leaf ){ 5915 rc = clearDatabasePage(pBt, get4byte(pCell), pPage->pParent, 1); 5916 if( rc ) goto cleardatabasepage_out; 5917 } 5918 rc = clearCell(pPage, pCell); 5919 if( rc ) goto cleardatabasepage_out; 5920 } 5921 if( !pPage->leaf ){ 5922 rc = clearDatabasePage(pBt, get4byte(&pPage->aData[8]), pPage->pParent, 1); 5923 if( rc ) goto cleardatabasepage_out; 5924 } 5925 if( freePageFlag ){ 5926 rc = freePage(pPage); 5927 }else if( (rc = sqlite3PagerWrite(pPage->pDbPage))==0 ){ 5928 zeroPage(pPage, pPage->aData[0] | PTF_LEAF); 5929 } 5930 5931 cleardatabasepage_out: 5932 releasePage(pPage); 5933 return rc; 5934 } 5935 5936 /* 5937 ** Delete all information from a single table in the database. iTable is 5938 ** the page number of the root of the table. After this routine returns, 5939 ** the root page is empty, but still exists. 5940 ** 5941 ** This routine will fail with SQLITE_LOCKED if there are any open 5942 ** read cursors on the table. Open write cursors are moved to the 5943 ** root of the table. 5944 */ 5945 int sqlite3BtreeClearTable(Btree *p, int iTable){ 5946 int rc; 5947 BtShared *pBt = p->pBt; 5948 sqlite3BtreeEnter(p); 5949 if( p->inTrans!=TRANS_WRITE ){ 5950 rc = pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR; 5951 }else if( (rc = checkReadLocks(p, iTable, 0))!=SQLITE_OK ){ 5952 /* nothing to do */ 5953 }else if( SQLITE_OK!=(rc = saveAllCursors(pBt, iTable, 0)) ){ 5954 /* nothing to do */ 5955 }else{ 5956 rc = clearDatabasePage(pBt, (Pgno)iTable, 0, 0); 5957 } 5958 sqlite3BtreeLeave(p); 5959 return rc; 5960 } 5961 5962 /* 5963 ** Erase all information in a table and add the root of the table to 5964 ** the freelist. Except, the root of the principle table (the one on 5965 ** page 1) is never added to the freelist. 5966 ** 5967 ** This routine will fail with SQLITE_LOCKED if there are any open 5968 ** cursors on the table. 5969 ** 5970 ** If AUTOVACUUM is enabled and the page at iTable is not the last 5971 ** root page in the database file, then the last root page 5972 ** in the database file is moved into the slot formerly occupied by 5973 ** iTable and that last slot formerly occupied by the last root page 5974 ** is added to the freelist instead of iTable. In this say, all 5975 ** root pages are kept at the beginning of the database file, which 5976 ** is necessary for AUTOVACUUM to work right. *piMoved is set to the 5977 ** page number that used to be the last root page in the file before 5978 ** the move. If no page gets moved, *piMoved is set to 0. 5979 ** The last root page is recorded in meta[3] and the value of 5980 ** meta[3] is updated by this procedure. 5981 */ 5982 static int btreeDropTable(Btree *p, int iTable, int *piMoved){ 5983 int rc; 5984 MemPage *pPage = 0; 5985 BtShared *pBt = p->pBt; 5986 5987 assert( sqlite3BtreeHoldsMutex(p) ); 5988 if( p->inTrans!=TRANS_WRITE ){ 5989 return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR; 5990 } 5991 5992 /* It is illegal to drop a table if any cursors are open on the 5993 ** database. This is because in auto-vacuum mode the backend may 5994 ** need to move another root-page to fill a gap left by the deleted 5995 ** root page. If an open cursor was using this page a problem would 5996 ** occur. 5997 */ 5998 if( pBt->pCursor ){ 5999 return SQLITE_LOCKED; 6000 } 6001 6002 rc = sqlite3BtreeGetPage(pBt, (Pgno)iTable, &pPage, 0); 6003 if( rc ) return rc; 6004 rc = sqlite3BtreeClearTable(p, iTable); 6005 if( rc ){ 6006 releasePage(pPage); 6007 return rc; 6008 } 6009 6010 *piMoved = 0; 6011 6012 if( iTable>1 ){ 6013 #ifdef SQLITE_OMIT_AUTOVACUUM 6014 rc = freePage(pPage); 6015 releasePage(pPage); 6016 #else 6017 if( pBt->autoVacuum ){ 6018 Pgno maxRootPgno; 6019 rc = sqlite3BtreeGetMeta(p, 4, &maxRootPgno); 6020 if( rc!=SQLITE_OK ){ 6021 releasePage(pPage); 6022 return rc; 6023 } 6024 6025 if( iTable==maxRootPgno ){ 6026 /* If the table being dropped is the table with the largest root-page 6027 ** number in the database, put the root page on the free list. 6028 */ 6029 rc = freePage(pPage); 6030 releasePage(pPage); 6031 if( rc!=SQLITE_OK ){ 6032 return rc; 6033 } 6034 }else{ 6035 /* The table being dropped does not have the largest root-page 6036 ** number in the database. So move the page that does into the 6037 ** gap left by the deleted root-page. 6038 */ 6039 MemPage *pMove; 6040 releasePage(pPage); 6041 rc = sqlite3BtreeGetPage(pBt, maxRootPgno, &pMove, 0); 6042 if( rc!=SQLITE_OK ){ 6043 return rc; 6044 } 6045 rc = relocatePage(pBt, pMove, PTRMAP_ROOTPAGE, 0, iTable); 6046 releasePage(pMove); 6047 if( rc!=SQLITE_OK ){ 6048 return rc; 6049 } 6050 rc = sqlite3BtreeGetPage(pBt, maxRootPgno, &pMove, 0); 6051 if( rc!=SQLITE_OK ){ 6052 return rc; 6053 } 6054 rc = freePage(pMove); 6055 releasePage(pMove); 6056 if( rc!=SQLITE_OK ){ 6057 return rc; 6058 } 6059 *piMoved = maxRootPgno; 6060 } 6061 6062 /* Set the new 'max-root-page' value in the database header. This 6063 ** is the old value less one, less one more if that happens to 6064 ** be a root-page number, less one again if that is the 6065 ** PENDING_BYTE_PAGE. 6066 */ 6067 maxRootPgno--; 6068 if( maxRootPgno==PENDING_BYTE_PAGE(pBt) ){ 6069 maxRootPgno--; 6070 } 6071 if( maxRootPgno==PTRMAP_PAGENO(pBt, maxRootPgno) ){ 6072 maxRootPgno--; 6073 } 6074 assert( maxRootPgno!=PENDING_BYTE_PAGE(pBt) ); 6075 6076 rc = sqlite3BtreeUpdateMeta(p, 4, maxRootPgno); 6077 }else{ 6078 rc = freePage(pPage); 6079 releasePage(pPage); 6080 } 6081 #endif 6082 }else{ 6083 /* If sqlite3BtreeDropTable was called on page 1. */ 6084 zeroPage(pPage, PTF_INTKEY|PTF_LEAF ); 6085 releasePage(pPage); 6086 } 6087 return rc; 6088 } 6089 int sqlite3BtreeDropTable(Btree *p, int iTable, int *piMoved){ 6090 int rc; 6091 sqlite3BtreeEnter(p); 6092 rc = btreeDropTable(p, iTable, piMoved); 6093 sqlite3BtreeLeave(p); 6094 return rc; 6095 } 6096 6097 6098 /* 6099 ** Read the meta-information out of a database file. Meta[0] 6100 ** is the number of free pages currently in the database. Meta[1] 6101 ** through meta[15] are available for use by higher layers. Meta[0] 6102 ** is read-only, the others are read/write. 6103 ** 6104 ** The schema layer numbers meta values differently. At the schema 6105 ** layer (and the SetCookie and ReadCookie opcodes) the number of 6106 ** free pages is not visible. So Cookie[0] is the same as Meta[1]. 6107 */ 6108 int sqlite3BtreeGetMeta(Btree *p, int idx, u32 *pMeta){ 6109 DbPage *pDbPage; 6110 int rc; 6111 unsigned char *pP1; 6112 BtShared *pBt = p->pBt; 6113 6114 sqlite3BtreeEnter(p); 6115 6116 /* Reading a meta-data value requires a read-lock on page 1 (and hence 6117 ** the sqlite_master table. We grab this lock regardless of whether or 6118 ** not the SQLITE_ReadUncommitted flag is set (the table rooted at page 6119 ** 1 is treated as a special case by queryTableLock() and lockTable()). 6120 */ 6121 rc = queryTableLock(p, 1, READ_LOCK); 6122 if( rc!=SQLITE_OK ){ 6123 sqlite3BtreeLeave(p); 6124 return rc; 6125 } 6126 6127 assert( idx>=0 && idx<=15 ); 6128 rc = sqlite3PagerGet(pBt->pPager, 1, &pDbPage); 6129 if( rc ){ 6130 sqlite3BtreeLeave(p); 6131 return rc; 6132 } 6133 pP1 = (unsigned char *)sqlite3PagerGetData(pDbPage); 6134 *pMeta = get4byte(&pP1[36 + idx*4]); 6135 sqlite3PagerUnref(pDbPage); 6136 6137 /* If autovacuumed is disabled in this build but we are trying to 6138 ** access an autovacuumed database, then make the database readonly. 6139 */ 6140 #ifdef SQLITE_OMIT_AUTOVACUUM 6141 if( idx==4 && *pMeta>0 ) pBt->readOnly = 1; 6142 #endif 6143 6144 /* Grab the read-lock on page 1. */ 6145 rc = lockTable(p, 1, READ_LOCK); 6146 sqlite3BtreeLeave(p); 6147 return rc; 6148 } 6149 6150 /* 6151 ** Write meta-information back into the database. Meta[0] is 6152 ** read-only and may not be written. 6153 */ 6154 int sqlite3BtreeUpdateMeta(Btree *p, int idx, u32 iMeta){ 6155 BtShared *pBt = p->pBt; 6156 unsigned char *pP1; 6157 int rc; 6158 assert( idx>=1 && idx<=15 ); 6159 sqlite3BtreeEnter(p); 6160 if( p->inTrans!=TRANS_WRITE ){ 6161 rc = pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR; 6162 }else{ 6163 assert( pBt->pPage1!=0 ); 6164 pP1 = pBt->pPage1->aData; 6165 rc = sqlite3PagerWrite(pBt->pPage1->pDbPage); 6166 if( rc==SQLITE_OK ){ 6167 put4byte(&pP1[36 + idx*4], iMeta); 6168 #ifndef SQLITE_OMIT_AUTOVACUUM 6169 if( idx==7 ){ 6170 assert( pBt->autoVacuum || iMeta==0 ); 6171 assert( iMeta==0 || iMeta==1 ); 6172 pBt->incrVacuum = iMeta; 6173 } 6174 #endif 6175 } 6176 } 6177 sqlite3BtreeLeave(p); 6178 return rc; 6179 } 6180 6181 /* 6182 ** Return the flag byte at the beginning of the page that the cursor 6183 ** is currently pointing to. 6184 */ 6185 int sqlite3BtreeFlags(BtCursor *pCur){ 6186 /* TODO: What about CURSOR_REQUIRESEEK state? Probably need to call 6187 ** restoreOrClearCursorPosition() here. 6188 */ 6189 MemPage *pPage = pCur->pPage; 6190 assert( cursorHoldsMutex(pCur) ); 6191 assert( pPage->pBt==pCur->pBt ); 6192 return pPage ? pPage->aData[pPage->hdrOffset] : 0; 6193 } 6194 6195 6196 /* 6197 ** Return the pager associated with a BTree. This routine is used for 6198 ** testing and debugging only. 6199 */ 6200 Pager *sqlite3BtreePager(Btree *p){ 6201 return p->pBt->pPager; 6202 } 6203 6204 #ifndef SQLITE_OMIT_INTEGRITY_CHECK 6205 /* 6206 ** Append a message to the error message string. 6207 */ 6208 static void checkAppendMsg( 6209 IntegrityCk *pCheck, 6210 char *zMsg1, 6211 const char *zFormat, 6212 ... 6213 ){ 6214 va_list ap; 6215 char *zMsg2; 6216 if( !pCheck->mxErr ) return; 6217 pCheck->mxErr--; 6218 pCheck->nErr++; 6219 va_start(ap, zFormat); 6220 zMsg2 = sqlite3VMPrintf(0, zFormat, ap); 6221 va_end(ap); 6222 if( zMsg1==0 ) zMsg1 = ""; 6223 if( pCheck->zErrMsg ){ 6224 char *zOld = pCheck->zErrMsg; 6225 pCheck->zErrMsg = 0; 6226 sqlite3SetString(&pCheck->zErrMsg, zOld, "\n", zMsg1, zMsg2, (char*)0); 6227 sqlite3_free(zOld); 6228 }else{ 6229 sqlite3SetString(&pCheck->zErrMsg, zMsg1, zMsg2, (char*)0); 6230 } 6231 sqlite3_free(zMsg2); 6232 } 6233 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */ 6234 6235 #ifndef SQLITE_OMIT_INTEGRITY_CHECK 6236 /* 6237 ** Add 1 to the reference count for page iPage. If this is the second 6238 ** reference to the page, add an error message to pCheck->zErrMsg. 6239 ** Return 1 if there are 2 ore more references to the page and 0 if 6240 ** if this is the first reference to the page. 6241 ** 6242 ** Also check that the page number is in bounds. 6243 */ 6244 static int checkRef(IntegrityCk *pCheck, int iPage, char *zContext){ 6245 if( iPage==0 ) return 1; 6246 if( iPage>pCheck->nPage || iPage<0 ){ 6247 checkAppendMsg(pCheck, zContext, "invalid page number %d", iPage); 6248 return 1; 6249 } 6250 if( pCheck->anRef[iPage]==1 ){ 6251 checkAppendMsg(pCheck, zContext, "2nd reference to page %d", iPage); 6252 return 1; 6253 } 6254 return (pCheck->anRef[iPage]++)>1; 6255 } 6256 6257 #ifndef SQLITE_OMIT_AUTOVACUUM 6258 /* 6259 ** Check that the entry in the pointer-map for page iChild maps to 6260 ** page iParent, pointer type ptrType. If not, append an error message 6261 ** to pCheck. 6262 */ 6263 static void checkPtrmap( 6264 IntegrityCk *pCheck, /* Integrity check context */ 6265 Pgno iChild, /* Child page number */ 6266 u8 eType, /* Expected pointer map type */ 6267 Pgno iParent, /* Expected pointer map parent page number */ 6268 char *zContext /* Context description (used for error msg) */ 6269 ){ 6270 int rc; 6271 u8 ePtrmapType; 6272 Pgno iPtrmapParent; 6273 6274 rc = ptrmapGet(pCheck->pBt, iChild, &ePtrmapType, &iPtrmapParent); 6275 if( rc!=SQLITE_OK ){ 6276 checkAppendMsg(pCheck, zContext, "Failed to read ptrmap key=%d", iChild); 6277 return; 6278 } 6279 6280 if( ePtrmapType!=eType || iPtrmapParent!=iParent ){ 6281 checkAppendMsg(pCheck, zContext, 6282 "Bad ptr map entry key=%d expected=(%d,%d) got=(%d,%d)", 6283 iChild, eType, iParent, ePtrmapType, iPtrmapParent); 6284 } 6285 } 6286 #endif 6287 6288 /* 6289 ** Check the integrity of the freelist or of an overflow page list. 6290 ** Verify that the number of pages on the list is N. 6291 */ 6292 static void checkList( 6293 IntegrityCk *pCheck, /* Integrity checking context */ 6294 int isFreeList, /* True for a freelist. False for overflow page list */ 6295 int iPage, /* Page number for first page in the list */ 6296 int N, /* Expected number of pages in the list */ 6297 char *zContext /* Context for error messages */ 6298 ){ 6299 int i; 6300 int expected = N; 6301 int iFirst = iPage; 6302 while( N-- > 0 && pCheck->mxErr ){ 6303 DbPage *pOvflPage; 6304 unsigned char *pOvflData; 6305 if( iPage<1 ){ 6306 checkAppendMsg(pCheck, zContext, 6307 "%d of %d pages missing from overflow list starting at %d", 6308 N+1, expected, iFirst); 6309 break; 6310 } 6311 if( checkRef(pCheck, iPage, zContext) ) break; 6312 if( sqlite3PagerGet(pCheck->pPager, (Pgno)iPage, &pOvflPage) ){ 6313 checkAppendMsg(pCheck, zContext, "failed to get page %d", iPage); 6314 break; 6315 } 6316 pOvflData = (unsigned char *)sqlite3PagerGetData(pOvflPage); 6317 if( isFreeList ){ 6318 int n = get4byte(&pOvflData[4]); 6319 #ifndef SQLITE_OMIT_AUTOVACUUM 6320 if( pCheck->pBt->autoVacuum ){ 6321 checkPtrmap(pCheck, iPage, PTRMAP_FREEPAGE, 0, zContext); 6322 } 6323 #endif 6324 if( n>pCheck->pBt->usableSize/4-8 ){ 6325 checkAppendMsg(pCheck, zContext, 6326 "freelist leaf count too big on page %d", iPage); 6327 N--; 6328 }else{ 6329 for(i=0; i<n; i++){ 6330 Pgno iFreePage = get4byte(&pOvflData[8+i*4]); 6331 #ifndef SQLITE_OMIT_AUTOVACUUM 6332 if( pCheck->pBt->autoVacuum ){ 6333 checkPtrmap(pCheck, iFreePage, PTRMAP_FREEPAGE, 0, zContext); 6334 } 6335 #endif 6336 checkRef(pCheck, iFreePage, zContext); 6337 } 6338 N -= n; 6339 } 6340 } 6341 #ifndef SQLITE_OMIT_AUTOVACUUM 6342 else{ 6343 /* If this database supports auto-vacuum and iPage is not the last 6344 ** page in this overflow list, check that the pointer-map entry for 6345 ** the following page matches iPage. 6346 */ 6347 if( pCheck->pBt->autoVacuum && N>0 ){ 6348 i = get4byte(pOvflData); 6349 checkPtrmap(pCheck, i, PTRMAP_OVERFLOW2, iPage, zContext); 6350 } 6351 } 6352 #endif 6353 iPage = get4byte(pOvflData); 6354 sqlite3PagerUnref(pOvflPage); 6355 } 6356 } 6357 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */ 6358 6359 #ifndef SQLITE_OMIT_INTEGRITY_CHECK 6360 /* 6361 ** Do various sanity checks on a single page of a tree. Return 6362 ** the tree depth. Root pages return 0. Parents of root pages 6363 ** return 1, and so forth. 6364 ** 6365 ** These checks are done: 6366 ** 6367 ** 1. Make sure that cells and freeblocks do not overlap 6368 ** but combine to completely cover the page. 6369 ** NO 2. Make sure cell keys are in order. 6370 ** NO 3. Make sure no key is less than or equal to zLowerBound. 6371 ** NO 4. Make sure no key is greater than or equal to zUpperBound. 6372 ** 5. Check the integrity of overflow pages. 6373 ** 6. Recursively call checkTreePage on all children. 6374 ** 7. Verify that the depth of all children is the same. 6375 ** 8. Make sure this page is at least 33% full or else it is 6376 ** the root of the tree. 6377 */ 6378 static int checkTreePage( 6379 IntegrityCk *pCheck, /* Context for the sanity check */ 6380 int iPage, /* Page number of the page to check */ 6381 MemPage *pParent, /* Parent page */ 6382 char *zParentContext /* Parent context */ 6383 ){ 6384 MemPage *pPage; 6385 int i, rc, depth, d2, pgno, cnt; 6386 int hdr, cellStart; 6387 int nCell; 6388 u8 *data; 6389 BtShared *pBt; 6390 int usableSize; 6391 char zContext[100]; 6392 char *hit; 6393 6394 sqlite3_snprintf(sizeof(zContext), zContext, "Page %d: ", iPage); 6395 6396 /* Check that the page exists 6397 */ 6398 pBt = pCheck->pBt; 6399 usableSize = pBt->usableSize; 6400 if( iPage==0 ) return 0; 6401 if( checkRef(pCheck, iPage, zParentContext) ) return 0; 6402 if( (rc = sqlite3BtreeGetPage(pBt, (Pgno)iPage, &pPage, 0))!=0 ){ 6403 checkAppendMsg(pCheck, zContext, 6404 "unable to get the page. error code=%d", rc); 6405 return 0; 6406 } 6407 if( (rc = sqlite3BtreeInitPage(pPage, pParent))!=0 ){ 6408 checkAppendMsg(pCheck, zContext, 6409 "sqlite3BtreeInitPage() returns error code %d", rc); 6410 releasePage(pPage); 6411 return 0; 6412 } 6413 6414 /* Check out all the cells. 6415 */ 6416 depth = 0; 6417 for(i=0; i<pPage->nCell && pCheck->mxErr; i++){ 6418 u8 *pCell; 6419 int sz; 6420 CellInfo info; 6421 6422 /* Check payload overflow pages 6423 */ 6424 sqlite3_snprintf(sizeof(zContext), zContext, 6425 "On tree page %d cell %d: ", iPage, i); 6426 pCell = findCell(pPage,i); 6427 sqlite3BtreeParseCellPtr(pPage, pCell, &info); 6428 sz = info.nData; 6429 if( !pPage->intKey ) sz += info.nKey; 6430 assert( sz==info.nPayload ); 6431 if( sz>info.nLocal ){ 6432 int nPage = (sz - info.nLocal + usableSize - 5)/(usableSize - 4); 6433 Pgno pgnoOvfl = get4byte(&pCell[info.iOverflow]); 6434 #ifndef SQLITE_OMIT_AUTOVACUUM 6435 if( pBt->autoVacuum ){ 6436 checkPtrmap(pCheck, pgnoOvfl, PTRMAP_OVERFLOW1, iPage, zContext); 6437 } 6438 #endif 6439 checkList(pCheck, 0, pgnoOvfl, nPage, zContext); 6440 } 6441 6442 /* Check sanity of left child page. 6443 */ 6444 if( !pPage->leaf ){ 6445 pgno = get4byte(pCell); 6446 #ifndef SQLITE_OMIT_AUTOVACUUM 6447 if( pBt->autoVacuum ){ 6448 checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage, zContext); 6449 } 6450 #endif 6451 d2 = checkTreePage(pCheck,pgno,pPage,zContext); 6452 if( i>0 && d2!=depth ){ 6453 checkAppendMsg(pCheck, zContext, "Child page depth differs"); 6454 } 6455 depth = d2; 6456 } 6457 } 6458 if( !pPage->leaf ){ 6459 pgno = get4byte(&pPage->aData[pPage->hdrOffset+8]); 6460 sqlite3_snprintf(sizeof(zContext), zContext, 6461 "On page %d at right child: ", iPage); 6462 #ifndef SQLITE_OMIT_AUTOVACUUM 6463 if( pBt->autoVacuum ){ 6464 checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage, 0); 6465 } 6466 #endif 6467 checkTreePage(pCheck, pgno, pPage, zContext); 6468 } 6469 6470 /* Check for complete coverage of the page 6471 */ 6472 data = pPage->aData; 6473 hdr = pPage->hdrOffset; 6474 hit = sqlite3MallocZero( usableSize ); 6475 if( hit ){ 6476 memset(hit, 1, get2byte(&data[hdr+5])); 6477 nCell = get2byte(&data[hdr+3]); 6478 cellStart = hdr + 12 - 4*pPage->leaf; 6479 for(i=0; i<nCell; i++){ 6480 int pc = get2byte(&data[cellStart+i*2]); 6481 int size = cellSizePtr(pPage, &data[pc]); 6482 int j; 6483 if( (pc+size-1)>=usableSize || pc<0 ){ 6484 checkAppendMsg(pCheck, 0, 6485 "Corruption detected in cell %d on page %d",i,iPage,0); 6486 }else{ 6487 for(j=pc+size-1; j>=pc; j--) hit[j]++; 6488 } 6489 } 6490 for(cnt=0, i=get2byte(&data[hdr+1]); i>0 && i<usableSize && cnt<10000; 6491 cnt++){ 6492 int size = get2byte(&data[i+2]); 6493 int j; 6494 if( (i+size-1)>=usableSize || i<0 ){ 6495 checkAppendMsg(pCheck, 0, 6496 "Corruption detected in cell %d on page %d",i,iPage,0); 6497 }else{ 6498 for(j=i+size-1; j>=i; j--) hit[j]++; 6499 } 6500 i = get2byte(&data[i]); 6501 } 6502 for(i=cnt=0; i<usableSize; i++){ 6503 if( hit[i]==0 ){ 6504 cnt++; 6505 }else if( hit[i]>1 ){ 6506 checkAppendMsg(pCheck, 0, 6507 "Multiple uses for byte %d of page %d", i, iPage); 6508 break; 6509 } 6510 } 6511 if( cnt!=data[hdr+7] ){ 6512 checkAppendMsg(pCheck, 0, 6513 "Fragmented space is %d byte reported as %d on page %d", 6514 cnt, data[hdr+7], iPage); 6515 } 6516 } 6517 sqlite3_free(hit); 6518 6519 releasePage(pPage); 6520 return depth+1; 6521 } 6522 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */ 6523 6524 #ifndef SQLITE_OMIT_INTEGRITY_CHECK 6525 /* 6526 ** This routine does a complete check of the given BTree file. aRoot[] is 6527 ** an array of pages numbers were each page number is the root page of 6528 ** a table. nRoot is the number of entries in aRoot. 6529 ** 6530 ** If everything checks out, this routine returns NULL. If something is 6531 ** amiss, an error message is written into memory obtained from malloc() 6532 ** and a pointer to that error message is returned. The calling function 6533 ** is responsible for freeing the error message when it is done. 6534 */ 6535 char *sqlite3BtreeIntegrityCheck( 6536 Btree *p, /* The btree to be checked */ 6537 int *aRoot, /* An array of root pages numbers for individual trees */ 6538 int nRoot, /* Number of entries in aRoot[] */ 6539 int mxErr, /* Stop reporting errors after this many */ 6540 int *pnErr /* Write number of errors seen to this variable */ 6541 ){ 6542 int i; 6543 int nRef; 6544 IntegrityCk sCheck; 6545 BtShared *pBt = p->pBt; 6546 6547 sqlite3BtreeEnter(p); 6548 nRef = sqlite3PagerRefcount(pBt->pPager); 6549 if( lockBtreeWithRetry(p)!=SQLITE_OK ){ 6550 sqlite3BtreeLeave(p); 6551 return sqlite3StrDup("Unable to acquire a read lock on the database"); 6552 } 6553 sCheck.pBt = pBt; 6554 sCheck.pPager = pBt->pPager; 6555 sCheck.nPage = sqlite3PagerPagecount(sCheck.pPager); 6556 sCheck.mxErr = mxErr; 6557 sCheck.nErr = 0; 6558 *pnErr = 0; 6559 #ifndef SQLITE_OMIT_AUTOVACUUM 6560 if( pBt->nTrunc!=0 ){ 6561 sCheck.nPage = pBt->nTrunc; 6562 } 6563 #endif 6564 if( sCheck.nPage==0 ){ 6565 unlockBtreeIfUnused(pBt); 6566 sqlite3BtreeLeave(p); 6567 return 0; 6568 } 6569 sCheck.anRef = sqlite3_malloc( (sCheck.nPage+1)*sizeof(sCheck.anRef[0]) ); 6570 if( !sCheck.anRef ){ 6571 unlockBtreeIfUnused(pBt); 6572 *pnErr = 1; 6573 sqlite3BtreeLeave(p); 6574 return sqlite3MPrintf(p->pSqlite, "Unable to malloc %d bytes", 6575 (sCheck.nPage+1)*sizeof(sCheck.anRef[0])); 6576 } 6577 for(i=0; i<=sCheck.nPage; i++){ sCheck.anRef[i] = 0; } 6578 i = PENDING_BYTE_PAGE(pBt); 6579 if( i<=sCheck.nPage ){ 6580 sCheck.anRef[i] = 1; 6581 } 6582 sCheck.zErrMsg = 0; 6583 6584 /* Check the integrity of the freelist 6585 */ 6586 checkList(&sCheck, 1, get4byte(&pBt->pPage1->aData[32]), 6587 get4byte(&pBt->pPage1->aData[36]), "Main freelist: "); 6588 6589 /* Check all the tables. 6590 */ 6591 for(i=0; i<nRoot && sCheck.mxErr; i++){ 6592 if( aRoot[i]==0 ) continue; 6593 #ifndef SQLITE_OMIT_AUTOVACUUM 6594 if( pBt->autoVacuum && aRoot[i]>1 ){ 6595 checkPtrmap(&sCheck, aRoot[i], PTRMAP_ROOTPAGE, 0, 0); 6596 } 6597 #endif 6598 checkTreePage(&sCheck, aRoot[i], 0, "List of tree roots: "); 6599 } 6600 6601 /* Make sure every page in the file is referenced 6602 */ 6603 for(i=1; i<=sCheck.nPage && sCheck.mxErr; i++){ 6604 #ifdef SQLITE_OMIT_AUTOVACUUM 6605 if( sCheck.anRef[i]==0 ){ 6606 checkAppendMsg(&sCheck, 0, "Page %d is never used", i); 6607 } 6608 #else 6609 /* If the database supports auto-vacuum, make sure no tables contain 6610 ** references to pointer-map pages. 6611 */ 6612 if( sCheck.anRef[i]==0 && 6613 (PTRMAP_PAGENO(pBt, i)!=i || !pBt->autoVacuum) ){ 6614 checkAppendMsg(&sCheck, 0, "Page %d is never used", i); 6615 } 6616 if( sCheck.anRef[i]!=0 && 6617 (PTRMAP_PAGENO(pBt, i)==i && pBt->autoVacuum) ){ 6618 checkAppendMsg(&sCheck, 0, "Pointer map page %d is referenced", i); 6619 } 6620 #endif 6621 } 6622 6623 /* Make sure this analysis did not leave any unref() pages 6624 */ 6625 unlockBtreeIfUnused(pBt); 6626 if( nRef != sqlite3PagerRefcount(pBt->pPager) ){ 6627 checkAppendMsg(&sCheck, 0, 6628 "Outstanding page count goes from %d to %d during this analysis", 6629 nRef, sqlite3PagerRefcount(pBt->pPager) 6630 ); 6631 } 6632 6633 /* Clean up and report errors. 6634 */ 6635 sqlite3BtreeLeave(p); 6636 sqlite3_free(sCheck.anRef); 6637 *pnErr = sCheck.nErr; 6638 return sCheck.zErrMsg; 6639 } 6640 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */ 6641 6642 /* 6643 ** Return the full pathname of the underlying database file. 6644 ** 6645 ** The pager filename is invariant as long as the pager is 6646 ** open so it is safe to access without the BtShared mutex. 6647 */ 6648 const char *sqlite3BtreeGetFilename(Btree *p){ 6649 assert( p->pBt->pPager!=0 ); 6650 return sqlite3PagerFilename(p->pBt->pPager); 6651 } 6652 6653 /* 6654 ** Return the pathname of the directory that contains the database file. 6655 ** 6656 ** The pager directory name is invariant as long as the pager is 6657 ** open so it is safe to access without the BtShared mutex. 6658 */ 6659 const char *sqlite3BtreeGetDirname(Btree *p){ 6660 assert( p->pBt->pPager!=0 ); 6661 return sqlite3PagerDirname(p->pBt->pPager); 6662 } 6663 6664 /* 6665 ** Return the pathname of the journal file for this database. The return 6666 ** value of this routine is the same regardless of whether the journal file 6667 ** has been created or not. 6668 ** 6669 ** The pager journal filename is invariant as long as the pager is 6670 ** open so it is safe to access without the BtShared mutex. 6671 */ 6672 const char *sqlite3BtreeGetJournalname(Btree *p){ 6673 assert( p->pBt->pPager!=0 ); 6674 return sqlite3PagerJournalname(p->pBt->pPager); 6675 } 6676 6677 #ifndef SQLITE_OMIT_VACUUM 6678 /* 6679 ** Copy the complete content of pBtFrom into pBtTo. A transaction 6680 ** must be active for both files. 6681 ** 6682 ** The size of file pBtFrom may be reduced by this operation. 6683 ** If anything goes wrong, the transaction on pBtFrom is rolled back. 6684 */ 6685 static int btreeCopyFile(Btree *pTo, Btree *pFrom){ 6686 int rc = SQLITE_OK; 6687 Pgno i, nPage, nToPage, iSkip; 6688 6689 BtShared *pBtTo = pTo->pBt; 6690 BtShared *pBtFrom = pFrom->pBt; 6691 6692 if( pTo->inTrans!=TRANS_WRITE || pFrom->inTrans!=TRANS_WRITE ){ 6693 return SQLITE_ERROR; 6694 } 6695 if( pBtTo->pCursor ) return SQLITE_BUSY; 6696 nToPage = sqlite3PagerPagecount(pBtTo->pPager); 6697 nPage = sqlite3PagerPagecount(pBtFrom->pPager); 6698 iSkip = PENDING_BYTE_PAGE(pBtTo); 6699 for(i=1; rc==SQLITE_OK && i<=nPage; i++){ 6700 DbPage *pDbPage; 6701 if( i==iSkip ) continue; 6702 rc = sqlite3PagerGet(pBtFrom->pPager, i, &pDbPage); 6703 if( rc ) break; 6704 rc = sqlite3PagerOverwrite(pBtTo->pPager, i, sqlite3PagerGetData(pDbPage)); 6705 sqlite3PagerUnref(pDbPage); 6706 } 6707 6708 /* If the file is shrinking, journal the pages that are being truncated 6709 ** so that they can be rolled back if the commit fails. 6710 */ 6711 for(i=nPage+1; rc==SQLITE_OK && i<=nToPage; i++){ 6712 DbPage *pDbPage; 6713 if( i==iSkip ) continue; 6714 rc = sqlite3PagerGet(pBtTo->pPager, i, &pDbPage); 6715 if( rc ) break; 6716 rc = sqlite3PagerWrite(pDbPage); 6717 sqlite3PagerDontWrite(pDbPage); 6718 /* Yeah. It seems wierd to call DontWrite() right after Write(). But 6719 ** that is because the names of those procedures do not exactly 6720 ** represent what they do. Write() really means "put this page in the 6721 ** rollback journal and mark it as dirty so that it will be written 6722 ** to the database file later." DontWrite() undoes the second part of 6723 ** that and prevents the page from being written to the database. The 6724 ** page is still on the rollback journal, though. And that is the whole 6725 ** point of this loop: to put pages on the rollback journal. */ 6726 sqlite3PagerUnref(pDbPage); 6727 } 6728 if( !rc && nPage<nToPage ){ 6729 rc = sqlite3PagerTruncate(pBtTo->pPager, nPage); 6730 } 6731 6732 if( rc ){ 6733 sqlite3BtreeRollback(pTo); 6734 } 6735 return rc; 6736 } 6737 int sqlite3BtreeCopyFile(Btree *pTo, Btree *pFrom){ 6738 int rc; 6739 sqlite3BtreeEnter(pTo); 6740 sqlite3BtreeEnter(pFrom); 6741 rc = btreeCopyFile(pTo, pFrom); 6742 sqlite3BtreeLeave(pFrom); 6743 sqlite3BtreeLeave(pTo); 6744 return rc; 6745 } 6746 6747 #endif /* SQLITE_OMIT_VACUUM */ 6748 6749 /* 6750 ** Return non-zero if a transaction is active. 6751 */ 6752 int sqlite3BtreeIsInTrans(Btree *p){ 6753 assert( p==0 || sqlite3_mutex_held(p->pSqlite->mutex) ); 6754 return (p && (p->inTrans==TRANS_WRITE)); 6755 } 6756 6757 /* 6758 ** Return non-zero if a statement transaction is active. 6759 */ 6760 int sqlite3BtreeIsInStmt(Btree *p){ 6761 assert( sqlite3BtreeHoldsMutex(p) ); 6762 return (p->pBt && p->pBt->inStmt); 6763 } 6764 6765 /* 6766 ** Return non-zero if a read (or write) transaction is active. 6767 */ 6768 int sqlite3BtreeIsInReadTrans(Btree *p){ 6769 assert( sqlite3_mutex_held(p->pSqlite->mutex) ); 6770 return (p && (p->inTrans!=TRANS_NONE)); 6771 } 6772 6773 /* 6774 ** This function returns a pointer to a blob of memory associated with 6775 ** a single shared-btree. The memory is used by client code for it's own 6776 ** purposes (for example, to store a high-level schema associated with 6777 ** the shared-btree). The btree layer manages reference counting issues. 6778 ** 6779 ** The first time this is called on a shared-btree, nBytes bytes of memory 6780 ** are allocated, zeroed, and returned to the caller. For each subsequent 6781 ** call the nBytes parameter is ignored and a pointer to the same blob 6782 ** of memory returned. 6783 ** 6784 ** Just before the shared-btree is closed, the function passed as the 6785 ** xFree argument when the memory allocation was made is invoked on the 6786 ** blob of allocated memory. This function should not call sqlite3_free() 6787 ** on the memory, the btree layer does that. 6788 */ 6789 void *sqlite3BtreeSchema(Btree *p, int nBytes, void(*xFree)(void *)){ 6790 BtShared *pBt = p->pBt; 6791 sqlite3BtreeEnter(p); 6792 if( !pBt->pSchema ){ 6793 pBt->pSchema = sqlite3MallocZero(nBytes); 6794 pBt->xFreeSchema = xFree; 6795 } 6796 sqlite3BtreeLeave(p); 6797 return pBt->pSchema; 6798 } 6799 6800 /* 6801 ** Return true if another user of the same shared btree as the argument 6802 ** handle holds an exclusive lock on the sqlite_master table. 6803 */ 6804 int sqlite3BtreeSchemaLocked(Btree *p){ 6805 int rc; 6806 assert( sqlite3_mutex_held(p->pSqlite->mutex) ); 6807 sqlite3BtreeEnter(p); 6808 rc = (queryTableLock(p, MASTER_ROOT, READ_LOCK)!=SQLITE_OK); 6809 sqlite3BtreeLeave(p); 6810 return rc; 6811 } 6812 6813 6814 #ifndef SQLITE_OMIT_SHARED_CACHE 6815 /* 6816 ** Obtain a lock on the table whose root page is iTab. The 6817 ** lock is a write lock if isWritelock is true or a read lock 6818 ** if it is false. 6819 */ 6820 int sqlite3BtreeLockTable(Btree *p, int iTab, u8 isWriteLock){ 6821 int rc = SQLITE_OK; 6822 u8 lockType = (isWriteLock?WRITE_LOCK:READ_LOCK); 6823 sqlite3BtreeEnter(p); 6824 rc = queryTableLock(p, iTab, lockType); 6825 if( rc==SQLITE_OK ){ 6826 rc = lockTable(p, iTab, lockType); 6827 } 6828 sqlite3BtreeLeave(p); 6829 return rc; 6830 } 6831 #endif 6832 6833 #ifndef SQLITE_OMIT_INCRBLOB 6834 /* 6835 ** Argument pCsr must be a cursor opened for writing on an 6836 ** INTKEY table currently pointing at a valid table entry. 6837 ** This function modifies the data stored as part of that entry. 6838 ** Only the data content may only be modified, it is not possible 6839 ** to change the length of the data stored. 6840 */ 6841 int sqlite3BtreePutData(BtCursor *pCsr, u32 offset, u32 amt, void *z){ 6842 assert( cursorHoldsMutex(pCsr) ); 6843 assert( sqlite3_mutex_held(pCsr->pBtree->pSqlite->mutex) ); 6844 assert(pCsr->isIncrblobHandle); 6845 if( pCsr->eState>=CURSOR_REQUIRESEEK ){ 6846 if( pCsr->eState==CURSOR_FAULT ){ 6847 return pCsr->skip; 6848 }else{ 6849 return SQLITE_ABORT; 6850 } 6851 } 6852 6853 /* Check some preconditions: 6854 ** (a) the cursor is open for writing, 6855 ** (b) there is no read-lock on the table being modified and 6856 ** (c) the cursor points at a valid row of an intKey table. 6857 */ 6858 if( !pCsr->wrFlag ){ 6859 return SQLITE_READONLY; 6860 } 6861 assert( !pCsr->pBt->readOnly 6862 && pCsr->pBt->inTransaction==TRANS_WRITE ); 6863 if( checkReadLocks(pCsr->pBtree, pCsr->pgnoRoot, pCsr) ){ 6864 return SQLITE_LOCKED; /* The table pCur points to has a read lock */ 6865 } 6866 if( pCsr->eState==CURSOR_INVALID || !pCsr->pPage->intKey ){ 6867 return SQLITE_ERROR; 6868 } 6869 6870 return accessPayload(pCsr, offset, amt, (unsigned char *)z, 0, 1); 6871 } 6872 6873 /* 6874 ** Set a flag on this cursor to cache the locations of pages from the 6875 ** overflow list for the current row. This is used by cursors opened 6876 ** for incremental blob IO only. 6877 ** 6878 ** This function sets a flag only. The actual page location cache 6879 ** (stored in BtCursor.aOverflow[]) is allocated and used by function 6880 ** accessPayload() (the worker function for sqlite3BtreeData() and 6881 ** sqlite3BtreePutData()). 6882 */ 6883 void sqlite3BtreeCacheOverflow(BtCursor *pCur){ 6884 assert( cursorHoldsMutex(pCur) ); 6885 assert( sqlite3_mutex_held(pCur->pBtree->pSqlite->mutex) ); 6886 assert(!pCur->isIncrblobHandle); 6887 assert(!pCur->aOverflow); 6888 pCur->isIncrblobHandle = 1; 6889 } 6890 #endif 6891