1 /* 2 ** 2004 April 6 3 ** 4 ** The author disclaims copyright to this source code. In place of 5 ** a legal notice, here is a blessing: 6 ** 7 ** May you do good and not evil. 8 ** May you find forgiveness for yourself and forgive others. 9 ** May you share freely, never taking more than you give. 10 ** 11 ************************************************************************* 12 ** $Id: btree.c,v 1.492 2008/07/28 19:34:53 drh Exp $ 13 ** 14 ** This file implements a external (disk-based) database using BTrees. 15 ** See the header comment on "btreeInt.h" for additional information. 16 ** Including a description of file format and an overview of operation. 17 */ 18 #include "btreeInt.h" 19 20 /* 21 ** The header string that appears at the beginning of every 22 ** SQLite database. 23 */ 24 static const char zMagicHeader[] = SQLITE_FILE_HEADER; 25 26 /* 27 ** Set this global variable to 1 to enable tracing using the TRACE 28 ** macro. 29 */ 30 #if 0 31 int sqlite3BtreeTrace=0; /* True to enable tracing */ 32 # define TRACE(X) if(sqlite3BtreeTrace){printf X;fflush(stdout);} 33 #else 34 # define TRACE(X) 35 #endif 36 37 38 39 #ifndef SQLITE_OMIT_SHARED_CACHE 40 /* 41 ** A flag to indicate whether or not shared cache is enabled. Also, 42 ** a list of BtShared objects that are eligible for participation 43 ** in shared cache. The variables have file scope during normal builds, 44 ** but the test harness needs to access these variables so we make them 45 ** global for test builds. 46 */ 47 #ifdef SQLITE_TEST 48 BtShared *sqlite3SharedCacheList = 0; 49 int sqlite3SharedCacheEnabled = 0; 50 #else 51 static BtShared *sqlite3SharedCacheList = 0; 52 static int sqlite3SharedCacheEnabled = 0; 53 #endif 54 #endif /* SQLITE_OMIT_SHARED_CACHE */ 55 56 #ifndef SQLITE_OMIT_SHARED_CACHE 57 /* 58 ** Enable or disable the shared pager and schema features. 59 ** 60 ** This routine has no effect on existing database connections. 61 ** The shared cache setting effects only future calls to 62 ** sqlite3_open(), sqlite3_open16(), or sqlite3_open_v2(). 63 */ 64 int sqlite3_enable_shared_cache(int enable){ 65 sqlite3SharedCacheEnabled = enable; 66 return SQLITE_OK; 67 } 68 #endif 69 70 71 /* 72 ** Forward declaration 73 */ 74 static int checkReadLocks(Btree*, Pgno, BtCursor*, i64); 75 76 77 #ifdef SQLITE_OMIT_SHARED_CACHE 78 /* 79 ** The functions queryTableLock(), lockTable() and unlockAllTables() 80 ** manipulate entries in the BtShared.pLock linked list used to store 81 ** shared-cache table level locks. If the library is compiled with the 82 ** shared-cache feature disabled, then there is only ever one user 83 ** of each BtShared structure and so this locking is not necessary. 84 ** So define the lock related functions as no-ops. 85 */ 86 #define queryTableLock(a,b,c) SQLITE_OK 87 #define lockTable(a,b,c) SQLITE_OK 88 #define unlockAllTables(a) 89 #endif 90 91 #ifndef SQLITE_OMIT_SHARED_CACHE 92 /* 93 ** Query to see if btree handle p may obtain a lock of type eLock 94 ** (READ_LOCK or WRITE_LOCK) on the table with root-page iTab. Return 95 ** SQLITE_OK if the lock may be obtained (by calling lockTable()), or 96 ** SQLITE_LOCKED if not. 97 */ 98 static int queryTableLock(Btree *p, Pgno iTab, u8 eLock){ 99 BtShared *pBt = p->pBt; 100 BtLock *pIter; 101 102 assert( sqlite3BtreeHoldsMutex(p) ); 103 assert( eLock==READ_LOCK || eLock==WRITE_LOCK ); 104 assert( p->db!=0 ); 105 106 /* This is a no-op if the shared-cache is not enabled */ 107 if( !p->sharable ){ 108 return SQLITE_OK; 109 } 110 111 /* If some other connection is holding an exclusive lock, the 112 ** requested lock may not be obtained. 113 */ 114 if( pBt->pExclusive && pBt->pExclusive!=p ){ 115 return SQLITE_LOCKED; 116 } 117 118 /* This (along with lockTable()) is where the ReadUncommitted flag is 119 ** dealt with. If the caller is querying for a read-lock and the flag is 120 ** set, it is unconditionally granted - even if there are write-locks 121 ** on the table. If a write-lock is requested, the ReadUncommitted flag 122 ** is not considered. 123 ** 124 ** In function lockTable(), if a read-lock is demanded and the 125 ** ReadUncommitted flag is set, no entry is added to the locks list 126 ** (BtShared.pLock). 127 ** 128 ** To summarize: If the ReadUncommitted flag is set, then read cursors do 129 ** not create or respect table locks. The locking procedure for a 130 ** write-cursor does not change. 131 */ 132 if( 133 0==(p->db->flags&SQLITE_ReadUncommitted) || 134 eLock==WRITE_LOCK || 135 iTab==MASTER_ROOT 136 ){ 137 for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){ 138 if( pIter->pBtree!=p && pIter->iTable==iTab && 139 (pIter->eLock!=eLock || eLock!=READ_LOCK) ){ 140 return SQLITE_LOCKED; 141 } 142 } 143 } 144 return SQLITE_OK; 145 } 146 #endif /* !SQLITE_OMIT_SHARED_CACHE */ 147 148 #ifndef SQLITE_OMIT_SHARED_CACHE 149 /* 150 ** Add a lock on the table with root-page iTable to the shared-btree used 151 ** by Btree handle p. Parameter eLock must be either READ_LOCK or 152 ** WRITE_LOCK. 153 ** 154 ** SQLITE_OK is returned if the lock is added successfully. SQLITE_BUSY and 155 ** SQLITE_NOMEM may also be returned. 156 */ 157 static int lockTable(Btree *p, Pgno iTable, u8 eLock){ 158 BtShared *pBt = p->pBt; 159 BtLock *pLock = 0; 160 BtLock *pIter; 161 162 assert( sqlite3BtreeHoldsMutex(p) ); 163 assert( eLock==READ_LOCK || eLock==WRITE_LOCK ); 164 assert( p->db!=0 ); 165 166 /* This is a no-op if the shared-cache is not enabled */ 167 if( !p->sharable ){ 168 return SQLITE_OK; 169 } 170 171 assert( SQLITE_OK==queryTableLock(p, iTable, eLock) ); 172 173 /* If the read-uncommitted flag is set and a read-lock is requested, 174 ** return early without adding an entry to the BtShared.pLock list. See 175 ** comment in function queryTableLock() for more info on handling 176 ** the ReadUncommitted flag. 177 */ 178 if( 179 (p->db->flags&SQLITE_ReadUncommitted) && 180 (eLock==READ_LOCK) && 181 iTable!=MASTER_ROOT 182 ){ 183 return SQLITE_OK; 184 } 185 186 /* First search the list for an existing lock on this table. */ 187 for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){ 188 if( pIter->iTable==iTable && pIter->pBtree==p ){ 189 pLock = pIter; 190 break; 191 } 192 } 193 194 /* If the above search did not find a BtLock struct associating Btree p 195 ** with table iTable, allocate one and link it into the list. 196 */ 197 if( !pLock ){ 198 pLock = (BtLock *)sqlite3MallocZero(sizeof(BtLock)); 199 if( !pLock ){ 200 return SQLITE_NOMEM; 201 } 202 pLock->iTable = iTable; 203 pLock->pBtree = p; 204 pLock->pNext = pBt->pLock; 205 pBt->pLock = pLock; 206 } 207 208 /* Set the BtLock.eLock variable to the maximum of the current lock 209 ** and the requested lock. This means if a write-lock was already held 210 ** and a read-lock requested, we don't incorrectly downgrade the lock. 211 */ 212 assert( WRITE_LOCK>READ_LOCK ); 213 if( eLock>pLock->eLock ){ 214 pLock->eLock = eLock; 215 } 216 217 return SQLITE_OK; 218 } 219 #endif /* !SQLITE_OMIT_SHARED_CACHE */ 220 221 #ifndef SQLITE_OMIT_SHARED_CACHE 222 /* 223 ** Release all the table locks (locks obtained via calls to the lockTable() 224 ** procedure) held by Btree handle p. 225 */ 226 static void unlockAllTables(Btree *p){ 227 BtShared *pBt = p->pBt; 228 BtLock **ppIter = &pBt->pLock; 229 230 assert( sqlite3BtreeHoldsMutex(p) ); 231 assert( p->sharable || 0==*ppIter ); 232 233 while( *ppIter ){ 234 BtLock *pLock = *ppIter; 235 assert( pBt->pExclusive==0 || pBt->pExclusive==pLock->pBtree ); 236 if( pLock->pBtree==p ){ 237 *ppIter = pLock->pNext; 238 sqlite3_free(pLock); 239 }else{ 240 ppIter = &pLock->pNext; 241 } 242 } 243 244 if( pBt->pExclusive==p ){ 245 pBt->pExclusive = 0; 246 } 247 } 248 #endif /* SQLITE_OMIT_SHARED_CACHE */ 249 250 static void releasePage(MemPage *pPage); /* Forward reference */ 251 252 /* 253 ** Verify that the cursor holds a mutex on the BtShared 254 */ 255 #ifndef NDEBUG 256 static int cursorHoldsMutex(BtCursor *p){ 257 return sqlite3_mutex_held(p->pBt->mutex); 258 } 259 #endif 260 261 262 #ifndef SQLITE_OMIT_INCRBLOB 263 /* 264 ** Invalidate the overflow page-list cache for cursor pCur, if any. 265 */ 266 static void invalidateOverflowCache(BtCursor *pCur){ 267 assert( cursorHoldsMutex(pCur) ); 268 sqlite3_free(pCur->aOverflow); 269 pCur->aOverflow = 0; 270 } 271 272 /* 273 ** Invalidate the overflow page-list cache for all cursors opened 274 ** on the shared btree structure pBt. 275 */ 276 static void invalidateAllOverflowCache(BtShared *pBt){ 277 BtCursor *p; 278 assert( sqlite3_mutex_held(pBt->mutex) ); 279 for(p=pBt->pCursor; p; p=p->pNext){ 280 invalidateOverflowCache(p); 281 } 282 } 283 #else 284 #define invalidateOverflowCache(x) 285 #define invalidateAllOverflowCache(x) 286 #endif 287 288 /* 289 ** Save the current cursor position in the variables BtCursor.nKey 290 ** and BtCursor.pKey. The cursor's state is set to CURSOR_REQUIRESEEK. 291 */ 292 static int saveCursorPosition(BtCursor *pCur){ 293 int rc; 294 295 assert( CURSOR_VALID==pCur->eState ); 296 assert( 0==pCur->pKey ); 297 assert( cursorHoldsMutex(pCur) ); 298 299 rc = sqlite3BtreeKeySize(pCur, &pCur->nKey); 300 301 /* If this is an intKey table, then the above call to BtreeKeySize() 302 ** stores the integer key in pCur->nKey. In this case this value is 303 ** all that is required. Otherwise, if pCur is not open on an intKey 304 ** table, then malloc space for and store the pCur->nKey bytes of key 305 ** data. 306 */ 307 if( rc==SQLITE_OK && 0==pCur->pPage->intKey){ 308 void *pKey = sqlite3Malloc(pCur->nKey); 309 if( pKey ){ 310 rc = sqlite3BtreeKey(pCur, 0, pCur->nKey, pKey); 311 if( rc==SQLITE_OK ){ 312 pCur->pKey = pKey; 313 }else{ 314 sqlite3_free(pKey); 315 } 316 }else{ 317 rc = SQLITE_NOMEM; 318 } 319 } 320 assert( !pCur->pPage->intKey || !pCur->pKey ); 321 322 if( rc==SQLITE_OK ){ 323 releasePage(pCur->pPage); 324 pCur->pPage = 0; 325 pCur->eState = CURSOR_REQUIRESEEK; 326 } 327 328 invalidateOverflowCache(pCur); 329 return rc; 330 } 331 332 /* 333 ** Save the positions of all cursors except pExcept open on the table 334 ** with root-page iRoot. Usually, this is called just before cursor 335 ** pExcept is used to modify the table (BtreeDelete() or BtreeInsert()). 336 */ 337 static int saveAllCursors(BtShared *pBt, Pgno iRoot, BtCursor *pExcept){ 338 BtCursor *p; 339 assert( sqlite3_mutex_held(pBt->mutex) ); 340 assert( pExcept==0 || pExcept->pBt==pBt ); 341 for(p=pBt->pCursor; p; p=p->pNext){ 342 if( p!=pExcept && (0==iRoot || p->pgnoRoot==iRoot) && 343 p->eState==CURSOR_VALID ){ 344 int rc = saveCursorPosition(p); 345 if( SQLITE_OK!=rc ){ 346 return rc; 347 } 348 } 349 } 350 return SQLITE_OK; 351 } 352 353 /* 354 ** Clear the current cursor position. 355 */ 356 static void clearCursorPosition(BtCursor *pCur){ 357 assert( cursorHoldsMutex(pCur) ); 358 sqlite3_free(pCur->pKey); 359 pCur->pKey = 0; 360 pCur->eState = CURSOR_INVALID; 361 } 362 363 /* 364 ** Restore the cursor to the position it was in (or as close to as possible) 365 ** when saveCursorPosition() was called. Note that this call deletes the 366 ** saved position info stored by saveCursorPosition(), so there can be 367 ** at most one effective restoreCursorPosition() call after each 368 ** saveCursorPosition(). 369 */ 370 int sqlite3BtreeRestoreCursorPosition(BtCursor *pCur){ 371 int rc; 372 assert( cursorHoldsMutex(pCur) ); 373 assert( pCur->eState>=CURSOR_REQUIRESEEK ); 374 if( pCur->eState==CURSOR_FAULT ){ 375 return pCur->skip; 376 } 377 pCur->eState = CURSOR_INVALID; 378 rc = sqlite3BtreeMoveto(pCur, pCur->pKey, 0, pCur->nKey, 0, &pCur->skip); 379 if( rc==SQLITE_OK ){ 380 sqlite3_free(pCur->pKey); 381 pCur->pKey = 0; 382 assert( pCur->eState==CURSOR_VALID || pCur->eState==CURSOR_INVALID ); 383 } 384 return rc; 385 } 386 387 #define restoreCursorPosition(p) \ 388 (p->eState>=CURSOR_REQUIRESEEK ? \ 389 sqlite3BtreeRestoreCursorPosition(p) : \ 390 SQLITE_OK) 391 392 /* 393 ** Determine whether or not a cursor has moved from the position it 394 ** was last placed at. Cursor can move when the row they are pointing 395 ** at is deleted out from under them. 396 ** 397 ** This routine returns an error code if something goes wrong. The 398 ** integer *pHasMoved is set to one if the cursor has moved and 0 if not. 399 */ 400 int sqlite3BtreeCursorHasMoved(BtCursor *pCur, int *pHasMoved){ 401 int rc; 402 403 rc = restoreCursorPosition(pCur); 404 if( rc ){ 405 *pHasMoved = 1; 406 return rc; 407 } 408 if( pCur->eState!=CURSOR_VALID || pCur->skip!=0 ){ 409 *pHasMoved = 1; 410 }else{ 411 *pHasMoved = 0; 412 } 413 return SQLITE_OK; 414 } 415 416 #ifndef SQLITE_OMIT_AUTOVACUUM 417 /* 418 ** Given a page number of a regular database page, return the page 419 ** number for the pointer-map page that contains the entry for the 420 ** input page number. 421 */ 422 static Pgno ptrmapPageno(BtShared *pBt, Pgno pgno){ 423 int nPagesPerMapPage, iPtrMap, ret; 424 assert( sqlite3_mutex_held(pBt->mutex) ); 425 nPagesPerMapPage = (pBt->usableSize/5)+1; 426 iPtrMap = (pgno-2)/nPagesPerMapPage; 427 ret = (iPtrMap*nPagesPerMapPage) + 2; 428 if( ret==PENDING_BYTE_PAGE(pBt) ){ 429 ret++; 430 } 431 return ret; 432 } 433 434 /* 435 ** Write an entry into the pointer map. 436 ** 437 ** This routine updates the pointer map entry for page number 'key' 438 ** so that it maps to type 'eType' and parent page number 'pgno'. 439 ** An error code is returned if something goes wrong, otherwise SQLITE_OK. 440 */ 441 static int ptrmapPut(BtShared *pBt, Pgno key, u8 eType, Pgno parent){ 442 DbPage *pDbPage; /* The pointer map page */ 443 u8 *pPtrmap; /* The pointer map data */ 444 Pgno iPtrmap; /* The pointer map page number */ 445 int offset; /* Offset in pointer map page */ 446 int rc; 447 448 assert( sqlite3_mutex_held(pBt->mutex) ); 449 /* The master-journal page number must never be used as a pointer map page */ 450 assert( 0==PTRMAP_ISPAGE(pBt, PENDING_BYTE_PAGE(pBt)) ); 451 452 assert( pBt->autoVacuum ); 453 if( key==0 ){ 454 return SQLITE_CORRUPT_BKPT; 455 } 456 iPtrmap = PTRMAP_PAGENO(pBt, key); 457 rc = sqlite3PagerGet(pBt->pPager, iPtrmap, &pDbPage); 458 if( rc!=SQLITE_OK ){ 459 return rc; 460 } 461 offset = PTRMAP_PTROFFSET(iPtrmap, key); 462 pPtrmap = (u8 *)sqlite3PagerGetData(pDbPage); 463 464 if( eType!=pPtrmap[offset] || get4byte(&pPtrmap[offset+1])!=parent ){ 465 TRACE(("PTRMAP_UPDATE: %d->(%d,%d)\n", key, eType, parent)); 466 rc = sqlite3PagerWrite(pDbPage); 467 if( rc==SQLITE_OK ){ 468 pPtrmap[offset] = eType; 469 put4byte(&pPtrmap[offset+1], parent); 470 } 471 } 472 473 sqlite3PagerUnref(pDbPage); 474 return rc; 475 } 476 477 /* 478 ** Read an entry from the pointer map. 479 ** 480 ** This routine retrieves the pointer map entry for page 'key', writing 481 ** the type and parent page number to *pEType and *pPgno respectively. 482 ** An error code is returned if something goes wrong, otherwise SQLITE_OK. 483 */ 484 static int ptrmapGet(BtShared *pBt, Pgno key, u8 *pEType, Pgno *pPgno){ 485 DbPage *pDbPage; /* The pointer map page */ 486 int iPtrmap; /* Pointer map page index */ 487 u8 *pPtrmap; /* Pointer map page data */ 488 int offset; /* Offset of entry in pointer map */ 489 int rc; 490 491 assert( sqlite3_mutex_held(pBt->mutex) ); 492 493 iPtrmap = PTRMAP_PAGENO(pBt, key); 494 rc = sqlite3PagerGet(pBt->pPager, iPtrmap, &pDbPage); 495 if( rc!=0 ){ 496 return rc; 497 } 498 pPtrmap = (u8 *)sqlite3PagerGetData(pDbPage); 499 500 offset = PTRMAP_PTROFFSET(iPtrmap, key); 501 assert( pEType!=0 ); 502 *pEType = pPtrmap[offset]; 503 if( pPgno ) *pPgno = get4byte(&pPtrmap[offset+1]); 504 505 sqlite3PagerUnref(pDbPage); 506 if( *pEType<1 || *pEType>5 ) return SQLITE_CORRUPT_BKPT; 507 return SQLITE_OK; 508 } 509 510 #else /* if defined SQLITE_OMIT_AUTOVACUUM */ 511 #define ptrmapPut(w,x,y,z) SQLITE_OK 512 #define ptrmapGet(w,x,y,z) SQLITE_OK 513 #define ptrmapPutOvfl(y,z) SQLITE_OK 514 #endif 515 516 /* 517 ** Given a btree page and a cell index (0 means the first cell on 518 ** the page, 1 means the second cell, and so forth) return a pointer 519 ** to the cell content. 520 ** 521 ** This routine works only for pages that do not contain overflow cells. 522 */ 523 #define findCell(P,I) \ 524 ((P)->aData + ((P)->maskPage & get2byte(&(P)->aData[(P)->cellOffset+2*(I)]))) 525 526 /* 527 ** This a more complex version of findCell() that works for 528 ** pages that do contain overflow cells. See insert 529 */ 530 static u8 *findOverflowCell(MemPage *pPage, int iCell){ 531 int i; 532 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 533 for(i=pPage->nOverflow-1; i>=0; i--){ 534 int k; 535 struct _OvflCell *pOvfl; 536 pOvfl = &pPage->aOvfl[i]; 537 k = pOvfl->idx; 538 if( k<=iCell ){ 539 if( k==iCell ){ 540 return pOvfl->pCell; 541 } 542 iCell--; 543 } 544 } 545 return findCell(pPage, iCell); 546 } 547 548 /* 549 ** Parse a cell content block and fill in the CellInfo structure. There 550 ** are two versions of this function. sqlite3BtreeParseCell() takes a 551 ** cell index as the second argument and sqlite3BtreeParseCellPtr() 552 ** takes a pointer to the body of the cell as its second argument. 553 ** 554 ** Within this file, the parseCell() macro can be called instead of 555 ** sqlite3BtreeParseCellPtr(). Using some compilers, this will be faster. 556 */ 557 void sqlite3BtreeParseCellPtr( 558 MemPage *pPage, /* Page containing the cell */ 559 u8 *pCell, /* Pointer to the cell text. */ 560 CellInfo *pInfo /* Fill in this structure */ 561 ){ 562 int n; /* Number bytes in cell content header */ 563 u32 nPayload; /* Number of bytes of cell payload */ 564 565 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 566 567 pInfo->pCell = pCell; 568 assert( pPage->leaf==0 || pPage->leaf==1 ); 569 n = pPage->childPtrSize; 570 assert( n==4-4*pPage->leaf ); 571 if( pPage->intKey ){ 572 if( pPage->hasData ){ 573 n += getVarint32(&pCell[n], nPayload); 574 }else{ 575 nPayload = 0; 576 } 577 n += getVarint(&pCell[n], (u64*)&pInfo->nKey); 578 pInfo->nData = nPayload; 579 }else{ 580 pInfo->nData = 0; 581 n += getVarint32(&pCell[n], nPayload); 582 pInfo->nKey = nPayload; 583 } 584 pInfo->nPayload = nPayload; 585 pInfo->nHeader = n; 586 if( likely(nPayload<=pPage->maxLocal) ){ 587 /* This is the (easy) common case where the entire payload fits 588 ** on the local page. No overflow is required. 589 */ 590 int nSize; /* Total size of cell content in bytes */ 591 nSize = nPayload + n; 592 pInfo->nLocal = nPayload; 593 pInfo->iOverflow = 0; 594 if( (nSize & ~3)==0 ){ 595 nSize = 4; /* Minimum cell size is 4 */ 596 } 597 pInfo->nSize = nSize; 598 }else{ 599 /* If the payload will not fit completely on the local page, we have 600 ** to decide how much to store locally and how much to spill onto 601 ** overflow pages. The strategy is to minimize the amount of unused 602 ** space on overflow pages while keeping the amount of local storage 603 ** in between minLocal and maxLocal. 604 ** 605 ** Warning: changing the way overflow payload is distributed in any 606 ** way will result in an incompatible file format. 607 */ 608 int minLocal; /* Minimum amount of payload held locally */ 609 int maxLocal; /* Maximum amount of payload held locally */ 610 int surplus; /* Overflow payload available for local storage */ 611 612 minLocal = pPage->minLocal; 613 maxLocal = pPage->maxLocal; 614 surplus = minLocal + (nPayload - minLocal)%(pPage->pBt->usableSize - 4); 615 if( surplus <= maxLocal ){ 616 pInfo->nLocal = surplus; 617 }else{ 618 pInfo->nLocal = minLocal; 619 } 620 pInfo->iOverflow = pInfo->nLocal + n; 621 pInfo->nSize = pInfo->iOverflow + 4; 622 } 623 } 624 #define parseCell(pPage, iCell, pInfo) \ 625 sqlite3BtreeParseCellPtr((pPage), findCell((pPage), (iCell)), (pInfo)) 626 void sqlite3BtreeParseCell( 627 MemPage *pPage, /* Page containing the cell */ 628 int iCell, /* The cell index. First cell is 0 */ 629 CellInfo *pInfo /* Fill in this structure */ 630 ){ 631 parseCell(pPage, iCell, pInfo); 632 } 633 634 /* 635 ** Compute the total number of bytes that a Cell needs in the cell 636 ** data area of the btree-page. The return number includes the cell 637 ** data header and the local payload, but not any overflow page or 638 ** the space used by the cell pointer. 639 */ 640 #ifndef NDEBUG 641 static u16 cellSize(MemPage *pPage, int iCell){ 642 CellInfo info; 643 sqlite3BtreeParseCell(pPage, iCell, &info); 644 return info.nSize; 645 } 646 #endif 647 static u16 cellSizePtr(MemPage *pPage, u8 *pCell){ 648 CellInfo info; 649 sqlite3BtreeParseCellPtr(pPage, pCell, &info); 650 return info.nSize; 651 } 652 653 #ifndef SQLITE_OMIT_AUTOVACUUM 654 /* 655 ** If the cell pCell, part of page pPage contains a pointer 656 ** to an overflow page, insert an entry into the pointer-map 657 ** for the overflow page. 658 */ 659 static int ptrmapPutOvflPtr(MemPage *pPage, u8 *pCell){ 660 CellInfo info; 661 assert( pCell!=0 ); 662 sqlite3BtreeParseCellPtr(pPage, pCell, &info); 663 assert( (info.nData+(pPage->intKey?0:info.nKey))==info.nPayload ); 664 if( (info.nData+(pPage->intKey?0:info.nKey))>info.nLocal ){ 665 Pgno ovfl = get4byte(&pCell[info.iOverflow]); 666 return ptrmapPut(pPage->pBt, ovfl, PTRMAP_OVERFLOW1, pPage->pgno); 667 } 668 return SQLITE_OK; 669 } 670 /* 671 ** If the cell with index iCell on page pPage contains a pointer 672 ** to an overflow page, insert an entry into the pointer-map 673 ** for the overflow page. 674 */ 675 static int ptrmapPutOvfl(MemPage *pPage, int iCell){ 676 u8 *pCell; 677 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 678 pCell = findOverflowCell(pPage, iCell); 679 return ptrmapPutOvflPtr(pPage, pCell); 680 } 681 #endif 682 683 684 /* 685 ** Defragment the page given. All Cells are moved to the 686 ** end of the page and all free space is collected into one 687 ** big FreeBlk that occurs in between the header and cell 688 ** pointer array and the cell content area. 689 */ 690 static void defragmentPage(MemPage *pPage){ 691 int i; /* Loop counter */ 692 int pc; /* Address of a i-th cell */ 693 int addr; /* Offset of first byte after cell pointer array */ 694 int hdr; /* Offset to the page header */ 695 int size; /* Size of a cell */ 696 int usableSize; /* Number of usable bytes on a page */ 697 int cellOffset; /* Offset to the cell pointer array */ 698 int brk; /* Offset to the cell content area */ 699 int nCell; /* Number of cells on the page */ 700 unsigned char *data; /* The page data */ 701 unsigned char *temp; /* Temp area for cell content */ 702 703 assert( sqlite3PagerIswriteable(pPage->pDbPage) ); 704 assert( pPage->pBt!=0 ); 705 assert( pPage->pBt->usableSize <= SQLITE_MAX_PAGE_SIZE ); 706 assert( pPage->nOverflow==0 ); 707 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 708 temp = sqlite3PagerTempSpace(pPage->pBt->pPager); 709 data = pPage->aData; 710 hdr = pPage->hdrOffset; 711 cellOffset = pPage->cellOffset; 712 nCell = pPage->nCell; 713 assert( nCell==get2byte(&data[hdr+3]) ); 714 usableSize = pPage->pBt->usableSize; 715 brk = get2byte(&data[hdr+5]); 716 memcpy(&temp[brk], &data[brk], usableSize - brk); 717 brk = usableSize; 718 for(i=0; i<nCell; i++){ 719 u8 *pAddr; /* The i-th cell pointer */ 720 pAddr = &data[cellOffset + i*2]; 721 pc = get2byte(pAddr); 722 assert( pc<pPage->pBt->usableSize ); 723 size = cellSizePtr(pPage, &temp[pc]); 724 brk -= size; 725 memcpy(&data[brk], &temp[pc], size); 726 put2byte(pAddr, brk); 727 } 728 assert( brk>=cellOffset+2*nCell ); 729 put2byte(&data[hdr+5], brk); 730 data[hdr+1] = 0; 731 data[hdr+2] = 0; 732 data[hdr+7] = 0; 733 addr = cellOffset+2*nCell; 734 memset(&data[addr], 0, brk-addr); 735 } 736 737 /* 738 ** Allocate nByte bytes of space on a page. 739 ** 740 ** Return the index into pPage->aData[] of the first byte of 741 ** the new allocation. The caller guarantees that there is enough 742 ** space. This routine will never fail. 743 ** 744 ** If the page contains nBytes of free space but does not contain 745 ** nBytes of contiguous free space, then this routine automatically 746 ** calls defragementPage() to consolidate all free space before 747 ** allocating the new chunk. 748 */ 749 static int allocateSpace(MemPage *pPage, int nByte){ 750 int addr, pc, hdr; 751 int size; 752 int nFrag; 753 int top; 754 int nCell; 755 int cellOffset; 756 unsigned char *data; 757 758 data = pPage->aData; 759 assert( sqlite3PagerIswriteable(pPage->pDbPage) ); 760 assert( pPage->pBt ); 761 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 762 assert( nByte>=0 ); /* Minimum cell size is 4 */ 763 assert( pPage->nFree>=nByte ); 764 assert( pPage->nOverflow==0 ); 765 pPage->nFree -= nByte; 766 hdr = pPage->hdrOffset; 767 768 nFrag = data[hdr+7]; 769 if( nFrag<60 ){ 770 /* Search the freelist looking for a slot big enough to satisfy the 771 ** space request. */ 772 addr = hdr+1; 773 while( (pc = get2byte(&data[addr]))>0 ){ 774 size = get2byte(&data[pc+2]); 775 if( size>=nByte ){ 776 if( size<nByte+4 ){ 777 memcpy(&data[addr], &data[pc], 2); 778 data[hdr+7] = nFrag + size - nByte; 779 return pc; 780 }else{ 781 put2byte(&data[pc+2], size-nByte); 782 return pc + size - nByte; 783 } 784 } 785 addr = pc; 786 } 787 } 788 789 /* Allocate memory from the gap in between the cell pointer array 790 ** and the cell content area. 791 */ 792 top = get2byte(&data[hdr+5]); 793 nCell = get2byte(&data[hdr+3]); 794 cellOffset = pPage->cellOffset; 795 if( nFrag>=60 || cellOffset + 2*nCell > top - nByte ){ 796 defragmentPage(pPage); 797 top = get2byte(&data[hdr+5]); 798 } 799 top -= nByte; 800 assert( cellOffset + 2*nCell <= top ); 801 put2byte(&data[hdr+5], top); 802 return top; 803 } 804 805 /* 806 ** Return a section of the pPage->aData to the freelist. 807 ** The first byte of the new free block is pPage->aDisk[start] 808 ** and the size of the block is "size" bytes. 809 ** 810 ** Most of the effort here is involved in coalesing adjacent 811 ** free blocks into a single big free block. 812 */ 813 static void freeSpace(MemPage *pPage, int start, int size){ 814 int addr, pbegin, hdr; 815 unsigned char *data = pPage->aData; 816 817 assert( pPage->pBt!=0 ); 818 assert( sqlite3PagerIswriteable(pPage->pDbPage) ); 819 assert( start>=pPage->hdrOffset+6+(pPage->leaf?0:4) ); 820 assert( (start + size)<=pPage->pBt->usableSize ); 821 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 822 assert( size>=0 ); /* Minimum cell size is 4 */ 823 824 #ifdef SQLITE_SECURE_DELETE 825 /* Overwrite deleted information with zeros when the SECURE_DELETE 826 ** option is enabled at compile-time */ 827 memset(&data[start], 0, size); 828 #endif 829 830 /* Add the space back into the linked list of freeblocks */ 831 hdr = pPage->hdrOffset; 832 addr = hdr + 1; 833 while( (pbegin = get2byte(&data[addr]))<start && pbegin>0 ){ 834 assert( pbegin<=pPage->pBt->usableSize-4 ); 835 assert( pbegin>addr ); 836 addr = pbegin; 837 } 838 assert( pbegin<=pPage->pBt->usableSize-4 ); 839 assert( pbegin>addr || pbegin==0 ); 840 put2byte(&data[addr], start); 841 put2byte(&data[start], pbegin); 842 put2byte(&data[start+2], size); 843 pPage->nFree += size; 844 845 /* Coalesce adjacent free blocks */ 846 addr = pPage->hdrOffset + 1; 847 while( (pbegin = get2byte(&data[addr]))>0 ){ 848 int pnext, psize; 849 assert( pbegin>addr ); 850 assert( pbegin<=pPage->pBt->usableSize-4 ); 851 pnext = get2byte(&data[pbegin]); 852 psize = get2byte(&data[pbegin+2]); 853 if( pbegin + psize + 3 >= pnext && pnext>0 ){ 854 int frag = pnext - (pbegin+psize); 855 assert( frag<=data[pPage->hdrOffset+7] ); 856 data[pPage->hdrOffset+7] -= frag; 857 put2byte(&data[pbegin], get2byte(&data[pnext])); 858 put2byte(&data[pbegin+2], pnext+get2byte(&data[pnext+2])-pbegin); 859 }else{ 860 addr = pbegin; 861 } 862 } 863 864 /* If the cell content area begins with a freeblock, remove it. */ 865 if( data[hdr+1]==data[hdr+5] && data[hdr+2]==data[hdr+6] ){ 866 int top; 867 pbegin = get2byte(&data[hdr+1]); 868 memcpy(&data[hdr+1], &data[pbegin], 2); 869 top = get2byte(&data[hdr+5]); 870 put2byte(&data[hdr+5], top + get2byte(&data[pbegin+2])); 871 } 872 } 873 874 /* 875 ** Decode the flags byte (the first byte of the header) for a page 876 ** and initialize fields of the MemPage structure accordingly. 877 ** 878 ** Only the following combinations are supported. Anything different 879 ** indicates a corrupt database files: 880 ** 881 ** PTF_ZERODATA 882 ** PTF_ZERODATA | PTF_LEAF 883 ** PTF_LEAFDATA | PTF_INTKEY 884 ** PTF_LEAFDATA | PTF_INTKEY | PTF_LEAF 885 */ 886 static int decodeFlags(MemPage *pPage, int flagByte){ 887 BtShared *pBt; /* A copy of pPage->pBt */ 888 889 assert( pPage->hdrOffset==(pPage->pgno==1 ? 100 : 0) ); 890 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 891 pPage->leaf = flagByte>>3; assert( PTF_LEAF == 1<<3 ); 892 flagByte &= ~PTF_LEAF; 893 pPage->childPtrSize = 4-4*pPage->leaf; 894 pBt = pPage->pBt; 895 if( flagByte==(PTF_LEAFDATA | PTF_INTKEY) ){ 896 pPage->intKey = 1; 897 pPage->hasData = pPage->leaf; 898 pPage->maxLocal = pBt->maxLeaf; 899 pPage->minLocal = pBt->minLeaf; 900 }else if( flagByte==PTF_ZERODATA ){ 901 pPage->intKey = 0; 902 pPage->hasData = 0; 903 pPage->maxLocal = pBt->maxLocal; 904 pPage->minLocal = pBt->minLocal; 905 }else{ 906 return SQLITE_CORRUPT_BKPT; 907 } 908 return SQLITE_OK; 909 } 910 911 /* 912 ** Initialize the auxiliary information for a disk block. 913 ** 914 ** The pParent parameter must be a pointer to the MemPage which 915 ** is the parent of the page being initialized. The root of a 916 ** BTree has no parent and so for that page, pParent==NULL. 917 ** 918 ** Return SQLITE_OK on success. If we see that the page does 919 ** not contain a well-formed database page, then return 920 ** SQLITE_CORRUPT. Note that a return of SQLITE_OK does not 921 ** guarantee that the page is well-formed. It only shows that 922 ** we failed to detect any corruption. 923 */ 924 int sqlite3BtreeInitPage( 925 MemPage *pPage, /* The page to be initialized */ 926 MemPage *pParent /* The parent. Might be NULL */ 927 ){ 928 int pc; /* Address of a freeblock within pPage->aData[] */ 929 int hdr; /* Offset to beginning of page header */ 930 u8 *data; /* Equal to pPage->aData */ 931 BtShared *pBt; /* The main btree structure */ 932 int usableSize; /* Amount of usable space on each page */ 933 int cellOffset; /* Offset from start of page to first cell pointer */ 934 int nFree; /* Number of unused bytes on the page */ 935 int top; /* First byte of the cell content area */ 936 937 pBt = pPage->pBt; 938 assert( pBt!=0 ); 939 assert( pParent==0 || pParent->pBt==pBt ); 940 assert( sqlite3_mutex_held(pBt->mutex) ); 941 assert( pPage->pgno==sqlite3PagerPagenumber(pPage->pDbPage) ); 942 assert( pPage == sqlite3PagerGetExtra(pPage->pDbPage) ); 943 assert( pPage->aData == sqlite3PagerGetData(pPage->pDbPage) ); 944 if( pPage->pParent!=pParent && (pPage->pParent!=0 || pPage->isInit) ){ 945 /* The parent page should never change unless the file is corrupt */ 946 return SQLITE_CORRUPT_BKPT; 947 } 948 if( pPage->isInit ) return SQLITE_OK; 949 if( pPage->pParent==0 && pParent!=0 ){ 950 pPage->pParent = pParent; 951 sqlite3PagerRef(pParent->pDbPage); 952 } 953 hdr = pPage->hdrOffset; 954 data = pPage->aData; 955 if( decodeFlags(pPage, data[hdr]) ) return SQLITE_CORRUPT_BKPT; 956 assert( pBt->pageSize>=512 && pBt->pageSize<=32768 ); 957 pPage->maskPage = pBt->pageSize - 1; 958 pPage->nOverflow = 0; 959 pPage->idxShift = 0; 960 usableSize = pBt->usableSize; 961 pPage->cellOffset = cellOffset = hdr + 12 - 4*pPage->leaf; 962 top = get2byte(&data[hdr+5]); 963 pPage->nCell = get2byte(&data[hdr+3]); 964 if( pPage->nCell>MX_CELL(pBt) ){ 965 /* To many cells for a single page. The page must be corrupt */ 966 return SQLITE_CORRUPT_BKPT; 967 } 968 if( pPage->nCell==0 && pParent!=0 && pParent->pgno!=1 ){ 969 /* All pages must have at least one cell, except for root pages */ 970 return SQLITE_CORRUPT_BKPT; 971 } 972 973 /* Compute the total free space on the page */ 974 pc = get2byte(&data[hdr+1]); 975 nFree = data[hdr+7] + top - (cellOffset + 2*pPage->nCell); 976 while( pc>0 ){ 977 int next, size; 978 if( pc>usableSize-4 ){ 979 /* Free block is off the page */ 980 return SQLITE_CORRUPT_BKPT; 981 } 982 next = get2byte(&data[pc]); 983 size = get2byte(&data[pc+2]); 984 if( next>0 && next<=pc+size+3 ){ 985 /* Free blocks must be in accending order */ 986 return SQLITE_CORRUPT_BKPT; 987 } 988 nFree += size; 989 pc = next; 990 } 991 pPage->nFree = nFree; 992 if( nFree>=usableSize ){ 993 /* Free space cannot exceed total page size */ 994 return SQLITE_CORRUPT_BKPT; 995 } 996 997 #if 0 998 /* Check that all the offsets in the cell offset array are within range. 999 ** 1000 ** Omitting this consistency check and using the pPage->maskPage mask 1001 ** to prevent overrunning the page buffer in findCell() results in a 1002 ** 2.5% performance gain. 1003 */ 1004 { 1005 u8 *pOff; /* Iterator used to check all cell offsets are in range */ 1006 u8 *pEnd; /* Pointer to end of cell offset array */ 1007 u8 mask; /* Mask of bits that must be zero in MSB of cell offsets */ 1008 mask = ~(((u8)(pBt->pageSize>>8))-1); 1009 pEnd = &data[cellOffset + pPage->nCell*2]; 1010 for(pOff=&data[cellOffset]; pOff!=pEnd && !((*pOff)&mask); pOff+=2); 1011 if( pOff!=pEnd ){ 1012 return SQLITE_CORRUPT_BKPT; 1013 } 1014 } 1015 #endif 1016 1017 pPage->isInit = 1; 1018 return SQLITE_OK; 1019 } 1020 1021 /* 1022 ** Set up a raw page so that it looks like a database page holding 1023 ** no entries. 1024 */ 1025 static void zeroPage(MemPage *pPage, int flags){ 1026 unsigned char *data = pPage->aData; 1027 BtShared *pBt = pPage->pBt; 1028 int hdr = pPage->hdrOffset; 1029 int first; 1030 1031 assert( sqlite3PagerPagenumber(pPage->pDbPage)==pPage->pgno ); 1032 assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage ); 1033 assert( sqlite3PagerGetData(pPage->pDbPage) == data ); 1034 assert( sqlite3PagerIswriteable(pPage->pDbPage) ); 1035 assert( sqlite3_mutex_held(pBt->mutex) ); 1036 /*memset(&data[hdr], 0, pBt->usableSize - hdr);*/ 1037 data[hdr] = flags; 1038 first = hdr + 8 + 4*((flags&PTF_LEAF)==0); 1039 memset(&data[hdr+1], 0, 4); 1040 data[hdr+7] = 0; 1041 put2byte(&data[hdr+5], pBt->usableSize); 1042 pPage->nFree = pBt->usableSize - first; 1043 decodeFlags(pPage, flags); 1044 pPage->hdrOffset = hdr; 1045 pPage->cellOffset = first; 1046 pPage->nOverflow = 0; 1047 assert( pBt->pageSize>=512 && pBt->pageSize<=32768 ); 1048 pPage->maskPage = pBt->pageSize - 1; 1049 pPage->idxShift = 0; 1050 pPage->nCell = 0; 1051 pPage->isInit = 1; 1052 } 1053 1054 /* 1055 ** Get a page from the pager. Initialize the MemPage.pBt and 1056 ** MemPage.aData elements if needed. 1057 ** 1058 ** If the noContent flag is set, it means that we do not care about 1059 ** the content of the page at this time. So do not go to the disk 1060 ** to fetch the content. Just fill in the content with zeros for now. 1061 ** If in the future we call sqlite3PagerWrite() on this page, that 1062 ** means we have started to be concerned about content and the disk 1063 ** read should occur at that point. 1064 */ 1065 int sqlite3BtreeGetPage( 1066 BtShared *pBt, /* The btree */ 1067 Pgno pgno, /* Number of the page to fetch */ 1068 MemPage **ppPage, /* Return the page in this parameter */ 1069 int noContent /* Do not load page content if true */ 1070 ){ 1071 int rc; 1072 MemPage *pPage; 1073 DbPage *pDbPage; 1074 1075 assert( sqlite3_mutex_held(pBt->mutex) ); 1076 rc = sqlite3PagerAcquire(pBt->pPager, pgno, (DbPage**)&pDbPage, noContent); 1077 if( rc ) return rc; 1078 pPage = (MemPage *)sqlite3PagerGetExtra(pDbPage); 1079 pPage->aData = sqlite3PagerGetData(pDbPage); 1080 pPage->pDbPage = pDbPage; 1081 pPage->pBt = pBt; 1082 pPage->pgno = pgno; 1083 pPage->hdrOffset = pPage->pgno==1 ? 100 : 0; 1084 *ppPage = pPage; 1085 return SQLITE_OK; 1086 } 1087 1088 /* 1089 ** Get a page from the pager and initialize it. This routine 1090 ** is just a convenience wrapper around separate calls to 1091 ** sqlite3BtreeGetPage() and sqlite3BtreeInitPage(). 1092 */ 1093 static int getAndInitPage( 1094 BtShared *pBt, /* The database file */ 1095 Pgno pgno, /* Number of the page to get */ 1096 MemPage **ppPage, /* Write the page pointer here */ 1097 MemPage *pParent /* Parent of the page */ 1098 ){ 1099 int rc; 1100 assert( sqlite3_mutex_held(pBt->mutex) ); 1101 if( pgno==0 ){ 1102 return SQLITE_CORRUPT_BKPT; 1103 } 1104 rc = sqlite3BtreeGetPage(pBt, pgno, ppPage, 0); 1105 if( rc==SQLITE_OK && (*ppPage)->isInit==0 ){ 1106 rc = sqlite3BtreeInitPage(*ppPage, pParent); 1107 if( rc!=SQLITE_OK ){ 1108 releasePage(*ppPage); 1109 *ppPage = 0; 1110 } 1111 } 1112 return rc; 1113 } 1114 1115 /* 1116 ** Release a MemPage. This should be called once for each prior 1117 ** call to sqlite3BtreeGetPage. 1118 */ 1119 static void releasePage(MemPage *pPage){ 1120 if( pPage ){ 1121 assert( pPage->aData ); 1122 assert( pPage->pBt ); 1123 assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage ); 1124 assert( sqlite3PagerGetData(pPage->pDbPage)==pPage->aData ); 1125 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 1126 sqlite3PagerUnref(pPage->pDbPage); 1127 } 1128 } 1129 1130 /* 1131 ** This routine is called when the reference count for a page 1132 ** reaches zero. We need to unref the pParent pointer when that 1133 ** happens. 1134 */ 1135 static void pageDestructor(DbPage *pData, int pageSize){ 1136 MemPage *pPage; 1137 assert( (pageSize & 7)==0 ); 1138 pPage = (MemPage *)sqlite3PagerGetExtra(pData); 1139 assert( pPage->isInit==0 || sqlite3_mutex_held(pPage->pBt->mutex) ); 1140 if( pPage->pParent ){ 1141 MemPage *pParent = pPage->pParent; 1142 assert( pParent->pBt==pPage->pBt ); 1143 pPage->pParent = 0; 1144 releasePage(pParent); 1145 } 1146 pPage->isInit = 0; 1147 } 1148 1149 /* 1150 ** During a rollback, when the pager reloads information into the cache 1151 ** so that the cache is restored to its original state at the start of 1152 ** the transaction, for each page restored this routine is called. 1153 ** 1154 ** This routine needs to reset the extra data section at the end of the 1155 ** page to agree with the restored data. 1156 */ 1157 static void pageReinit(DbPage *pData, int pageSize){ 1158 MemPage *pPage; 1159 assert( (pageSize & 7)==0 ); 1160 pPage = (MemPage *)sqlite3PagerGetExtra(pData); 1161 if( pPage->isInit ){ 1162 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 1163 pPage->isInit = 0; 1164 sqlite3BtreeInitPage(pPage, pPage->pParent); 1165 } 1166 } 1167 1168 /* 1169 ** Invoke the busy handler for a btree. 1170 */ 1171 static int sqlite3BtreeInvokeBusyHandler(void *pArg, int n){ 1172 BtShared *pBt = (BtShared*)pArg; 1173 assert( pBt->db ); 1174 assert( sqlite3_mutex_held(pBt->db->mutex) ); 1175 return sqlite3InvokeBusyHandler(&pBt->db->busyHandler); 1176 } 1177 1178 /* 1179 ** Open a database file. 1180 ** 1181 ** zFilename is the name of the database file. If zFilename is NULL 1182 ** a new database with a random name is created. This randomly named 1183 ** database file will be deleted when sqlite3BtreeClose() is called. 1184 ** If zFilename is ":memory:" then an in-memory database is created 1185 ** that is automatically destroyed when it is closed. 1186 */ 1187 int sqlite3BtreeOpen( 1188 const char *zFilename, /* Name of the file containing the BTree database */ 1189 sqlite3 *db, /* Associated database handle */ 1190 Btree **ppBtree, /* Pointer to new Btree object written here */ 1191 int flags, /* Options */ 1192 int vfsFlags /* Flags passed through to sqlite3_vfs.xOpen() */ 1193 ){ 1194 sqlite3_vfs *pVfs; /* The VFS to use for this btree */ 1195 BtShared *pBt = 0; /* Shared part of btree structure */ 1196 Btree *p; /* Handle to return */ 1197 int rc = SQLITE_OK; 1198 int nReserve; 1199 unsigned char zDbHeader[100]; 1200 1201 /* Set the variable isMemdb to true for an in-memory database, or 1202 ** false for a file-based database. This symbol is only required if 1203 ** either of the shared-data or autovacuum features are compiled 1204 ** into the library. 1205 */ 1206 #if !defined(SQLITE_OMIT_SHARED_CACHE) || !defined(SQLITE_OMIT_AUTOVACUUM) 1207 #ifdef SQLITE_OMIT_MEMORYDB 1208 const int isMemdb = 0; 1209 #else 1210 const int isMemdb = zFilename && !strcmp(zFilename, ":memory:"); 1211 #endif 1212 #endif 1213 1214 assert( db!=0 ); 1215 assert( sqlite3_mutex_held(db->mutex) ); 1216 1217 pVfs = db->pVfs; 1218 p = sqlite3MallocZero(sizeof(Btree)); 1219 if( !p ){ 1220 return SQLITE_NOMEM; 1221 } 1222 p->inTrans = TRANS_NONE; 1223 p->db = db; 1224 1225 #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO) 1226 /* 1227 ** If this Btree is a candidate for shared cache, try to find an 1228 ** existing BtShared object that we can share with 1229 */ 1230 if( isMemdb==0 1231 && (db->flags & SQLITE_Vtab)==0 1232 && zFilename && zFilename[0] 1233 ){ 1234 if( sqlite3SharedCacheEnabled ){ 1235 int nFullPathname = pVfs->mxPathname+1; 1236 char *zFullPathname = sqlite3Malloc(nFullPathname); 1237 sqlite3_mutex *mutexShared; 1238 p->sharable = 1; 1239 db->flags |= SQLITE_SharedCache; 1240 if( !zFullPathname ){ 1241 sqlite3_free(p); 1242 return SQLITE_NOMEM; 1243 } 1244 sqlite3OsFullPathname(pVfs, zFilename, nFullPathname, zFullPathname); 1245 mutexShared = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER); 1246 sqlite3_mutex_enter(mutexShared); 1247 for(pBt=sqlite3SharedCacheList; pBt; pBt=pBt->pNext){ 1248 assert( pBt->nRef>0 ); 1249 if( 0==strcmp(zFullPathname, sqlite3PagerFilename(pBt->pPager)) 1250 && sqlite3PagerVfs(pBt->pPager)==pVfs ){ 1251 p->pBt = pBt; 1252 pBt->nRef++; 1253 break; 1254 } 1255 } 1256 sqlite3_mutex_leave(mutexShared); 1257 sqlite3_free(zFullPathname); 1258 } 1259 #ifdef SQLITE_DEBUG 1260 else{ 1261 /* In debug mode, we mark all persistent databases as sharable 1262 ** even when they are not. This exercises the locking code and 1263 ** gives more opportunity for asserts(sqlite3_mutex_held()) 1264 ** statements to find locking problems. 1265 */ 1266 p->sharable = 1; 1267 } 1268 #endif 1269 } 1270 #endif 1271 if( pBt==0 ){ 1272 /* 1273 ** The following asserts make sure that structures used by the btree are 1274 ** the right size. This is to guard against size changes that result 1275 ** when compiling on a different architecture. 1276 */ 1277 assert( sizeof(i64)==8 || sizeof(i64)==4 ); 1278 assert( sizeof(u64)==8 || sizeof(u64)==4 ); 1279 assert( sizeof(u32)==4 ); 1280 assert( sizeof(u16)==2 ); 1281 assert( sizeof(Pgno)==4 ); 1282 1283 pBt = sqlite3MallocZero( sizeof(*pBt) ); 1284 if( pBt==0 ){ 1285 rc = SQLITE_NOMEM; 1286 goto btree_open_out; 1287 } 1288 pBt->busyHdr.xFunc = sqlite3BtreeInvokeBusyHandler; 1289 pBt->busyHdr.pArg = pBt; 1290 rc = sqlite3PagerOpen(pVfs, &pBt->pPager, zFilename, 1291 EXTRA_SIZE, flags, vfsFlags); 1292 if( rc==SQLITE_OK ){ 1293 rc = sqlite3PagerReadFileheader(pBt->pPager,sizeof(zDbHeader),zDbHeader); 1294 } 1295 if( rc!=SQLITE_OK ){ 1296 goto btree_open_out; 1297 } 1298 sqlite3PagerSetBusyhandler(pBt->pPager, &pBt->busyHdr); 1299 p->pBt = pBt; 1300 1301 sqlite3PagerSetDestructor(pBt->pPager, pageDestructor); 1302 sqlite3PagerSetReiniter(pBt->pPager, pageReinit); 1303 pBt->pCursor = 0; 1304 pBt->pPage1 = 0; 1305 pBt->readOnly = sqlite3PagerIsreadonly(pBt->pPager); 1306 pBt->pageSize = get2byte(&zDbHeader[16]); 1307 if( pBt->pageSize<512 || pBt->pageSize>SQLITE_MAX_PAGE_SIZE 1308 || ((pBt->pageSize-1)&pBt->pageSize)!=0 ){ 1309 pBt->pageSize = 0; 1310 sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize); 1311 #ifndef SQLITE_OMIT_AUTOVACUUM 1312 /* If the magic name ":memory:" will create an in-memory database, then 1313 ** leave the autoVacuum mode at 0 (do not auto-vacuum), even if 1314 ** SQLITE_DEFAULT_AUTOVACUUM is true. On the other hand, if 1315 ** SQLITE_OMIT_MEMORYDB has been defined, then ":memory:" is just a 1316 ** regular file-name. In this case the auto-vacuum applies as per normal. 1317 */ 1318 if( zFilename && !isMemdb ){ 1319 pBt->autoVacuum = (SQLITE_DEFAULT_AUTOVACUUM ? 1 : 0); 1320 pBt->incrVacuum = (SQLITE_DEFAULT_AUTOVACUUM==2 ? 1 : 0); 1321 } 1322 #endif 1323 nReserve = 0; 1324 }else{ 1325 nReserve = zDbHeader[20]; 1326 pBt->pageSizeFixed = 1; 1327 #ifndef SQLITE_OMIT_AUTOVACUUM 1328 pBt->autoVacuum = (get4byte(&zDbHeader[36 + 4*4])?1:0); 1329 pBt->incrVacuum = (get4byte(&zDbHeader[36 + 7*4])?1:0); 1330 #endif 1331 } 1332 pBt->usableSize = pBt->pageSize - nReserve; 1333 assert( (pBt->pageSize & 7)==0 ); /* 8-byte alignment of pageSize */ 1334 sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize); 1335 1336 #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO) 1337 /* Add the new BtShared object to the linked list sharable BtShareds. 1338 */ 1339 if( p->sharable ){ 1340 sqlite3_mutex *mutexShared; 1341 pBt->nRef = 1; 1342 mutexShared = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER); 1343 if( SQLITE_THREADSAFE && sqlite3Config.bCoreMutex ){ 1344 pBt->mutex = sqlite3MutexAlloc(SQLITE_MUTEX_FAST); 1345 if( pBt->mutex==0 ){ 1346 rc = SQLITE_NOMEM; 1347 db->mallocFailed = 0; 1348 goto btree_open_out; 1349 } 1350 } 1351 sqlite3_mutex_enter(mutexShared); 1352 pBt->pNext = sqlite3SharedCacheList; 1353 sqlite3SharedCacheList = pBt; 1354 sqlite3_mutex_leave(mutexShared); 1355 } 1356 #endif 1357 } 1358 1359 #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO) 1360 /* If the new Btree uses a sharable pBtShared, then link the new 1361 ** Btree into the list of all sharable Btrees for the same connection. 1362 ** The list is kept in ascending order by pBt address. 1363 */ 1364 if( p->sharable ){ 1365 int i; 1366 Btree *pSib; 1367 for(i=0; i<db->nDb; i++){ 1368 if( (pSib = db->aDb[i].pBt)!=0 && pSib->sharable ){ 1369 while( pSib->pPrev ){ pSib = pSib->pPrev; } 1370 if( p->pBt<pSib->pBt ){ 1371 p->pNext = pSib; 1372 p->pPrev = 0; 1373 pSib->pPrev = p; 1374 }else{ 1375 while( pSib->pNext && pSib->pNext->pBt<p->pBt ){ 1376 pSib = pSib->pNext; 1377 } 1378 p->pNext = pSib->pNext; 1379 p->pPrev = pSib; 1380 if( p->pNext ){ 1381 p->pNext->pPrev = p; 1382 } 1383 pSib->pNext = p; 1384 } 1385 break; 1386 } 1387 } 1388 } 1389 #endif 1390 *ppBtree = p; 1391 1392 btree_open_out: 1393 if( rc!=SQLITE_OK ){ 1394 if( pBt && pBt->pPager ){ 1395 sqlite3PagerClose(pBt->pPager); 1396 } 1397 sqlite3_free(pBt); 1398 sqlite3_free(p); 1399 *ppBtree = 0; 1400 } 1401 return rc; 1402 } 1403 1404 /* 1405 ** Decrement the BtShared.nRef counter. When it reaches zero, 1406 ** remove the BtShared structure from the sharing list. Return 1407 ** true if the BtShared.nRef counter reaches zero and return 1408 ** false if it is still positive. 1409 */ 1410 static int removeFromSharingList(BtShared *pBt){ 1411 #ifndef SQLITE_OMIT_SHARED_CACHE 1412 sqlite3_mutex *pMaster; 1413 BtShared *pList; 1414 int removed = 0; 1415 1416 assert( sqlite3_mutex_notheld(pBt->mutex) ); 1417 pMaster = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER); 1418 sqlite3_mutex_enter(pMaster); 1419 pBt->nRef--; 1420 if( pBt->nRef<=0 ){ 1421 if( sqlite3SharedCacheList==pBt ){ 1422 sqlite3SharedCacheList = pBt->pNext; 1423 }else{ 1424 pList = sqlite3SharedCacheList; 1425 while( ALWAYS(pList) && pList->pNext!=pBt ){ 1426 pList=pList->pNext; 1427 } 1428 if( ALWAYS(pList) ){ 1429 pList->pNext = pBt->pNext; 1430 } 1431 } 1432 if( SQLITE_THREADSAFE ){ 1433 sqlite3_mutex_free(pBt->mutex); 1434 } 1435 removed = 1; 1436 } 1437 sqlite3_mutex_leave(pMaster); 1438 return removed; 1439 #else 1440 return 1; 1441 #endif 1442 } 1443 1444 /* 1445 ** Make sure pBt->pTmpSpace points to an allocation of 1446 ** MX_CELL_SIZE(pBt) bytes. 1447 */ 1448 static void allocateTempSpace(BtShared *pBt){ 1449 if( !pBt->pTmpSpace ){ 1450 pBt->pTmpSpace = sqlite3PageMalloc( pBt->pageSize ); 1451 } 1452 } 1453 1454 /* 1455 ** Free the pBt->pTmpSpace allocation 1456 */ 1457 static void freeTempSpace(BtShared *pBt){ 1458 sqlite3PageFree( pBt->pTmpSpace); 1459 pBt->pTmpSpace = 0; 1460 } 1461 1462 /* 1463 ** Close an open database and invalidate all cursors. 1464 */ 1465 int sqlite3BtreeClose(Btree *p){ 1466 BtShared *pBt = p->pBt; 1467 BtCursor *pCur; 1468 1469 /* Close all cursors opened via this handle. */ 1470 assert( sqlite3_mutex_held(p->db->mutex) ); 1471 sqlite3BtreeEnter(p); 1472 pBt->db = p->db; 1473 pCur = pBt->pCursor; 1474 while( pCur ){ 1475 BtCursor *pTmp = pCur; 1476 pCur = pCur->pNext; 1477 if( pTmp->pBtree==p ){ 1478 sqlite3BtreeCloseCursor(pTmp); 1479 } 1480 } 1481 1482 /* Rollback any active transaction and free the handle structure. 1483 ** The call to sqlite3BtreeRollback() drops any table-locks held by 1484 ** this handle. 1485 */ 1486 sqlite3BtreeRollback(p); 1487 sqlite3BtreeLeave(p); 1488 1489 /* If there are still other outstanding references to the shared-btree 1490 ** structure, return now. The remainder of this procedure cleans 1491 ** up the shared-btree. 1492 */ 1493 assert( p->wantToLock==0 && p->locked==0 ); 1494 if( !p->sharable || removeFromSharingList(pBt) ){ 1495 /* The pBt is no longer on the sharing list, so we can access 1496 ** it without having to hold the mutex. 1497 ** 1498 ** Clean out and delete the BtShared object. 1499 */ 1500 assert( !pBt->pCursor ); 1501 sqlite3PagerClose(pBt->pPager); 1502 if( pBt->xFreeSchema && pBt->pSchema ){ 1503 pBt->xFreeSchema(pBt->pSchema); 1504 } 1505 sqlite3_free(pBt->pSchema); 1506 freeTempSpace(pBt); 1507 sqlite3_free(pBt); 1508 } 1509 1510 #ifndef SQLITE_OMIT_SHARED_CACHE 1511 assert( p->wantToLock==0 ); 1512 assert( p->locked==0 ); 1513 if( p->pPrev ) p->pPrev->pNext = p->pNext; 1514 if( p->pNext ) p->pNext->pPrev = p->pPrev; 1515 #endif 1516 1517 sqlite3_free(p); 1518 return SQLITE_OK; 1519 } 1520 1521 /* 1522 ** Change the limit on the number of pages allowed in the cache. 1523 ** 1524 ** The maximum number of cache pages is set to the absolute 1525 ** value of mxPage. If mxPage is negative, the pager will 1526 ** operate asynchronously - it will not stop to do fsync()s 1527 ** to insure data is written to the disk surface before 1528 ** continuing. Transactions still work if synchronous is off, 1529 ** and the database cannot be corrupted if this program 1530 ** crashes. But if the operating system crashes or there is 1531 ** an abrupt power failure when synchronous is off, the database 1532 ** could be left in an inconsistent and unrecoverable state. 1533 ** Synchronous is on by default so database corruption is not 1534 ** normally a worry. 1535 */ 1536 int sqlite3BtreeSetCacheSize(Btree *p, int mxPage){ 1537 BtShared *pBt = p->pBt; 1538 assert( sqlite3_mutex_held(p->db->mutex) ); 1539 sqlite3BtreeEnter(p); 1540 sqlite3PagerSetCachesize(pBt->pPager, mxPage); 1541 sqlite3BtreeLeave(p); 1542 return SQLITE_OK; 1543 } 1544 1545 /* 1546 ** Change the way data is synced to disk in order to increase or decrease 1547 ** how well the database resists damage due to OS crashes and power 1548 ** failures. Level 1 is the same as asynchronous (no syncs() occur and 1549 ** there is a high probability of damage) Level 2 is the default. There 1550 ** is a very low but non-zero probability of damage. Level 3 reduces the 1551 ** probability of damage to near zero but with a write performance reduction. 1552 */ 1553 #ifndef SQLITE_OMIT_PAGER_PRAGMAS 1554 int sqlite3BtreeSetSafetyLevel(Btree *p, int level, int fullSync){ 1555 BtShared *pBt = p->pBt; 1556 assert( sqlite3_mutex_held(p->db->mutex) ); 1557 sqlite3BtreeEnter(p); 1558 sqlite3PagerSetSafetyLevel(pBt->pPager, level, fullSync); 1559 sqlite3BtreeLeave(p); 1560 return SQLITE_OK; 1561 } 1562 #endif 1563 1564 /* 1565 ** Return TRUE if the given btree is set to safety level 1. In other 1566 ** words, return TRUE if no sync() occurs on the disk files. 1567 */ 1568 int sqlite3BtreeSyncDisabled(Btree *p){ 1569 BtShared *pBt = p->pBt; 1570 int rc; 1571 assert( sqlite3_mutex_held(p->db->mutex) ); 1572 sqlite3BtreeEnter(p); 1573 assert( pBt && pBt->pPager ); 1574 rc = sqlite3PagerNosync(pBt->pPager); 1575 sqlite3BtreeLeave(p); 1576 return rc; 1577 } 1578 1579 #if !defined(SQLITE_OMIT_PAGER_PRAGMAS) || !defined(SQLITE_OMIT_VACUUM) 1580 /* 1581 ** Change the default pages size and the number of reserved bytes per page. 1582 ** 1583 ** The page size must be a power of 2 between 512 and 65536. If the page 1584 ** size supplied does not meet this constraint then the page size is not 1585 ** changed. 1586 ** 1587 ** Page sizes are constrained to be a power of two so that the region 1588 ** of the database file used for locking (beginning at PENDING_BYTE, 1589 ** the first byte past the 1GB boundary, 0x40000000) needs to occur 1590 ** at the beginning of a page. 1591 ** 1592 ** If parameter nReserve is less than zero, then the number of reserved 1593 ** bytes per page is left unchanged. 1594 */ 1595 int sqlite3BtreeSetPageSize(Btree *p, int pageSize, int nReserve){ 1596 int rc = SQLITE_OK; 1597 BtShared *pBt = p->pBt; 1598 sqlite3BtreeEnter(p); 1599 if( pBt->pageSizeFixed ){ 1600 sqlite3BtreeLeave(p); 1601 return SQLITE_READONLY; 1602 } 1603 if( nReserve<0 ){ 1604 nReserve = pBt->pageSize - pBt->usableSize; 1605 } 1606 if( pageSize>=512 && pageSize<=SQLITE_MAX_PAGE_SIZE && 1607 ((pageSize-1)&pageSize)==0 ){ 1608 assert( (pageSize & 7)==0 ); 1609 assert( !pBt->pPage1 && !pBt->pCursor ); 1610 pBt->pageSize = pageSize; 1611 freeTempSpace(pBt); 1612 rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize); 1613 } 1614 pBt->usableSize = pBt->pageSize - nReserve; 1615 sqlite3BtreeLeave(p); 1616 return rc; 1617 } 1618 1619 /* 1620 ** Return the currently defined page size 1621 */ 1622 int sqlite3BtreeGetPageSize(Btree *p){ 1623 return p->pBt->pageSize; 1624 } 1625 int sqlite3BtreeGetReserve(Btree *p){ 1626 int n; 1627 sqlite3BtreeEnter(p); 1628 n = p->pBt->pageSize - p->pBt->usableSize; 1629 sqlite3BtreeLeave(p); 1630 return n; 1631 } 1632 1633 /* 1634 ** Set the maximum page count for a database if mxPage is positive. 1635 ** No changes are made if mxPage is 0 or negative. 1636 ** Regardless of the value of mxPage, return the maximum page count. 1637 */ 1638 int sqlite3BtreeMaxPageCount(Btree *p, int mxPage){ 1639 int n; 1640 sqlite3BtreeEnter(p); 1641 n = sqlite3PagerMaxPageCount(p->pBt->pPager, mxPage); 1642 sqlite3BtreeLeave(p); 1643 return n; 1644 } 1645 #endif /* !defined(SQLITE_OMIT_PAGER_PRAGMAS) || !defined(SQLITE_OMIT_VACUUM) */ 1646 1647 /* 1648 ** Change the 'auto-vacuum' property of the database. If the 'autoVacuum' 1649 ** parameter is non-zero, then auto-vacuum mode is enabled. If zero, it 1650 ** is disabled. The default value for the auto-vacuum property is 1651 ** determined by the SQLITE_DEFAULT_AUTOVACUUM macro. 1652 */ 1653 int sqlite3BtreeSetAutoVacuum(Btree *p, int autoVacuum){ 1654 #ifdef SQLITE_OMIT_AUTOVACUUM 1655 return SQLITE_READONLY; 1656 #else 1657 BtShared *pBt = p->pBt; 1658 int rc = SQLITE_OK; 1659 int av = (autoVacuum?1:0); 1660 1661 sqlite3BtreeEnter(p); 1662 if( pBt->pageSizeFixed && av!=pBt->autoVacuum ){ 1663 rc = SQLITE_READONLY; 1664 }else{ 1665 pBt->autoVacuum = av; 1666 } 1667 sqlite3BtreeLeave(p); 1668 return rc; 1669 #endif 1670 } 1671 1672 /* 1673 ** Return the value of the 'auto-vacuum' property. If auto-vacuum is 1674 ** enabled 1 is returned. Otherwise 0. 1675 */ 1676 int sqlite3BtreeGetAutoVacuum(Btree *p){ 1677 #ifdef SQLITE_OMIT_AUTOVACUUM 1678 return BTREE_AUTOVACUUM_NONE; 1679 #else 1680 int rc; 1681 sqlite3BtreeEnter(p); 1682 rc = ( 1683 (!p->pBt->autoVacuum)?BTREE_AUTOVACUUM_NONE: 1684 (!p->pBt->incrVacuum)?BTREE_AUTOVACUUM_FULL: 1685 BTREE_AUTOVACUUM_INCR 1686 ); 1687 sqlite3BtreeLeave(p); 1688 return rc; 1689 #endif 1690 } 1691 1692 1693 /* 1694 ** Get a reference to pPage1 of the database file. This will 1695 ** also acquire a readlock on that file. 1696 ** 1697 ** SQLITE_OK is returned on success. If the file is not a 1698 ** well-formed database file, then SQLITE_CORRUPT is returned. 1699 ** SQLITE_BUSY is returned if the database is locked. SQLITE_NOMEM 1700 ** is returned if we run out of memory. 1701 */ 1702 static int lockBtree(BtShared *pBt){ 1703 int rc; 1704 MemPage *pPage1; 1705 int nPage; 1706 1707 assert( sqlite3_mutex_held(pBt->mutex) ); 1708 if( pBt->pPage1 ) return SQLITE_OK; 1709 rc = sqlite3BtreeGetPage(pBt, 1, &pPage1, 0); 1710 if( rc!=SQLITE_OK ) return rc; 1711 1712 /* Do some checking to help insure the file we opened really is 1713 ** a valid database file. 1714 */ 1715 rc = sqlite3PagerPagecount(pBt->pPager, &nPage); 1716 if( rc!=SQLITE_OK ){ 1717 goto page1_init_failed; 1718 }else if( nPage>0 ){ 1719 int pageSize; 1720 int usableSize; 1721 u8 *page1 = pPage1->aData; 1722 rc = SQLITE_NOTADB; 1723 if( memcmp(page1, zMagicHeader, 16)!=0 ){ 1724 goto page1_init_failed; 1725 } 1726 if( page1[18]>1 ){ 1727 pBt->readOnly = 1; 1728 } 1729 if( page1[19]>1 ){ 1730 goto page1_init_failed; 1731 } 1732 1733 /* The maximum embedded fraction must be exactly 25%. And the minimum 1734 ** embedded fraction must be 12.5% for both leaf-data and non-leaf-data. 1735 ** The original design allowed these amounts to vary, but as of 1736 ** version 3.6.0, we require them to be fixed. 1737 */ 1738 if( memcmp(&page1[21], "\100\040\040",3)!=0 ){ 1739 goto page1_init_failed; 1740 } 1741 pageSize = get2byte(&page1[16]); 1742 if( ((pageSize-1)&pageSize)!=0 || pageSize<512 || 1743 (SQLITE_MAX_PAGE_SIZE<32768 && pageSize>SQLITE_MAX_PAGE_SIZE) 1744 ){ 1745 goto page1_init_failed; 1746 } 1747 assert( (pageSize & 7)==0 ); 1748 usableSize = pageSize - page1[20]; 1749 if( pageSize!=pBt->pageSize ){ 1750 /* After reading the first page of the database assuming a page size 1751 ** of BtShared.pageSize, we have discovered that the page-size is 1752 ** actually pageSize. Unlock the database, leave pBt->pPage1 at 1753 ** zero and return SQLITE_OK. The caller will call this function 1754 ** again with the correct page-size. 1755 */ 1756 releasePage(pPage1); 1757 pBt->usableSize = usableSize; 1758 pBt->pageSize = pageSize; 1759 freeTempSpace(pBt); 1760 sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize); 1761 return SQLITE_OK; 1762 } 1763 if( usableSize<500 ){ 1764 goto page1_init_failed; 1765 } 1766 pBt->pageSize = pageSize; 1767 pBt->usableSize = usableSize; 1768 #ifndef SQLITE_OMIT_AUTOVACUUM 1769 pBt->autoVacuum = (get4byte(&page1[36 + 4*4])?1:0); 1770 pBt->incrVacuum = (get4byte(&page1[36 + 7*4])?1:0); 1771 #endif 1772 } 1773 1774 /* maxLocal is the maximum amount of payload to store locally for 1775 ** a cell. Make sure it is small enough so that at least minFanout 1776 ** cells can will fit on one page. We assume a 10-byte page header. 1777 ** Besides the payload, the cell must store: 1778 ** 2-byte pointer to the cell 1779 ** 4-byte child pointer 1780 ** 9-byte nKey value 1781 ** 4-byte nData value 1782 ** 4-byte overflow page pointer 1783 ** So a cell consists of a 2-byte poiner, a header which is as much as 1784 ** 17 bytes long, 0 to N bytes of payload, and an optional 4 byte overflow 1785 ** page pointer. 1786 */ 1787 pBt->maxLocal = (pBt->usableSize-12)*64/255 - 23; 1788 pBt->minLocal = (pBt->usableSize-12)*32/255 - 23; 1789 pBt->maxLeaf = pBt->usableSize - 35; 1790 pBt->minLeaf = (pBt->usableSize-12)*32/255 - 23; 1791 assert( pBt->maxLeaf + 23 <= MX_CELL_SIZE(pBt) ); 1792 pBt->pPage1 = pPage1; 1793 return SQLITE_OK; 1794 1795 page1_init_failed: 1796 releasePage(pPage1); 1797 pBt->pPage1 = 0; 1798 return rc; 1799 } 1800 1801 /* 1802 ** This routine works like lockBtree() except that it also invokes the 1803 ** busy callback if there is lock contention. 1804 */ 1805 static int lockBtreeWithRetry(Btree *pRef){ 1806 int rc = SQLITE_OK; 1807 1808 assert( sqlite3BtreeHoldsMutex(pRef) ); 1809 if( pRef->inTrans==TRANS_NONE ){ 1810 u8 inTransaction = pRef->pBt->inTransaction; 1811 btreeIntegrity(pRef); 1812 rc = sqlite3BtreeBeginTrans(pRef, 0); 1813 pRef->pBt->inTransaction = inTransaction; 1814 pRef->inTrans = TRANS_NONE; 1815 if( rc==SQLITE_OK ){ 1816 pRef->pBt->nTransaction--; 1817 } 1818 btreeIntegrity(pRef); 1819 } 1820 return rc; 1821 } 1822 1823 1824 /* 1825 ** If there are no outstanding cursors and we are not in the middle 1826 ** of a transaction but there is a read lock on the database, then 1827 ** this routine unrefs the first page of the database file which 1828 ** has the effect of releasing the read lock. 1829 ** 1830 ** If there are any outstanding cursors, this routine is a no-op. 1831 ** 1832 ** If there is a transaction in progress, this routine is a no-op. 1833 */ 1834 static void unlockBtreeIfUnused(BtShared *pBt){ 1835 assert( sqlite3_mutex_held(pBt->mutex) ); 1836 if( pBt->inTransaction==TRANS_NONE && pBt->pCursor==0 && pBt->pPage1!=0 ){ 1837 if( sqlite3PagerRefcount(pBt->pPager)>=1 ){ 1838 assert( pBt->pPage1->aData ); 1839 #if 0 1840 if( pBt->pPage1->aData==0 ){ 1841 MemPage *pPage = pBt->pPage1; 1842 pPage->aData = sqlite3PagerGetData(pPage->pDbPage); 1843 pPage->pBt = pBt; 1844 pPage->pgno = 1; 1845 } 1846 #endif 1847 releasePage(pBt->pPage1); 1848 } 1849 pBt->pPage1 = 0; 1850 pBt->inStmt = 0; 1851 } 1852 } 1853 1854 /* 1855 ** Create a new database by initializing the first page of the 1856 ** file. 1857 */ 1858 static int newDatabase(BtShared *pBt){ 1859 MemPage *pP1; 1860 unsigned char *data; 1861 int rc; 1862 int nPage; 1863 1864 assert( sqlite3_mutex_held(pBt->mutex) ); 1865 rc = sqlite3PagerPagecount(pBt->pPager, &nPage); 1866 if( rc!=SQLITE_OK || nPage>0 ){ 1867 return rc; 1868 } 1869 pP1 = pBt->pPage1; 1870 assert( pP1!=0 ); 1871 data = pP1->aData; 1872 rc = sqlite3PagerWrite(pP1->pDbPage); 1873 if( rc ) return rc; 1874 memcpy(data, zMagicHeader, sizeof(zMagicHeader)); 1875 assert( sizeof(zMagicHeader)==16 ); 1876 put2byte(&data[16], pBt->pageSize); 1877 data[18] = 1; 1878 data[19] = 1; 1879 data[20] = pBt->pageSize - pBt->usableSize; 1880 data[21] = 64; 1881 data[22] = 32; 1882 data[23] = 32; 1883 memset(&data[24], 0, 100-24); 1884 zeroPage(pP1, PTF_INTKEY|PTF_LEAF|PTF_LEAFDATA ); 1885 pBt->pageSizeFixed = 1; 1886 #ifndef SQLITE_OMIT_AUTOVACUUM 1887 assert( pBt->autoVacuum==1 || pBt->autoVacuum==0 ); 1888 assert( pBt->incrVacuum==1 || pBt->incrVacuum==0 ); 1889 put4byte(&data[36 + 4*4], pBt->autoVacuum); 1890 put4byte(&data[36 + 7*4], pBt->incrVacuum); 1891 #endif 1892 return SQLITE_OK; 1893 } 1894 1895 /* 1896 ** Attempt to start a new transaction. A write-transaction 1897 ** is started if the second argument is nonzero, otherwise a read- 1898 ** transaction. If the second argument is 2 or more and exclusive 1899 ** transaction is started, meaning that no other process is allowed 1900 ** to access the database. A preexisting transaction may not be 1901 ** upgraded to exclusive by calling this routine a second time - the 1902 ** exclusivity flag only works for a new transaction. 1903 ** 1904 ** A write-transaction must be started before attempting any 1905 ** changes to the database. None of the following routines 1906 ** will work unless a transaction is started first: 1907 ** 1908 ** sqlite3BtreeCreateTable() 1909 ** sqlite3BtreeCreateIndex() 1910 ** sqlite3BtreeClearTable() 1911 ** sqlite3BtreeDropTable() 1912 ** sqlite3BtreeInsert() 1913 ** sqlite3BtreeDelete() 1914 ** sqlite3BtreeUpdateMeta() 1915 ** 1916 ** If an initial attempt to acquire the lock fails because of lock contention 1917 ** and the database was previously unlocked, then invoke the busy handler 1918 ** if there is one. But if there was previously a read-lock, do not 1919 ** invoke the busy handler - just return SQLITE_BUSY. SQLITE_BUSY is 1920 ** returned when there is already a read-lock in order to avoid a deadlock. 1921 ** 1922 ** Suppose there are two processes A and B. A has a read lock and B has 1923 ** a reserved lock. B tries to promote to exclusive but is blocked because 1924 ** of A's read lock. A tries to promote to reserved but is blocked by B. 1925 ** One or the other of the two processes must give way or there can be 1926 ** no progress. By returning SQLITE_BUSY and not invoking the busy callback 1927 ** when A already has a read lock, we encourage A to give up and let B 1928 ** proceed. 1929 */ 1930 int sqlite3BtreeBeginTrans(Btree *p, int wrflag){ 1931 BtShared *pBt = p->pBt; 1932 int rc = SQLITE_OK; 1933 1934 sqlite3BtreeEnter(p); 1935 pBt->db = p->db; 1936 btreeIntegrity(p); 1937 1938 /* If the btree is already in a write-transaction, or it 1939 ** is already in a read-transaction and a read-transaction 1940 ** is requested, this is a no-op. 1941 */ 1942 if( p->inTrans==TRANS_WRITE || (p->inTrans==TRANS_READ && !wrflag) ){ 1943 goto trans_begun; 1944 } 1945 1946 /* Write transactions are not possible on a read-only database */ 1947 if( pBt->readOnly && wrflag ){ 1948 rc = SQLITE_READONLY; 1949 goto trans_begun; 1950 } 1951 1952 /* If another database handle has already opened a write transaction 1953 ** on this shared-btree structure and a second write transaction is 1954 ** requested, return SQLITE_BUSY. 1955 */ 1956 if( pBt->inTransaction==TRANS_WRITE && wrflag ){ 1957 rc = SQLITE_BUSY; 1958 goto trans_begun; 1959 } 1960 1961 #ifndef SQLITE_OMIT_SHARED_CACHE 1962 if( wrflag>1 ){ 1963 BtLock *pIter; 1964 for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){ 1965 if( pIter->pBtree!=p ){ 1966 rc = SQLITE_BUSY; 1967 goto trans_begun; 1968 } 1969 } 1970 } 1971 #endif 1972 1973 do { 1974 if( pBt->pPage1==0 ){ 1975 do{ 1976 rc = lockBtree(pBt); 1977 }while( pBt->pPage1==0 && rc==SQLITE_OK ); 1978 } 1979 1980 if( rc==SQLITE_OK && wrflag ){ 1981 if( pBt->readOnly ){ 1982 rc = SQLITE_READONLY; 1983 }else{ 1984 rc = sqlite3PagerBegin(pBt->pPage1->pDbPage, wrflag>1); 1985 if( rc==SQLITE_OK ){ 1986 rc = newDatabase(pBt); 1987 } 1988 } 1989 } 1990 1991 if( rc==SQLITE_OK ){ 1992 if( wrflag ) pBt->inStmt = 0; 1993 }else{ 1994 unlockBtreeIfUnused(pBt); 1995 } 1996 }while( rc==SQLITE_BUSY && pBt->inTransaction==TRANS_NONE && 1997 sqlite3BtreeInvokeBusyHandler(pBt, 0) ); 1998 1999 if( rc==SQLITE_OK ){ 2000 if( p->inTrans==TRANS_NONE ){ 2001 pBt->nTransaction++; 2002 } 2003 p->inTrans = (wrflag?TRANS_WRITE:TRANS_READ); 2004 if( p->inTrans>pBt->inTransaction ){ 2005 pBt->inTransaction = p->inTrans; 2006 } 2007 #ifndef SQLITE_OMIT_SHARED_CACHE 2008 if( wrflag>1 ){ 2009 assert( !pBt->pExclusive ); 2010 pBt->pExclusive = p; 2011 } 2012 #endif 2013 } 2014 2015 2016 trans_begun: 2017 btreeIntegrity(p); 2018 sqlite3BtreeLeave(p); 2019 return rc; 2020 } 2021 2022 /* 2023 ** Return the size of the database file in pages. Or return -1 if 2024 ** there is any kind of error. 2025 */ 2026 static int pagerPagecount(Pager *pPager){ 2027 int rc; 2028 int nPage; 2029 rc = sqlite3PagerPagecount(pPager, &nPage); 2030 return (rc==SQLITE_OK?nPage:-1); 2031 } 2032 2033 2034 #ifndef SQLITE_OMIT_AUTOVACUUM 2035 2036 /* 2037 ** Set the pointer-map entries for all children of page pPage. Also, if 2038 ** pPage contains cells that point to overflow pages, set the pointer 2039 ** map entries for the overflow pages as well. 2040 */ 2041 static int setChildPtrmaps(MemPage *pPage){ 2042 int i; /* Counter variable */ 2043 int nCell; /* Number of cells in page pPage */ 2044 int rc; /* Return code */ 2045 BtShared *pBt = pPage->pBt; 2046 int isInitOrig = pPage->isInit; 2047 Pgno pgno = pPage->pgno; 2048 2049 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 2050 rc = sqlite3BtreeInitPage(pPage, pPage->pParent); 2051 if( rc!=SQLITE_OK ){ 2052 goto set_child_ptrmaps_out; 2053 } 2054 nCell = pPage->nCell; 2055 2056 for(i=0; i<nCell; i++){ 2057 u8 *pCell = findCell(pPage, i); 2058 2059 rc = ptrmapPutOvflPtr(pPage, pCell); 2060 if( rc!=SQLITE_OK ){ 2061 goto set_child_ptrmaps_out; 2062 } 2063 2064 if( !pPage->leaf ){ 2065 Pgno childPgno = get4byte(pCell); 2066 rc = ptrmapPut(pBt, childPgno, PTRMAP_BTREE, pgno); 2067 if( rc!=SQLITE_OK ) goto set_child_ptrmaps_out; 2068 } 2069 } 2070 2071 if( !pPage->leaf ){ 2072 Pgno childPgno = get4byte(&pPage->aData[pPage->hdrOffset+8]); 2073 rc = ptrmapPut(pBt, childPgno, PTRMAP_BTREE, pgno); 2074 } 2075 2076 set_child_ptrmaps_out: 2077 pPage->isInit = isInitOrig; 2078 return rc; 2079 } 2080 2081 /* 2082 ** Somewhere on pPage, which is guarenteed to be a btree page, not an overflow 2083 ** page, is a pointer to page iFrom. Modify this pointer so that it points to 2084 ** iTo. Parameter eType describes the type of pointer to be modified, as 2085 ** follows: 2086 ** 2087 ** PTRMAP_BTREE: pPage is a btree-page. The pointer points at a child 2088 ** page of pPage. 2089 ** 2090 ** PTRMAP_OVERFLOW1: pPage is a btree-page. The pointer points at an overflow 2091 ** page pointed to by one of the cells on pPage. 2092 ** 2093 ** PTRMAP_OVERFLOW2: pPage is an overflow-page. The pointer points at the next 2094 ** overflow page in the list. 2095 */ 2096 static int modifyPagePointer(MemPage *pPage, Pgno iFrom, Pgno iTo, u8 eType){ 2097 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 2098 if( eType==PTRMAP_OVERFLOW2 ){ 2099 /* The pointer is always the first 4 bytes of the page in this case. */ 2100 if( get4byte(pPage->aData)!=iFrom ){ 2101 return SQLITE_CORRUPT_BKPT; 2102 } 2103 put4byte(pPage->aData, iTo); 2104 }else{ 2105 int isInitOrig = pPage->isInit; 2106 int i; 2107 int nCell; 2108 2109 sqlite3BtreeInitPage(pPage, 0); 2110 nCell = pPage->nCell; 2111 2112 for(i=0; i<nCell; i++){ 2113 u8 *pCell = findCell(pPage, i); 2114 if( eType==PTRMAP_OVERFLOW1 ){ 2115 CellInfo info; 2116 sqlite3BtreeParseCellPtr(pPage, pCell, &info); 2117 if( info.iOverflow ){ 2118 if( iFrom==get4byte(&pCell[info.iOverflow]) ){ 2119 put4byte(&pCell[info.iOverflow], iTo); 2120 break; 2121 } 2122 } 2123 }else{ 2124 if( get4byte(pCell)==iFrom ){ 2125 put4byte(pCell, iTo); 2126 break; 2127 } 2128 } 2129 } 2130 2131 if( i==nCell ){ 2132 if( eType!=PTRMAP_BTREE || 2133 get4byte(&pPage->aData[pPage->hdrOffset+8])!=iFrom ){ 2134 return SQLITE_CORRUPT_BKPT; 2135 } 2136 put4byte(&pPage->aData[pPage->hdrOffset+8], iTo); 2137 } 2138 2139 pPage->isInit = isInitOrig; 2140 } 2141 return SQLITE_OK; 2142 } 2143 2144 2145 /* 2146 ** Move the open database page pDbPage to location iFreePage in the 2147 ** database. The pDbPage reference remains valid. 2148 */ 2149 static int relocatePage( 2150 BtShared *pBt, /* Btree */ 2151 MemPage *pDbPage, /* Open page to move */ 2152 u8 eType, /* Pointer map 'type' entry for pDbPage */ 2153 Pgno iPtrPage, /* Pointer map 'page-no' entry for pDbPage */ 2154 Pgno iFreePage, /* The location to move pDbPage to */ 2155 int isCommit 2156 ){ 2157 MemPage *pPtrPage; /* The page that contains a pointer to pDbPage */ 2158 Pgno iDbPage = pDbPage->pgno; 2159 Pager *pPager = pBt->pPager; 2160 int rc; 2161 2162 assert( eType==PTRMAP_OVERFLOW2 || eType==PTRMAP_OVERFLOW1 || 2163 eType==PTRMAP_BTREE || eType==PTRMAP_ROOTPAGE ); 2164 assert( sqlite3_mutex_held(pBt->mutex) ); 2165 assert( pDbPage->pBt==pBt ); 2166 2167 /* Move page iDbPage from its current location to page number iFreePage */ 2168 TRACE(("AUTOVACUUM: Moving %d to free page %d (ptr page %d type %d)\n", 2169 iDbPage, iFreePage, iPtrPage, eType)); 2170 rc = sqlite3PagerMovepage(pPager, pDbPage->pDbPage, iFreePage, isCommit); 2171 if( rc!=SQLITE_OK ){ 2172 return rc; 2173 } 2174 pDbPage->pgno = iFreePage; 2175 2176 /* If pDbPage was a btree-page, then it may have child pages and/or cells 2177 ** that point to overflow pages. The pointer map entries for all these 2178 ** pages need to be changed. 2179 ** 2180 ** If pDbPage is an overflow page, then the first 4 bytes may store a 2181 ** pointer to a subsequent overflow page. If this is the case, then 2182 ** the pointer map needs to be updated for the subsequent overflow page. 2183 */ 2184 if( eType==PTRMAP_BTREE || eType==PTRMAP_ROOTPAGE ){ 2185 rc = setChildPtrmaps(pDbPage); 2186 if( rc!=SQLITE_OK ){ 2187 return rc; 2188 } 2189 }else{ 2190 Pgno nextOvfl = get4byte(pDbPage->aData); 2191 if( nextOvfl!=0 ){ 2192 rc = ptrmapPut(pBt, nextOvfl, PTRMAP_OVERFLOW2, iFreePage); 2193 if( rc!=SQLITE_OK ){ 2194 return rc; 2195 } 2196 } 2197 } 2198 2199 /* Fix the database pointer on page iPtrPage that pointed at iDbPage so 2200 ** that it points at iFreePage. Also fix the pointer map entry for 2201 ** iPtrPage. 2202 */ 2203 if( eType!=PTRMAP_ROOTPAGE ){ 2204 rc = sqlite3BtreeGetPage(pBt, iPtrPage, &pPtrPage, 0); 2205 if( rc!=SQLITE_OK ){ 2206 return rc; 2207 } 2208 rc = sqlite3PagerWrite(pPtrPage->pDbPage); 2209 if( rc!=SQLITE_OK ){ 2210 releasePage(pPtrPage); 2211 return rc; 2212 } 2213 rc = modifyPagePointer(pPtrPage, iDbPage, iFreePage, eType); 2214 releasePage(pPtrPage); 2215 if( rc==SQLITE_OK ){ 2216 rc = ptrmapPut(pBt, iFreePage, eType, iPtrPage); 2217 } 2218 } 2219 return rc; 2220 } 2221 2222 /* Forward declaration required by incrVacuumStep(). */ 2223 static int allocateBtreePage(BtShared *, MemPage **, Pgno *, Pgno, u8); 2224 2225 /* 2226 ** Perform a single step of an incremental-vacuum. If successful, 2227 ** return SQLITE_OK. If there is no work to do (and therefore no 2228 ** point in calling this function again), return SQLITE_DONE. 2229 ** 2230 ** More specificly, this function attempts to re-organize the 2231 ** database so that the last page of the file currently in use 2232 ** is no longer in use. 2233 ** 2234 ** If the nFin parameter is non-zero, the implementation assumes 2235 ** that the caller will keep calling incrVacuumStep() until 2236 ** it returns SQLITE_DONE or an error, and that nFin is the 2237 ** number of pages the database file will contain after this 2238 ** process is complete. 2239 */ 2240 static int incrVacuumStep(BtShared *pBt, Pgno nFin){ 2241 Pgno iLastPg; /* Last page in the database */ 2242 Pgno nFreeList; /* Number of pages still on the free-list */ 2243 2244 assert( sqlite3_mutex_held(pBt->mutex) ); 2245 iLastPg = pBt->nTrunc; 2246 if( iLastPg==0 ){ 2247 iLastPg = pagerPagecount(pBt->pPager); 2248 } 2249 2250 if( !PTRMAP_ISPAGE(pBt, iLastPg) && iLastPg!=PENDING_BYTE_PAGE(pBt) ){ 2251 int rc; 2252 u8 eType; 2253 Pgno iPtrPage; 2254 2255 nFreeList = get4byte(&pBt->pPage1->aData[36]); 2256 if( nFreeList==0 || nFin==iLastPg ){ 2257 return SQLITE_DONE; 2258 } 2259 2260 rc = ptrmapGet(pBt, iLastPg, &eType, &iPtrPage); 2261 if( rc!=SQLITE_OK ){ 2262 return rc; 2263 } 2264 if( eType==PTRMAP_ROOTPAGE ){ 2265 return SQLITE_CORRUPT_BKPT; 2266 } 2267 2268 if( eType==PTRMAP_FREEPAGE ){ 2269 if( nFin==0 ){ 2270 /* Remove the page from the files free-list. This is not required 2271 ** if nFin is non-zero. In that case, the free-list will be 2272 ** truncated to zero after this function returns, so it doesn't 2273 ** matter if it still contains some garbage entries. 2274 */ 2275 Pgno iFreePg; 2276 MemPage *pFreePg; 2277 rc = allocateBtreePage(pBt, &pFreePg, &iFreePg, iLastPg, 1); 2278 if( rc!=SQLITE_OK ){ 2279 return rc; 2280 } 2281 assert( iFreePg==iLastPg ); 2282 releasePage(pFreePg); 2283 } 2284 } else { 2285 Pgno iFreePg; /* Index of free page to move pLastPg to */ 2286 MemPage *pLastPg; 2287 2288 rc = sqlite3BtreeGetPage(pBt, iLastPg, &pLastPg, 0); 2289 if( rc!=SQLITE_OK ){ 2290 return rc; 2291 } 2292 2293 /* If nFin is zero, this loop runs exactly once and page pLastPg 2294 ** is swapped with the first free page pulled off the free list. 2295 ** 2296 ** On the other hand, if nFin is greater than zero, then keep 2297 ** looping until a free-page located within the first nFin pages 2298 ** of the file is found. 2299 */ 2300 do { 2301 MemPage *pFreePg; 2302 rc = allocateBtreePage(pBt, &pFreePg, &iFreePg, 0, 0); 2303 if( rc!=SQLITE_OK ){ 2304 releasePage(pLastPg); 2305 return rc; 2306 } 2307 releasePage(pFreePg); 2308 }while( nFin!=0 && iFreePg>nFin ); 2309 assert( iFreePg<iLastPg ); 2310 2311 rc = sqlite3PagerWrite(pLastPg->pDbPage); 2312 if( rc==SQLITE_OK ){ 2313 rc = relocatePage(pBt, pLastPg, eType, iPtrPage, iFreePg, nFin!=0); 2314 } 2315 releasePage(pLastPg); 2316 if( rc!=SQLITE_OK ){ 2317 return rc; 2318 } 2319 } 2320 } 2321 2322 pBt->nTrunc = iLastPg - 1; 2323 while( pBt->nTrunc==PENDING_BYTE_PAGE(pBt)||PTRMAP_ISPAGE(pBt, pBt->nTrunc) ){ 2324 pBt->nTrunc--; 2325 } 2326 return SQLITE_OK; 2327 } 2328 2329 /* 2330 ** A write-transaction must be opened before calling this function. 2331 ** It performs a single unit of work towards an incremental vacuum. 2332 ** 2333 ** If the incremental vacuum is finished after this function has run, 2334 ** SQLITE_DONE is returned. If it is not finished, but no error occured, 2335 ** SQLITE_OK is returned. Otherwise an SQLite error code. 2336 */ 2337 int sqlite3BtreeIncrVacuum(Btree *p){ 2338 int rc; 2339 BtShared *pBt = p->pBt; 2340 2341 sqlite3BtreeEnter(p); 2342 pBt->db = p->db; 2343 assert( pBt->inTransaction==TRANS_WRITE && p->inTrans==TRANS_WRITE ); 2344 if( !pBt->autoVacuum ){ 2345 rc = SQLITE_DONE; 2346 }else{ 2347 invalidateAllOverflowCache(pBt); 2348 rc = incrVacuumStep(pBt, 0); 2349 } 2350 sqlite3BtreeLeave(p); 2351 return rc; 2352 } 2353 2354 /* 2355 ** This routine is called prior to sqlite3PagerCommit when a transaction 2356 ** is commited for an auto-vacuum database. 2357 ** 2358 ** If SQLITE_OK is returned, then *pnTrunc is set to the number of pages 2359 ** the database file should be truncated to during the commit process. 2360 ** i.e. the database has been reorganized so that only the first *pnTrunc 2361 ** pages are in use. 2362 */ 2363 static int autoVacuumCommit(BtShared *pBt, Pgno *pnTrunc){ 2364 int rc = SQLITE_OK; 2365 Pager *pPager = pBt->pPager; 2366 #ifndef NDEBUG 2367 int nRef = sqlite3PagerRefcount(pPager); 2368 #endif 2369 2370 assert( sqlite3_mutex_held(pBt->mutex) ); 2371 invalidateAllOverflowCache(pBt); 2372 assert(pBt->autoVacuum); 2373 if( !pBt->incrVacuum ){ 2374 Pgno nFin = 0; 2375 2376 if( pBt->nTrunc==0 ){ 2377 Pgno nFree; 2378 Pgno nPtrmap; 2379 const int pgsz = pBt->pageSize; 2380 int nOrig = pagerPagecount(pBt->pPager); 2381 2382 if( PTRMAP_ISPAGE(pBt, nOrig) ){ 2383 return SQLITE_CORRUPT_BKPT; 2384 } 2385 if( nOrig==PENDING_BYTE_PAGE(pBt) ){ 2386 nOrig--; 2387 } 2388 nFree = get4byte(&pBt->pPage1->aData[36]); 2389 nPtrmap = (nFree-nOrig+PTRMAP_PAGENO(pBt, nOrig)+pgsz/5)/(pgsz/5); 2390 nFin = nOrig - nFree - nPtrmap; 2391 if( nOrig>PENDING_BYTE_PAGE(pBt) && nFin<=PENDING_BYTE_PAGE(pBt) ){ 2392 nFin--; 2393 } 2394 while( PTRMAP_ISPAGE(pBt, nFin) || nFin==PENDING_BYTE_PAGE(pBt) ){ 2395 nFin--; 2396 } 2397 } 2398 2399 while( rc==SQLITE_OK ){ 2400 rc = incrVacuumStep(pBt, nFin); 2401 } 2402 if( rc==SQLITE_DONE ){ 2403 assert(nFin==0 || pBt->nTrunc==0 || nFin<=pBt->nTrunc); 2404 rc = SQLITE_OK; 2405 if( pBt->nTrunc && nFin ){ 2406 rc = sqlite3PagerWrite(pBt->pPage1->pDbPage); 2407 put4byte(&pBt->pPage1->aData[32], 0); 2408 put4byte(&pBt->pPage1->aData[36], 0); 2409 pBt->nTrunc = nFin; 2410 } 2411 } 2412 if( rc!=SQLITE_OK ){ 2413 sqlite3PagerRollback(pPager); 2414 } 2415 } 2416 2417 if( rc==SQLITE_OK ){ 2418 *pnTrunc = pBt->nTrunc; 2419 pBt->nTrunc = 0; 2420 } 2421 assert( nRef==sqlite3PagerRefcount(pPager) ); 2422 return rc; 2423 } 2424 2425 #endif 2426 2427 /* 2428 ** This routine does the first phase of a two-phase commit. This routine 2429 ** causes a rollback journal to be created (if it does not already exist) 2430 ** and populated with enough information so that if a power loss occurs 2431 ** the database can be restored to its original state by playing back 2432 ** the journal. Then the contents of the journal are flushed out to 2433 ** the disk. After the journal is safely on oxide, the changes to the 2434 ** database are written into the database file and flushed to oxide. 2435 ** At the end of this call, the rollback journal still exists on the 2436 ** disk and we are still holding all locks, so the transaction has not 2437 ** committed. See sqlite3BtreeCommit() for the second phase of the 2438 ** commit process. 2439 ** 2440 ** This call is a no-op if no write-transaction is currently active on pBt. 2441 ** 2442 ** Otherwise, sync the database file for the btree pBt. zMaster points to 2443 ** the name of a master journal file that should be written into the 2444 ** individual journal file, or is NULL, indicating no master journal file 2445 ** (single database transaction). 2446 ** 2447 ** When this is called, the master journal should already have been 2448 ** created, populated with this journal pointer and synced to disk. 2449 ** 2450 ** Once this is routine has returned, the only thing required to commit 2451 ** the write-transaction for this database file is to delete the journal. 2452 */ 2453 int sqlite3BtreeCommitPhaseOne(Btree *p, const char *zMaster){ 2454 int rc = SQLITE_OK; 2455 if( p->inTrans==TRANS_WRITE ){ 2456 BtShared *pBt = p->pBt; 2457 Pgno nTrunc = 0; 2458 sqlite3BtreeEnter(p); 2459 pBt->db = p->db; 2460 #ifndef SQLITE_OMIT_AUTOVACUUM 2461 if( pBt->autoVacuum ){ 2462 rc = autoVacuumCommit(pBt, &nTrunc); 2463 if( rc!=SQLITE_OK ){ 2464 sqlite3BtreeLeave(p); 2465 return rc; 2466 } 2467 } 2468 #endif 2469 rc = sqlite3PagerCommitPhaseOne(pBt->pPager, zMaster, nTrunc, 0); 2470 sqlite3BtreeLeave(p); 2471 } 2472 return rc; 2473 } 2474 2475 /* 2476 ** Commit the transaction currently in progress. 2477 ** 2478 ** This routine implements the second phase of a 2-phase commit. The 2479 ** sqlite3BtreeSync() routine does the first phase and should be invoked 2480 ** prior to calling this routine. The sqlite3BtreeSync() routine did 2481 ** all the work of writing information out to disk and flushing the 2482 ** contents so that they are written onto the disk platter. All this 2483 ** routine has to do is delete or truncate the rollback journal 2484 ** (which causes the transaction to commit) and drop locks. 2485 ** 2486 ** This will release the write lock on the database file. If there 2487 ** are no active cursors, it also releases the read lock. 2488 */ 2489 int sqlite3BtreeCommitPhaseTwo(Btree *p){ 2490 BtShared *pBt = p->pBt; 2491 2492 sqlite3BtreeEnter(p); 2493 pBt->db = p->db; 2494 btreeIntegrity(p); 2495 2496 /* If the handle has a write-transaction open, commit the shared-btrees 2497 ** transaction and set the shared state to TRANS_READ. 2498 */ 2499 if( p->inTrans==TRANS_WRITE ){ 2500 int rc; 2501 assert( pBt->inTransaction==TRANS_WRITE ); 2502 assert( pBt->nTransaction>0 ); 2503 rc = sqlite3PagerCommitPhaseTwo(pBt->pPager); 2504 if( rc!=SQLITE_OK ){ 2505 sqlite3BtreeLeave(p); 2506 return rc; 2507 } 2508 pBt->inTransaction = TRANS_READ; 2509 pBt->inStmt = 0; 2510 } 2511 unlockAllTables(p); 2512 2513 /* If the handle has any kind of transaction open, decrement the transaction 2514 ** count of the shared btree. If the transaction count reaches 0, set 2515 ** the shared state to TRANS_NONE. The unlockBtreeIfUnused() call below 2516 ** will unlock the pager. 2517 */ 2518 if( p->inTrans!=TRANS_NONE ){ 2519 pBt->nTransaction--; 2520 if( 0==pBt->nTransaction ){ 2521 pBt->inTransaction = TRANS_NONE; 2522 } 2523 } 2524 2525 /* Set the handles current transaction state to TRANS_NONE and unlock 2526 ** the pager if this call closed the only read or write transaction. 2527 */ 2528 p->inTrans = TRANS_NONE; 2529 unlockBtreeIfUnused(pBt); 2530 2531 btreeIntegrity(p); 2532 sqlite3BtreeLeave(p); 2533 return SQLITE_OK; 2534 } 2535 2536 /* 2537 ** Do both phases of a commit. 2538 */ 2539 int sqlite3BtreeCommit(Btree *p){ 2540 int rc; 2541 sqlite3BtreeEnter(p); 2542 rc = sqlite3BtreeCommitPhaseOne(p, 0); 2543 if( rc==SQLITE_OK ){ 2544 rc = sqlite3BtreeCommitPhaseTwo(p); 2545 } 2546 sqlite3BtreeLeave(p); 2547 return rc; 2548 } 2549 2550 #ifndef NDEBUG 2551 /* 2552 ** Return the number of write-cursors open on this handle. This is for use 2553 ** in assert() expressions, so it is only compiled if NDEBUG is not 2554 ** defined. 2555 ** 2556 ** For the purposes of this routine, a write-cursor is any cursor that 2557 ** is capable of writing to the databse. That means the cursor was 2558 ** originally opened for writing and the cursor has not be disabled 2559 ** by having its state changed to CURSOR_FAULT. 2560 */ 2561 static int countWriteCursors(BtShared *pBt){ 2562 BtCursor *pCur; 2563 int r = 0; 2564 for(pCur=pBt->pCursor; pCur; pCur=pCur->pNext){ 2565 if( pCur->wrFlag && pCur->eState!=CURSOR_FAULT ) r++; 2566 } 2567 return r; 2568 } 2569 #endif 2570 2571 /* 2572 ** This routine sets the state to CURSOR_FAULT and the error 2573 ** code to errCode for every cursor on BtShared that pBtree 2574 ** references. 2575 ** 2576 ** Every cursor is tripped, including cursors that belong 2577 ** to other database connections that happen to be sharing 2578 ** the cache with pBtree. 2579 ** 2580 ** This routine gets called when a rollback occurs. 2581 ** All cursors using the same cache must be tripped 2582 ** to prevent them from trying to use the btree after 2583 ** the rollback. The rollback may have deleted tables 2584 ** or moved root pages, so it is not sufficient to 2585 ** save the state of the cursor. The cursor must be 2586 ** invalidated. 2587 */ 2588 void sqlite3BtreeTripAllCursors(Btree *pBtree, int errCode){ 2589 BtCursor *p; 2590 sqlite3BtreeEnter(pBtree); 2591 for(p=pBtree->pBt->pCursor; p; p=p->pNext){ 2592 clearCursorPosition(p); 2593 p->eState = CURSOR_FAULT; 2594 p->skip = errCode; 2595 } 2596 sqlite3BtreeLeave(pBtree); 2597 } 2598 2599 /* 2600 ** Rollback the transaction in progress. All cursors will be 2601 ** invalided by this operation. Any attempt to use a cursor 2602 ** that was open at the beginning of this operation will result 2603 ** in an error. 2604 ** 2605 ** This will release the write lock on the database file. If there 2606 ** are no active cursors, it also releases the read lock. 2607 */ 2608 int sqlite3BtreeRollback(Btree *p){ 2609 int rc; 2610 BtShared *pBt = p->pBt; 2611 MemPage *pPage1; 2612 2613 sqlite3BtreeEnter(p); 2614 pBt->db = p->db; 2615 rc = saveAllCursors(pBt, 0, 0); 2616 #ifndef SQLITE_OMIT_SHARED_CACHE 2617 if( rc!=SQLITE_OK ){ 2618 /* This is a horrible situation. An IO or malloc() error occured whilst 2619 ** trying to save cursor positions. If this is an automatic rollback (as 2620 ** the result of a constraint, malloc() failure or IO error) then 2621 ** the cache may be internally inconsistent (not contain valid trees) so 2622 ** we cannot simply return the error to the caller. Instead, abort 2623 ** all queries that may be using any of the cursors that failed to save. 2624 */ 2625 sqlite3BtreeTripAllCursors(p, rc); 2626 } 2627 #endif 2628 btreeIntegrity(p); 2629 unlockAllTables(p); 2630 2631 if( p->inTrans==TRANS_WRITE ){ 2632 int rc2; 2633 2634 #ifndef SQLITE_OMIT_AUTOVACUUM 2635 pBt->nTrunc = 0; 2636 #endif 2637 2638 assert( TRANS_WRITE==pBt->inTransaction ); 2639 rc2 = sqlite3PagerRollback(pBt->pPager); 2640 if( rc2!=SQLITE_OK ){ 2641 rc = rc2; 2642 } 2643 2644 /* The rollback may have destroyed the pPage1->aData value. So 2645 ** call sqlite3BtreeGetPage() on page 1 again to make 2646 ** sure pPage1->aData is set correctly. */ 2647 if( sqlite3BtreeGetPage(pBt, 1, &pPage1, 0)==SQLITE_OK ){ 2648 releasePage(pPage1); 2649 } 2650 assert( countWriteCursors(pBt)==0 ); 2651 pBt->inTransaction = TRANS_READ; 2652 } 2653 2654 if( p->inTrans!=TRANS_NONE ){ 2655 assert( pBt->nTransaction>0 ); 2656 pBt->nTransaction--; 2657 if( 0==pBt->nTransaction ){ 2658 pBt->inTransaction = TRANS_NONE; 2659 } 2660 } 2661 2662 p->inTrans = TRANS_NONE; 2663 pBt->inStmt = 0; 2664 unlockBtreeIfUnused(pBt); 2665 2666 btreeIntegrity(p); 2667 sqlite3BtreeLeave(p); 2668 return rc; 2669 } 2670 2671 /* 2672 ** Start a statement subtransaction. The subtransaction can 2673 ** can be rolled back independently of the main transaction. 2674 ** You must start a transaction before starting a subtransaction. 2675 ** The subtransaction is ended automatically if the main transaction 2676 ** commits or rolls back. 2677 ** 2678 ** Only one subtransaction may be active at a time. It is an error to try 2679 ** to start a new subtransaction if another subtransaction is already active. 2680 ** 2681 ** Statement subtransactions are used around individual SQL statements 2682 ** that are contained within a BEGIN...COMMIT block. If a constraint 2683 ** error occurs within the statement, the effect of that one statement 2684 ** can be rolled back without having to rollback the entire transaction. 2685 */ 2686 int sqlite3BtreeBeginStmt(Btree *p){ 2687 int rc; 2688 BtShared *pBt = p->pBt; 2689 sqlite3BtreeEnter(p); 2690 pBt->db = p->db; 2691 if( (p->inTrans!=TRANS_WRITE) || pBt->inStmt ){ 2692 rc = pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR; 2693 }else{ 2694 assert( pBt->inTransaction==TRANS_WRITE ); 2695 rc = pBt->readOnly ? SQLITE_OK : sqlite3PagerStmtBegin(pBt->pPager); 2696 pBt->inStmt = 1; 2697 } 2698 sqlite3BtreeLeave(p); 2699 return rc; 2700 } 2701 2702 2703 /* 2704 ** Commit the statment subtransaction currently in progress. If no 2705 ** subtransaction is active, this is a no-op. 2706 */ 2707 int sqlite3BtreeCommitStmt(Btree *p){ 2708 int rc; 2709 BtShared *pBt = p->pBt; 2710 sqlite3BtreeEnter(p); 2711 pBt->db = p->db; 2712 if( pBt->inStmt && !pBt->readOnly ){ 2713 rc = sqlite3PagerStmtCommit(pBt->pPager); 2714 }else{ 2715 rc = SQLITE_OK; 2716 } 2717 pBt->inStmt = 0; 2718 sqlite3BtreeLeave(p); 2719 return rc; 2720 } 2721 2722 /* 2723 ** Rollback the active statement subtransaction. If no subtransaction 2724 ** is active this routine is a no-op. 2725 ** 2726 ** All cursors will be invalidated by this operation. Any attempt 2727 ** to use a cursor that was open at the beginning of this operation 2728 ** will result in an error. 2729 */ 2730 int sqlite3BtreeRollbackStmt(Btree *p){ 2731 int rc = SQLITE_OK; 2732 BtShared *pBt = p->pBt; 2733 sqlite3BtreeEnter(p); 2734 pBt->db = p->db; 2735 if( pBt->inStmt && !pBt->readOnly ){ 2736 rc = sqlite3PagerStmtRollback(pBt->pPager); 2737 pBt->inStmt = 0; 2738 } 2739 sqlite3BtreeLeave(p); 2740 return rc; 2741 } 2742 2743 /* 2744 ** Create a new cursor for the BTree whose root is on the page 2745 ** iTable. The act of acquiring a cursor gets a read lock on 2746 ** the database file. 2747 ** 2748 ** If wrFlag==0, then the cursor can only be used for reading. 2749 ** If wrFlag==1, then the cursor can be used for reading or for 2750 ** writing if other conditions for writing are also met. These 2751 ** are the conditions that must be met in order for writing to 2752 ** be allowed: 2753 ** 2754 ** 1: The cursor must have been opened with wrFlag==1 2755 ** 2756 ** 2: Other database connections that share the same pager cache 2757 ** but which are not in the READ_UNCOMMITTED state may not have 2758 ** cursors open with wrFlag==0 on the same table. Otherwise 2759 ** the changes made by this write cursor would be visible to 2760 ** the read cursors in the other database connection. 2761 ** 2762 ** 3: The database must be writable (not on read-only media) 2763 ** 2764 ** 4: There must be an active transaction. 2765 ** 2766 ** No checking is done to make sure that page iTable really is the 2767 ** root page of a b-tree. If it is not, then the cursor acquired 2768 ** will not work correctly. 2769 */ 2770 static int btreeCursor( 2771 Btree *p, /* The btree */ 2772 int iTable, /* Root page of table to open */ 2773 int wrFlag, /* 1 to write. 0 read-only */ 2774 struct KeyInfo *pKeyInfo, /* First arg to comparison function */ 2775 BtCursor *pCur /* Space for new cursor */ 2776 ){ 2777 int rc; 2778 BtShared *pBt = p->pBt; 2779 2780 assert( sqlite3BtreeHoldsMutex(p) ); 2781 if( wrFlag ){ 2782 if( pBt->readOnly ){ 2783 return SQLITE_READONLY; 2784 } 2785 if( checkReadLocks(p, iTable, 0, 0) ){ 2786 return SQLITE_LOCKED; 2787 } 2788 } 2789 2790 if( pBt->pPage1==0 ){ 2791 rc = lockBtreeWithRetry(p); 2792 if( rc!=SQLITE_OK ){ 2793 return rc; 2794 } 2795 if( pBt->readOnly && wrFlag ){ 2796 return SQLITE_READONLY; 2797 } 2798 } 2799 pCur->pgnoRoot = (Pgno)iTable; 2800 if( iTable==1 && pagerPagecount(pBt->pPager)==0 ){ 2801 rc = SQLITE_EMPTY; 2802 goto create_cursor_exception; 2803 } 2804 rc = getAndInitPage(pBt, pCur->pgnoRoot, &pCur->pPage, 0); 2805 if( rc!=SQLITE_OK ){ 2806 goto create_cursor_exception; 2807 } 2808 2809 /* Now that no other errors can occur, finish filling in the BtCursor 2810 ** variables, link the cursor into the BtShared list and set *ppCur (the 2811 ** output argument to this function). 2812 */ 2813 pCur->pKeyInfo = pKeyInfo; 2814 pCur->pBtree = p; 2815 pCur->pBt = pBt; 2816 pCur->wrFlag = wrFlag; 2817 pCur->pNext = pBt->pCursor; 2818 if( pCur->pNext ){ 2819 pCur->pNext->pPrev = pCur; 2820 } 2821 pBt->pCursor = pCur; 2822 pCur->eState = CURSOR_INVALID; 2823 2824 return SQLITE_OK; 2825 2826 create_cursor_exception: 2827 releasePage(pCur->pPage); 2828 unlockBtreeIfUnused(pBt); 2829 return rc; 2830 } 2831 int sqlite3BtreeCursor( 2832 Btree *p, /* The btree */ 2833 int iTable, /* Root page of table to open */ 2834 int wrFlag, /* 1 to write. 0 read-only */ 2835 struct KeyInfo *pKeyInfo, /* First arg to xCompare() */ 2836 BtCursor *pCur /* Write new cursor here */ 2837 ){ 2838 int rc; 2839 sqlite3BtreeEnter(p); 2840 p->pBt->db = p->db; 2841 rc = btreeCursor(p, iTable, wrFlag, pKeyInfo, pCur); 2842 sqlite3BtreeLeave(p); 2843 return rc; 2844 } 2845 int sqlite3BtreeCursorSize(){ 2846 return sizeof(BtCursor); 2847 } 2848 2849 2850 2851 /* 2852 ** Close a cursor. The read lock on the database file is released 2853 ** when the last cursor is closed. 2854 */ 2855 int sqlite3BtreeCloseCursor(BtCursor *pCur){ 2856 Btree *pBtree = pCur->pBtree; 2857 if( pBtree ){ 2858 BtShared *pBt = pCur->pBt; 2859 sqlite3BtreeEnter(pBtree); 2860 pBt->db = pBtree->db; 2861 clearCursorPosition(pCur); 2862 if( pCur->pPrev ){ 2863 pCur->pPrev->pNext = pCur->pNext; 2864 }else{ 2865 pBt->pCursor = pCur->pNext; 2866 } 2867 if( pCur->pNext ){ 2868 pCur->pNext->pPrev = pCur->pPrev; 2869 } 2870 releasePage(pCur->pPage); 2871 unlockBtreeIfUnused(pBt); 2872 invalidateOverflowCache(pCur); 2873 /* sqlite3_free(pCur); */ 2874 sqlite3BtreeLeave(pBtree); 2875 } 2876 return SQLITE_OK; 2877 } 2878 2879 /* 2880 ** Make a temporary cursor by filling in the fields of pTempCur. 2881 ** The temporary cursor is not on the cursor list for the Btree. 2882 */ 2883 void sqlite3BtreeGetTempCursor(BtCursor *pCur, BtCursor *pTempCur){ 2884 assert( cursorHoldsMutex(pCur) ); 2885 memcpy(pTempCur, pCur, sizeof(*pCur)); 2886 pTempCur->pNext = 0; 2887 pTempCur->pPrev = 0; 2888 if( pTempCur->pPage ){ 2889 sqlite3PagerRef(pTempCur->pPage->pDbPage); 2890 } 2891 } 2892 2893 /* 2894 ** Delete a temporary cursor such as was made by the CreateTemporaryCursor() 2895 ** function above. 2896 */ 2897 void sqlite3BtreeReleaseTempCursor(BtCursor *pCur){ 2898 assert( cursorHoldsMutex(pCur) ); 2899 if( pCur->pPage ){ 2900 sqlite3PagerUnref(pCur->pPage->pDbPage); 2901 } 2902 } 2903 2904 /* 2905 ** Make sure the BtCursor* given in the argument has a valid 2906 ** BtCursor.info structure. If it is not already valid, call 2907 ** sqlite3BtreeParseCell() to fill it in. 2908 ** 2909 ** BtCursor.info is a cache of the information in the current cell. 2910 ** Using this cache reduces the number of calls to sqlite3BtreeParseCell(). 2911 ** 2912 ** 2007-06-25: There is a bug in some versions of MSVC that cause the 2913 ** compiler to crash when getCellInfo() is implemented as a macro. 2914 ** But there is a measureable speed advantage to using the macro on gcc 2915 ** (when less compiler optimizations like -Os or -O0 are used and the 2916 ** compiler is not doing agressive inlining.) So we use a real function 2917 ** for MSVC and a macro for everything else. Ticket #2457. 2918 */ 2919 #ifndef NDEBUG 2920 static void assertCellInfo(BtCursor *pCur){ 2921 CellInfo info; 2922 memset(&info, 0, sizeof(info)); 2923 sqlite3BtreeParseCell(pCur->pPage, pCur->idx, &info); 2924 assert( memcmp(&info, &pCur->info, sizeof(info))==0 ); 2925 } 2926 #else 2927 #define assertCellInfo(x) 2928 #endif 2929 #ifdef _MSC_VER 2930 /* Use a real function in MSVC to work around bugs in that compiler. */ 2931 static void getCellInfo(BtCursor *pCur){ 2932 if( pCur->info.nSize==0 ){ 2933 sqlite3BtreeParseCell(pCur->pPage, pCur->idx, &pCur->info); 2934 pCur->validNKey = 1; 2935 }else{ 2936 assertCellInfo(pCur); 2937 } 2938 } 2939 #else /* if not _MSC_VER */ 2940 /* Use a macro in all other compilers so that the function is inlined */ 2941 #define getCellInfo(pCur) \ 2942 if( pCur->info.nSize==0 ){ \ 2943 sqlite3BtreeParseCell(pCur->pPage, pCur->idx, &pCur->info); \ 2944 pCur->validNKey = 1; \ 2945 }else{ \ 2946 assertCellInfo(pCur); \ 2947 } 2948 #endif /* _MSC_VER */ 2949 2950 /* 2951 ** Set *pSize to the size of the buffer needed to hold the value of 2952 ** the key for the current entry. If the cursor is not pointing 2953 ** to a valid entry, *pSize is set to 0. 2954 ** 2955 ** For a table with the INTKEY flag set, this routine returns the key 2956 ** itself, not the number of bytes in the key. 2957 */ 2958 int sqlite3BtreeKeySize(BtCursor *pCur, i64 *pSize){ 2959 int rc; 2960 2961 assert( cursorHoldsMutex(pCur) ); 2962 rc = restoreCursorPosition(pCur); 2963 if( rc==SQLITE_OK ){ 2964 assert( pCur->eState==CURSOR_INVALID || pCur->eState==CURSOR_VALID ); 2965 if( pCur->eState==CURSOR_INVALID ){ 2966 *pSize = 0; 2967 }else{ 2968 getCellInfo(pCur); 2969 *pSize = pCur->info.nKey; 2970 } 2971 } 2972 return rc; 2973 } 2974 2975 /* 2976 ** Set *pSize to the number of bytes of data in the entry the 2977 ** cursor currently points to. Always return SQLITE_OK. 2978 ** Failure is not possible. If the cursor is not currently 2979 ** pointing to an entry (which can happen, for example, if 2980 ** the database is empty) then *pSize is set to 0. 2981 */ 2982 int sqlite3BtreeDataSize(BtCursor *pCur, u32 *pSize){ 2983 int rc; 2984 2985 assert( cursorHoldsMutex(pCur) ); 2986 rc = restoreCursorPosition(pCur); 2987 if( rc==SQLITE_OK ){ 2988 assert( pCur->eState==CURSOR_INVALID || pCur->eState==CURSOR_VALID ); 2989 if( pCur->eState==CURSOR_INVALID ){ 2990 /* Not pointing at a valid entry - set *pSize to 0. */ 2991 *pSize = 0; 2992 }else{ 2993 getCellInfo(pCur); 2994 *pSize = pCur->info.nData; 2995 } 2996 } 2997 return rc; 2998 } 2999 3000 /* 3001 ** Given the page number of an overflow page in the database (parameter 3002 ** ovfl), this function finds the page number of the next page in the 3003 ** linked list of overflow pages. If possible, it uses the auto-vacuum 3004 ** pointer-map data instead of reading the content of page ovfl to do so. 3005 ** 3006 ** If an error occurs an SQLite error code is returned. Otherwise: 3007 ** 3008 ** Unless pPgnoNext is NULL, the page number of the next overflow 3009 ** page in the linked list is written to *pPgnoNext. If page ovfl 3010 ** is the last page in its linked list, *pPgnoNext is set to zero. 3011 ** 3012 ** If ppPage is not NULL, *ppPage is set to the MemPage* handle 3013 ** for page ovfl. The underlying pager page may have been requested 3014 ** with the noContent flag set, so the page data accessable via 3015 ** this handle may not be trusted. 3016 */ 3017 static int getOverflowPage( 3018 BtShared *pBt, 3019 Pgno ovfl, /* Overflow page */ 3020 MemPage **ppPage, /* OUT: MemPage handle */ 3021 Pgno *pPgnoNext /* OUT: Next overflow page number */ 3022 ){ 3023 Pgno next = 0; 3024 int rc; 3025 3026 assert( sqlite3_mutex_held(pBt->mutex) ); 3027 /* One of these must not be NULL. Otherwise, why call this function? */ 3028 assert(ppPage || pPgnoNext); 3029 3030 /* If pPgnoNext is NULL, then this function is being called to obtain 3031 ** a MemPage* reference only. No page-data is required in this case. 3032 */ 3033 if( !pPgnoNext ){ 3034 return sqlite3BtreeGetPage(pBt, ovfl, ppPage, 1); 3035 } 3036 3037 #ifndef SQLITE_OMIT_AUTOVACUUM 3038 /* Try to find the next page in the overflow list using the 3039 ** autovacuum pointer-map pages. Guess that the next page in 3040 ** the overflow list is page number (ovfl+1). If that guess turns 3041 ** out to be wrong, fall back to loading the data of page 3042 ** number ovfl to determine the next page number. 3043 */ 3044 if( pBt->autoVacuum ){ 3045 Pgno pgno; 3046 Pgno iGuess = ovfl+1; 3047 u8 eType; 3048 3049 while( PTRMAP_ISPAGE(pBt, iGuess) || iGuess==PENDING_BYTE_PAGE(pBt) ){ 3050 iGuess++; 3051 } 3052 3053 if( iGuess<=pagerPagecount(pBt->pPager) ){ 3054 rc = ptrmapGet(pBt, iGuess, &eType, &pgno); 3055 if( rc!=SQLITE_OK ){ 3056 return rc; 3057 } 3058 if( eType==PTRMAP_OVERFLOW2 && pgno==ovfl ){ 3059 next = iGuess; 3060 } 3061 } 3062 } 3063 #endif 3064 3065 if( next==0 || ppPage ){ 3066 MemPage *pPage = 0; 3067 3068 rc = sqlite3BtreeGetPage(pBt, ovfl, &pPage, next!=0); 3069 assert(rc==SQLITE_OK || pPage==0); 3070 if( next==0 && rc==SQLITE_OK ){ 3071 next = get4byte(pPage->aData); 3072 } 3073 3074 if( ppPage ){ 3075 *ppPage = pPage; 3076 }else{ 3077 releasePage(pPage); 3078 } 3079 } 3080 *pPgnoNext = next; 3081 3082 return rc; 3083 } 3084 3085 /* 3086 ** Copy data from a buffer to a page, or from a page to a buffer. 3087 ** 3088 ** pPayload is a pointer to data stored on database page pDbPage. 3089 ** If argument eOp is false, then nByte bytes of data are copied 3090 ** from pPayload to the buffer pointed at by pBuf. If eOp is true, 3091 ** then sqlite3PagerWrite() is called on pDbPage and nByte bytes 3092 ** of data are copied from the buffer pBuf to pPayload. 3093 ** 3094 ** SQLITE_OK is returned on success, otherwise an error code. 3095 */ 3096 static int copyPayload( 3097 void *pPayload, /* Pointer to page data */ 3098 void *pBuf, /* Pointer to buffer */ 3099 int nByte, /* Number of bytes to copy */ 3100 int eOp, /* 0 -> copy from page, 1 -> copy to page */ 3101 DbPage *pDbPage /* Page containing pPayload */ 3102 ){ 3103 if( eOp ){ 3104 /* Copy data from buffer to page (a write operation) */ 3105 int rc = sqlite3PagerWrite(pDbPage); 3106 if( rc!=SQLITE_OK ){ 3107 return rc; 3108 } 3109 memcpy(pPayload, pBuf, nByte); 3110 }else{ 3111 /* Copy data from page to buffer (a read operation) */ 3112 memcpy(pBuf, pPayload, nByte); 3113 } 3114 return SQLITE_OK; 3115 } 3116 3117 /* 3118 ** This function is used to read or overwrite payload information 3119 ** for the entry that the pCur cursor is pointing to. If the eOp 3120 ** parameter is 0, this is a read operation (data copied into 3121 ** buffer pBuf). If it is non-zero, a write (data copied from 3122 ** buffer pBuf). 3123 ** 3124 ** A total of "amt" bytes are read or written beginning at "offset". 3125 ** Data is read to or from the buffer pBuf. 3126 ** 3127 ** This routine does not make a distinction between key and data. 3128 ** It just reads or writes bytes from the payload area. Data might 3129 ** appear on the main page or be scattered out on multiple overflow 3130 ** pages. 3131 ** 3132 ** If the BtCursor.isIncrblobHandle flag is set, and the current 3133 ** cursor entry uses one or more overflow pages, this function 3134 ** allocates space for and lazily popluates the overflow page-list 3135 ** cache array (BtCursor.aOverflow). Subsequent calls use this 3136 ** cache to make seeking to the supplied offset more efficient. 3137 ** 3138 ** Once an overflow page-list cache has been allocated, it may be 3139 ** invalidated if some other cursor writes to the same table, or if 3140 ** the cursor is moved to a different row. Additionally, in auto-vacuum 3141 ** mode, the following events may invalidate an overflow page-list cache. 3142 ** 3143 ** * An incremental vacuum, 3144 ** * A commit in auto_vacuum="full" mode, 3145 ** * Creating a table (may require moving an overflow page). 3146 */ 3147 static int accessPayload( 3148 BtCursor *pCur, /* Cursor pointing to entry to read from */ 3149 int offset, /* Begin reading this far into payload */ 3150 int amt, /* Read this many bytes */ 3151 unsigned char *pBuf, /* Write the bytes into this buffer */ 3152 int skipKey, /* offset begins at data if this is true */ 3153 int eOp /* zero to read. non-zero to write. */ 3154 ){ 3155 unsigned char *aPayload; 3156 int rc = SQLITE_OK; 3157 u32 nKey; 3158 int iIdx = 0; 3159 MemPage *pPage = pCur->pPage; /* Btree page of current cursor entry */ 3160 BtShared *pBt; /* Btree this cursor belongs to */ 3161 3162 assert( pPage ); 3163 assert( pCur->eState==CURSOR_VALID ); 3164 assert( pCur->idx>=0 && pCur->idx<pPage->nCell ); 3165 assert( offset>=0 ); 3166 assert( cursorHoldsMutex(pCur) ); 3167 3168 getCellInfo(pCur); 3169 aPayload = pCur->info.pCell + pCur->info.nHeader; 3170 nKey = (pPage->intKey ? 0 : pCur->info.nKey); 3171 3172 if( skipKey ){ 3173 offset += nKey; 3174 } 3175 if( offset+amt > nKey+pCur->info.nData ){ 3176 /* Trying to read or write past the end of the data is an error */ 3177 return SQLITE_ERROR; 3178 } 3179 3180 /* Check if data must be read/written to/from the btree page itself. */ 3181 if( offset<pCur->info.nLocal ){ 3182 int a = amt; 3183 if( a+offset>pCur->info.nLocal ){ 3184 a = pCur->info.nLocal - offset; 3185 } 3186 rc = copyPayload(&aPayload[offset], pBuf, a, eOp, pPage->pDbPage); 3187 offset = 0; 3188 pBuf += a; 3189 amt -= a; 3190 }else{ 3191 offset -= pCur->info.nLocal; 3192 } 3193 3194 pBt = pCur->pBt; 3195 if( rc==SQLITE_OK && amt>0 ){ 3196 const int ovflSize = pBt->usableSize - 4; /* Bytes content per ovfl page */ 3197 Pgno nextPage; 3198 3199 nextPage = get4byte(&aPayload[pCur->info.nLocal]); 3200 3201 #ifndef SQLITE_OMIT_INCRBLOB 3202 /* If the isIncrblobHandle flag is set and the BtCursor.aOverflow[] 3203 ** has not been allocated, allocate it now. The array is sized at 3204 ** one entry for each overflow page in the overflow chain. The 3205 ** page number of the first overflow page is stored in aOverflow[0], 3206 ** etc. A value of 0 in the aOverflow[] array means "not yet known" 3207 ** (the cache is lazily populated). 3208 */ 3209 if( pCur->isIncrblobHandle && !pCur->aOverflow ){ 3210 int nOvfl = (pCur->info.nPayload-pCur->info.nLocal+ovflSize-1)/ovflSize; 3211 pCur->aOverflow = (Pgno *)sqlite3MallocZero(sizeof(Pgno)*nOvfl); 3212 if( nOvfl && !pCur->aOverflow ){ 3213 rc = SQLITE_NOMEM; 3214 } 3215 } 3216 3217 /* If the overflow page-list cache has been allocated and the 3218 ** entry for the first required overflow page is valid, skip 3219 ** directly to it. 3220 */ 3221 if( pCur->aOverflow && pCur->aOverflow[offset/ovflSize] ){ 3222 iIdx = (offset/ovflSize); 3223 nextPage = pCur->aOverflow[iIdx]; 3224 offset = (offset%ovflSize); 3225 } 3226 #endif 3227 3228 for( ; rc==SQLITE_OK && amt>0 && nextPage; iIdx++){ 3229 3230 #ifndef SQLITE_OMIT_INCRBLOB 3231 /* If required, populate the overflow page-list cache. */ 3232 if( pCur->aOverflow ){ 3233 assert(!pCur->aOverflow[iIdx] || pCur->aOverflow[iIdx]==nextPage); 3234 pCur->aOverflow[iIdx] = nextPage; 3235 } 3236 #endif 3237 3238 if( offset>=ovflSize ){ 3239 /* The only reason to read this page is to obtain the page 3240 ** number for the next page in the overflow chain. The page 3241 ** data is not required. So first try to lookup the overflow 3242 ** page-list cache, if any, then fall back to the getOverflowPage() 3243 ** function. 3244 */ 3245 #ifndef SQLITE_OMIT_INCRBLOB 3246 if( pCur->aOverflow && pCur->aOverflow[iIdx+1] ){ 3247 nextPage = pCur->aOverflow[iIdx+1]; 3248 } else 3249 #endif 3250 rc = getOverflowPage(pBt, nextPage, 0, &nextPage); 3251 offset -= ovflSize; 3252 }else{ 3253 /* Need to read this page properly. It contains some of the 3254 ** range of data that is being read (eOp==0) or written (eOp!=0). 3255 */ 3256 DbPage *pDbPage; 3257 int a = amt; 3258 rc = sqlite3PagerGet(pBt->pPager, nextPage, &pDbPage); 3259 if( rc==SQLITE_OK ){ 3260 aPayload = sqlite3PagerGetData(pDbPage); 3261 nextPage = get4byte(aPayload); 3262 if( a + offset > ovflSize ){ 3263 a = ovflSize - offset; 3264 } 3265 rc = copyPayload(&aPayload[offset+4], pBuf, a, eOp, pDbPage); 3266 sqlite3PagerUnref(pDbPage); 3267 offset = 0; 3268 amt -= a; 3269 pBuf += a; 3270 } 3271 } 3272 } 3273 } 3274 3275 if( rc==SQLITE_OK && amt>0 ){ 3276 return SQLITE_CORRUPT_BKPT; 3277 } 3278 return rc; 3279 } 3280 3281 /* 3282 ** Read part of the key associated with cursor pCur. Exactly 3283 ** "amt" bytes will be transfered into pBuf[]. The transfer 3284 ** begins at "offset". 3285 ** 3286 ** Return SQLITE_OK on success or an error code if anything goes 3287 ** wrong. An error is returned if "offset+amt" is larger than 3288 ** the available payload. 3289 */ 3290 int sqlite3BtreeKey(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){ 3291 int rc; 3292 3293 assert( cursorHoldsMutex(pCur) ); 3294 rc = restoreCursorPosition(pCur); 3295 if( rc==SQLITE_OK ){ 3296 assert( pCur->eState==CURSOR_VALID ); 3297 assert( pCur->pPage!=0 ); 3298 if( pCur->pPage->intKey ){ 3299 return SQLITE_CORRUPT_BKPT; 3300 } 3301 assert( pCur->pPage->intKey==0 ); 3302 assert( pCur->idx>=0 && pCur->idx<pCur->pPage->nCell ); 3303 rc = accessPayload(pCur, offset, amt, (unsigned char*)pBuf, 0, 0); 3304 } 3305 return rc; 3306 } 3307 3308 /* 3309 ** Read part of the data associated with cursor pCur. Exactly 3310 ** "amt" bytes will be transfered into pBuf[]. The transfer 3311 ** begins at "offset". 3312 ** 3313 ** Return SQLITE_OK on success or an error code if anything goes 3314 ** wrong. An error is returned if "offset+amt" is larger than 3315 ** the available payload. 3316 */ 3317 int sqlite3BtreeData(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){ 3318 int rc; 3319 3320 #ifndef SQLITE_OMIT_INCRBLOB 3321 if ( pCur->eState==CURSOR_INVALID ){ 3322 return SQLITE_ABORT; 3323 } 3324 #endif 3325 3326 assert( cursorHoldsMutex(pCur) ); 3327 rc = restoreCursorPosition(pCur); 3328 if( rc==SQLITE_OK ){ 3329 assert( pCur->eState==CURSOR_VALID ); 3330 assert( pCur->pPage!=0 ); 3331 assert( pCur->idx>=0 && pCur->idx<pCur->pPage->nCell ); 3332 rc = accessPayload(pCur, offset, amt, pBuf, 1, 0); 3333 } 3334 return rc; 3335 } 3336 3337 /* 3338 ** Return a pointer to payload information from the entry that the 3339 ** pCur cursor is pointing to. The pointer is to the beginning of 3340 ** the key if skipKey==0 and it points to the beginning of data if 3341 ** skipKey==1. The number of bytes of available key/data is written 3342 ** into *pAmt. If *pAmt==0, then the value returned will not be 3343 ** a valid pointer. 3344 ** 3345 ** This routine is an optimization. It is common for the entire key 3346 ** and data to fit on the local page and for there to be no overflow 3347 ** pages. When that is so, this routine can be used to access the 3348 ** key and data without making a copy. If the key and/or data spills 3349 ** onto overflow pages, then accessPayload() must be used to reassembly 3350 ** the key/data and copy it into a preallocated buffer. 3351 ** 3352 ** The pointer returned by this routine looks directly into the cached 3353 ** page of the database. The data might change or move the next time 3354 ** any btree routine is called. 3355 */ 3356 static const unsigned char *fetchPayload( 3357 BtCursor *pCur, /* Cursor pointing to entry to read from */ 3358 int *pAmt, /* Write the number of available bytes here */ 3359 int skipKey /* read beginning at data if this is true */ 3360 ){ 3361 unsigned char *aPayload; 3362 MemPage *pPage; 3363 u32 nKey; 3364 int nLocal; 3365 3366 assert( pCur!=0 && pCur->pPage!=0 ); 3367 assert( pCur->eState==CURSOR_VALID ); 3368 assert( cursorHoldsMutex(pCur) ); 3369 pPage = pCur->pPage; 3370 assert( pCur->idx>=0 && pCur->idx<pPage->nCell ); 3371 getCellInfo(pCur); 3372 aPayload = pCur->info.pCell; 3373 aPayload += pCur->info.nHeader; 3374 if( pPage->intKey ){ 3375 nKey = 0; 3376 }else{ 3377 nKey = pCur->info.nKey; 3378 } 3379 if( skipKey ){ 3380 aPayload += nKey; 3381 nLocal = pCur->info.nLocal - nKey; 3382 }else{ 3383 nLocal = pCur->info.nLocal; 3384 if( nLocal>nKey ){ 3385 nLocal = nKey; 3386 } 3387 } 3388 *pAmt = nLocal; 3389 return aPayload; 3390 } 3391 3392 3393 /* 3394 ** For the entry that cursor pCur is point to, return as 3395 ** many bytes of the key or data as are available on the local 3396 ** b-tree page. Write the number of available bytes into *pAmt. 3397 ** 3398 ** The pointer returned is ephemeral. The key/data may move 3399 ** or be destroyed on the next call to any Btree routine, 3400 ** including calls from other threads against the same cache. 3401 ** Hence, a mutex on the BtShared should be held prior to calling 3402 ** this routine. 3403 ** 3404 ** These routines is used to get quick access to key and data 3405 ** in the common case where no overflow pages are used. 3406 */ 3407 const void *sqlite3BtreeKeyFetch(BtCursor *pCur, int *pAmt){ 3408 assert( cursorHoldsMutex(pCur) ); 3409 if( pCur->eState==CURSOR_VALID ){ 3410 return (const void*)fetchPayload(pCur, pAmt, 0); 3411 } 3412 return 0; 3413 } 3414 const void *sqlite3BtreeDataFetch(BtCursor *pCur, int *pAmt){ 3415 assert( cursorHoldsMutex(pCur) ); 3416 if( pCur->eState==CURSOR_VALID ){ 3417 return (const void*)fetchPayload(pCur, pAmt, 1); 3418 } 3419 return 0; 3420 } 3421 3422 3423 /* 3424 ** Move the cursor down to a new child page. The newPgno argument is the 3425 ** page number of the child page to move to. 3426 */ 3427 static int moveToChild(BtCursor *pCur, u32 newPgno){ 3428 int rc; 3429 MemPage *pNewPage; 3430 MemPage *pOldPage; 3431 BtShared *pBt = pCur->pBt; 3432 3433 assert( cursorHoldsMutex(pCur) ); 3434 assert( pCur->eState==CURSOR_VALID ); 3435 rc = getAndInitPage(pBt, newPgno, &pNewPage, pCur->pPage); 3436 if( rc ) return rc; 3437 pNewPage->idxParent = pCur->idx; 3438 pOldPage = pCur->pPage; 3439 pOldPage->idxShift = 0; 3440 releasePage(pOldPage); 3441 pCur->pPage = pNewPage; 3442 pCur->idx = 0; 3443 pCur->info.nSize = 0; 3444 pCur->validNKey = 0; 3445 if( pNewPage->nCell<1 ){ 3446 return SQLITE_CORRUPT_BKPT; 3447 } 3448 return SQLITE_OK; 3449 } 3450 3451 /* 3452 ** Return true if the page is the virtual root of its table. 3453 ** 3454 ** The virtual root page is the root page for most tables. But 3455 ** for the table rooted on page 1, sometime the real root page 3456 ** is empty except for the right-pointer. In such cases the 3457 ** virtual root page is the page that the right-pointer of page 3458 ** 1 is pointing to. 3459 */ 3460 int sqlite3BtreeIsRootPage(MemPage *pPage){ 3461 MemPage *pParent; 3462 3463 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 3464 pParent = pPage->pParent; 3465 if( pParent==0 ) return 1; 3466 if( pParent->pgno>1 ) return 0; 3467 if( get2byte(&pParent->aData[pParent->hdrOffset+3])==0 ) return 1; 3468 return 0; 3469 } 3470 3471 /* 3472 ** Move the cursor up to the parent page. 3473 ** 3474 ** pCur->idx is set to the cell index that contains the pointer 3475 ** to the page we are coming from. If we are coming from the 3476 ** right-most child page then pCur->idx is set to one more than 3477 ** the largest cell index. 3478 */ 3479 void sqlite3BtreeMoveToParent(BtCursor *pCur){ 3480 MemPage *pParent; 3481 MemPage *pPage; 3482 int idxParent; 3483 3484 assert( cursorHoldsMutex(pCur) ); 3485 assert( pCur->eState==CURSOR_VALID ); 3486 pPage = pCur->pPage; 3487 assert( pPage!=0 ); 3488 assert( !sqlite3BtreeIsRootPage(pPage) ); 3489 pParent = pPage->pParent; 3490 assert( pParent!=0 ); 3491 idxParent = pPage->idxParent; 3492 sqlite3PagerRef(pParent->pDbPage); 3493 releasePage(pPage); 3494 pCur->pPage = pParent; 3495 pCur->info.nSize = 0; 3496 pCur->validNKey = 0; 3497 assert( pParent->idxShift==0 ); 3498 pCur->idx = idxParent; 3499 } 3500 3501 /* 3502 ** Move the cursor to the root page 3503 */ 3504 static int moveToRoot(BtCursor *pCur){ 3505 MemPage *pRoot; 3506 int rc = SQLITE_OK; 3507 Btree *p = pCur->pBtree; 3508 BtShared *pBt = p->pBt; 3509 3510 assert( cursorHoldsMutex(pCur) ); 3511 assert( CURSOR_INVALID < CURSOR_REQUIRESEEK ); 3512 assert( CURSOR_VALID < CURSOR_REQUIRESEEK ); 3513 assert( CURSOR_FAULT > CURSOR_REQUIRESEEK ); 3514 if( pCur->eState>=CURSOR_REQUIRESEEK ){ 3515 if( pCur->eState==CURSOR_FAULT ){ 3516 return pCur->skip; 3517 } 3518 clearCursorPosition(pCur); 3519 } 3520 pRoot = pCur->pPage; 3521 if( pRoot && pRoot->pgno==pCur->pgnoRoot ){ 3522 assert( pRoot->isInit ); 3523 }else{ 3524 if( 3525 SQLITE_OK!=(rc = getAndInitPage(pBt, pCur->pgnoRoot, &pRoot, 0)) 3526 ){ 3527 pCur->eState = CURSOR_INVALID; 3528 return rc; 3529 } 3530 releasePage(pCur->pPage); 3531 pCur->pPage = pRoot; 3532 } 3533 pCur->idx = 0; 3534 pCur->info.nSize = 0; 3535 pCur->atLast = 0; 3536 pCur->validNKey = 0; 3537 if( pRoot->nCell==0 && !pRoot->leaf ){ 3538 Pgno subpage; 3539 assert( pRoot->pgno==1 ); 3540 subpage = get4byte(&pRoot->aData[pRoot->hdrOffset+8]); 3541 assert( subpage>0 ); 3542 pCur->eState = CURSOR_VALID; 3543 rc = moveToChild(pCur, subpage); 3544 } 3545 pCur->eState = ((pCur->pPage->nCell>0)?CURSOR_VALID:CURSOR_INVALID); 3546 return rc; 3547 } 3548 3549 /* 3550 ** Move the cursor down to the left-most leaf entry beneath the 3551 ** entry to which it is currently pointing. 3552 ** 3553 ** The left-most leaf is the one with the smallest key - the first 3554 ** in ascending order. 3555 */ 3556 static int moveToLeftmost(BtCursor *pCur){ 3557 Pgno pgno; 3558 int rc = SQLITE_OK; 3559 MemPage *pPage; 3560 3561 assert( cursorHoldsMutex(pCur) ); 3562 assert( pCur->eState==CURSOR_VALID ); 3563 while( rc==SQLITE_OK && !(pPage = pCur->pPage)->leaf ){ 3564 assert( pCur->idx>=0 && pCur->idx<pPage->nCell ); 3565 pgno = get4byte(findCell(pPage, pCur->idx)); 3566 rc = moveToChild(pCur, pgno); 3567 } 3568 return rc; 3569 } 3570 3571 /* 3572 ** Move the cursor down to the right-most leaf entry beneath the 3573 ** page to which it is currently pointing. Notice the difference 3574 ** between moveToLeftmost() and moveToRightmost(). moveToLeftmost() 3575 ** finds the left-most entry beneath the *entry* whereas moveToRightmost() 3576 ** finds the right-most entry beneath the *page*. 3577 ** 3578 ** The right-most entry is the one with the largest key - the last 3579 ** key in ascending order. 3580 */ 3581 static int moveToRightmost(BtCursor *pCur){ 3582 Pgno pgno; 3583 int rc = SQLITE_OK; 3584 MemPage *pPage; 3585 3586 assert( cursorHoldsMutex(pCur) ); 3587 assert( pCur->eState==CURSOR_VALID ); 3588 while( rc==SQLITE_OK && !(pPage = pCur->pPage)->leaf ){ 3589 pgno = get4byte(&pPage->aData[pPage->hdrOffset+8]); 3590 pCur->idx = pPage->nCell; 3591 rc = moveToChild(pCur, pgno); 3592 } 3593 if( rc==SQLITE_OK ){ 3594 pCur->idx = pPage->nCell - 1; 3595 pCur->info.nSize = 0; 3596 pCur->validNKey = 0; 3597 } 3598 return SQLITE_OK; 3599 } 3600 3601 /* Move the cursor to the first entry in the table. Return SQLITE_OK 3602 ** on success. Set *pRes to 0 if the cursor actually points to something 3603 ** or set *pRes to 1 if the table is empty. 3604 */ 3605 int sqlite3BtreeFirst(BtCursor *pCur, int *pRes){ 3606 int rc; 3607 3608 assert( cursorHoldsMutex(pCur) ); 3609 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) ); 3610 rc = moveToRoot(pCur); 3611 if( rc==SQLITE_OK ){ 3612 if( pCur->eState==CURSOR_INVALID ){ 3613 assert( pCur->pPage->nCell==0 ); 3614 *pRes = 1; 3615 rc = SQLITE_OK; 3616 }else{ 3617 assert( pCur->pPage->nCell>0 ); 3618 *pRes = 0; 3619 rc = moveToLeftmost(pCur); 3620 } 3621 } 3622 return rc; 3623 } 3624 3625 /* Move the cursor to the last entry in the table. Return SQLITE_OK 3626 ** on success. Set *pRes to 0 if the cursor actually points to something 3627 ** or set *pRes to 1 if the table is empty. 3628 */ 3629 int sqlite3BtreeLast(BtCursor *pCur, int *pRes){ 3630 int rc; 3631 3632 assert( cursorHoldsMutex(pCur) ); 3633 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) ); 3634 rc = moveToRoot(pCur); 3635 if( rc==SQLITE_OK ){ 3636 if( CURSOR_INVALID==pCur->eState ){ 3637 assert( pCur->pPage->nCell==0 ); 3638 *pRes = 1; 3639 }else{ 3640 assert( pCur->eState==CURSOR_VALID ); 3641 *pRes = 0; 3642 rc = moveToRightmost(pCur); 3643 getCellInfo(pCur); 3644 pCur->atLast = rc==SQLITE_OK; 3645 } 3646 } 3647 return rc; 3648 } 3649 3650 /* Move the cursor so that it points to an entry near the key 3651 ** specified by pKey/nKey/pUnKey. Return a success code. 3652 ** 3653 ** For INTKEY tables, only the nKey parameter is used. pKey 3654 ** and pUnKey must be NULL. For index tables, either pUnKey 3655 ** must point to a key that has already been unpacked, or else 3656 ** pKey/nKey describes a blob containing the key. 3657 ** 3658 ** If an exact match is not found, then the cursor is always 3659 ** left pointing at a leaf page which would hold the entry if it 3660 ** were present. The cursor might point to an entry that comes 3661 ** before or after the key. 3662 ** 3663 ** The result of comparing the key with the entry to which the 3664 ** cursor is written to *pRes if pRes!=NULL. The meaning of 3665 ** this value is as follows: 3666 ** 3667 ** *pRes<0 The cursor is left pointing at an entry that 3668 ** is smaller than pKey or if the table is empty 3669 ** and the cursor is therefore left point to nothing. 3670 ** 3671 ** *pRes==0 The cursor is left pointing at an entry that 3672 ** exactly matches pKey. 3673 ** 3674 ** *pRes>0 The cursor is left pointing at an entry that 3675 ** is larger than pKey. 3676 ** 3677 */ 3678 int sqlite3BtreeMoveto( 3679 BtCursor *pCur, /* The cursor to be moved */ 3680 const void *pKey, /* The key content for indices. Not used by tables */ 3681 UnpackedRecord *pUnKey,/* Unpacked version of pKey */ 3682 i64 nKey, /* Size of pKey. Or the key for tables */ 3683 int biasRight, /* If true, bias the search to the high end */ 3684 int *pRes /* Search result flag */ 3685 ){ 3686 int rc; 3687 char aSpace[200]; 3688 3689 assert( cursorHoldsMutex(pCur) ); 3690 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) ); 3691 3692 /* If the cursor is already positioned at the point we are trying 3693 ** to move to, then just return without doing any work */ 3694 if( pCur->eState==CURSOR_VALID && pCur->validNKey && pCur->pPage->intKey ){ 3695 if( pCur->info.nKey==nKey ){ 3696 *pRes = 0; 3697 return SQLITE_OK; 3698 } 3699 if( pCur->atLast && pCur->info.nKey<nKey ){ 3700 *pRes = -1; 3701 return SQLITE_OK; 3702 } 3703 } 3704 3705 3706 rc = moveToRoot(pCur); 3707 if( rc ){ 3708 return rc; 3709 } 3710 assert( pCur->pPage ); 3711 assert( pCur->pPage->isInit ); 3712 if( pCur->eState==CURSOR_INVALID ){ 3713 *pRes = -1; 3714 assert( pCur->pPage->nCell==0 ); 3715 return SQLITE_OK; 3716 } 3717 if( pCur->pPage->intKey ){ 3718 /* We are given an SQL table to search. The key is the integer 3719 ** rowid contained in nKey. pKey and pUnKey should both be NULL */ 3720 assert( pUnKey==0 ); 3721 assert( pKey==0 ); 3722 }else if( pUnKey==0 ){ 3723 /* We are to search an SQL index using a key encoded as a blob. 3724 ** The blob is found at pKey and is nKey bytes in length. Unpack 3725 ** this key so that we can use it. */ 3726 assert( pKey!=0 ); 3727 pUnKey = sqlite3VdbeRecordUnpack(pCur->pKeyInfo, nKey, pKey, 3728 aSpace, sizeof(aSpace)); 3729 if( pUnKey==0 ) return SQLITE_NOMEM; 3730 }else{ 3731 /* We are to search an SQL index using a key that is already unpacked 3732 ** and handed to us in pUnKey. */ 3733 assert( pKey==0 ); 3734 } 3735 for(;;){ 3736 int lwr, upr; 3737 Pgno chldPg; 3738 MemPage *pPage = pCur->pPage; 3739 int c = -1; /* pRes return if table is empty must be -1 */ 3740 lwr = 0; 3741 upr = pPage->nCell-1; 3742 if( !pPage->intKey && pUnKey==0 ){ 3743 rc = SQLITE_CORRUPT_BKPT; 3744 goto moveto_finish; 3745 } 3746 if( biasRight ){ 3747 pCur->idx = upr; 3748 }else{ 3749 pCur->idx = (upr+lwr)/2; 3750 } 3751 if( lwr<=upr ) for(;;){ 3752 void *pCellKey; 3753 i64 nCellKey; 3754 pCur->info.nSize = 0; 3755 pCur->validNKey = 1; 3756 if( pPage->intKey ){ 3757 u8 *pCell; 3758 pCell = findCell(pPage, pCur->idx) + pPage->childPtrSize; 3759 if( pPage->hasData ){ 3760 u32 dummy; 3761 pCell += getVarint32(pCell, dummy); 3762 } 3763 getVarint(pCell, (u64*)&nCellKey); 3764 if( nCellKey==nKey ){ 3765 c = 0; 3766 }else if( nCellKey<nKey ){ 3767 c = -1; 3768 }else{ 3769 assert( nCellKey>nKey ); 3770 c = +1; 3771 } 3772 }else{ 3773 int available; 3774 pCellKey = (void *)fetchPayload(pCur, &available, 0); 3775 nCellKey = pCur->info.nKey; 3776 if( available>=nCellKey ){ 3777 c = sqlite3VdbeRecordCompare(nCellKey, pCellKey, pUnKey); 3778 }else{ 3779 pCellKey = sqlite3Malloc( nCellKey ); 3780 if( pCellKey==0 ){ 3781 rc = SQLITE_NOMEM; 3782 goto moveto_finish; 3783 } 3784 rc = sqlite3BtreeKey(pCur, 0, nCellKey, (void *)pCellKey); 3785 c = sqlite3VdbeRecordCompare(nCellKey, pCellKey, pUnKey); 3786 sqlite3_free(pCellKey); 3787 if( rc ) goto moveto_finish; 3788 } 3789 } 3790 if( c==0 ){ 3791 pCur->info.nKey = nCellKey; 3792 if( pPage->intKey && !pPage->leaf ){ 3793 lwr = pCur->idx; 3794 upr = lwr - 1; 3795 break; 3796 }else{ 3797 if( pRes ) *pRes = 0; 3798 rc = SQLITE_OK; 3799 goto moveto_finish; 3800 } 3801 } 3802 if( c<0 ){ 3803 lwr = pCur->idx+1; 3804 }else{ 3805 upr = pCur->idx-1; 3806 } 3807 if( lwr>upr ){ 3808 pCur->info.nKey = nCellKey; 3809 break; 3810 } 3811 pCur->idx = (lwr+upr)/2; 3812 } 3813 assert( lwr==upr+1 ); 3814 assert( pPage->isInit ); 3815 if( pPage->leaf ){ 3816 chldPg = 0; 3817 }else if( lwr>=pPage->nCell ){ 3818 chldPg = get4byte(&pPage->aData[pPage->hdrOffset+8]); 3819 }else{ 3820 chldPg = get4byte(findCell(pPage, lwr)); 3821 } 3822 if( chldPg==0 ){ 3823 assert( pCur->idx>=0 && pCur->idx<pCur->pPage->nCell ); 3824 if( pRes ) *pRes = c; 3825 rc = SQLITE_OK; 3826 goto moveto_finish; 3827 } 3828 pCur->idx = lwr; 3829 pCur->info.nSize = 0; 3830 pCur->validNKey = 0; 3831 rc = moveToChild(pCur, chldPg); 3832 if( rc ) goto moveto_finish; 3833 } 3834 moveto_finish: 3835 if( pKey ){ 3836 /* If we created our own unpacked key at the top of this 3837 ** procedure, then destroy that key before returning. */ 3838 sqlite3VdbeDeleteUnpackedRecord(pUnKey); 3839 } 3840 return rc; 3841 } 3842 3843 3844 /* 3845 ** Return TRUE if the cursor is not pointing at an entry of the table. 3846 ** 3847 ** TRUE will be returned after a call to sqlite3BtreeNext() moves 3848 ** past the last entry in the table or sqlite3BtreePrev() moves past 3849 ** the first entry. TRUE is also returned if the table is empty. 3850 */ 3851 int sqlite3BtreeEof(BtCursor *pCur){ 3852 /* TODO: What if the cursor is in CURSOR_REQUIRESEEK but all table entries 3853 ** have been deleted? This API will need to change to return an error code 3854 ** as well as the boolean result value. 3855 */ 3856 return (CURSOR_VALID!=pCur->eState); 3857 } 3858 3859 /* 3860 ** Return the database connection handle for a cursor. 3861 */ 3862 sqlite3 *sqlite3BtreeCursorDb(const BtCursor *pCur){ 3863 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) ); 3864 return pCur->pBtree->db; 3865 } 3866 3867 /* 3868 ** Advance the cursor to the next entry in the database. If 3869 ** successful then set *pRes=0. If the cursor 3870 ** was already pointing to the last entry in the database before 3871 ** this routine was called, then set *pRes=1. 3872 */ 3873 int sqlite3BtreeNext(BtCursor *pCur, int *pRes){ 3874 int rc; 3875 MemPage *pPage; 3876 3877 assert( cursorHoldsMutex(pCur) ); 3878 rc = restoreCursorPosition(pCur); 3879 if( rc!=SQLITE_OK ){ 3880 return rc; 3881 } 3882 assert( pRes!=0 ); 3883 pPage = pCur->pPage; 3884 if( CURSOR_INVALID==pCur->eState ){ 3885 *pRes = 1; 3886 return SQLITE_OK; 3887 } 3888 if( pCur->skip>0 ){ 3889 pCur->skip = 0; 3890 *pRes = 0; 3891 return SQLITE_OK; 3892 } 3893 pCur->skip = 0; 3894 3895 assert( pPage->isInit ); 3896 assert( pCur->idx<pPage->nCell ); 3897 3898 pCur->idx++; 3899 pCur->info.nSize = 0; 3900 pCur->validNKey = 0; 3901 if( pCur->idx>=pPage->nCell ){ 3902 if( !pPage->leaf ){ 3903 rc = moveToChild(pCur, get4byte(&pPage->aData[pPage->hdrOffset+8])); 3904 if( rc ) return rc; 3905 rc = moveToLeftmost(pCur); 3906 *pRes = 0; 3907 return rc; 3908 } 3909 do{ 3910 if( sqlite3BtreeIsRootPage(pPage) ){ 3911 *pRes = 1; 3912 pCur->eState = CURSOR_INVALID; 3913 return SQLITE_OK; 3914 } 3915 sqlite3BtreeMoveToParent(pCur); 3916 pPage = pCur->pPage; 3917 }while( pCur->idx>=pPage->nCell ); 3918 *pRes = 0; 3919 if( pPage->intKey ){ 3920 rc = sqlite3BtreeNext(pCur, pRes); 3921 }else{ 3922 rc = SQLITE_OK; 3923 } 3924 return rc; 3925 } 3926 *pRes = 0; 3927 if( pPage->leaf ){ 3928 return SQLITE_OK; 3929 } 3930 rc = moveToLeftmost(pCur); 3931 return rc; 3932 } 3933 3934 3935 /* 3936 ** Step the cursor to the back to the previous entry in the database. If 3937 ** successful then set *pRes=0. If the cursor 3938 ** was already pointing to the first entry in the database before 3939 ** this routine was called, then set *pRes=1. 3940 */ 3941 int sqlite3BtreePrevious(BtCursor *pCur, int *pRes){ 3942 int rc; 3943 Pgno pgno; 3944 MemPage *pPage; 3945 3946 assert( cursorHoldsMutex(pCur) ); 3947 rc = restoreCursorPosition(pCur); 3948 if( rc!=SQLITE_OK ){ 3949 return rc; 3950 } 3951 pCur->atLast = 0; 3952 if( CURSOR_INVALID==pCur->eState ){ 3953 *pRes = 1; 3954 return SQLITE_OK; 3955 } 3956 if( pCur->skip<0 ){ 3957 pCur->skip = 0; 3958 *pRes = 0; 3959 return SQLITE_OK; 3960 } 3961 pCur->skip = 0; 3962 3963 pPage = pCur->pPage; 3964 assert( pPage->isInit ); 3965 assert( pCur->idx>=0 ); 3966 if( !pPage->leaf ){ 3967 pgno = get4byte( findCell(pPage, pCur->idx) ); 3968 rc = moveToChild(pCur, pgno); 3969 if( rc ){ 3970 return rc; 3971 } 3972 rc = moveToRightmost(pCur); 3973 }else{ 3974 while( pCur->idx==0 ){ 3975 if( sqlite3BtreeIsRootPage(pPage) ){ 3976 pCur->eState = CURSOR_INVALID; 3977 *pRes = 1; 3978 return SQLITE_OK; 3979 } 3980 sqlite3BtreeMoveToParent(pCur); 3981 pPage = pCur->pPage; 3982 } 3983 pCur->idx--; 3984 pCur->info.nSize = 0; 3985 pCur->validNKey = 0; 3986 if( pPage->intKey && !pPage->leaf ){ 3987 rc = sqlite3BtreePrevious(pCur, pRes); 3988 }else{ 3989 rc = SQLITE_OK; 3990 } 3991 } 3992 *pRes = 0; 3993 return rc; 3994 } 3995 3996 /* 3997 ** Allocate a new page from the database file. 3998 ** 3999 ** The new page is marked as dirty. (In other words, sqlite3PagerWrite() 4000 ** has already been called on the new page.) The new page has also 4001 ** been referenced and the calling routine is responsible for calling 4002 ** sqlite3PagerUnref() on the new page when it is done. 4003 ** 4004 ** SQLITE_OK is returned on success. Any other return value indicates 4005 ** an error. *ppPage and *pPgno are undefined in the event of an error. 4006 ** Do not invoke sqlite3PagerUnref() on *ppPage if an error is returned. 4007 ** 4008 ** If the "nearby" parameter is not 0, then a (feeble) effort is made to 4009 ** locate a page close to the page number "nearby". This can be used in an 4010 ** attempt to keep related pages close to each other in the database file, 4011 ** which in turn can make database access faster. 4012 ** 4013 ** If the "exact" parameter is not 0, and the page-number nearby exists 4014 ** anywhere on the free-list, then it is guarenteed to be returned. This 4015 ** is only used by auto-vacuum databases when allocating a new table. 4016 */ 4017 static int allocateBtreePage( 4018 BtShared *pBt, 4019 MemPage **ppPage, 4020 Pgno *pPgno, 4021 Pgno nearby, 4022 u8 exact 4023 ){ 4024 MemPage *pPage1; 4025 int rc; 4026 int n; /* Number of pages on the freelist */ 4027 int k; /* Number of leaves on the trunk of the freelist */ 4028 MemPage *pTrunk = 0; 4029 MemPage *pPrevTrunk = 0; 4030 4031 assert( sqlite3_mutex_held(pBt->mutex) ); 4032 pPage1 = pBt->pPage1; 4033 n = get4byte(&pPage1->aData[36]); 4034 if( n>0 ){ 4035 /* There are pages on the freelist. Reuse one of those pages. */ 4036 Pgno iTrunk; 4037 u8 searchList = 0; /* If the free-list must be searched for 'nearby' */ 4038 4039 /* If the 'exact' parameter was true and a query of the pointer-map 4040 ** shows that the page 'nearby' is somewhere on the free-list, then 4041 ** the entire-list will be searched for that page. 4042 */ 4043 #ifndef SQLITE_OMIT_AUTOVACUUM 4044 if( exact && nearby<=pagerPagecount(pBt->pPager) ){ 4045 u8 eType; 4046 assert( nearby>0 ); 4047 assert( pBt->autoVacuum ); 4048 rc = ptrmapGet(pBt, nearby, &eType, 0); 4049 if( rc ) return rc; 4050 if( eType==PTRMAP_FREEPAGE ){ 4051 searchList = 1; 4052 } 4053 *pPgno = nearby; 4054 } 4055 #endif 4056 4057 /* Decrement the free-list count by 1. Set iTrunk to the index of the 4058 ** first free-list trunk page. iPrevTrunk is initially 1. 4059 */ 4060 rc = sqlite3PagerWrite(pPage1->pDbPage); 4061 if( rc ) return rc; 4062 put4byte(&pPage1->aData[36], n-1); 4063 4064 /* The code within this loop is run only once if the 'searchList' variable 4065 ** is not true. Otherwise, it runs once for each trunk-page on the 4066 ** free-list until the page 'nearby' is located. 4067 */ 4068 do { 4069 pPrevTrunk = pTrunk; 4070 if( pPrevTrunk ){ 4071 iTrunk = get4byte(&pPrevTrunk->aData[0]); 4072 }else{ 4073 iTrunk = get4byte(&pPage1->aData[32]); 4074 } 4075 rc = sqlite3BtreeGetPage(pBt, iTrunk, &pTrunk, 0); 4076 if( rc ){ 4077 pTrunk = 0; 4078 goto end_allocate_page; 4079 } 4080 4081 k = get4byte(&pTrunk->aData[4]); 4082 if( k==0 && !searchList ){ 4083 /* The trunk has no leaves and the list is not being searched. 4084 ** So extract the trunk page itself and use it as the newly 4085 ** allocated page */ 4086 assert( pPrevTrunk==0 ); 4087 rc = sqlite3PagerWrite(pTrunk->pDbPage); 4088 if( rc ){ 4089 goto end_allocate_page; 4090 } 4091 *pPgno = iTrunk; 4092 memcpy(&pPage1->aData[32], &pTrunk->aData[0], 4); 4093 *ppPage = pTrunk; 4094 pTrunk = 0; 4095 TRACE(("ALLOCATE: %d trunk - %d free pages left\n", *pPgno, n-1)); 4096 }else if( k>pBt->usableSize/4 - 2 ){ 4097 /* Value of k is out of range. Database corruption */ 4098 rc = SQLITE_CORRUPT_BKPT; 4099 goto end_allocate_page; 4100 #ifndef SQLITE_OMIT_AUTOVACUUM 4101 }else if( searchList && nearby==iTrunk ){ 4102 /* The list is being searched and this trunk page is the page 4103 ** to allocate, regardless of whether it has leaves. 4104 */ 4105 assert( *pPgno==iTrunk ); 4106 *ppPage = pTrunk; 4107 searchList = 0; 4108 rc = sqlite3PagerWrite(pTrunk->pDbPage); 4109 if( rc ){ 4110 goto end_allocate_page; 4111 } 4112 if( k==0 ){ 4113 if( !pPrevTrunk ){ 4114 memcpy(&pPage1->aData[32], &pTrunk->aData[0], 4); 4115 }else{ 4116 memcpy(&pPrevTrunk->aData[0], &pTrunk->aData[0], 4); 4117 } 4118 }else{ 4119 /* The trunk page is required by the caller but it contains 4120 ** pointers to free-list leaves. The first leaf becomes a trunk 4121 ** page in this case. 4122 */ 4123 MemPage *pNewTrunk; 4124 Pgno iNewTrunk = get4byte(&pTrunk->aData[8]); 4125 rc = sqlite3BtreeGetPage(pBt, iNewTrunk, &pNewTrunk, 0); 4126 if( rc!=SQLITE_OK ){ 4127 goto end_allocate_page; 4128 } 4129 rc = sqlite3PagerWrite(pNewTrunk->pDbPage); 4130 if( rc!=SQLITE_OK ){ 4131 releasePage(pNewTrunk); 4132 goto end_allocate_page; 4133 } 4134 memcpy(&pNewTrunk->aData[0], &pTrunk->aData[0], 4); 4135 put4byte(&pNewTrunk->aData[4], k-1); 4136 memcpy(&pNewTrunk->aData[8], &pTrunk->aData[12], (k-1)*4); 4137 releasePage(pNewTrunk); 4138 if( !pPrevTrunk ){ 4139 put4byte(&pPage1->aData[32], iNewTrunk); 4140 }else{ 4141 rc = sqlite3PagerWrite(pPrevTrunk->pDbPage); 4142 if( rc ){ 4143 goto end_allocate_page; 4144 } 4145 put4byte(&pPrevTrunk->aData[0], iNewTrunk); 4146 } 4147 } 4148 pTrunk = 0; 4149 TRACE(("ALLOCATE: %d trunk - %d free pages left\n", *pPgno, n-1)); 4150 #endif 4151 }else{ 4152 /* Extract a leaf from the trunk */ 4153 int closest; 4154 Pgno iPage; 4155 unsigned char *aData = pTrunk->aData; 4156 rc = sqlite3PagerWrite(pTrunk->pDbPage); 4157 if( rc ){ 4158 goto end_allocate_page; 4159 } 4160 if( nearby>0 ){ 4161 int i, dist; 4162 closest = 0; 4163 dist = get4byte(&aData[8]) - nearby; 4164 if( dist<0 ) dist = -dist; 4165 for(i=1; i<k; i++){ 4166 int d2 = get4byte(&aData[8+i*4]) - nearby; 4167 if( d2<0 ) d2 = -d2; 4168 if( d2<dist ){ 4169 closest = i; 4170 dist = d2; 4171 } 4172 } 4173 }else{ 4174 closest = 0; 4175 } 4176 4177 iPage = get4byte(&aData[8+closest*4]); 4178 if( !searchList || iPage==nearby ){ 4179 int nPage; 4180 *pPgno = iPage; 4181 nPage = pagerPagecount(pBt->pPager); 4182 if( *pPgno>nPage ){ 4183 /* Free page off the end of the file */ 4184 rc = SQLITE_CORRUPT_BKPT; 4185 goto end_allocate_page; 4186 } 4187 TRACE(("ALLOCATE: %d was leaf %d of %d on trunk %d" 4188 ": %d more free pages\n", 4189 *pPgno, closest+1, k, pTrunk->pgno, n-1)); 4190 if( closest<k-1 ){ 4191 memcpy(&aData[8+closest*4], &aData[4+k*4], 4); 4192 } 4193 put4byte(&aData[4], k-1); 4194 rc = sqlite3BtreeGetPage(pBt, *pPgno, ppPage, 1); 4195 if( rc==SQLITE_OK ){ 4196 sqlite3PagerDontRollback((*ppPage)->pDbPage); 4197 rc = sqlite3PagerWrite((*ppPage)->pDbPage); 4198 if( rc!=SQLITE_OK ){ 4199 releasePage(*ppPage); 4200 } 4201 } 4202 searchList = 0; 4203 } 4204 } 4205 releasePage(pPrevTrunk); 4206 pPrevTrunk = 0; 4207 }while( searchList ); 4208 }else{ 4209 /* There are no pages on the freelist, so create a new page at the 4210 ** end of the file */ 4211 int nPage = pagerPagecount(pBt->pPager); 4212 *pPgno = nPage + 1; 4213 4214 #ifndef SQLITE_OMIT_AUTOVACUUM 4215 if( pBt->nTrunc ){ 4216 /* An incr-vacuum has already run within this transaction. So the 4217 ** page to allocate is not from the physical end of the file, but 4218 ** at pBt->nTrunc. 4219 */ 4220 *pPgno = pBt->nTrunc+1; 4221 if( *pPgno==PENDING_BYTE_PAGE(pBt) ){ 4222 (*pPgno)++; 4223 } 4224 } 4225 if( pBt->autoVacuum && PTRMAP_ISPAGE(pBt, *pPgno) ){ 4226 /* If *pPgno refers to a pointer-map page, allocate two new pages 4227 ** at the end of the file instead of one. The first allocated page 4228 ** becomes a new pointer-map page, the second is used by the caller. 4229 */ 4230 TRACE(("ALLOCATE: %d from end of file (pointer-map page)\n", *pPgno)); 4231 assert( *pPgno!=PENDING_BYTE_PAGE(pBt) ); 4232 (*pPgno)++; 4233 if( *pPgno==PENDING_BYTE_PAGE(pBt) ){ (*pPgno)++; } 4234 } 4235 if( pBt->nTrunc ){ 4236 pBt->nTrunc = *pPgno; 4237 } 4238 #endif 4239 4240 assert( *pPgno!=PENDING_BYTE_PAGE(pBt) ); 4241 rc = sqlite3BtreeGetPage(pBt, *pPgno, ppPage, 0); 4242 if( rc ) return rc; 4243 rc = sqlite3PagerWrite((*ppPage)->pDbPage); 4244 if( rc!=SQLITE_OK ){ 4245 releasePage(*ppPage); 4246 } 4247 TRACE(("ALLOCATE: %d from end of file\n", *pPgno)); 4248 } 4249 4250 assert( *pPgno!=PENDING_BYTE_PAGE(pBt) ); 4251 4252 end_allocate_page: 4253 releasePage(pTrunk); 4254 releasePage(pPrevTrunk); 4255 return rc; 4256 } 4257 4258 /* 4259 ** Add a page of the database file to the freelist. 4260 ** 4261 ** sqlite3PagerUnref() is NOT called for pPage. 4262 */ 4263 static int freePage(MemPage *pPage){ 4264 BtShared *pBt = pPage->pBt; 4265 MemPage *pPage1 = pBt->pPage1; 4266 int rc, n, k; 4267 4268 /* Prepare the page for freeing */ 4269 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 4270 assert( pPage->pgno>1 ); 4271 pPage->isInit = 0; 4272 releasePage(pPage->pParent); 4273 pPage->pParent = 0; 4274 4275 /* Increment the free page count on pPage1 */ 4276 rc = sqlite3PagerWrite(pPage1->pDbPage); 4277 if( rc ) return rc; 4278 n = get4byte(&pPage1->aData[36]); 4279 put4byte(&pPage1->aData[36], n+1); 4280 4281 #ifdef SQLITE_SECURE_DELETE 4282 /* If the SQLITE_SECURE_DELETE compile-time option is enabled, then 4283 ** always fully overwrite deleted information with zeros. 4284 */ 4285 rc = sqlite3PagerWrite(pPage->pDbPage); 4286 if( rc ) return rc; 4287 memset(pPage->aData, 0, pPage->pBt->pageSize); 4288 #endif 4289 4290 /* If the database supports auto-vacuum, write an entry in the pointer-map 4291 ** to indicate that the page is free. 4292 */ 4293 if( ISAUTOVACUUM ){ 4294 rc = ptrmapPut(pBt, pPage->pgno, PTRMAP_FREEPAGE, 0); 4295 if( rc ) return rc; 4296 } 4297 4298 if( n==0 ){ 4299 /* This is the first free page */ 4300 rc = sqlite3PagerWrite(pPage->pDbPage); 4301 if( rc ) return rc; 4302 memset(pPage->aData, 0, 8); 4303 put4byte(&pPage1->aData[32], pPage->pgno); 4304 TRACE(("FREE-PAGE: %d first\n", pPage->pgno)); 4305 }else{ 4306 /* Other free pages already exist. Retrive the first trunk page 4307 ** of the freelist and find out how many leaves it has. */ 4308 MemPage *pTrunk; 4309 rc = sqlite3BtreeGetPage(pBt, get4byte(&pPage1->aData[32]), &pTrunk, 0); 4310 if( rc ) return rc; 4311 k = get4byte(&pTrunk->aData[4]); 4312 if( k>=pBt->usableSize/4 - 8 ){ 4313 /* The trunk is full. Turn the page being freed into a new 4314 ** trunk page with no leaves. 4315 ** 4316 ** Note that the trunk page is not really full until it contains 4317 ** usableSize/4 - 2 entries, not usableSize/4 - 8 entries as we have 4318 ** coded. But due to a coding error in versions of SQLite prior to 4319 ** 3.6.0, databases with freelist trunk pages holding more than 4320 ** usableSize/4 - 8 entries will be reported as corrupt. In order 4321 ** to maintain backwards compatibility with older versions of SQLite, 4322 ** we will contain to restrict the number of entries to usableSize/4 - 8 4323 ** for now. At some point in the future (once everyone has upgraded 4324 ** to 3.6.0 or later) we should consider fixing the conditional above 4325 ** to read "usableSize/4-2" instead of "usableSize/4-8". 4326 */ 4327 rc = sqlite3PagerWrite(pPage->pDbPage); 4328 if( rc==SQLITE_OK ){ 4329 put4byte(pPage->aData, pTrunk->pgno); 4330 put4byte(&pPage->aData[4], 0); 4331 put4byte(&pPage1->aData[32], pPage->pgno); 4332 TRACE(("FREE-PAGE: %d new trunk page replacing %d\n", 4333 pPage->pgno, pTrunk->pgno)); 4334 } 4335 }else if( k<0 ){ 4336 rc = SQLITE_CORRUPT; 4337 }else{ 4338 /* Add the newly freed page as a leaf on the current trunk */ 4339 rc = sqlite3PagerWrite(pTrunk->pDbPage); 4340 if( rc==SQLITE_OK ){ 4341 put4byte(&pTrunk->aData[4], k+1); 4342 put4byte(&pTrunk->aData[8+k*4], pPage->pgno); 4343 #ifndef SQLITE_SECURE_DELETE 4344 sqlite3PagerDontWrite(pPage->pDbPage); 4345 #endif 4346 } 4347 TRACE(("FREE-PAGE: %d leaf on trunk page %d\n",pPage->pgno,pTrunk->pgno)); 4348 } 4349 releasePage(pTrunk); 4350 } 4351 return rc; 4352 } 4353 4354 /* 4355 ** Free any overflow pages associated with the given Cell. 4356 */ 4357 static int clearCell(MemPage *pPage, unsigned char *pCell){ 4358 BtShared *pBt = pPage->pBt; 4359 CellInfo info; 4360 Pgno ovflPgno; 4361 int rc; 4362 int nOvfl; 4363 int ovflPageSize; 4364 4365 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 4366 sqlite3BtreeParseCellPtr(pPage, pCell, &info); 4367 if( info.iOverflow==0 ){ 4368 return SQLITE_OK; /* No overflow pages. Return without doing anything */ 4369 } 4370 ovflPgno = get4byte(&pCell[info.iOverflow]); 4371 ovflPageSize = pBt->usableSize - 4; 4372 nOvfl = (info.nPayload - info.nLocal + ovflPageSize - 1)/ovflPageSize; 4373 assert( ovflPgno==0 || nOvfl>0 ); 4374 while( nOvfl-- ){ 4375 MemPage *pOvfl; 4376 if( ovflPgno==0 || ovflPgno>pagerPagecount(pBt->pPager) ){ 4377 return SQLITE_CORRUPT_BKPT; 4378 } 4379 4380 rc = getOverflowPage(pBt, ovflPgno, &pOvfl, (nOvfl==0)?0:&ovflPgno); 4381 if( rc ) return rc; 4382 rc = freePage(pOvfl); 4383 sqlite3PagerUnref(pOvfl->pDbPage); 4384 if( rc ) return rc; 4385 } 4386 return SQLITE_OK; 4387 } 4388 4389 /* 4390 ** Create the byte sequence used to represent a cell on page pPage 4391 ** and write that byte sequence into pCell[]. Overflow pages are 4392 ** allocated and filled in as necessary. The calling procedure 4393 ** is responsible for making sure sufficient space has been allocated 4394 ** for pCell[]. 4395 ** 4396 ** Note that pCell does not necessary need to point to the pPage->aData 4397 ** area. pCell might point to some temporary storage. The cell will 4398 ** be constructed in this temporary area then copied into pPage->aData 4399 ** later. 4400 */ 4401 static int fillInCell( 4402 MemPage *pPage, /* The page that contains the cell */ 4403 unsigned char *pCell, /* Complete text of the cell */ 4404 const void *pKey, i64 nKey, /* The key */ 4405 const void *pData,int nData, /* The data */ 4406 int nZero, /* Extra zero bytes to append to pData */ 4407 int *pnSize /* Write cell size here */ 4408 ){ 4409 int nPayload; 4410 const u8 *pSrc; 4411 int nSrc, n, rc; 4412 int spaceLeft; 4413 MemPage *pOvfl = 0; 4414 MemPage *pToRelease = 0; 4415 unsigned char *pPrior; 4416 unsigned char *pPayload; 4417 BtShared *pBt = pPage->pBt; 4418 Pgno pgnoOvfl = 0; 4419 int nHeader; 4420 CellInfo info; 4421 4422 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 4423 4424 /* Fill in the header. */ 4425 nHeader = 0; 4426 if( !pPage->leaf ){ 4427 nHeader += 4; 4428 } 4429 if( pPage->hasData ){ 4430 nHeader += putVarint(&pCell[nHeader], nData+nZero); 4431 }else{ 4432 nData = nZero = 0; 4433 } 4434 nHeader += putVarint(&pCell[nHeader], *(u64*)&nKey); 4435 sqlite3BtreeParseCellPtr(pPage, pCell, &info); 4436 assert( info.nHeader==nHeader ); 4437 assert( info.nKey==nKey ); 4438 assert( info.nData==nData+nZero ); 4439 4440 /* Fill in the payload */ 4441 nPayload = nData + nZero; 4442 if( pPage->intKey ){ 4443 pSrc = pData; 4444 nSrc = nData; 4445 nData = 0; 4446 }else{ 4447 nPayload += nKey; 4448 pSrc = pKey; 4449 nSrc = nKey; 4450 } 4451 *pnSize = info.nSize; 4452 spaceLeft = info.nLocal; 4453 pPayload = &pCell[nHeader]; 4454 pPrior = &pCell[info.iOverflow]; 4455 4456 while( nPayload>0 ){ 4457 if( spaceLeft==0 ){ 4458 int isExact = 0; 4459 #ifndef SQLITE_OMIT_AUTOVACUUM 4460 Pgno pgnoPtrmap = pgnoOvfl; /* Overflow page pointer-map entry page */ 4461 if( pBt->autoVacuum ){ 4462 do{ 4463 pgnoOvfl++; 4464 } while( 4465 PTRMAP_ISPAGE(pBt, pgnoOvfl) || pgnoOvfl==PENDING_BYTE_PAGE(pBt) 4466 ); 4467 if( pgnoOvfl>1 ){ 4468 /* isExact = 1; */ 4469 } 4470 } 4471 #endif 4472 rc = allocateBtreePage(pBt, &pOvfl, &pgnoOvfl, pgnoOvfl, isExact); 4473 #ifndef SQLITE_OMIT_AUTOVACUUM 4474 /* If the database supports auto-vacuum, and the second or subsequent 4475 ** overflow page is being allocated, add an entry to the pointer-map 4476 ** for that page now. 4477 ** 4478 ** If this is the first overflow page, then write a partial entry 4479 ** to the pointer-map. If we write nothing to this pointer-map slot, 4480 ** then the optimistic overflow chain processing in clearCell() 4481 ** may misinterpret the uninitialised values and delete the 4482 ** wrong pages from the database. 4483 */ 4484 if( pBt->autoVacuum && rc==SQLITE_OK ){ 4485 u8 eType = (pgnoPtrmap?PTRMAP_OVERFLOW2:PTRMAP_OVERFLOW1); 4486 rc = ptrmapPut(pBt, pgnoOvfl, eType, pgnoPtrmap); 4487 if( rc ){ 4488 releasePage(pOvfl); 4489 } 4490 } 4491 #endif 4492 if( rc ){ 4493 releasePage(pToRelease); 4494 return rc; 4495 } 4496 put4byte(pPrior, pgnoOvfl); 4497 releasePage(pToRelease); 4498 pToRelease = pOvfl; 4499 pPrior = pOvfl->aData; 4500 put4byte(pPrior, 0); 4501 pPayload = &pOvfl->aData[4]; 4502 spaceLeft = pBt->usableSize - 4; 4503 } 4504 n = nPayload; 4505 if( n>spaceLeft ) n = spaceLeft; 4506 if( nSrc>0 ){ 4507 if( n>nSrc ) n = nSrc; 4508 assert( pSrc ); 4509 memcpy(pPayload, pSrc, n); 4510 }else{ 4511 memset(pPayload, 0, n); 4512 } 4513 nPayload -= n; 4514 pPayload += n; 4515 pSrc += n; 4516 nSrc -= n; 4517 spaceLeft -= n; 4518 if( nSrc==0 ){ 4519 nSrc = nData; 4520 pSrc = pData; 4521 } 4522 } 4523 releasePage(pToRelease); 4524 return SQLITE_OK; 4525 } 4526 4527 /* 4528 ** Change the MemPage.pParent pointer on the page whose number is 4529 ** given in the second argument so that MemPage.pParent holds the 4530 ** pointer in the third argument. 4531 ** 4532 ** If the final argument, updatePtrmap, is non-zero and the database 4533 ** is an auto-vacuum database, then the pointer-map entry for pgno 4534 ** is updated. 4535 */ 4536 static int reparentPage( 4537 BtShared *pBt, /* B-Tree structure */ 4538 Pgno pgno, /* Page number of child being adopted */ 4539 MemPage *pNewParent, /* New parent of pgno */ 4540 int idx, /* Index of child page pgno in pNewParent */ 4541 int updatePtrmap /* If true, update pointer-map for pgno */ 4542 ){ 4543 MemPage *pThis; 4544 DbPage *pDbPage; 4545 4546 assert( sqlite3_mutex_held(pBt->mutex) ); 4547 assert( pNewParent!=0 ); 4548 if( pgno==0 ) return SQLITE_OK; 4549 assert( pBt->pPager!=0 ); 4550 pDbPage = sqlite3PagerLookup(pBt->pPager, pgno); 4551 if( pDbPage ){ 4552 pThis = (MemPage *)sqlite3PagerGetExtra(pDbPage); 4553 if( pThis->isInit ){ 4554 assert( pThis->aData==sqlite3PagerGetData(pDbPage) ); 4555 if( pThis->pParent!=pNewParent ){ 4556 if( pThis->pParent ) sqlite3PagerUnref(pThis->pParent->pDbPage); 4557 pThis->pParent = pNewParent; 4558 sqlite3PagerRef(pNewParent->pDbPage); 4559 } 4560 pThis->idxParent = idx; 4561 } 4562 sqlite3PagerUnref(pDbPage); 4563 } 4564 4565 if( ISAUTOVACUUM && updatePtrmap ){ 4566 return ptrmapPut(pBt, pgno, PTRMAP_BTREE, pNewParent->pgno); 4567 } 4568 4569 #ifndef NDEBUG 4570 /* If the updatePtrmap flag was clear, assert that the entry in the 4571 ** pointer-map is already correct. 4572 */ 4573 if( ISAUTOVACUUM ){ 4574 u8 eType; 4575 Pgno ii; 4576 ptrmapGet(pBt, pgno, &eType, &ii); 4577 assert( ii==pNewParent->pgno && eType==PTRMAP_BTREE ); 4578 } 4579 #endif 4580 4581 return SQLITE_OK; 4582 } 4583 4584 4585 4586 /* 4587 ** Change the pParent pointer of all children of pPage to point back 4588 ** to pPage. 4589 ** 4590 ** In other words, for every child of pPage, invoke reparentPage() 4591 ** to make sure that each child knows that pPage is its parent. 4592 ** 4593 ** This routine gets called after you memcpy() one page into 4594 ** another. 4595 ** 4596 ** If updatePtrmap is true, then the pointer-map entries for all child 4597 ** pages of pPage are updated. 4598 */ 4599 static int reparentChildPages(MemPage *pPage, int updatePtrmap){ 4600 int rc = SQLITE_OK; 4601 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 4602 if( !pPage->leaf ){ 4603 int i; 4604 BtShared *pBt = pPage->pBt; 4605 Pgno iRight = get4byte(&pPage->aData[pPage->hdrOffset+8]); 4606 4607 for(i=0; i<pPage->nCell; i++){ 4608 u8 *pCell = findCell(pPage, i); 4609 rc = reparentPage(pBt, get4byte(pCell), pPage, i, updatePtrmap); 4610 if( rc!=SQLITE_OK ) return rc; 4611 } 4612 rc = reparentPage(pBt, iRight, pPage, i, updatePtrmap); 4613 pPage->idxShift = 0; 4614 } 4615 return rc; 4616 } 4617 4618 /* 4619 ** Remove the i-th cell from pPage. This routine effects pPage only. 4620 ** The cell content is not freed or deallocated. It is assumed that 4621 ** the cell content has been copied someplace else. This routine just 4622 ** removes the reference to the cell from pPage. 4623 ** 4624 ** "sz" must be the number of bytes in the cell. 4625 */ 4626 static void dropCell(MemPage *pPage, int idx, int sz){ 4627 int i; /* Loop counter */ 4628 int pc; /* Offset to cell content of cell being deleted */ 4629 u8 *data; /* pPage->aData */ 4630 u8 *ptr; /* Used to move bytes around within data[] */ 4631 4632 assert( idx>=0 && idx<pPage->nCell ); 4633 assert( sz==cellSize(pPage, idx) ); 4634 assert( sqlite3PagerIswriteable(pPage->pDbPage) ); 4635 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 4636 data = pPage->aData; 4637 ptr = &data[pPage->cellOffset + 2*idx]; 4638 pc = get2byte(ptr); 4639 assert( pc>10 && pc+sz<=pPage->pBt->usableSize ); 4640 freeSpace(pPage, pc, sz); 4641 for(i=idx+1; i<pPage->nCell; i++, ptr+=2){ 4642 ptr[0] = ptr[2]; 4643 ptr[1] = ptr[3]; 4644 } 4645 pPage->nCell--; 4646 put2byte(&data[pPage->hdrOffset+3], pPage->nCell); 4647 pPage->nFree += 2; 4648 pPage->idxShift = 1; 4649 } 4650 4651 /* 4652 ** Insert a new cell on pPage at cell index "i". pCell points to the 4653 ** content of the cell. 4654 ** 4655 ** If the cell content will fit on the page, then put it there. If it 4656 ** will not fit, then make a copy of the cell content into pTemp if 4657 ** pTemp is not null. Regardless of pTemp, allocate a new entry 4658 ** in pPage->aOvfl[] and make it point to the cell content (either 4659 ** in pTemp or the original pCell) and also record its index. 4660 ** Allocating a new entry in pPage->aCell[] implies that 4661 ** pPage->nOverflow is incremented. 4662 ** 4663 ** If nSkip is non-zero, then do not copy the first nSkip bytes of the 4664 ** cell. The caller will overwrite them after this function returns. If 4665 ** nSkip is non-zero, then pCell may not point to an invalid memory location 4666 ** (but pCell+nSkip is always valid). 4667 */ 4668 static int insertCell( 4669 MemPage *pPage, /* Page into which we are copying */ 4670 int i, /* New cell becomes the i-th cell of the page */ 4671 u8 *pCell, /* Content of the new cell */ 4672 int sz, /* Bytes of content in pCell */ 4673 u8 *pTemp, /* Temp storage space for pCell, if needed */ 4674 u8 nSkip /* Do not write the first nSkip bytes of the cell */ 4675 ){ 4676 int idx; /* Where to write new cell content in data[] */ 4677 int j; /* Loop counter */ 4678 int top; /* First byte of content for any cell in data[] */ 4679 int end; /* First byte past the last cell pointer in data[] */ 4680 int ins; /* Index in data[] where new cell pointer is inserted */ 4681 int hdr; /* Offset into data[] of the page header */ 4682 int cellOffset; /* Address of first cell pointer in data[] */ 4683 u8 *data; /* The content of the whole page */ 4684 u8 *ptr; /* Used for moving information around in data[] */ 4685 4686 assert( i>=0 && i<=pPage->nCell+pPage->nOverflow ); 4687 assert( sz==cellSizePtr(pPage, pCell) ); 4688 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 4689 if( pPage->nOverflow || sz+2>pPage->nFree ){ 4690 if( pTemp ){ 4691 memcpy(pTemp+nSkip, pCell+nSkip, sz-nSkip); 4692 pCell = pTemp; 4693 } 4694 j = pPage->nOverflow++; 4695 assert( j<sizeof(pPage->aOvfl)/sizeof(pPage->aOvfl[0]) ); 4696 pPage->aOvfl[j].pCell = pCell; 4697 pPage->aOvfl[j].idx = i; 4698 pPage->nFree = 0; 4699 }else{ 4700 int rc = sqlite3PagerWrite(pPage->pDbPage); 4701 if( rc!=SQLITE_OK ){ 4702 return rc; 4703 } 4704 assert( sqlite3PagerIswriteable(pPage->pDbPage) ); 4705 data = pPage->aData; 4706 hdr = pPage->hdrOffset; 4707 top = get2byte(&data[hdr+5]); 4708 cellOffset = pPage->cellOffset; 4709 end = cellOffset + 2*pPage->nCell + 2; 4710 ins = cellOffset + 2*i; 4711 if( end > top - sz ){ 4712 defragmentPage(pPage); 4713 top = get2byte(&data[hdr+5]); 4714 assert( end + sz <= top ); 4715 } 4716 idx = allocateSpace(pPage, sz); 4717 assert( idx>0 ); 4718 assert( end <= get2byte(&data[hdr+5]) ); 4719 pPage->nCell++; 4720 pPage->nFree -= 2; 4721 memcpy(&data[idx+nSkip], pCell+nSkip, sz-nSkip); 4722 for(j=end-2, ptr=&data[j]; j>ins; j-=2, ptr-=2){ 4723 ptr[0] = ptr[-2]; 4724 ptr[1] = ptr[-1]; 4725 } 4726 put2byte(&data[ins], idx); 4727 put2byte(&data[hdr+3], pPage->nCell); 4728 pPage->idxShift = 1; 4729 #ifndef SQLITE_OMIT_AUTOVACUUM 4730 if( pPage->pBt->autoVacuum ){ 4731 /* The cell may contain a pointer to an overflow page. If so, write 4732 ** the entry for the overflow page into the pointer map. 4733 */ 4734 CellInfo info; 4735 sqlite3BtreeParseCellPtr(pPage, pCell, &info); 4736 assert( (info.nData+(pPage->intKey?0:info.nKey))==info.nPayload ); 4737 if( (info.nData+(pPage->intKey?0:info.nKey))>info.nLocal ){ 4738 Pgno pgnoOvfl = get4byte(&pCell[info.iOverflow]); 4739 rc = ptrmapPut(pPage->pBt, pgnoOvfl, PTRMAP_OVERFLOW1, pPage->pgno); 4740 if( rc!=SQLITE_OK ) return rc; 4741 } 4742 } 4743 #endif 4744 } 4745 4746 return SQLITE_OK; 4747 } 4748 4749 /* 4750 ** Add a list of cells to a page. The page should be initially empty. 4751 ** The cells are guaranteed to fit on the page. 4752 */ 4753 static void assemblePage( 4754 MemPage *pPage, /* The page to be assemblied */ 4755 int nCell, /* The number of cells to add to this page */ 4756 u8 **apCell, /* Pointers to cell bodies */ 4757 u16 *aSize /* Sizes of the cells */ 4758 ){ 4759 int i; /* Loop counter */ 4760 int totalSize; /* Total size of all cells */ 4761 int hdr; /* Index of page header */ 4762 int cellptr; /* Address of next cell pointer */ 4763 int cellbody; /* Address of next cell body */ 4764 u8 *data; /* Data for the page */ 4765 4766 assert( pPage->nOverflow==0 ); 4767 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 4768 totalSize = 0; 4769 for(i=0; i<nCell; i++){ 4770 totalSize += aSize[i]; 4771 } 4772 assert( totalSize+2*nCell<=pPage->nFree ); 4773 assert( pPage->nCell==0 ); 4774 cellptr = pPage->cellOffset; 4775 data = pPage->aData; 4776 hdr = pPage->hdrOffset; 4777 put2byte(&data[hdr+3], nCell); 4778 if( nCell ){ 4779 cellbody = allocateSpace(pPage, totalSize); 4780 assert( cellbody>0 ); 4781 assert( pPage->nFree >= 2*nCell ); 4782 pPage->nFree -= 2*nCell; 4783 for(i=0; i<nCell; i++){ 4784 put2byte(&data[cellptr], cellbody); 4785 memcpy(&data[cellbody], apCell[i], aSize[i]); 4786 cellptr += 2; 4787 cellbody += aSize[i]; 4788 } 4789 assert( cellbody==pPage->pBt->usableSize ); 4790 } 4791 pPage->nCell = nCell; 4792 } 4793 4794 /* 4795 ** The following parameters determine how many adjacent pages get involved 4796 ** in a balancing operation. NN is the number of neighbors on either side 4797 ** of the page that participate in the balancing operation. NB is the 4798 ** total number of pages that participate, including the target page and 4799 ** NN neighbors on either side. 4800 ** 4801 ** The minimum value of NN is 1 (of course). Increasing NN above 1 4802 ** (to 2 or 3) gives a modest improvement in SELECT and DELETE performance 4803 ** in exchange for a larger degradation in INSERT and UPDATE performance. 4804 ** The value of NN appears to give the best results overall. 4805 */ 4806 #define NN 1 /* Number of neighbors on either side of pPage */ 4807 #define NB (NN*2+1) /* Total pages involved in the balance */ 4808 4809 /* Forward reference */ 4810 static int balance(MemPage*, int); 4811 4812 #ifndef SQLITE_OMIT_QUICKBALANCE 4813 /* 4814 ** This version of balance() handles the common special case where 4815 ** a new entry is being inserted on the extreme right-end of the 4816 ** tree, in other words, when the new entry will become the largest 4817 ** entry in the tree. 4818 ** 4819 ** Instead of trying balance the 3 right-most leaf pages, just add 4820 ** a new page to the right-hand side and put the one new entry in 4821 ** that page. This leaves the right side of the tree somewhat 4822 ** unbalanced. But odds are that we will be inserting new entries 4823 ** at the end soon afterwards so the nearly empty page will quickly 4824 ** fill up. On average. 4825 ** 4826 ** pPage is the leaf page which is the right-most page in the tree. 4827 ** pParent is its parent. pPage must have a single overflow entry 4828 ** which is also the right-most entry on the page. 4829 */ 4830 static int balance_quick(MemPage *pPage, MemPage *pParent){ 4831 int rc; 4832 MemPage *pNew; 4833 Pgno pgnoNew; 4834 u8 *pCell; 4835 u16 szCell; 4836 CellInfo info; 4837 BtShared *pBt = pPage->pBt; 4838 int parentIdx = pParent->nCell; /* pParent new divider cell index */ 4839 int parentSize; /* Size of new divider cell */ 4840 u8 parentCell[64]; /* Space for the new divider cell */ 4841 4842 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 4843 4844 /* Allocate a new page. Insert the overflow cell from pPage 4845 ** into it. Then remove the overflow cell from pPage. 4846 */ 4847 rc = allocateBtreePage(pBt, &pNew, &pgnoNew, 0, 0); 4848 if( rc!=SQLITE_OK ){ 4849 return rc; 4850 } 4851 pCell = pPage->aOvfl[0].pCell; 4852 szCell = cellSizePtr(pPage, pCell); 4853 zeroPage(pNew, pPage->aData[0]); 4854 assemblePage(pNew, 1, &pCell, &szCell); 4855 pPage->nOverflow = 0; 4856 4857 /* Set the parent of the newly allocated page to pParent. */ 4858 pNew->pParent = pParent; 4859 sqlite3PagerRef(pParent->pDbPage); 4860 4861 /* pPage is currently the right-child of pParent. Change this 4862 ** so that the right-child is the new page allocated above and 4863 ** pPage is the next-to-right child. 4864 ** 4865 ** Ignore the return value of the call to fillInCell(). fillInCell() 4866 ** may only return other than SQLITE_OK if it is required to allocate 4867 ** one or more overflow pages. Since an internal table B-Tree cell 4868 ** may never spill over onto an overflow page (it is a maximum of 4869 ** 13 bytes in size), it is not neccessary to check the return code. 4870 ** 4871 ** Similarly, the insertCell() function cannot fail if the page 4872 ** being inserted into is already writable and the cell does not 4873 ** contain an overflow pointer. So ignore this return code too. 4874 */ 4875 assert( pPage->nCell>0 ); 4876 pCell = findCell(pPage, pPage->nCell-1); 4877 sqlite3BtreeParseCellPtr(pPage, pCell, &info); 4878 fillInCell(pParent, parentCell, 0, info.nKey, 0, 0, 0, &parentSize); 4879 assert( parentSize<64 ); 4880 assert( sqlite3PagerIswriteable(pParent->pDbPage) ); 4881 insertCell(pParent, parentIdx, parentCell, parentSize, 0, 4); 4882 put4byte(findOverflowCell(pParent,parentIdx), pPage->pgno); 4883 put4byte(&pParent->aData[pParent->hdrOffset+8], pgnoNew); 4884 4885 /* If this is an auto-vacuum database, update the pointer map 4886 ** with entries for the new page, and any pointer from the 4887 ** cell on the page to an overflow page. 4888 */ 4889 if( ISAUTOVACUUM ){ 4890 rc = ptrmapPut(pBt, pgnoNew, PTRMAP_BTREE, pParent->pgno); 4891 if( rc==SQLITE_OK ){ 4892 rc = ptrmapPutOvfl(pNew, 0); 4893 } 4894 if( rc!=SQLITE_OK ){ 4895 releasePage(pNew); 4896 return rc; 4897 } 4898 } 4899 4900 /* Release the reference to the new page and balance the parent page, 4901 ** in case the divider cell inserted caused it to become overfull. 4902 */ 4903 releasePage(pNew); 4904 return balance(pParent, 0); 4905 } 4906 #endif /* SQLITE_OMIT_QUICKBALANCE */ 4907 4908 /* 4909 ** This routine redistributes Cells on pPage and up to NN*2 siblings 4910 ** of pPage so that all pages have about the same amount of free space. 4911 ** Usually NN siblings on either side of pPage is used in the balancing, 4912 ** though more siblings might come from one side if pPage is the first 4913 ** or last child of its parent. If pPage has fewer than 2*NN siblings 4914 ** (something which can only happen if pPage is the root page or a 4915 ** child of root) then all available siblings participate in the balancing. 4916 ** 4917 ** The number of siblings of pPage might be increased or decreased by one or 4918 ** two in an effort to keep pages nearly full but not over full. The root page 4919 ** is special and is allowed to be nearly empty. If pPage is 4920 ** the root page, then the depth of the tree might be increased 4921 ** or decreased by one, as necessary, to keep the root page from being 4922 ** overfull or completely empty. 4923 ** 4924 ** Note that when this routine is called, some of the Cells on pPage 4925 ** might not actually be stored in pPage->aData[]. This can happen 4926 ** if the page is overfull. Part of the job of this routine is to 4927 ** make sure all Cells for pPage once again fit in pPage->aData[]. 4928 ** 4929 ** In the course of balancing the siblings of pPage, the parent of pPage 4930 ** might become overfull or underfull. If that happens, then this routine 4931 ** is called recursively on the parent. 4932 ** 4933 ** If this routine fails for any reason, it might leave the database 4934 ** in a corrupted state. So if this routine fails, the database should 4935 ** be rolled back. 4936 */ 4937 static int balance_nonroot(MemPage *pPage){ 4938 MemPage *pParent; /* The parent of pPage */ 4939 BtShared *pBt; /* The whole database */ 4940 int nCell = 0; /* Number of cells in apCell[] */ 4941 int nMaxCells = 0; /* Allocated size of apCell, szCell, aFrom. */ 4942 int nOld; /* Number of pages in apOld[] */ 4943 int nNew; /* Number of pages in apNew[] */ 4944 int nDiv; /* Number of cells in apDiv[] */ 4945 int i, j, k; /* Loop counters */ 4946 int idx; /* Index of pPage in pParent->aCell[] */ 4947 int nxDiv; /* Next divider slot in pParent->aCell[] */ 4948 int rc; /* The return code */ 4949 int leafCorrection; /* 4 if pPage is a leaf. 0 if not */ 4950 int leafData; /* True if pPage is a leaf of a LEAFDATA tree */ 4951 int usableSpace; /* Bytes in pPage beyond the header */ 4952 int pageFlags; /* Value of pPage->aData[0] */ 4953 int subtotal; /* Subtotal of bytes in cells on one page */ 4954 int iSpace1 = 0; /* First unused byte of aSpace1[] */ 4955 int iSpace2 = 0; /* First unused byte of aSpace2[] */ 4956 int szScratch; /* Size of scratch memory requested */ 4957 MemPage *apOld[NB]; /* pPage and up to two siblings */ 4958 Pgno pgnoOld[NB]; /* Page numbers for each page in apOld[] */ 4959 MemPage *apCopy[NB]; /* Private copies of apOld[] pages */ 4960 MemPage *apNew[NB+2]; /* pPage and up to NB siblings after balancing */ 4961 Pgno pgnoNew[NB+2]; /* Page numbers for each page in apNew[] */ 4962 u8 *apDiv[NB]; /* Divider cells in pParent */ 4963 int cntNew[NB+2]; /* Index in aCell[] of cell after i-th page */ 4964 int szNew[NB+2]; /* Combined size of cells place on i-th page */ 4965 u8 **apCell = 0; /* All cells begin balanced */ 4966 u16 *szCell; /* Local size of all cells in apCell[] */ 4967 u8 *aCopy[NB]; /* Space for holding data of apCopy[] */ 4968 u8 *aSpace1; /* Space for copies of dividers cells before balance */ 4969 u8 *aSpace2 = 0; /* Space for overflow dividers cells after balance */ 4970 u8 *aFrom = 0; 4971 4972 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 4973 4974 /* 4975 ** Find the parent page. 4976 */ 4977 assert( pPage->isInit ); 4978 assert( sqlite3PagerIswriteable(pPage->pDbPage) || pPage->nOverflow==1 ); 4979 pBt = pPage->pBt; 4980 pParent = pPage->pParent; 4981 assert( pParent ); 4982 if( SQLITE_OK!=(rc = sqlite3PagerWrite(pParent->pDbPage)) ){ 4983 return rc; 4984 } 4985 4986 TRACE(("BALANCE: begin page %d child of %d\n", pPage->pgno, pParent->pgno)); 4987 4988 #ifndef SQLITE_OMIT_QUICKBALANCE 4989 /* 4990 ** A special case: If a new entry has just been inserted into a 4991 ** table (that is, a btree with integer keys and all data at the leaves) 4992 ** and the new entry is the right-most entry in the tree (it has the 4993 ** largest key) then use the special balance_quick() routine for 4994 ** balancing. balance_quick() is much faster and results in a tighter 4995 ** packing of data in the common case. 4996 */ 4997 if( pPage->leaf && 4998 pPage->intKey && 4999 pPage->nOverflow==1 && 5000 pPage->aOvfl[0].idx==pPage->nCell && 5001 pPage->pParent->pgno!=1 && 5002 get4byte(&pParent->aData[pParent->hdrOffset+8])==pPage->pgno 5003 ){ 5004 assert( pPage->intKey ); 5005 /* 5006 ** TODO: Check the siblings to the left of pPage. It may be that 5007 ** they are not full and no new page is required. 5008 */ 5009 return balance_quick(pPage, pParent); 5010 } 5011 #endif 5012 5013 if( SQLITE_OK!=(rc = sqlite3PagerWrite(pPage->pDbPage)) ){ 5014 return rc; 5015 } 5016 5017 /* 5018 ** Find the cell in the parent page whose left child points back 5019 ** to pPage. The "idx" variable is the index of that cell. If pPage 5020 ** is the rightmost child of pParent then set idx to pParent->nCell 5021 */ 5022 if( pParent->idxShift ){ 5023 Pgno pgno; 5024 pgno = pPage->pgno; 5025 assert( pgno==sqlite3PagerPagenumber(pPage->pDbPage) ); 5026 for(idx=0; idx<pParent->nCell; idx++){ 5027 if( get4byte(findCell(pParent, idx))==pgno ){ 5028 break; 5029 } 5030 } 5031 assert( idx<pParent->nCell 5032 || get4byte(&pParent->aData[pParent->hdrOffset+8])==pgno ); 5033 }else{ 5034 idx = pPage->idxParent; 5035 } 5036 5037 /* 5038 ** Initialize variables so that it will be safe to jump 5039 ** directly to balance_cleanup at any moment. 5040 */ 5041 nOld = nNew = 0; 5042 sqlite3PagerRef(pParent->pDbPage); 5043 5044 /* 5045 ** Find sibling pages to pPage and the cells in pParent that divide 5046 ** the siblings. An attempt is made to find NN siblings on either 5047 ** side of pPage. More siblings are taken from one side, however, if 5048 ** pPage there are fewer than NN siblings on the other side. If pParent 5049 ** has NB or fewer children then all children of pParent are taken. 5050 */ 5051 nxDiv = idx - NN; 5052 if( nxDiv + NB > pParent->nCell ){ 5053 nxDiv = pParent->nCell - NB + 1; 5054 } 5055 if( nxDiv<0 ){ 5056 nxDiv = 0; 5057 } 5058 nDiv = 0; 5059 for(i=0, k=nxDiv; i<NB; i++, k++){ 5060 if( k<pParent->nCell ){ 5061 apDiv[i] = findCell(pParent, k); 5062 nDiv++; 5063 assert( !pParent->leaf ); 5064 pgnoOld[i] = get4byte(apDiv[i]); 5065 }else if( k==pParent->nCell ){ 5066 pgnoOld[i] = get4byte(&pParent->aData[pParent->hdrOffset+8]); 5067 }else{ 5068 break; 5069 } 5070 rc = getAndInitPage(pBt, pgnoOld[i], &apOld[i], pParent); 5071 if( rc ) goto balance_cleanup; 5072 apOld[i]->idxParent = k; 5073 apCopy[i] = 0; 5074 assert( i==nOld ); 5075 nOld++; 5076 nMaxCells += 1+apOld[i]->nCell+apOld[i]->nOverflow; 5077 } 5078 5079 /* Make nMaxCells a multiple of 4 in order to preserve 8-byte 5080 ** alignment */ 5081 nMaxCells = (nMaxCells + 3)&~3; 5082 5083 /* 5084 ** Allocate space for memory structures 5085 */ 5086 szScratch = 5087 nMaxCells*sizeof(u8*) /* apCell */ 5088 + nMaxCells*sizeof(u16) /* szCell */ 5089 + (ROUND8(sizeof(MemPage))+pBt->pageSize)*NB /* aCopy */ 5090 + pBt->pageSize /* aSpace1 */ 5091 + (ISAUTOVACUUM ? nMaxCells : 0); /* aFrom */ 5092 apCell = sqlite3ScratchMalloc( szScratch ); 5093 if( apCell==0 ){ 5094 rc = SQLITE_NOMEM; 5095 goto balance_cleanup; 5096 } 5097 szCell = (u16*)&apCell[nMaxCells]; 5098 aCopy[0] = (u8*)&szCell[nMaxCells]; 5099 assert( ((aCopy[0] - (u8*)apCell) & 7)==0 ); /* 8-byte alignment required */ 5100 for(i=1; i<NB; i++){ 5101 aCopy[i] = &aCopy[i-1][pBt->pageSize+ROUND8(sizeof(MemPage))]; 5102 assert( ((aCopy[i] - (u8*)apCell) & 7)==0 ); /* 8-byte alignment required */ 5103 } 5104 aSpace1 = &aCopy[NB-1][pBt->pageSize+ROUND8(sizeof(MemPage))]; 5105 assert( ((aSpace1 - (u8*)apCell) & 7)==0 ); /* 8-byte alignment required */ 5106 if( ISAUTOVACUUM ){ 5107 aFrom = &aSpace1[pBt->pageSize]; 5108 } 5109 aSpace2 = sqlite3PageMalloc(pBt->pageSize); 5110 if( aSpace2==0 ){ 5111 rc = SQLITE_NOMEM; 5112 goto balance_cleanup; 5113 } 5114 5115 /* 5116 ** Make copies of the content of pPage and its siblings into aOld[]. 5117 ** The rest of this function will use data from the copies rather 5118 ** that the original pages since the original pages will be in the 5119 ** process of being overwritten. 5120 */ 5121 for(i=0; i<nOld; i++){ 5122 MemPage *p = apCopy[i] = (MemPage*)aCopy[i]; 5123 memcpy(p, apOld[i], sizeof(MemPage)); 5124 p->aData = (void*)&p[1]; 5125 memcpy(p->aData, apOld[i]->aData, pBt->pageSize); 5126 } 5127 5128 /* 5129 ** Load pointers to all cells on sibling pages and the divider cells 5130 ** into the local apCell[] array. Make copies of the divider cells 5131 ** into space obtained form aSpace1[] and remove the the divider Cells 5132 ** from pParent. 5133 ** 5134 ** If the siblings are on leaf pages, then the child pointers of the 5135 ** divider cells are stripped from the cells before they are copied 5136 ** into aSpace1[]. In this way, all cells in apCell[] are without 5137 ** child pointers. If siblings are not leaves, then all cell in 5138 ** apCell[] include child pointers. Either way, all cells in apCell[] 5139 ** are alike. 5140 ** 5141 ** leafCorrection: 4 if pPage is a leaf. 0 if pPage is not a leaf. 5142 ** leafData: 1 if pPage holds key+data and pParent holds only keys. 5143 */ 5144 nCell = 0; 5145 leafCorrection = pPage->leaf*4; 5146 leafData = pPage->hasData; 5147 for(i=0; i<nOld; i++){ 5148 MemPage *pOld = apCopy[i]; 5149 int limit = pOld->nCell+pOld->nOverflow; 5150 for(j=0; j<limit; j++){ 5151 assert( nCell<nMaxCells ); 5152 apCell[nCell] = findOverflowCell(pOld, j); 5153 szCell[nCell] = cellSizePtr(pOld, apCell[nCell]); 5154 if( ISAUTOVACUUM ){ 5155 int a; 5156 aFrom[nCell] = i; 5157 for(a=0; a<pOld->nOverflow; a++){ 5158 if( pOld->aOvfl[a].pCell==apCell[nCell] ){ 5159 aFrom[nCell] = 0xFF; 5160 break; 5161 } 5162 } 5163 } 5164 nCell++; 5165 } 5166 if( i<nOld-1 ){ 5167 u16 sz = cellSizePtr(pParent, apDiv[i]); 5168 if( leafData ){ 5169 /* With the LEAFDATA flag, pParent cells hold only INTKEYs that 5170 ** are duplicates of keys on the child pages. We need to remove 5171 ** the divider cells from pParent, but the dividers cells are not 5172 ** added to apCell[] because they are duplicates of child cells. 5173 */ 5174 dropCell(pParent, nxDiv, sz); 5175 }else{ 5176 u8 *pTemp; 5177 assert( nCell<nMaxCells ); 5178 szCell[nCell] = sz; 5179 pTemp = &aSpace1[iSpace1]; 5180 iSpace1 += sz; 5181 assert( sz<=pBt->pageSize/4 ); 5182 assert( iSpace1<=pBt->pageSize ); 5183 memcpy(pTemp, apDiv[i], sz); 5184 apCell[nCell] = pTemp+leafCorrection; 5185 if( ISAUTOVACUUM ){ 5186 aFrom[nCell] = 0xFF; 5187 } 5188 dropCell(pParent, nxDiv, sz); 5189 szCell[nCell] -= leafCorrection; 5190 assert( get4byte(pTemp)==pgnoOld[i] ); 5191 if( !pOld->leaf ){ 5192 assert( leafCorrection==0 ); 5193 /* The right pointer of the child page pOld becomes the left 5194 ** pointer of the divider cell */ 5195 memcpy(apCell[nCell], &pOld->aData[pOld->hdrOffset+8], 4); 5196 }else{ 5197 assert( leafCorrection==4 ); 5198 if( szCell[nCell]<4 ){ 5199 /* Do not allow any cells smaller than 4 bytes. */ 5200 szCell[nCell] = 4; 5201 } 5202 } 5203 nCell++; 5204 } 5205 } 5206 } 5207 5208 /* 5209 ** Figure out the number of pages needed to hold all nCell cells. 5210 ** Store this number in "k". Also compute szNew[] which is the total 5211 ** size of all cells on the i-th page and cntNew[] which is the index 5212 ** in apCell[] of the cell that divides page i from page i+1. 5213 ** cntNew[k] should equal nCell. 5214 ** 5215 ** Values computed by this block: 5216 ** 5217 ** k: The total number of sibling pages 5218 ** szNew[i]: Spaced used on the i-th sibling page. 5219 ** cntNew[i]: Index in apCell[] and szCell[] for the first cell to 5220 ** the right of the i-th sibling page. 5221 ** usableSpace: Number of bytes of space available on each sibling. 5222 ** 5223 */ 5224 usableSpace = pBt->usableSize - 12 + leafCorrection; 5225 for(subtotal=k=i=0; i<nCell; i++){ 5226 assert( i<nMaxCells ); 5227 subtotal += szCell[i] + 2; 5228 if( subtotal > usableSpace ){ 5229 szNew[k] = subtotal - szCell[i]; 5230 cntNew[k] = i; 5231 if( leafData ){ i--; } 5232 subtotal = 0; 5233 k++; 5234 } 5235 } 5236 szNew[k] = subtotal; 5237 cntNew[k] = nCell; 5238 k++; 5239 5240 /* 5241 ** The packing computed by the previous block is biased toward the siblings 5242 ** on the left side. The left siblings are always nearly full, while the 5243 ** right-most sibling might be nearly empty. This block of code attempts 5244 ** to adjust the packing of siblings to get a better balance. 5245 ** 5246 ** This adjustment is more than an optimization. The packing above might 5247 ** be so out of balance as to be illegal. For example, the right-most 5248 ** sibling might be completely empty. This adjustment is not optional. 5249 */ 5250 for(i=k-1; i>0; i--){ 5251 int szRight = szNew[i]; /* Size of sibling on the right */ 5252 int szLeft = szNew[i-1]; /* Size of sibling on the left */ 5253 int r; /* Index of right-most cell in left sibling */ 5254 int d; /* Index of first cell to the left of right sibling */ 5255 5256 r = cntNew[i-1] - 1; 5257 d = r + 1 - leafData; 5258 assert( d<nMaxCells ); 5259 assert( r<nMaxCells ); 5260 while( szRight==0 || szRight+szCell[d]+2<=szLeft-(szCell[r]+2) ){ 5261 szRight += szCell[d] + 2; 5262 szLeft -= szCell[r] + 2; 5263 cntNew[i-1]--; 5264 r = cntNew[i-1] - 1; 5265 d = r + 1 - leafData; 5266 } 5267 szNew[i] = szRight; 5268 szNew[i-1] = szLeft; 5269 } 5270 5271 /* Either we found one or more cells (cntnew[0])>0) or we are the 5272 ** a virtual root page. A virtual root page is when the real root 5273 ** page is page 1 and we are the only child of that page. 5274 */ 5275 assert( cntNew[0]>0 || (pParent->pgno==1 && pParent->nCell==0) ); 5276 5277 /* 5278 ** Allocate k new pages. Reuse old pages where possible. 5279 */ 5280 assert( pPage->pgno>1 ); 5281 pageFlags = pPage->aData[0]; 5282 for(i=0; i<k; i++){ 5283 MemPage *pNew; 5284 if( i<nOld ){ 5285 pNew = apNew[i] = apOld[i]; 5286 pgnoNew[i] = pgnoOld[i]; 5287 apOld[i] = 0; 5288 rc = sqlite3PagerWrite(pNew->pDbPage); 5289 nNew++; 5290 if( rc ) goto balance_cleanup; 5291 }else{ 5292 assert( i>0 ); 5293 rc = allocateBtreePage(pBt, &pNew, &pgnoNew[i], pgnoNew[i-1], 0); 5294 if( rc ) goto balance_cleanup; 5295 apNew[i] = pNew; 5296 nNew++; 5297 } 5298 } 5299 5300 /* Free any old pages that were not reused as new pages. 5301 */ 5302 while( i<nOld ){ 5303 rc = freePage(apOld[i]); 5304 if( rc ) goto balance_cleanup; 5305 releasePage(apOld[i]); 5306 apOld[i] = 0; 5307 i++; 5308 } 5309 5310 /* 5311 ** Put the new pages in accending order. This helps to 5312 ** keep entries in the disk file in order so that a scan 5313 ** of the table is a linear scan through the file. That 5314 ** in turn helps the operating system to deliver pages 5315 ** from the disk more rapidly. 5316 ** 5317 ** An O(n^2) insertion sort algorithm is used, but since 5318 ** n is never more than NB (a small constant), that should 5319 ** not be a problem. 5320 ** 5321 ** When NB==3, this one optimization makes the database 5322 ** about 25% faster for large insertions and deletions. 5323 */ 5324 for(i=0; i<k-1; i++){ 5325 int minV = pgnoNew[i]; 5326 int minI = i; 5327 for(j=i+1; j<k; j++){ 5328 if( pgnoNew[j]<(unsigned)minV ){ 5329 minI = j; 5330 minV = pgnoNew[j]; 5331 } 5332 } 5333 if( minI>i ){ 5334 int t; 5335 MemPage *pT; 5336 t = pgnoNew[i]; 5337 pT = apNew[i]; 5338 pgnoNew[i] = pgnoNew[minI]; 5339 apNew[i] = apNew[minI]; 5340 pgnoNew[minI] = t; 5341 apNew[minI] = pT; 5342 } 5343 } 5344 TRACE(("BALANCE: old: %d %d %d new: %d(%d) %d(%d) %d(%d) %d(%d) %d(%d)\n", 5345 pgnoOld[0], 5346 nOld>=2 ? pgnoOld[1] : 0, 5347 nOld>=3 ? pgnoOld[2] : 0, 5348 pgnoNew[0], szNew[0], 5349 nNew>=2 ? pgnoNew[1] : 0, nNew>=2 ? szNew[1] : 0, 5350 nNew>=3 ? pgnoNew[2] : 0, nNew>=3 ? szNew[2] : 0, 5351 nNew>=4 ? pgnoNew[3] : 0, nNew>=4 ? szNew[3] : 0, 5352 nNew>=5 ? pgnoNew[4] : 0, nNew>=5 ? szNew[4] : 0)); 5353 5354 /* 5355 ** Evenly distribute the data in apCell[] across the new pages. 5356 ** Insert divider cells into pParent as necessary. 5357 */ 5358 j = 0; 5359 for(i=0; i<nNew; i++){ 5360 /* Assemble the new sibling page. */ 5361 MemPage *pNew = apNew[i]; 5362 assert( j<nMaxCells ); 5363 assert( pNew->pgno==pgnoNew[i] ); 5364 zeroPage(pNew, pageFlags); 5365 assemblePage(pNew, cntNew[i]-j, &apCell[j], &szCell[j]); 5366 assert( pNew->nCell>0 || (nNew==1 && cntNew[0]==0) ); 5367 assert( pNew->nOverflow==0 ); 5368 5369 /* If this is an auto-vacuum database, update the pointer map entries 5370 ** that point to the siblings that were rearranged. These can be: left 5371 ** children of cells, the right-child of the page, or overflow pages 5372 ** pointed to by cells. 5373 */ 5374 if( ISAUTOVACUUM ){ 5375 for(k=j; k<cntNew[i]; k++){ 5376 assert( k<nMaxCells ); 5377 if( aFrom[k]==0xFF || apCopy[aFrom[k]]->pgno!=pNew->pgno ){ 5378 rc = ptrmapPutOvfl(pNew, k-j); 5379 if( rc==SQLITE_OK && leafCorrection==0 ){ 5380 rc = ptrmapPut(pBt, get4byte(apCell[k]), PTRMAP_BTREE, pNew->pgno); 5381 } 5382 if( rc!=SQLITE_OK ){ 5383 goto balance_cleanup; 5384 } 5385 } 5386 } 5387 } 5388 5389 j = cntNew[i]; 5390 5391 /* If the sibling page assembled above was not the right-most sibling, 5392 ** insert a divider cell into the parent page. 5393 */ 5394 if( i<nNew-1 && j<nCell ){ 5395 u8 *pCell; 5396 u8 *pTemp; 5397 int sz; 5398 5399 assert( j<nMaxCells ); 5400 pCell = apCell[j]; 5401 sz = szCell[j] + leafCorrection; 5402 pTemp = &aSpace2[iSpace2]; 5403 if( !pNew->leaf ){ 5404 memcpy(&pNew->aData[8], pCell, 4); 5405 if( ISAUTOVACUUM 5406 && (aFrom[j]==0xFF || apCopy[aFrom[j]]->pgno!=pNew->pgno) 5407 ){ 5408 rc = ptrmapPut(pBt, get4byte(pCell), PTRMAP_BTREE, pNew->pgno); 5409 if( rc!=SQLITE_OK ){ 5410 goto balance_cleanup; 5411 } 5412 } 5413 }else if( leafData ){ 5414 /* If the tree is a leaf-data tree, and the siblings are leaves, 5415 ** then there is no divider cell in apCell[]. Instead, the divider 5416 ** cell consists of the integer key for the right-most cell of 5417 ** the sibling-page assembled above only. 5418 */ 5419 CellInfo info; 5420 j--; 5421 sqlite3BtreeParseCellPtr(pNew, apCell[j], &info); 5422 pCell = pTemp; 5423 fillInCell(pParent, pCell, 0, info.nKey, 0, 0, 0, &sz); 5424 pTemp = 0; 5425 }else{ 5426 pCell -= 4; 5427 /* Obscure case for non-leaf-data trees: If the cell at pCell was 5428 ** previously stored on a leaf node, and its reported size was 4 5429 ** bytes, then it may actually be smaller than this 5430 ** (see sqlite3BtreeParseCellPtr(), 4 bytes is the minimum size of 5431 ** any cell). But it is important to pass the correct size to 5432 ** insertCell(), so reparse the cell now. 5433 ** 5434 ** Note that this can never happen in an SQLite data file, as all 5435 ** cells are at least 4 bytes. It only happens in b-trees used 5436 ** to evaluate "IN (SELECT ...)" and similar clauses. 5437 */ 5438 if( szCell[j]==4 ){ 5439 assert(leafCorrection==4); 5440 sz = cellSizePtr(pParent, pCell); 5441 } 5442 } 5443 iSpace2 += sz; 5444 assert( sz<=pBt->pageSize/4 ); 5445 assert( iSpace2<=pBt->pageSize ); 5446 rc = insertCell(pParent, nxDiv, pCell, sz, pTemp, 4); 5447 if( rc!=SQLITE_OK ) goto balance_cleanup; 5448 put4byte(findOverflowCell(pParent,nxDiv), pNew->pgno); 5449 5450 /* If this is an auto-vacuum database, and not a leaf-data tree, 5451 ** then update the pointer map with an entry for the overflow page 5452 ** that the cell just inserted points to (if any). 5453 */ 5454 if( ISAUTOVACUUM && !leafData ){ 5455 rc = ptrmapPutOvfl(pParent, nxDiv); 5456 if( rc!=SQLITE_OK ){ 5457 goto balance_cleanup; 5458 } 5459 } 5460 j++; 5461 nxDiv++; 5462 } 5463 5464 /* Set the pointer-map entry for the new sibling page. */ 5465 if( ISAUTOVACUUM ){ 5466 rc = ptrmapPut(pBt, pNew->pgno, PTRMAP_BTREE, pParent->pgno); 5467 if( rc!=SQLITE_OK ){ 5468 goto balance_cleanup; 5469 } 5470 } 5471 } 5472 assert( j==nCell ); 5473 assert( nOld>0 ); 5474 assert( nNew>0 ); 5475 if( (pageFlags & PTF_LEAF)==0 ){ 5476 u8 *zChild = &apCopy[nOld-1]->aData[8]; 5477 memcpy(&apNew[nNew-1]->aData[8], zChild, 4); 5478 if( ISAUTOVACUUM ){ 5479 rc = ptrmapPut(pBt, get4byte(zChild), PTRMAP_BTREE, apNew[nNew-1]->pgno); 5480 if( rc!=SQLITE_OK ){ 5481 goto balance_cleanup; 5482 } 5483 } 5484 } 5485 if( nxDiv==pParent->nCell+pParent->nOverflow ){ 5486 /* Right-most sibling is the right-most child of pParent */ 5487 put4byte(&pParent->aData[pParent->hdrOffset+8], pgnoNew[nNew-1]); 5488 }else{ 5489 /* Right-most sibling is the left child of the first entry in pParent 5490 ** past the right-most divider entry */ 5491 put4byte(findOverflowCell(pParent, nxDiv), pgnoNew[nNew-1]); 5492 } 5493 5494 /* 5495 ** Reparent children of all cells. 5496 */ 5497 for(i=0; i<nNew; i++){ 5498 rc = reparentChildPages(apNew[i], 0); 5499 if( rc!=SQLITE_OK ) goto balance_cleanup; 5500 } 5501 rc = reparentChildPages(pParent, 0); 5502 if( rc!=SQLITE_OK ) goto balance_cleanup; 5503 5504 /* 5505 ** Balance the parent page. Note that the current page (pPage) might 5506 ** have been added to the freelist so it might no longer be initialized. 5507 ** But the parent page will always be initialized. 5508 */ 5509 assert( pParent->isInit ); 5510 sqlite3ScratchFree(apCell); 5511 apCell = 0; 5512 rc = balance(pParent, 0); 5513 5514 /* 5515 ** Cleanup before returning. 5516 */ 5517 balance_cleanup: 5518 sqlite3PageFree(aSpace2); 5519 sqlite3ScratchFree(apCell); 5520 for(i=0; i<nOld; i++){ 5521 releasePage(apOld[i]); 5522 } 5523 for(i=0; i<nNew; i++){ 5524 releasePage(apNew[i]); 5525 } 5526 releasePage(pParent); 5527 TRACE(("BALANCE: finished with %d: old=%d new=%d cells=%d\n", 5528 pPage->pgno, nOld, nNew, nCell)); 5529 return rc; 5530 } 5531 5532 /* 5533 ** This routine is called for the root page of a btree when the root 5534 ** page contains no cells. This is an opportunity to make the tree 5535 ** shallower by one level. 5536 */ 5537 static int balance_shallower(MemPage *pPage){ 5538 MemPage *pChild; /* The only child page of pPage */ 5539 Pgno pgnoChild; /* Page number for pChild */ 5540 int rc = SQLITE_OK; /* Return code from subprocedures */ 5541 BtShared *pBt; /* The main BTree structure */ 5542 int mxCellPerPage; /* Maximum number of cells per page */ 5543 u8 **apCell; /* All cells from pages being balanced */ 5544 u16 *szCell; /* Local size of all cells */ 5545 5546 assert( pPage->pParent==0 ); 5547 assert( pPage->nCell==0 ); 5548 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 5549 pBt = pPage->pBt; 5550 mxCellPerPage = MX_CELL(pBt); 5551 apCell = sqlite3Malloc( mxCellPerPage*(sizeof(u8*)+sizeof(u16)) ); 5552 if( apCell==0 ) return SQLITE_NOMEM; 5553 szCell = (u16*)&apCell[mxCellPerPage]; 5554 if( pPage->leaf ){ 5555 /* The table is completely empty */ 5556 TRACE(("BALANCE: empty table %d\n", pPage->pgno)); 5557 }else{ 5558 /* The root page is empty but has one child. Transfer the 5559 ** information from that one child into the root page if it 5560 ** will fit. This reduces the depth of the tree by one. 5561 ** 5562 ** If the root page is page 1, it has less space available than 5563 ** its child (due to the 100 byte header that occurs at the beginning 5564 ** of the database fle), so it might not be able to hold all of the 5565 ** information currently contained in the child. If this is the 5566 ** case, then do not do the transfer. Leave page 1 empty except 5567 ** for the right-pointer to the child page. The child page becomes 5568 ** the virtual root of the tree. 5569 */ 5570 pgnoChild = get4byte(&pPage->aData[pPage->hdrOffset+8]); 5571 assert( pgnoChild>0 ); 5572 assert( pgnoChild<=pagerPagecount(pPage->pBt->pPager) ); 5573 rc = sqlite3BtreeGetPage(pPage->pBt, pgnoChild, &pChild, 0); 5574 if( rc ) goto end_shallow_balance; 5575 if( pPage->pgno==1 ){ 5576 rc = sqlite3BtreeInitPage(pChild, pPage); 5577 if( rc ) goto end_shallow_balance; 5578 assert( pChild->nOverflow==0 ); 5579 if( pChild->nFree>=100 ){ 5580 /* The child information will fit on the root page, so do the 5581 ** copy */ 5582 int i; 5583 zeroPage(pPage, pChild->aData[0]); 5584 for(i=0; i<pChild->nCell; i++){ 5585 apCell[i] = findCell(pChild,i); 5586 szCell[i] = cellSizePtr(pChild, apCell[i]); 5587 } 5588 assemblePage(pPage, pChild->nCell, apCell, szCell); 5589 /* Copy the right-pointer of the child to the parent. */ 5590 put4byte(&pPage->aData[pPage->hdrOffset+8], 5591 get4byte(&pChild->aData[pChild->hdrOffset+8])); 5592 freePage(pChild); 5593 TRACE(("BALANCE: child %d transfer to page 1\n", pChild->pgno)); 5594 }else{ 5595 /* The child has more information that will fit on the root. 5596 ** The tree is already balanced. Do nothing. */ 5597 TRACE(("BALANCE: child %d will not fit on page 1\n", pChild->pgno)); 5598 } 5599 }else{ 5600 memcpy(pPage->aData, pChild->aData, pPage->pBt->usableSize); 5601 pPage->isInit = 0; 5602 pPage->pParent = 0; 5603 rc = sqlite3BtreeInitPage(pPage, 0); 5604 assert( rc==SQLITE_OK ); 5605 freePage(pChild); 5606 TRACE(("BALANCE: transfer child %d into root %d\n", 5607 pChild->pgno, pPage->pgno)); 5608 } 5609 rc = reparentChildPages(pPage, 1); 5610 assert( pPage->nOverflow==0 ); 5611 if( ISAUTOVACUUM ){ 5612 int i; 5613 for(i=0; i<pPage->nCell; i++){ 5614 rc = ptrmapPutOvfl(pPage, i); 5615 if( rc!=SQLITE_OK ){ 5616 goto end_shallow_balance; 5617 } 5618 } 5619 } 5620 releasePage(pChild); 5621 } 5622 end_shallow_balance: 5623 sqlite3_free(apCell); 5624 return rc; 5625 } 5626 5627 5628 /* 5629 ** The root page is overfull 5630 ** 5631 ** When this happens, Create a new child page and copy the 5632 ** contents of the root into the child. Then make the root 5633 ** page an empty page with rightChild pointing to the new 5634 ** child. Finally, call balance_internal() on the new child 5635 ** to cause it to split. 5636 */ 5637 static int balance_deeper(MemPage *pPage){ 5638 int rc; /* Return value from subprocedures */ 5639 MemPage *pChild; /* Pointer to a new child page */ 5640 Pgno pgnoChild; /* Page number of the new child page */ 5641 BtShared *pBt; /* The BTree */ 5642 int usableSize; /* Total usable size of a page */ 5643 u8 *data; /* Content of the parent page */ 5644 u8 *cdata; /* Content of the child page */ 5645 int hdr; /* Offset to page header in parent */ 5646 int brk; /* Offset to content of first cell in parent */ 5647 5648 assert( pPage->pParent==0 ); 5649 assert( pPage->nOverflow>0 ); 5650 pBt = pPage->pBt; 5651 assert( sqlite3_mutex_held(pBt->mutex) ); 5652 rc = allocateBtreePage(pBt, &pChild, &pgnoChild, pPage->pgno, 0); 5653 if( rc ) return rc; 5654 assert( sqlite3PagerIswriteable(pChild->pDbPage) ); 5655 usableSize = pBt->usableSize; 5656 data = pPage->aData; 5657 hdr = pPage->hdrOffset; 5658 brk = get2byte(&data[hdr+5]); 5659 cdata = pChild->aData; 5660 memcpy(cdata, &data[hdr], pPage->cellOffset+2*pPage->nCell-hdr); 5661 memcpy(&cdata[brk], &data[brk], usableSize-brk); 5662 if( pChild->isInit ) return SQLITE_CORRUPT; 5663 rc = sqlite3BtreeInitPage(pChild, pPage); 5664 if( rc ) goto balancedeeper_out; 5665 memcpy(pChild->aOvfl, pPage->aOvfl, pPage->nOverflow*sizeof(pPage->aOvfl[0])); 5666 pChild->nOverflow = pPage->nOverflow; 5667 if( pChild->nOverflow ){ 5668 pChild->nFree = 0; 5669 } 5670 assert( pChild->nCell==pPage->nCell ); 5671 zeroPage(pPage, pChild->aData[0] & ~PTF_LEAF); 5672 put4byte(&pPage->aData[pPage->hdrOffset+8], pgnoChild); 5673 TRACE(("BALANCE: copy root %d into %d\n", pPage->pgno, pChild->pgno)); 5674 if( ISAUTOVACUUM ){ 5675 int i; 5676 rc = ptrmapPut(pBt, pChild->pgno, PTRMAP_BTREE, pPage->pgno); 5677 if( rc ) goto balancedeeper_out; 5678 for(i=0; i<pChild->nCell; i++){ 5679 rc = ptrmapPutOvfl(pChild, i); 5680 if( rc!=SQLITE_OK ){ 5681 goto balancedeeper_out; 5682 } 5683 } 5684 rc = reparentChildPages(pChild, 1); 5685 } 5686 if( rc==SQLITE_OK ){ 5687 rc = balance_nonroot(pChild); 5688 } 5689 5690 balancedeeper_out: 5691 releasePage(pChild); 5692 return rc; 5693 } 5694 5695 /* 5696 ** Decide if the page pPage needs to be balanced. If balancing is 5697 ** required, call the appropriate balancing routine. 5698 */ 5699 static int balance(MemPage *pPage, int insert){ 5700 int rc = SQLITE_OK; 5701 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 5702 if( pPage->pParent==0 ){ 5703 rc = sqlite3PagerWrite(pPage->pDbPage); 5704 if( rc==SQLITE_OK && pPage->nOverflow>0 ){ 5705 rc = balance_deeper(pPage); 5706 } 5707 if( rc==SQLITE_OK && pPage->nCell==0 ){ 5708 rc = balance_shallower(pPage); 5709 } 5710 }else{ 5711 if( pPage->nOverflow>0 || 5712 (!insert && pPage->nFree>pPage->pBt->usableSize*2/3) ){ 5713 rc = balance_nonroot(pPage); 5714 } 5715 } 5716 return rc; 5717 } 5718 5719 /* 5720 ** This routine checks all cursors that point to table pgnoRoot. 5721 ** If any of those cursors were opened with wrFlag==0 in a different 5722 ** database connection (a database connection that shares the pager 5723 ** cache with the current connection) and that other connection 5724 ** is not in the ReadUncommmitted state, then this routine returns 5725 ** SQLITE_LOCKED. 5726 ** 5727 ** As well as cursors with wrFlag==0, cursors with wrFlag==1 and 5728 ** isIncrblobHandle==1 are also considered 'read' cursors. Incremental 5729 ** blob cursors are used for both reading and writing. 5730 ** 5731 ** When pgnoRoot is the root page of an intkey table, this function is also 5732 ** responsible for invalidating incremental blob cursors when the table row 5733 ** on which they are opened is deleted or modified. Cursors are invalidated 5734 ** according to the following rules: 5735 ** 5736 ** 1) When BtreeClearTable() is called to completely delete the contents 5737 ** of a B-Tree table, pExclude is set to zero and parameter iRow is 5738 ** set to non-zero. In this case all incremental blob cursors open 5739 ** on the table rooted at pgnoRoot are invalidated. 5740 ** 5741 ** 2) When BtreeInsert(), BtreeDelete() or BtreePutData() is called to 5742 ** modify a table row via an SQL statement, pExclude is set to the 5743 ** write cursor used to do the modification and parameter iRow is set 5744 ** to the integer row id of the B-Tree entry being modified. Unless 5745 ** pExclude is itself an incremental blob cursor, then all incremental 5746 ** blob cursors open on row iRow of the B-Tree are invalidated. 5747 ** 5748 ** 3) If both pExclude and iRow are set to zero, no incremental blob 5749 ** cursors are invalidated. 5750 */ 5751 static int checkReadLocks( 5752 Btree *pBtree, 5753 Pgno pgnoRoot, 5754 BtCursor *pExclude, 5755 i64 iRow 5756 ){ 5757 BtCursor *p; 5758 BtShared *pBt = pBtree->pBt; 5759 sqlite3 *db = pBtree->db; 5760 assert( sqlite3BtreeHoldsMutex(pBtree) ); 5761 for(p=pBt->pCursor; p; p=p->pNext){ 5762 if( p==pExclude ) continue; 5763 if( p->pgnoRoot!=pgnoRoot ) continue; 5764 #ifndef SQLITE_OMIT_INCRBLOB 5765 if( p->isIncrblobHandle && ( 5766 (!pExclude && iRow) 5767 || (pExclude && !pExclude->isIncrblobHandle && p->info.nKey==iRow) 5768 )){ 5769 p->eState = CURSOR_INVALID; 5770 } 5771 #endif 5772 if( p->eState!=CURSOR_VALID ) continue; 5773 if( p->wrFlag==0 5774 #ifndef SQLITE_OMIT_INCRBLOB 5775 || p->isIncrblobHandle 5776 #endif 5777 ){ 5778 sqlite3 *dbOther = p->pBtree->db; 5779 if( dbOther==0 || 5780 (dbOther!=db && (dbOther->flags & SQLITE_ReadUncommitted)==0) ){ 5781 return SQLITE_LOCKED; 5782 } 5783 } 5784 } 5785 return SQLITE_OK; 5786 } 5787 5788 /* 5789 ** Insert a new record into the BTree. The key is given by (pKey,nKey) 5790 ** and the data is given by (pData,nData). The cursor is used only to 5791 ** define what table the record should be inserted into. The cursor 5792 ** is left pointing at a random location. 5793 ** 5794 ** For an INTKEY table, only the nKey value of the key is used. pKey is 5795 ** ignored. For a ZERODATA table, the pData and nData are both ignored. 5796 */ 5797 int sqlite3BtreeInsert( 5798 BtCursor *pCur, /* Insert data into the table of this cursor */ 5799 const void *pKey, i64 nKey, /* The key of the new record */ 5800 const void *pData, int nData, /* The data of the new record */ 5801 int nZero, /* Number of extra 0 bytes to append to data */ 5802 int appendBias /* True if this is likely an append */ 5803 ){ 5804 int rc; 5805 int loc; 5806 int szNew; 5807 MemPage *pPage; 5808 Btree *p = pCur->pBtree; 5809 BtShared *pBt = p->pBt; 5810 unsigned char *oldCell; 5811 unsigned char *newCell = 0; 5812 5813 assert( cursorHoldsMutex(pCur) ); 5814 if( pBt->inTransaction!=TRANS_WRITE ){ 5815 /* Must start a transaction before doing an insert */ 5816 rc = pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR; 5817 return rc; 5818 } 5819 assert( !pBt->readOnly ); 5820 if( !pCur->wrFlag ){ 5821 return SQLITE_PERM; /* Cursor not open for writing */ 5822 } 5823 if( checkReadLocks(pCur->pBtree, pCur->pgnoRoot, pCur, nKey) ){ 5824 return SQLITE_LOCKED; /* The table pCur points to has a read lock */ 5825 } 5826 if( pCur->eState==CURSOR_FAULT ){ 5827 return pCur->skip; 5828 } 5829 5830 /* Save the positions of any other cursors open on this table */ 5831 clearCursorPosition(pCur); 5832 if( 5833 SQLITE_OK!=(rc = saveAllCursors(pBt, pCur->pgnoRoot, pCur)) || 5834 SQLITE_OK!=(rc = sqlite3BtreeMoveto(pCur, pKey, 0, nKey, appendBias, &loc)) 5835 ){ 5836 return rc; 5837 } 5838 5839 pPage = pCur->pPage; 5840 assert( pPage->intKey || nKey>=0 ); 5841 assert( pPage->leaf || !pPage->intKey ); 5842 TRACE(("INSERT: table=%d nkey=%lld ndata=%d page=%d %s\n", 5843 pCur->pgnoRoot, nKey, nData, pPage->pgno, 5844 loc==0 ? "overwrite" : "new entry")); 5845 assert( pPage->isInit ); 5846 allocateTempSpace(pBt); 5847 newCell = pBt->pTmpSpace; 5848 if( newCell==0 ) return SQLITE_NOMEM; 5849 rc = fillInCell(pPage, newCell, pKey, nKey, pData, nData, nZero, &szNew); 5850 if( rc ) goto end_insert; 5851 assert( szNew==cellSizePtr(pPage, newCell) ); 5852 assert( szNew<=MX_CELL_SIZE(pBt) ); 5853 if( loc==0 && CURSOR_VALID==pCur->eState ){ 5854 u16 szOld; 5855 assert( pCur->idx>=0 && pCur->idx<pPage->nCell ); 5856 rc = sqlite3PagerWrite(pPage->pDbPage); 5857 if( rc ){ 5858 goto end_insert; 5859 } 5860 oldCell = findCell(pPage, pCur->idx); 5861 if( !pPage->leaf ){ 5862 memcpy(newCell, oldCell, 4); 5863 } 5864 szOld = cellSizePtr(pPage, oldCell); 5865 rc = clearCell(pPage, oldCell); 5866 if( rc ) goto end_insert; 5867 dropCell(pPage, pCur->idx, szOld); 5868 }else if( loc<0 && pPage->nCell>0 ){ 5869 assert( pPage->leaf ); 5870 pCur->idx++; 5871 pCur->info.nSize = 0; 5872 pCur->validNKey = 0; 5873 }else{ 5874 assert( pPage->leaf ); 5875 } 5876 rc = insertCell(pPage, pCur->idx, newCell, szNew, 0, 0); 5877 if( rc!=SQLITE_OK ) goto end_insert; 5878 rc = balance(pPage, 1); 5879 if( rc==SQLITE_OK ){ 5880 moveToRoot(pCur); 5881 } 5882 end_insert: 5883 return rc; 5884 } 5885 5886 /* 5887 ** Delete the entry that the cursor is pointing to. The cursor 5888 ** is left pointing at a random location. 5889 */ 5890 int sqlite3BtreeDelete(BtCursor *pCur){ 5891 MemPage *pPage = pCur->pPage; 5892 unsigned char *pCell; 5893 int rc; 5894 Pgno pgnoChild = 0; 5895 Btree *p = pCur->pBtree; 5896 BtShared *pBt = p->pBt; 5897 5898 assert( cursorHoldsMutex(pCur) ); 5899 assert( pPage->isInit ); 5900 if( pBt->inTransaction!=TRANS_WRITE ){ 5901 /* Must start a transaction before doing a delete */ 5902 rc = pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR; 5903 return rc; 5904 } 5905 assert( !pBt->readOnly ); 5906 if( pCur->eState==CURSOR_FAULT ){ 5907 return pCur->skip; 5908 } 5909 if( pCur->idx >= pPage->nCell ){ 5910 return SQLITE_ERROR; /* The cursor is not pointing to anything */ 5911 } 5912 if( !pCur->wrFlag ){ 5913 return SQLITE_PERM; /* Did not open this cursor for writing */ 5914 } 5915 if( checkReadLocks(pCur->pBtree, pCur->pgnoRoot, pCur, pCur->info.nKey) ){ 5916 return SQLITE_LOCKED; /* The table pCur points to has a read lock */ 5917 } 5918 5919 /* Restore the current cursor position (a no-op if the cursor is not in 5920 ** CURSOR_REQUIRESEEK state) and save the positions of any other cursors 5921 ** open on the same table. Then call sqlite3PagerWrite() on the page 5922 ** that the entry will be deleted from. 5923 */ 5924 if( 5925 (rc = restoreCursorPosition(pCur))!=0 || 5926 (rc = saveAllCursors(pBt, pCur->pgnoRoot, pCur))!=0 || 5927 (rc = sqlite3PagerWrite(pPage->pDbPage))!=0 5928 ){ 5929 return rc; 5930 } 5931 5932 /* Locate the cell within its page and leave pCell pointing to the 5933 ** data. The clearCell() call frees any overflow pages associated with the 5934 ** cell. The cell itself is still intact. 5935 */ 5936 pCell = findCell(pPage, pCur->idx); 5937 if( !pPage->leaf ){ 5938 pgnoChild = get4byte(pCell); 5939 } 5940 rc = clearCell(pPage, pCell); 5941 if( rc ){ 5942 return rc; 5943 } 5944 5945 if( !pPage->leaf ){ 5946 /* 5947 ** The entry we are about to delete is not a leaf so if we do not 5948 ** do something we will leave a hole on an internal page. 5949 ** We have to fill the hole by moving in a cell from a leaf. The 5950 ** next Cell after the one to be deleted is guaranteed to exist and 5951 ** to be a leaf so we can use it. 5952 */ 5953 BtCursor leafCur; 5954 unsigned char *pNext; 5955 int notUsed; 5956 unsigned char *tempCell = 0; 5957 assert( !pPage->intKey ); 5958 sqlite3BtreeGetTempCursor(pCur, &leafCur); 5959 rc = sqlite3BtreeNext(&leafCur, ¬Used); 5960 if( rc==SQLITE_OK ){ 5961 rc = sqlite3PagerWrite(leafCur.pPage->pDbPage); 5962 } 5963 if( rc==SQLITE_OK ){ 5964 u16 szNext; 5965 TRACE(("DELETE: table=%d delete internal from %d replace from leaf %d\n", 5966 pCur->pgnoRoot, pPage->pgno, leafCur.pPage->pgno)); 5967 dropCell(pPage, pCur->idx, cellSizePtr(pPage, pCell)); 5968 pNext = findCell(leafCur.pPage, leafCur.idx); 5969 szNext = cellSizePtr(leafCur.pPage, pNext); 5970 assert( MX_CELL_SIZE(pBt)>=szNext+4 ); 5971 allocateTempSpace(pBt); 5972 tempCell = pBt->pTmpSpace; 5973 if( tempCell==0 ){ 5974 rc = SQLITE_NOMEM; 5975 } 5976 if( rc==SQLITE_OK ){ 5977 rc = insertCell(pPage, pCur->idx, pNext-4, szNext+4, tempCell, 0); 5978 } 5979 if( rc==SQLITE_OK ){ 5980 put4byte(findOverflowCell(pPage, pCur->idx), pgnoChild); 5981 rc = balance(pPage, 0); 5982 } 5983 if( rc==SQLITE_OK ){ 5984 dropCell(leafCur.pPage, leafCur.idx, szNext); 5985 rc = balance(leafCur.pPage, 0); 5986 } 5987 } 5988 sqlite3BtreeReleaseTempCursor(&leafCur); 5989 }else{ 5990 TRACE(("DELETE: table=%d delete from leaf %d\n", 5991 pCur->pgnoRoot, pPage->pgno)); 5992 dropCell(pPage, pCur->idx, cellSizePtr(pPage, pCell)); 5993 rc = balance(pPage, 0); 5994 } 5995 if( rc==SQLITE_OK ){ 5996 moveToRoot(pCur); 5997 } 5998 return rc; 5999 } 6000 6001 /* 6002 ** Create a new BTree table. Write into *piTable the page 6003 ** number for the root page of the new table. 6004 ** 6005 ** The type of type is determined by the flags parameter. Only the 6006 ** following values of flags are currently in use. Other values for 6007 ** flags might not work: 6008 ** 6009 ** BTREE_INTKEY|BTREE_LEAFDATA Used for SQL tables with rowid keys 6010 ** BTREE_ZERODATA Used for SQL indices 6011 */ 6012 static int btreeCreateTable(Btree *p, int *piTable, int flags){ 6013 BtShared *pBt = p->pBt; 6014 MemPage *pRoot; 6015 Pgno pgnoRoot; 6016 int rc; 6017 6018 assert( sqlite3BtreeHoldsMutex(p) ); 6019 if( pBt->inTransaction!=TRANS_WRITE ){ 6020 /* Must start a transaction first */ 6021 rc = pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR; 6022 return rc; 6023 } 6024 assert( !pBt->readOnly ); 6025 6026 #ifdef SQLITE_OMIT_AUTOVACUUM 6027 rc = allocateBtreePage(pBt, &pRoot, &pgnoRoot, 1, 0); 6028 if( rc ){ 6029 return rc; 6030 } 6031 #else 6032 if( pBt->autoVacuum ){ 6033 Pgno pgnoMove; /* Move a page here to make room for the root-page */ 6034 MemPage *pPageMove; /* The page to move to. */ 6035 6036 /* Creating a new table may probably require moving an existing database 6037 ** to make room for the new tables root page. In case this page turns 6038 ** out to be an overflow page, delete all overflow page-map caches 6039 ** held by open cursors. 6040 */ 6041 invalidateAllOverflowCache(pBt); 6042 6043 /* Read the value of meta[3] from the database to determine where the 6044 ** root page of the new table should go. meta[3] is the largest root-page 6045 ** created so far, so the new root-page is (meta[3]+1). 6046 */ 6047 rc = sqlite3BtreeGetMeta(p, 4, &pgnoRoot); 6048 if( rc!=SQLITE_OK ){ 6049 return rc; 6050 } 6051 pgnoRoot++; 6052 6053 /* The new root-page may not be allocated on a pointer-map page, or the 6054 ** PENDING_BYTE page. 6055 */ 6056 while( pgnoRoot==PTRMAP_PAGENO(pBt, pgnoRoot) || 6057 pgnoRoot==PENDING_BYTE_PAGE(pBt) ){ 6058 pgnoRoot++; 6059 } 6060 assert( pgnoRoot>=3 ); 6061 6062 /* Allocate a page. The page that currently resides at pgnoRoot will 6063 ** be moved to the allocated page (unless the allocated page happens 6064 ** to reside at pgnoRoot). 6065 */ 6066 rc = allocateBtreePage(pBt, &pPageMove, &pgnoMove, pgnoRoot, 1); 6067 if( rc!=SQLITE_OK ){ 6068 return rc; 6069 } 6070 6071 if( pgnoMove!=pgnoRoot ){ 6072 /* pgnoRoot is the page that will be used for the root-page of 6073 ** the new table (assuming an error did not occur). But we were 6074 ** allocated pgnoMove. If required (i.e. if it was not allocated 6075 ** by extending the file), the current page at position pgnoMove 6076 ** is already journaled. 6077 */ 6078 u8 eType; 6079 Pgno iPtrPage; 6080 6081 releasePage(pPageMove); 6082 6083 /* Move the page currently at pgnoRoot to pgnoMove. */ 6084 rc = sqlite3BtreeGetPage(pBt, pgnoRoot, &pRoot, 0); 6085 if( rc!=SQLITE_OK ){ 6086 return rc; 6087 } 6088 rc = ptrmapGet(pBt, pgnoRoot, &eType, &iPtrPage); 6089 if( rc!=SQLITE_OK || eType==PTRMAP_ROOTPAGE || eType==PTRMAP_FREEPAGE ){ 6090 releasePage(pRoot); 6091 return rc; 6092 } 6093 assert( eType!=PTRMAP_ROOTPAGE ); 6094 assert( eType!=PTRMAP_FREEPAGE ); 6095 rc = sqlite3PagerWrite(pRoot->pDbPage); 6096 if( rc!=SQLITE_OK ){ 6097 releasePage(pRoot); 6098 return rc; 6099 } 6100 rc = relocatePage(pBt, pRoot, eType, iPtrPage, pgnoMove, 0); 6101 releasePage(pRoot); 6102 6103 /* Obtain the page at pgnoRoot */ 6104 if( rc!=SQLITE_OK ){ 6105 return rc; 6106 } 6107 rc = sqlite3BtreeGetPage(pBt, pgnoRoot, &pRoot, 0); 6108 if( rc!=SQLITE_OK ){ 6109 return rc; 6110 } 6111 rc = sqlite3PagerWrite(pRoot->pDbPage); 6112 if( rc!=SQLITE_OK ){ 6113 releasePage(pRoot); 6114 return rc; 6115 } 6116 }else{ 6117 pRoot = pPageMove; 6118 } 6119 6120 /* Update the pointer-map and meta-data with the new root-page number. */ 6121 rc = ptrmapPut(pBt, pgnoRoot, PTRMAP_ROOTPAGE, 0); 6122 if( rc ){ 6123 releasePage(pRoot); 6124 return rc; 6125 } 6126 rc = sqlite3BtreeUpdateMeta(p, 4, pgnoRoot); 6127 if( rc ){ 6128 releasePage(pRoot); 6129 return rc; 6130 } 6131 6132 }else{ 6133 rc = allocateBtreePage(pBt, &pRoot, &pgnoRoot, 1, 0); 6134 if( rc ) return rc; 6135 } 6136 #endif 6137 assert( sqlite3PagerIswriteable(pRoot->pDbPage) ); 6138 zeroPage(pRoot, flags | PTF_LEAF); 6139 sqlite3PagerUnref(pRoot->pDbPage); 6140 *piTable = (int)pgnoRoot; 6141 return SQLITE_OK; 6142 } 6143 int sqlite3BtreeCreateTable(Btree *p, int *piTable, int flags){ 6144 int rc; 6145 sqlite3BtreeEnter(p); 6146 p->pBt->db = p->db; 6147 rc = btreeCreateTable(p, piTable, flags); 6148 sqlite3BtreeLeave(p); 6149 return rc; 6150 } 6151 6152 /* 6153 ** Erase the given database page and all its children. Return 6154 ** the page to the freelist. 6155 */ 6156 static int clearDatabasePage( 6157 BtShared *pBt, /* The BTree that contains the table */ 6158 Pgno pgno, /* Page number to clear */ 6159 MemPage *pParent, /* Parent page. NULL for the root */ 6160 int freePageFlag /* Deallocate page if true */ 6161 ){ 6162 MemPage *pPage = 0; 6163 int rc; 6164 unsigned char *pCell; 6165 int i; 6166 6167 assert( sqlite3_mutex_held(pBt->mutex) ); 6168 if( pgno>pagerPagecount(pBt->pPager) ){ 6169 return SQLITE_CORRUPT_BKPT; 6170 } 6171 6172 rc = getAndInitPage(pBt, pgno, &pPage, pParent); 6173 if( rc ) goto cleardatabasepage_out; 6174 for(i=0; i<pPage->nCell; i++){ 6175 pCell = findCell(pPage, i); 6176 if( !pPage->leaf ){ 6177 rc = clearDatabasePage(pBt, get4byte(pCell), pPage->pParent, 1); 6178 if( rc ) goto cleardatabasepage_out; 6179 } 6180 rc = clearCell(pPage, pCell); 6181 if( rc ) goto cleardatabasepage_out; 6182 } 6183 if( !pPage->leaf ){ 6184 rc = clearDatabasePage(pBt, get4byte(&pPage->aData[8]), pPage->pParent, 1); 6185 if( rc ) goto cleardatabasepage_out; 6186 } 6187 if( freePageFlag ){ 6188 rc = freePage(pPage); 6189 }else if( (rc = sqlite3PagerWrite(pPage->pDbPage))==0 ){ 6190 zeroPage(pPage, pPage->aData[0] | PTF_LEAF); 6191 } 6192 6193 cleardatabasepage_out: 6194 releasePage(pPage); 6195 return rc; 6196 } 6197 6198 /* 6199 ** Delete all information from a single table in the database. iTable is 6200 ** the page number of the root of the table. After this routine returns, 6201 ** the root page is empty, but still exists. 6202 ** 6203 ** This routine will fail with SQLITE_LOCKED if there are any open 6204 ** read cursors on the table. Open write cursors are moved to the 6205 ** root of the table. 6206 */ 6207 int sqlite3BtreeClearTable(Btree *p, int iTable){ 6208 int rc; 6209 BtShared *pBt = p->pBt; 6210 sqlite3BtreeEnter(p); 6211 pBt->db = p->db; 6212 if( p->inTrans!=TRANS_WRITE ){ 6213 rc = pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR; 6214 }else if( (rc = checkReadLocks(p, iTable, 0, 1))!=SQLITE_OK ){ 6215 /* nothing to do */ 6216 }else if( SQLITE_OK!=(rc = saveAllCursors(pBt, iTable, 0)) ){ 6217 /* nothing to do */ 6218 }else{ 6219 rc = clearDatabasePage(pBt, (Pgno)iTable, 0, 0); 6220 } 6221 sqlite3BtreeLeave(p); 6222 return rc; 6223 } 6224 6225 /* 6226 ** Erase all information in a table and add the root of the table to 6227 ** the freelist. Except, the root of the principle table (the one on 6228 ** page 1) is never added to the freelist. 6229 ** 6230 ** This routine will fail with SQLITE_LOCKED if there are any open 6231 ** cursors on the table. 6232 ** 6233 ** If AUTOVACUUM is enabled and the page at iTable is not the last 6234 ** root page in the database file, then the last root page 6235 ** in the database file is moved into the slot formerly occupied by 6236 ** iTable and that last slot formerly occupied by the last root page 6237 ** is added to the freelist instead of iTable. In this say, all 6238 ** root pages are kept at the beginning of the database file, which 6239 ** is necessary for AUTOVACUUM to work right. *piMoved is set to the 6240 ** page number that used to be the last root page in the file before 6241 ** the move. If no page gets moved, *piMoved is set to 0. 6242 ** The last root page is recorded in meta[3] and the value of 6243 ** meta[3] is updated by this procedure. 6244 */ 6245 static int btreeDropTable(Btree *p, int iTable, int *piMoved){ 6246 int rc; 6247 MemPage *pPage = 0; 6248 BtShared *pBt = p->pBt; 6249 6250 assert( sqlite3BtreeHoldsMutex(p) ); 6251 if( p->inTrans!=TRANS_WRITE ){ 6252 return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR; 6253 } 6254 6255 /* It is illegal to drop a table if any cursors are open on the 6256 ** database. This is because in auto-vacuum mode the backend may 6257 ** need to move another root-page to fill a gap left by the deleted 6258 ** root page. If an open cursor was using this page a problem would 6259 ** occur. 6260 */ 6261 if( pBt->pCursor ){ 6262 return SQLITE_LOCKED; 6263 } 6264 6265 rc = sqlite3BtreeGetPage(pBt, (Pgno)iTable, &pPage, 0); 6266 if( rc ) return rc; 6267 rc = sqlite3BtreeClearTable(p, iTable); 6268 if( rc ){ 6269 releasePage(pPage); 6270 return rc; 6271 } 6272 6273 *piMoved = 0; 6274 6275 if( iTable>1 ){ 6276 #ifdef SQLITE_OMIT_AUTOVACUUM 6277 rc = freePage(pPage); 6278 releasePage(pPage); 6279 #else 6280 if( pBt->autoVacuum ){ 6281 Pgno maxRootPgno; 6282 rc = sqlite3BtreeGetMeta(p, 4, &maxRootPgno); 6283 if( rc!=SQLITE_OK ){ 6284 releasePage(pPage); 6285 return rc; 6286 } 6287 6288 if( iTable==maxRootPgno ){ 6289 /* If the table being dropped is the table with the largest root-page 6290 ** number in the database, put the root page on the free list. 6291 */ 6292 rc = freePage(pPage); 6293 releasePage(pPage); 6294 if( rc!=SQLITE_OK ){ 6295 return rc; 6296 } 6297 }else{ 6298 /* The table being dropped does not have the largest root-page 6299 ** number in the database. So move the page that does into the 6300 ** gap left by the deleted root-page. 6301 */ 6302 MemPage *pMove; 6303 releasePage(pPage); 6304 rc = sqlite3BtreeGetPage(pBt, maxRootPgno, &pMove, 0); 6305 if( rc!=SQLITE_OK ){ 6306 return rc; 6307 } 6308 rc = relocatePage(pBt, pMove, PTRMAP_ROOTPAGE, 0, iTable, 0); 6309 releasePage(pMove); 6310 if( rc!=SQLITE_OK ){ 6311 return rc; 6312 } 6313 rc = sqlite3BtreeGetPage(pBt, maxRootPgno, &pMove, 0); 6314 if( rc!=SQLITE_OK ){ 6315 return rc; 6316 } 6317 rc = freePage(pMove); 6318 releasePage(pMove); 6319 if( rc!=SQLITE_OK ){ 6320 return rc; 6321 } 6322 *piMoved = maxRootPgno; 6323 } 6324 6325 /* Set the new 'max-root-page' value in the database header. This 6326 ** is the old value less one, less one more if that happens to 6327 ** be a root-page number, less one again if that is the 6328 ** PENDING_BYTE_PAGE. 6329 */ 6330 maxRootPgno--; 6331 if( maxRootPgno==PENDING_BYTE_PAGE(pBt) ){ 6332 maxRootPgno--; 6333 } 6334 if( maxRootPgno==PTRMAP_PAGENO(pBt, maxRootPgno) ){ 6335 maxRootPgno--; 6336 } 6337 assert( maxRootPgno!=PENDING_BYTE_PAGE(pBt) ); 6338 6339 rc = sqlite3BtreeUpdateMeta(p, 4, maxRootPgno); 6340 }else{ 6341 rc = freePage(pPage); 6342 releasePage(pPage); 6343 } 6344 #endif 6345 }else{ 6346 /* If sqlite3BtreeDropTable was called on page 1. */ 6347 zeroPage(pPage, PTF_INTKEY|PTF_LEAF ); 6348 releasePage(pPage); 6349 } 6350 return rc; 6351 } 6352 int sqlite3BtreeDropTable(Btree *p, int iTable, int *piMoved){ 6353 int rc; 6354 sqlite3BtreeEnter(p); 6355 p->pBt->db = p->db; 6356 rc = btreeDropTable(p, iTable, piMoved); 6357 sqlite3BtreeLeave(p); 6358 return rc; 6359 } 6360 6361 6362 /* 6363 ** Read the meta-information out of a database file. Meta[0] 6364 ** is the number of free pages currently in the database. Meta[1] 6365 ** through meta[15] are available for use by higher layers. Meta[0] 6366 ** is read-only, the others are read/write. 6367 ** 6368 ** The schema layer numbers meta values differently. At the schema 6369 ** layer (and the SetCookie and ReadCookie opcodes) the number of 6370 ** free pages is not visible. So Cookie[0] is the same as Meta[1]. 6371 */ 6372 int sqlite3BtreeGetMeta(Btree *p, int idx, u32 *pMeta){ 6373 DbPage *pDbPage; 6374 int rc; 6375 unsigned char *pP1; 6376 BtShared *pBt = p->pBt; 6377 6378 sqlite3BtreeEnter(p); 6379 pBt->db = p->db; 6380 6381 /* Reading a meta-data value requires a read-lock on page 1 (and hence 6382 ** the sqlite_master table. We grab this lock regardless of whether or 6383 ** not the SQLITE_ReadUncommitted flag is set (the table rooted at page 6384 ** 1 is treated as a special case by queryTableLock() and lockTable()). 6385 */ 6386 rc = queryTableLock(p, 1, READ_LOCK); 6387 if( rc!=SQLITE_OK ){ 6388 sqlite3BtreeLeave(p); 6389 return rc; 6390 } 6391 6392 assert( idx>=0 && idx<=15 ); 6393 rc = sqlite3PagerGet(pBt->pPager, 1, &pDbPage); 6394 if( rc ){ 6395 sqlite3BtreeLeave(p); 6396 return rc; 6397 } 6398 pP1 = (unsigned char *)sqlite3PagerGetData(pDbPage); 6399 *pMeta = get4byte(&pP1[36 + idx*4]); 6400 sqlite3PagerUnref(pDbPage); 6401 6402 /* If autovacuumed is disabled in this build but we are trying to 6403 ** access an autovacuumed database, then make the database readonly. 6404 */ 6405 #ifdef SQLITE_OMIT_AUTOVACUUM 6406 if( idx==4 && *pMeta>0 ) pBt->readOnly = 1; 6407 #endif 6408 6409 /* Grab the read-lock on page 1. */ 6410 rc = lockTable(p, 1, READ_LOCK); 6411 sqlite3BtreeLeave(p); 6412 return rc; 6413 } 6414 6415 /* 6416 ** Write meta-information back into the database. Meta[0] is 6417 ** read-only and may not be written. 6418 */ 6419 int sqlite3BtreeUpdateMeta(Btree *p, int idx, u32 iMeta){ 6420 BtShared *pBt = p->pBt; 6421 unsigned char *pP1; 6422 int rc; 6423 assert( idx>=1 && idx<=15 ); 6424 sqlite3BtreeEnter(p); 6425 pBt->db = p->db; 6426 if( p->inTrans!=TRANS_WRITE ){ 6427 rc = pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR; 6428 }else{ 6429 assert( pBt->pPage1!=0 ); 6430 pP1 = pBt->pPage1->aData; 6431 rc = sqlite3PagerWrite(pBt->pPage1->pDbPage); 6432 if( rc==SQLITE_OK ){ 6433 put4byte(&pP1[36 + idx*4], iMeta); 6434 #ifndef SQLITE_OMIT_AUTOVACUUM 6435 if( idx==7 ){ 6436 assert( pBt->autoVacuum || iMeta==0 ); 6437 assert( iMeta==0 || iMeta==1 ); 6438 pBt->incrVacuum = iMeta; 6439 } 6440 #endif 6441 } 6442 } 6443 sqlite3BtreeLeave(p); 6444 return rc; 6445 } 6446 6447 /* 6448 ** Return the flag byte at the beginning of the page that the cursor 6449 ** is currently pointing to. 6450 */ 6451 int sqlite3BtreeFlags(BtCursor *pCur){ 6452 /* TODO: What about CURSOR_REQUIRESEEK state? Probably need to call 6453 ** restoreCursorPosition() here. 6454 */ 6455 MemPage *pPage; 6456 restoreCursorPosition(pCur); 6457 pPage = pCur->pPage; 6458 assert( cursorHoldsMutex(pCur) ); 6459 assert( pPage->pBt==pCur->pBt ); 6460 return pPage ? pPage->aData[pPage->hdrOffset] : 0; 6461 } 6462 6463 6464 /* 6465 ** Return the pager associated with a BTree. This routine is used for 6466 ** testing and debugging only. 6467 */ 6468 Pager *sqlite3BtreePager(Btree *p){ 6469 return p->pBt->pPager; 6470 } 6471 6472 #ifndef SQLITE_OMIT_INTEGRITY_CHECK 6473 /* 6474 ** Append a message to the error message string. 6475 */ 6476 static void checkAppendMsg( 6477 IntegrityCk *pCheck, 6478 char *zMsg1, 6479 const char *zFormat, 6480 ... 6481 ){ 6482 va_list ap; 6483 if( !pCheck->mxErr ) return; 6484 pCheck->mxErr--; 6485 pCheck->nErr++; 6486 va_start(ap, zFormat); 6487 if( pCheck->errMsg.nChar ){ 6488 sqlite3StrAccumAppend(&pCheck->errMsg, "\n", 1); 6489 } 6490 if( zMsg1 ){ 6491 sqlite3StrAccumAppend(&pCheck->errMsg, zMsg1, -1); 6492 } 6493 sqlite3VXPrintf(&pCheck->errMsg, 1, zFormat, ap); 6494 va_end(ap); 6495 } 6496 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */ 6497 6498 #ifndef SQLITE_OMIT_INTEGRITY_CHECK 6499 /* 6500 ** Add 1 to the reference count for page iPage. If this is the second 6501 ** reference to the page, add an error message to pCheck->zErrMsg. 6502 ** Return 1 if there are 2 ore more references to the page and 0 if 6503 ** if this is the first reference to the page. 6504 ** 6505 ** Also check that the page number is in bounds. 6506 */ 6507 static int checkRef(IntegrityCk *pCheck, int iPage, char *zContext){ 6508 if( iPage==0 ) return 1; 6509 if( iPage>pCheck->nPage || iPage<0 ){ 6510 checkAppendMsg(pCheck, zContext, "invalid page number %d", iPage); 6511 return 1; 6512 } 6513 if( pCheck->anRef[iPage]==1 ){ 6514 checkAppendMsg(pCheck, zContext, "2nd reference to page %d", iPage); 6515 return 1; 6516 } 6517 return (pCheck->anRef[iPage]++)>1; 6518 } 6519 6520 #ifndef SQLITE_OMIT_AUTOVACUUM 6521 /* 6522 ** Check that the entry in the pointer-map for page iChild maps to 6523 ** page iParent, pointer type ptrType. If not, append an error message 6524 ** to pCheck. 6525 */ 6526 static void checkPtrmap( 6527 IntegrityCk *pCheck, /* Integrity check context */ 6528 Pgno iChild, /* Child page number */ 6529 u8 eType, /* Expected pointer map type */ 6530 Pgno iParent, /* Expected pointer map parent page number */ 6531 char *zContext /* Context description (used for error msg) */ 6532 ){ 6533 int rc; 6534 u8 ePtrmapType; 6535 Pgno iPtrmapParent; 6536 6537 rc = ptrmapGet(pCheck->pBt, iChild, &ePtrmapType, &iPtrmapParent); 6538 if( rc!=SQLITE_OK ){ 6539 checkAppendMsg(pCheck, zContext, "Failed to read ptrmap key=%d", iChild); 6540 return; 6541 } 6542 6543 if( ePtrmapType!=eType || iPtrmapParent!=iParent ){ 6544 checkAppendMsg(pCheck, zContext, 6545 "Bad ptr map entry key=%d expected=(%d,%d) got=(%d,%d)", 6546 iChild, eType, iParent, ePtrmapType, iPtrmapParent); 6547 } 6548 } 6549 #endif 6550 6551 /* 6552 ** Check the integrity of the freelist or of an overflow page list. 6553 ** Verify that the number of pages on the list is N. 6554 */ 6555 static void checkList( 6556 IntegrityCk *pCheck, /* Integrity checking context */ 6557 int isFreeList, /* True for a freelist. False for overflow page list */ 6558 int iPage, /* Page number for first page in the list */ 6559 int N, /* Expected number of pages in the list */ 6560 char *zContext /* Context for error messages */ 6561 ){ 6562 int i; 6563 int expected = N; 6564 int iFirst = iPage; 6565 while( N-- > 0 && pCheck->mxErr ){ 6566 DbPage *pOvflPage; 6567 unsigned char *pOvflData; 6568 if( iPage<1 ){ 6569 checkAppendMsg(pCheck, zContext, 6570 "%d of %d pages missing from overflow list starting at %d", 6571 N+1, expected, iFirst); 6572 break; 6573 } 6574 if( checkRef(pCheck, iPage, zContext) ) break; 6575 if( sqlite3PagerGet(pCheck->pPager, (Pgno)iPage, &pOvflPage) ){ 6576 checkAppendMsg(pCheck, zContext, "failed to get page %d", iPage); 6577 break; 6578 } 6579 pOvflData = (unsigned char *)sqlite3PagerGetData(pOvflPage); 6580 if( isFreeList ){ 6581 int n = get4byte(&pOvflData[4]); 6582 #ifndef SQLITE_OMIT_AUTOVACUUM 6583 if( pCheck->pBt->autoVacuum ){ 6584 checkPtrmap(pCheck, iPage, PTRMAP_FREEPAGE, 0, zContext); 6585 } 6586 #endif 6587 if( n>pCheck->pBt->usableSize/4-2 ){ 6588 checkAppendMsg(pCheck, zContext, 6589 "freelist leaf count too big on page %d", iPage); 6590 N--; 6591 }else{ 6592 for(i=0; i<n; i++){ 6593 Pgno iFreePage = get4byte(&pOvflData[8+i*4]); 6594 #ifndef SQLITE_OMIT_AUTOVACUUM 6595 if( pCheck->pBt->autoVacuum ){ 6596 checkPtrmap(pCheck, iFreePage, PTRMAP_FREEPAGE, 0, zContext); 6597 } 6598 #endif 6599 checkRef(pCheck, iFreePage, zContext); 6600 } 6601 N -= n; 6602 } 6603 } 6604 #ifndef SQLITE_OMIT_AUTOVACUUM 6605 else{ 6606 /* If this database supports auto-vacuum and iPage is not the last 6607 ** page in this overflow list, check that the pointer-map entry for 6608 ** the following page matches iPage. 6609 */ 6610 if( pCheck->pBt->autoVacuum && N>0 ){ 6611 i = get4byte(pOvflData); 6612 checkPtrmap(pCheck, i, PTRMAP_OVERFLOW2, iPage, zContext); 6613 } 6614 } 6615 #endif 6616 iPage = get4byte(pOvflData); 6617 sqlite3PagerUnref(pOvflPage); 6618 } 6619 } 6620 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */ 6621 6622 #ifndef SQLITE_OMIT_INTEGRITY_CHECK 6623 /* 6624 ** Do various sanity checks on a single page of a tree. Return 6625 ** the tree depth. Root pages return 0. Parents of root pages 6626 ** return 1, and so forth. 6627 ** 6628 ** These checks are done: 6629 ** 6630 ** 1. Make sure that cells and freeblocks do not overlap 6631 ** but combine to completely cover the page. 6632 ** NO 2. Make sure cell keys are in order. 6633 ** NO 3. Make sure no key is less than or equal to zLowerBound. 6634 ** NO 4. Make sure no key is greater than or equal to zUpperBound. 6635 ** 5. Check the integrity of overflow pages. 6636 ** 6. Recursively call checkTreePage on all children. 6637 ** 7. Verify that the depth of all children is the same. 6638 ** 8. Make sure this page is at least 33% full or else it is 6639 ** the root of the tree. 6640 */ 6641 static int checkTreePage( 6642 IntegrityCk *pCheck, /* Context for the sanity check */ 6643 int iPage, /* Page number of the page to check */ 6644 MemPage *pParent, /* Parent page */ 6645 char *zParentContext /* Parent context */ 6646 ){ 6647 MemPage *pPage; 6648 int i, rc, depth, d2, pgno, cnt; 6649 int hdr, cellStart; 6650 int nCell; 6651 u8 *data; 6652 BtShared *pBt; 6653 int usableSize; 6654 char zContext[100]; 6655 char *hit; 6656 6657 sqlite3_snprintf(sizeof(zContext), zContext, "Page %d: ", iPage); 6658 6659 /* Check that the page exists 6660 */ 6661 pBt = pCheck->pBt; 6662 usableSize = pBt->usableSize; 6663 if( iPage==0 ) return 0; 6664 if( checkRef(pCheck, iPage, zParentContext) ) return 0; 6665 if( (rc = sqlite3BtreeGetPage(pBt, (Pgno)iPage, &pPage, 0))!=0 ){ 6666 checkAppendMsg(pCheck, zContext, 6667 "unable to get the page. error code=%d", rc); 6668 return 0; 6669 } 6670 if( (rc = sqlite3BtreeInitPage(pPage, pParent))!=0 ){ 6671 checkAppendMsg(pCheck, zContext, 6672 "sqlite3BtreeInitPage() returns error code %d", rc); 6673 releasePage(pPage); 6674 return 0; 6675 } 6676 6677 /* Check out all the cells. 6678 */ 6679 depth = 0; 6680 for(i=0; i<pPage->nCell && pCheck->mxErr; i++){ 6681 u8 *pCell; 6682 int sz; 6683 CellInfo info; 6684 6685 /* Check payload overflow pages 6686 */ 6687 sqlite3_snprintf(sizeof(zContext), zContext, 6688 "On tree page %d cell %d: ", iPage, i); 6689 pCell = findCell(pPage,i); 6690 sqlite3BtreeParseCellPtr(pPage, pCell, &info); 6691 sz = info.nData; 6692 if( !pPage->intKey ) sz += info.nKey; 6693 assert( sz==info.nPayload ); 6694 if( sz>info.nLocal ){ 6695 int nPage = (sz - info.nLocal + usableSize - 5)/(usableSize - 4); 6696 Pgno pgnoOvfl = get4byte(&pCell[info.iOverflow]); 6697 #ifndef SQLITE_OMIT_AUTOVACUUM 6698 if( pBt->autoVacuum ){ 6699 checkPtrmap(pCheck, pgnoOvfl, PTRMAP_OVERFLOW1, iPage, zContext); 6700 } 6701 #endif 6702 checkList(pCheck, 0, pgnoOvfl, nPage, zContext); 6703 } 6704 6705 /* Check sanity of left child page. 6706 */ 6707 if( !pPage->leaf ){ 6708 pgno = get4byte(pCell); 6709 #ifndef SQLITE_OMIT_AUTOVACUUM 6710 if( pBt->autoVacuum ){ 6711 checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage, zContext); 6712 } 6713 #endif 6714 d2 = checkTreePage(pCheck,pgno,pPage,zContext); 6715 if( i>0 && d2!=depth ){ 6716 checkAppendMsg(pCheck, zContext, "Child page depth differs"); 6717 } 6718 depth = d2; 6719 } 6720 } 6721 if( !pPage->leaf ){ 6722 pgno = get4byte(&pPage->aData[pPage->hdrOffset+8]); 6723 sqlite3_snprintf(sizeof(zContext), zContext, 6724 "On page %d at right child: ", iPage); 6725 #ifndef SQLITE_OMIT_AUTOVACUUM 6726 if( pBt->autoVacuum ){ 6727 checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage, 0); 6728 } 6729 #endif 6730 checkTreePage(pCheck, pgno, pPage, zContext); 6731 } 6732 6733 /* Check for complete coverage of the page 6734 */ 6735 data = pPage->aData; 6736 hdr = pPage->hdrOffset; 6737 hit = sqlite3PageMalloc( pBt->pageSize ); 6738 if( hit ){ 6739 memset(hit, 0, usableSize ); 6740 memset(hit, 1, get2byte(&data[hdr+5])); 6741 nCell = get2byte(&data[hdr+3]); 6742 cellStart = hdr + 12 - 4*pPage->leaf; 6743 for(i=0; i<nCell; i++){ 6744 int pc = get2byte(&data[cellStart+i*2]); 6745 u16 size = cellSizePtr(pPage, &data[pc]); 6746 int j; 6747 if( (pc+size-1)>=usableSize || pc<0 ){ 6748 checkAppendMsg(pCheck, 0, 6749 "Corruption detected in cell %d on page %d",i,iPage,0); 6750 }else{ 6751 for(j=pc+size-1; j>=pc; j--) hit[j]++; 6752 } 6753 } 6754 for(cnt=0, i=get2byte(&data[hdr+1]); i>0 && i<usableSize && cnt<10000; 6755 cnt++){ 6756 int size = get2byte(&data[i+2]); 6757 int j; 6758 if( (i+size-1)>=usableSize || i<0 ){ 6759 checkAppendMsg(pCheck, 0, 6760 "Corruption detected in cell %d on page %d",i,iPage,0); 6761 }else{ 6762 for(j=i+size-1; j>=i; j--) hit[j]++; 6763 } 6764 i = get2byte(&data[i]); 6765 } 6766 for(i=cnt=0; i<usableSize; i++){ 6767 if( hit[i]==0 ){ 6768 cnt++; 6769 }else if( hit[i]>1 ){ 6770 checkAppendMsg(pCheck, 0, 6771 "Multiple uses for byte %d of page %d", i, iPage); 6772 break; 6773 } 6774 } 6775 if( cnt!=data[hdr+7] ){ 6776 checkAppendMsg(pCheck, 0, 6777 "Fragmented space is %d byte reported as %d on page %d", 6778 cnt, data[hdr+7], iPage); 6779 } 6780 } 6781 sqlite3PageFree(hit); 6782 6783 releasePage(pPage); 6784 return depth+1; 6785 } 6786 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */ 6787 6788 #ifndef SQLITE_OMIT_INTEGRITY_CHECK 6789 /* 6790 ** This routine does a complete check of the given BTree file. aRoot[] is 6791 ** an array of pages numbers were each page number is the root page of 6792 ** a table. nRoot is the number of entries in aRoot. 6793 ** 6794 ** If everything checks out, this routine returns NULL. If something is 6795 ** amiss, an error message is written into memory obtained from malloc() 6796 ** and a pointer to that error message is returned. The calling function 6797 ** is responsible for freeing the error message when it is done. 6798 */ 6799 char *sqlite3BtreeIntegrityCheck( 6800 Btree *p, /* The btree to be checked */ 6801 int *aRoot, /* An array of root pages numbers for individual trees */ 6802 int nRoot, /* Number of entries in aRoot[] */ 6803 int mxErr, /* Stop reporting errors after this many */ 6804 int *pnErr /* Write number of errors seen to this variable */ 6805 ){ 6806 int i; 6807 int nRef; 6808 IntegrityCk sCheck; 6809 BtShared *pBt = p->pBt; 6810 char zErr[100]; 6811 6812 sqlite3BtreeEnter(p); 6813 pBt->db = p->db; 6814 nRef = sqlite3PagerRefcount(pBt->pPager); 6815 if( lockBtreeWithRetry(p)!=SQLITE_OK ){ 6816 sqlite3BtreeLeave(p); 6817 return sqlite3DbStrDup(0, "Unable to acquire a read lock on the database"); 6818 } 6819 sCheck.pBt = pBt; 6820 sCheck.pPager = pBt->pPager; 6821 sCheck.nPage = pagerPagecount(sCheck.pPager); 6822 sCheck.mxErr = mxErr; 6823 sCheck.nErr = 0; 6824 *pnErr = 0; 6825 #ifndef SQLITE_OMIT_AUTOVACUUM 6826 if( pBt->nTrunc!=0 ){ 6827 sCheck.nPage = pBt->nTrunc; 6828 } 6829 #endif 6830 if( sCheck.nPage==0 ){ 6831 unlockBtreeIfUnused(pBt); 6832 sqlite3BtreeLeave(p); 6833 return 0; 6834 } 6835 sCheck.anRef = sqlite3Malloc( (sCheck.nPage+1)*sizeof(sCheck.anRef[0]) ); 6836 if( !sCheck.anRef ){ 6837 unlockBtreeIfUnused(pBt); 6838 *pnErr = 1; 6839 sqlite3BtreeLeave(p); 6840 return sqlite3MPrintf(p->db, "Unable to malloc %d bytes", 6841 (sCheck.nPage+1)*sizeof(sCheck.anRef[0])); 6842 } 6843 for(i=0; i<=sCheck.nPage; i++){ sCheck.anRef[i] = 0; } 6844 i = PENDING_BYTE_PAGE(pBt); 6845 if( i<=sCheck.nPage ){ 6846 sCheck.anRef[i] = 1; 6847 } 6848 sqlite3StrAccumInit(&sCheck.errMsg, zErr, sizeof(zErr), 20000); 6849 6850 /* Check the integrity of the freelist 6851 */ 6852 checkList(&sCheck, 1, get4byte(&pBt->pPage1->aData[32]), 6853 get4byte(&pBt->pPage1->aData[36]), "Main freelist: "); 6854 6855 /* Check all the tables. 6856 */ 6857 for(i=0; i<nRoot && sCheck.mxErr; i++){ 6858 if( aRoot[i]==0 ) continue; 6859 #ifndef SQLITE_OMIT_AUTOVACUUM 6860 if( pBt->autoVacuum && aRoot[i]>1 ){ 6861 checkPtrmap(&sCheck, aRoot[i], PTRMAP_ROOTPAGE, 0, 0); 6862 } 6863 #endif 6864 checkTreePage(&sCheck, aRoot[i], 0, "List of tree roots: "); 6865 } 6866 6867 /* Make sure every page in the file is referenced 6868 */ 6869 for(i=1; i<=sCheck.nPage && sCheck.mxErr; i++){ 6870 #ifdef SQLITE_OMIT_AUTOVACUUM 6871 if( sCheck.anRef[i]==0 ){ 6872 checkAppendMsg(&sCheck, 0, "Page %d is never used", i); 6873 } 6874 #else 6875 /* If the database supports auto-vacuum, make sure no tables contain 6876 ** references to pointer-map pages. 6877 */ 6878 if( sCheck.anRef[i]==0 && 6879 (PTRMAP_PAGENO(pBt, i)!=i || !pBt->autoVacuum) ){ 6880 checkAppendMsg(&sCheck, 0, "Page %d is never used", i); 6881 } 6882 if( sCheck.anRef[i]!=0 && 6883 (PTRMAP_PAGENO(pBt, i)==i && pBt->autoVacuum) ){ 6884 checkAppendMsg(&sCheck, 0, "Pointer map page %d is referenced", i); 6885 } 6886 #endif 6887 } 6888 6889 /* Make sure this analysis did not leave any unref() pages 6890 */ 6891 unlockBtreeIfUnused(pBt); 6892 if( nRef != sqlite3PagerRefcount(pBt->pPager) ){ 6893 checkAppendMsg(&sCheck, 0, 6894 "Outstanding page count goes from %d to %d during this analysis", 6895 nRef, sqlite3PagerRefcount(pBt->pPager) 6896 ); 6897 } 6898 6899 /* Clean up and report errors. 6900 */ 6901 sqlite3BtreeLeave(p); 6902 sqlite3_free(sCheck.anRef); 6903 *pnErr = sCheck.nErr; 6904 if( sCheck.nErr==0 ) sqlite3StrAccumReset(&sCheck.errMsg); 6905 return sqlite3StrAccumFinish(&sCheck.errMsg); 6906 } 6907 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */ 6908 6909 /* 6910 ** Return the full pathname of the underlying database file. 6911 ** 6912 ** The pager filename is invariant as long as the pager is 6913 ** open so it is safe to access without the BtShared mutex. 6914 */ 6915 const char *sqlite3BtreeGetFilename(Btree *p){ 6916 assert( p->pBt->pPager!=0 ); 6917 return sqlite3PagerFilename(p->pBt->pPager); 6918 } 6919 6920 /* 6921 ** Return the pathname of the directory that contains the database file. 6922 ** 6923 ** The pager directory name is invariant as long as the pager is 6924 ** open so it is safe to access without the BtShared mutex. 6925 */ 6926 const char *sqlite3BtreeGetDirname(Btree *p){ 6927 assert( p->pBt->pPager!=0 ); 6928 return sqlite3PagerDirname(p->pBt->pPager); 6929 } 6930 6931 /* 6932 ** Return the pathname of the journal file for this database. The return 6933 ** value of this routine is the same regardless of whether the journal file 6934 ** has been created or not. 6935 ** 6936 ** The pager journal filename is invariant as long as the pager is 6937 ** open so it is safe to access without the BtShared mutex. 6938 */ 6939 const char *sqlite3BtreeGetJournalname(Btree *p){ 6940 assert( p->pBt->pPager!=0 ); 6941 return sqlite3PagerJournalname(p->pBt->pPager); 6942 } 6943 6944 #ifndef SQLITE_OMIT_VACUUM 6945 /* 6946 ** Copy the complete content of pBtFrom into pBtTo. A transaction 6947 ** must be active for both files. 6948 ** 6949 ** The size of file pTo may be reduced by this operation. 6950 ** If anything goes wrong, the transaction on pTo is rolled back. 6951 ** 6952 ** If successful, CommitPhaseOne() may be called on pTo before returning. 6953 ** The caller should finish committing the transaction on pTo by calling 6954 ** sqlite3BtreeCommit(). 6955 */ 6956 static int btreeCopyFile(Btree *pTo, Btree *pFrom){ 6957 int rc = SQLITE_OK; 6958 Pgno i; 6959 6960 Pgno nFromPage; /* Number of pages in pFrom */ 6961 Pgno nToPage; /* Number of pages in pTo */ 6962 Pgno nNewPage; /* Number of pages in pTo after the copy */ 6963 6964 Pgno iSkip; /* Pending byte page in pTo */ 6965 int nToPageSize; /* Page size of pTo in bytes */ 6966 int nFromPageSize; /* Page size of pFrom in bytes */ 6967 6968 BtShared *pBtTo = pTo->pBt; 6969 BtShared *pBtFrom = pFrom->pBt; 6970 pBtTo->db = pTo->db; 6971 pBtFrom->db = pFrom->db; 6972 6973 nToPageSize = pBtTo->pageSize; 6974 nFromPageSize = pBtFrom->pageSize; 6975 6976 if( pTo->inTrans!=TRANS_WRITE || pFrom->inTrans!=TRANS_WRITE ){ 6977 return SQLITE_ERROR; 6978 } 6979 if( pBtTo->pCursor ){ 6980 return SQLITE_BUSY; 6981 } 6982 6983 nToPage = pagerPagecount(pBtTo->pPager); 6984 nFromPage = pagerPagecount(pBtFrom->pPager); 6985 iSkip = PENDING_BYTE_PAGE(pBtTo); 6986 6987 /* Variable nNewPage is the number of pages required to store the 6988 ** contents of pFrom using the current page-size of pTo. 6989 */ 6990 nNewPage = ((i64)nFromPage * (i64)nFromPageSize + (i64)nToPageSize - 1) / 6991 (i64)nToPageSize; 6992 6993 for(i=1; rc==SQLITE_OK && (i<=nToPage || i<=nNewPage); i++){ 6994 6995 /* Journal the original page. 6996 ** 6997 ** iSkip is the page number of the locking page (PENDING_BYTE_PAGE) 6998 ** in database *pTo (before the copy). This page is never written 6999 ** into the journal file. Unless i==iSkip or the page was not 7000 ** present in pTo before the copy operation, journal page i from pTo. 7001 */ 7002 if( i!=iSkip && i<=nToPage ){ 7003 DbPage *pDbPage = 0; 7004 rc = sqlite3PagerGet(pBtTo->pPager, i, &pDbPage); 7005 if( rc==SQLITE_OK ){ 7006 rc = sqlite3PagerWrite(pDbPage); 7007 if( rc==SQLITE_OK && i>nFromPage ){ 7008 /* Yeah. It seems wierd to call DontWrite() right after Write(). But 7009 ** that is because the names of those procedures do not exactly 7010 ** represent what they do. Write() really means "put this page in the 7011 ** rollback journal and mark it as dirty so that it will be written 7012 ** to the database file later." DontWrite() undoes the second part of 7013 ** that and prevents the page from being written to the database. The 7014 ** page is still on the rollback journal, though. And that is the 7015 ** whole point of this block: to put pages on the rollback journal. 7016 */ 7017 sqlite3PagerDontWrite(pDbPage); 7018 } 7019 sqlite3PagerUnref(pDbPage); 7020 } 7021 } 7022 7023 /* Overwrite the data in page i of the target database */ 7024 if( rc==SQLITE_OK && i!=iSkip && i<=nNewPage ){ 7025 7026 DbPage *pToPage = 0; 7027 sqlite3_int64 iOff; 7028 7029 rc = sqlite3PagerGet(pBtTo->pPager, i, &pToPage); 7030 if( rc==SQLITE_OK ){ 7031 rc = sqlite3PagerWrite(pToPage); 7032 } 7033 7034 for( 7035 iOff=(i-1)*nToPageSize; 7036 rc==SQLITE_OK && iOff<i*nToPageSize; 7037 iOff += nFromPageSize 7038 ){ 7039 DbPage *pFromPage = 0; 7040 Pgno iFrom = (iOff/nFromPageSize)+1; 7041 7042 if( iFrom==PENDING_BYTE_PAGE(pBtFrom) ){ 7043 continue; 7044 } 7045 7046 rc = sqlite3PagerGet(pBtFrom->pPager, iFrom, &pFromPage); 7047 if( rc==SQLITE_OK ){ 7048 char *zTo = sqlite3PagerGetData(pToPage); 7049 char *zFrom = sqlite3PagerGetData(pFromPage); 7050 int nCopy; 7051 7052 if( nFromPageSize>=nToPageSize ){ 7053 zFrom += ((i-1)*nToPageSize - ((iFrom-1)*nFromPageSize)); 7054 nCopy = nToPageSize; 7055 }else{ 7056 zTo += (((iFrom-1)*nFromPageSize) - (i-1)*nToPageSize); 7057 nCopy = nFromPageSize; 7058 } 7059 7060 memcpy(zTo, zFrom, nCopy); 7061 sqlite3PagerUnref(pFromPage); 7062 } 7063 } 7064 7065 if( pToPage ) sqlite3PagerUnref(pToPage); 7066 } 7067 } 7068 7069 /* If things have worked so far, the database file may need to be 7070 ** truncated. The complex part is that it may need to be truncated to 7071 ** a size that is not an integer multiple of nToPageSize - the current 7072 ** page size used by the pager associated with B-Tree pTo. 7073 ** 7074 ** For example, say the page-size of pTo is 2048 bytes and the original 7075 ** number of pages is 5 (10 KB file). If pFrom has a page size of 1024 7076 ** bytes and 9 pages, then the file needs to be truncated to 9KB. 7077 */ 7078 if( rc==SQLITE_OK ){ 7079 if( nFromPageSize!=nToPageSize ){ 7080 sqlite3_file *pFile = sqlite3PagerFile(pBtTo->pPager); 7081 i64 iSize = (i64)nFromPageSize * (i64)nFromPage; 7082 i64 iNow = (i64)((nToPage>nNewPage)?nToPage:nNewPage) * (i64)nToPageSize; 7083 i64 iPending = ((i64)PENDING_BYTE_PAGE(pBtTo)-1) *(i64)nToPageSize; 7084 7085 assert( iSize<=iNow ); 7086 7087 /* Commit phase one syncs the journal file associated with pTo 7088 ** containing the original data. It does not sync the database file 7089 ** itself. After doing this it is safe to use OsTruncate() and other 7090 ** file APIs on the database file directly. 7091 */ 7092 pBtTo->db = pTo->db; 7093 rc = sqlite3PagerCommitPhaseOne(pBtTo->pPager, 0, 0, 1); 7094 if( iSize<iNow && rc==SQLITE_OK ){ 7095 rc = sqlite3OsTruncate(pFile, iSize); 7096 } 7097 7098 /* The loop that copied data from database pFrom to pTo did not 7099 ** populate the locking page of database pTo. If the page-size of 7100 ** pFrom is smaller than that of pTo, this means some data will 7101 ** not have been copied. 7102 ** 7103 ** This block copies the missing data from database pFrom to pTo 7104 ** using file APIs. This is safe because at this point we know that 7105 ** all of the original data from pTo has been synced into the 7106 ** journal file. At this point it would be safe to do anything at 7107 ** all to the database file except truncate it to zero bytes. 7108 */ 7109 if( rc==SQLITE_OK && nFromPageSize<nToPageSize && iSize>iPending){ 7110 i64 iOff; 7111 for( 7112 iOff=iPending; 7113 rc==SQLITE_OK && iOff<(iPending+nToPageSize); 7114 iOff += nFromPageSize 7115 ){ 7116 DbPage *pFromPage = 0; 7117 Pgno iFrom = (iOff/nFromPageSize)+1; 7118 7119 if( iFrom==PENDING_BYTE_PAGE(pBtFrom) || iFrom>nFromPage ){ 7120 continue; 7121 } 7122 7123 rc = sqlite3PagerGet(pBtFrom->pPager, iFrom, &pFromPage); 7124 if( rc==SQLITE_OK ){ 7125 char *zFrom = sqlite3PagerGetData(pFromPage); 7126 rc = sqlite3OsWrite(pFile, zFrom, nFromPageSize, iOff); 7127 sqlite3PagerUnref(pFromPage); 7128 } 7129 } 7130 } 7131 7132 /* Sync the database file */ 7133 if( rc==SQLITE_OK ){ 7134 rc = sqlite3PagerSync(pBtTo->pPager); 7135 } 7136 }else{ 7137 rc = sqlite3PagerTruncate(pBtTo->pPager, nNewPage); 7138 } 7139 if( rc==SQLITE_OK ){ 7140 pBtTo->pageSizeFixed = 0; 7141 } 7142 } 7143 7144 if( rc ){ 7145 sqlite3BtreeRollback(pTo); 7146 } 7147 7148 return rc; 7149 } 7150 int sqlite3BtreeCopyFile(Btree *pTo, Btree *pFrom){ 7151 int rc; 7152 sqlite3BtreeEnter(pTo); 7153 sqlite3BtreeEnter(pFrom); 7154 rc = btreeCopyFile(pTo, pFrom); 7155 sqlite3BtreeLeave(pFrom); 7156 sqlite3BtreeLeave(pTo); 7157 return rc; 7158 } 7159 7160 #endif /* SQLITE_OMIT_VACUUM */ 7161 7162 /* 7163 ** Return non-zero if a transaction is active. 7164 */ 7165 int sqlite3BtreeIsInTrans(Btree *p){ 7166 assert( p==0 || sqlite3_mutex_held(p->db->mutex) ); 7167 return (p && (p->inTrans==TRANS_WRITE)); 7168 } 7169 7170 /* 7171 ** Return non-zero if a statement transaction is active. 7172 */ 7173 int sqlite3BtreeIsInStmt(Btree *p){ 7174 assert( sqlite3BtreeHoldsMutex(p) ); 7175 return (p->pBt && p->pBt->inStmt); 7176 } 7177 7178 /* 7179 ** Return non-zero if a read (or write) transaction is active. 7180 */ 7181 int sqlite3BtreeIsInReadTrans(Btree *p){ 7182 assert( sqlite3_mutex_held(p->db->mutex) ); 7183 return (p && (p->inTrans!=TRANS_NONE)); 7184 } 7185 7186 /* 7187 ** This function returns a pointer to a blob of memory associated with 7188 ** a single shared-btree. The memory is used by client code for its own 7189 ** purposes (for example, to store a high-level schema associated with 7190 ** the shared-btree). The btree layer manages reference counting issues. 7191 ** 7192 ** The first time this is called on a shared-btree, nBytes bytes of memory 7193 ** are allocated, zeroed, and returned to the caller. For each subsequent 7194 ** call the nBytes parameter is ignored and a pointer to the same blob 7195 ** of memory returned. 7196 ** 7197 ** If the nBytes parameter is 0 and the blob of memory has not yet been 7198 ** allocated, a null pointer is returned. If the blob has already been 7199 ** allocated, it is returned as normal. 7200 ** 7201 ** Just before the shared-btree is closed, the function passed as the 7202 ** xFree argument when the memory allocation was made is invoked on the 7203 ** blob of allocated memory. This function should not call sqlite3_free() 7204 ** on the memory, the btree layer does that. 7205 */ 7206 void *sqlite3BtreeSchema(Btree *p, int nBytes, void(*xFree)(void *)){ 7207 BtShared *pBt = p->pBt; 7208 sqlite3BtreeEnter(p); 7209 if( !pBt->pSchema && nBytes ){ 7210 pBt->pSchema = sqlite3MallocZero(nBytes); 7211 pBt->xFreeSchema = xFree; 7212 } 7213 sqlite3BtreeLeave(p); 7214 return pBt->pSchema; 7215 } 7216 7217 /* 7218 ** Return true if another user of the same shared btree as the argument 7219 ** handle holds an exclusive lock on the sqlite_master table. 7220 */ 7221 int sqlite3BtreeSchemaLocked(Btree *p){ 7222 int rc; 7223 assert( sqlite3_mutex_held(p->db->mutex) ); 7224 sqlite3BtreeEnter(p); 7225 rc = (queryTableLock(p, MASTER_ROOT, READ_LOCK)!=SQLITE_OK); 7226 sqlite3BtreeLeave(p); 7227 return rc; 7228 } 7229 7230 7231 #ifndef SQLITE_OMIT_SHARED_CACHE 7232 /* 7233 ** Obtain a lock on the table whose root page is iTab. The 7234 ** lock is a write lock if isWritelock is true or a read lock 7235 ** if it is false. 7236 */ 7237 int sqlite3BtreeLockTable(Btree *p, int iTab, u8 isWriteLock){ 7238 int rc = SQLITE_OK; 7239 if( p->sharable ){ 7240 u8 lockType = READ_LOCK + isWriteLock; 7241 assert( READ_LOCK+1==WRITE_LOCK ); 7242 assert( isWriteLock==0 || isWriteLock==1 ); 7243 sqlite3BtreeEnter(p); 7244 rc = queryTableLock(p, iTab, lockType); 7245 if( rc==SQLITE_OK ){ 7246 rc = lockTable(p, iTab, lockType); 7247 } 7248 sqlite3BtreeLeave(p); 7249 } 7250 return rc; 7251 } 7252 #endif 7253 7254 #ifndef SQLITE_OMIT_INCRBLOB 7255 /* 7256 ** Argument pCsr must be a cursor opened for writing on an 7257 ** INTKEY table currently pointing at a valid table entry. 7258 ** This function modifies the data stored as part of that entry. 7259 ** Only the data content may only be modified, it is not possible 7260 ** to change the length of the data stored. 7261 */ 7262 int sqlite3BtreePutData(BtCursor *pCsr, u32 offset, u32 amt, void *z){ 7263 assert( cursorHoldsMutex(pCsr) ); 7264 assert( sqlite3_mutex_held(pCsr->pBtree->db->mutex) ); 7265 assert(pCsr->isIncrblobHandle); 7266 7267 restoreCursorPosition(pCsr); 7268 assert( pCsr->eState!=CURSOR_REQUIRESEEK ); 7269 if( pCsr->eState!=CURSOR_VALID ){ 7270 return SQLITE_ABORT; 7271 } 7272 7273 /* Check some preconditions: 7274 ** (a) the cursor is open for writing, 7275 ** (b) there is no read-lock on the table being modified and 7276 ** (c) the cursor points at a valid row of an intKey table. 7277 */ 7278 if( !pCsr->wrFlag ){ 7279 return SQLITE_READONLY; 7280 } 7281 assert( !pCsr->pBt->readOnly 7282 && pCsr->pBt->inTransaction==TRANS_WRITE ); 7283 if( checkReadLocks(pCsr->pBtree, pCsr->pgnoRoot, pCsr, 0) ){ 7284 return SQLITE_LOCKED; /* The table pCur points to has a read lock */ 7285 } 7286 if( pCsr->eState==CURSOR_INVALID || !pCsr->pPage->intKey ){ 7287 return SQLITE_ERROR; 7288 } 7289 7290 return accessPayload(pCsr, offset, amt, (unsigned char *)z, 0, 1); 7291 } 7292 7293 /* 7294 ** Set a flag on this cursor to cache the locations of pages from the 7295 ** overflow list for the current row. This is used by cursors opened 7296 ** for incremental blob IO only. 7297 ** 7298 ** This function sets a flag only. The actual page location cache 7299 ** (stored in BtCursor.aOverflow[]) is allocated and used by function 7300 ** accessPayload() (the worker function for sqlite3BtreeData() and 7301 ** sqlite3BtreePutData()). 7302 */ 7303 void sqlite3BtreeCacheOverflow(BtCursor *pCur){ 7304 assert( cursorHoldsMutex(pCur) ); 7305 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) ); 7306 assert(!pCur->isIncrblobHandle); 7307 assert(!pCur->aOverflow); 7308 pCur->isIncrblobHandle = 1; 7309 } 7310 #endif 7311