1 /* 2 ** 2004 April 6 3 ** 4 ** The author disclaims copyright to this source code. In place of 5 ** a legal notice, here is a blessing: 6 ** 7 ** May you do good and not evil. 8 ** May you find forgiveness for yourself and forgive others. 9 ** May you share freely, never taking more than you give. 10 ** 11 ************************************************************************* 12 ** This file implements a external (disk-based) database using BTrees. 13 ** See the header comment on "btreeInt.h" for additional information. 14 ** Including a description of file format and an overview of operation. 15 */ 16 #include "btreeInt.h" 17 18 /* 19 ** The header string that appears at the beginning of every 20 ** SQLite database. 21 */ 22 static const char zMagicHeader[] = SQLITE_FILE_HEADER; 23 24 /* 25 ** Set this global variable to 1 to enable tracing using the TRACE 26 ** macro. 27 */ 28 #if 0 29 int sqlite3BtreeTrace=1; /* True to enable tracing */ 30 # define TRACE(X) if(sqlite3BtreeTrace){printf X;fflush(stdout);} 31 #else 32 # define TRACE(X) 33 #endif 34 35 /* 36 ** Extract a 2-byte big-endian integer from an array of unsigned bytes. 37 ** But if the value is zero, make it 65536. 38 ** 39 ** This routine is used to extract the "offset to cell content area" value 40 ** from the header of a btree page. If the page size is 65536 and the page 41 ** is empty, the offset should be 65536, but the 2-byte value stores zero. 42 ** This routine makes the necessary adjustment to 65536. 43 */ 44 #define get2byteNotZero(X) (((((int)get2byte(X))-1)&0xffff)+1) 45 46 #ifndef SQLITE_OMIT_SHARED_CACHE 47 /* 48 ** A list of BtShared objects that are eligible for participation 49 ** in shared cache. This variable has file scope during normal builds, 50 ** but the test harness needs to access it so we make it global for 51 ** test builds. 52 ** 53 ** Access to this variable is protected by SQLITE_MUTEX_STATIC_MASTER. 54 */ 55 #ifdef SQLITE_TEST 56 BtShared *SQLITE_WSD sqlite3SharedCacheList = 0; 57 #else 58 static BtShared *SQLITE_WSD sqlite3SharedCacheList = 0; 59 #endif 60 #endif /* SQLITE_OMIT_SHARED_CACHE */ 61 62 #ifndef SQLITE_OMIT_SHARED_CACHE 63 /* 64 ** Enable or disable the shared pager and schema features. 65 ** 66 ** This routine has no effect on existing database connections. 67 ** The shared cache setting effects only future calls to 68 ** sqlite3_open(), sqlite3_open16(), or sqlite3_open_v2(). 69 */ 70 int sqlite3_enable_shared_cache(int enable){ 71 sqlite3GlobalConfig.sharedCacheEnabled = enable; 72 return SQLITE_OK; 73 } 74 #endif 75 76 77 78 #ifdef SQLITE_OMIT_SHARED_CACHE 79 /* 80 ** The functions querySharedCacheTableLock(), setSharedCacheTableLock(), 81 ** and clearAllSharedCacheTableLocks() 82 ** manipulate entries in the BtShared.pLock linked list used to store 83 ** shared-cache table level locks. If the library is compiled with the 84 ** shared-cache feature disabled, then there is only ever one user 85 ** of each BtShared structure and so this locking is not necessary. 86 ** So define the lock related functions as no-ops. 87 */ 88 #define querySharedCacheTableLock(a,b,c) SQLITE_OK 89 #define setSharedCacheTableLock(a,b,c) SQLITE_OK 90 #define clearAllSharedCacheTableLocks(a) 91 #define downgradeAllSharedCacheTableLocks(a) 92 #define hasSharedCacheTableLock(a,b,c,d) 1 93 #define hasReadConflicts(a, b) 0 94 #endif 95 96 #ifndef SQLITE_OMIT_SHARED_CACHE 97 98 #ifdef SQLITE_DEBUG 99 /* 100 **** This function is only used as part of an assert() statement. *** 101 ** 102 ** Check to see if pBtree holds the required locks to read or write to the 103 ** table with root page iRoot. Return 1 if it does and 0 if not. 104 ** 105 ** For example, when writing to a table with root-page iRoot via 106 ** Btree connection pBtree: 107 ** 108 ** assert( hasSharedCacheTableLock(pBtree, iRoot, 0, WRITE_LOCK) ); 109 ** 110 ** When writing to an index that resides in a sharable database, the 111 ** caller should have first obtained a lock specifying the root page of 112 ** the corresponding table. This makes things a bit more complicated, 113 ** as this module treats each table as a separate structure. To determine 114 ** the table corresponding to the index being written, this 115 ** function has to search through the database schema. 116 ** 117 ** Instead of a lock on the table/index rooted at page iRoot, the caller may 118 ** hold a write-lock on the schema table (root page 1). This is also 119 ** acceptable. 120 */ 121 static int hasSharedCacheTableLock( 122 Btree *pBtree, /* Handle that must hold lock */ 123 Pgno iRoot, /* Root page of b-tree */ 124 int isIndex, /* True if iRoot is the root of an index b-tree */ 125 int eLockType /* Required lock type (READ_LOCK or WRITE_LOCK) */ 126 ){ 127 Schema *pSchema = (Schema *)pBtree->pBt->pSchema; 128 Pgno iTab = 0; 129 BtLock *pLock; 130 131 /* If this database is not shareable, or if the client is reading 132 ** and has the read-uncommitted flag set, then no lock is required. 133 ** Return true immediately. 134 */ 135 if( (pBtree->sharable==0) 136 || (eLockType==READ_LOCK && (pBtree->db->flags & SQLITE_ReadUncommitted)) 137 ){ 138 return 1; 139 } 140 141 /* If the client is reading or writing an index and the schema is 142 ** not loaded, then it is too difficult to actually check to see if 143 ** the correct locks are held. So do not bother - just return true. 144 ** This case does not come up very often anyhow. 145 */ 146 if( isIndex && (!pSchema || (pSchema->flags&DB_SchemaLoaded)==0) ){ 147 return 1; 148 } 149 150 /* Figure out the root-page that the lock should be held on. For table 151 ** b-trees, this is just the root page of the b-tree being read or 152 ** written. For index b-trees, it is the root page of the associated 153 ** table. */ 154 if( isIndex ){ 155 HashElem *p; 156 for(p=sqliteHashFirst(&pSchema->idxHash); p; p=sqliteHashNext(p)){ 157 Index *pIdx = (Index *)sqliteHashData(p); 158 if( pIdx->tnum==(int)iRoot ){ 159 iTab = pIdx->pTable->tnum; 160 } 161 } 162 }else{ 163 iTab = iRoot; 164 } 165 166 /* Search for the required lock. Either a write-lock on root-page iTab, a 167 ** write-lock on the schema table, or (if the client is reading) a 168 ** read-lock on iTab will suffice. Return 1 if any of these are found. */ 169 for(pLock=pBtree->pBt->pLock; pLock; pLock=pLock->pNext){ 170 if( pLock->pBtree==pBtree 171 && (pLock->iTable==iTab || (pLock->eLock==WRITE_LOCK && pLock->iTable==1)) 172 && pLock->eLock>=eLockType 173 ){ 174 return 1; 175 } 176 } 177 178 /* Failed to find the required lock. */ 179 return 0; 180 } 181 #endif /* SQLITE_DEBUG */ 182 183 #ifdef SQLITE_DEBUG 184 /* 185 **** This function may be used as part of assert() statements only. **** 186 ** 187 ** Return true if it would be illegal for pBtree to write into the 188 ** table or index rooted at iRoot because other shared connections are 189 ** simultaneously reading that same table or index. 190 ** 191 ** It is illegal for pBtree to write if some other Btree object that 192 ** shares the same BtShared object is currently reading or writing 193 ** the iRoot table. Except, if the other Btree object has the 194 ** read-uncommitted flag set, then it is OK for the other object to 195 ** have a read cursor. 196 ** 197 ** For example, before writing to any part of the table or index 198 ** rooted at page iRoot, one should call: 199 ** 200 ** assert( !hasReadConflicts(pBtree, iRoot) ); 201 */ 202 static int hasReadConflicts(Btree *pBtree, Pgno iRoot){ 203 BtCursor *p; 204 for(p=pBtree->pBt->pCursor; p; p=p->pNext){ 205 if( p->pgnoRoot==iRoot 206 && p->pBtree!=pBtree 207 && 0==(p->pBtree->db->flags & SQLITE_ReadUncommitted) 208 ){ 209 return 1; 210 } 211 } 212 return 0; 213 } 214 #endif /* #ifdef SQLITE_DEBUG */ 215 216 /* 217 ** Query to see if Btree handle p may obtain a lock of type eLock 218 ** (READ_LOCK or WRITE_LOCK) on the table with root-page iTab. Return 219 ** SQLITE_OK if the lock may be obtained (by calling 220 ** setSharedCacheTableLock()), or SQLITE_LOCKED if not. 221 */ 222 static int querySharedCacheTableLock(Btree *p, Pgno iTab, u8 eLock){ 223 BtShared *pBt = p->pBt; 224 BtLock *pIter; 225 226 assert( sqlite3BtreeHoldsMutex(p) ); 227 assert( eLock==READ_LOCK || eLock==WRITE_LOCK ); 228 assert( p->db!=0 ); 229 assert( !(p->db->flags&SQLITE_ReadUncommitted)||eLock==WRITE_LOCK||iTab==1 ); 230 231 /* If requesting a write-lock, then the Btree must have an open write 232 ** transaction on this file. And, obviously, for this to be so there 233 ** must be an open write transaction on the file itself. 234 */ 235 assert( eLock==READ_LOCK || (p==pBt->pWriter && p->inTrans==TRANS_WRITE) ); 236 assert( eLock==READ_LOCK || pBt->inTransaction==TRANS_WRITE ); 237 238 /* This routine is a no-op if the shared-cache is not enabled */ 239 if( !p->sharable ){ 240 return SQLITE_OK; 241 } 242 243 /* If some other connection is holding an exclusive lock, the 244 ** requested lock may not be obtained. 245 */ 246 if( pBt->pWriter!=p && pBt->isExclusive ){ 247 sqlite3ConnectionBlocked(p->db, pBt->pWriter->db); 248 return SQLITE_LOCKED_SHAREDCACHE; 249 } 250 251 for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){ 252 /* The condition (pIter->eLock!=eLock) in the following if(...) 253 ** statement is a simplification of: 254 ** 255 ** (eLock==WRITE_LOCK || pIter->eLock==WRITE_LOCK) 256 ** 257 ** since we know that if eLock==WRITE_LOCK, then no other connection 258 ** may hold a WRITE_LOCK on any table in this file (since there can 259 ** only be a single writer). 260 */ 261 assert( pIter->eLock==READ_LOCK || pIter->eLock==WRITE_LOCK ); 262 assert( eLock==READ_LOCK || pIter->pBtree==p || pIter->eLock==READ_LOCK); 263 if( pIter->pBtree!=p && pIter->iTable==iTab && pIter->eLock!=eLock ){ 264 sqlite3ConnectionBlocked(p->db, pIter->pBtree->db); 265 if( eLock==WRITE_LOCK ){ 266 assert( p==pBt->pWriter ); 267 pBt->isPending = 1; 268 } 269 return SQLITE_LOCKED_SHAREDCACHE; 270 } 271 } 272 return SQLITE_OK; 273 } 274 #endif /* !SQLITE_OMIT_SHARED_CACHE */ 275 276 #ifndef SQLITE_OMIT_SHARED_CACHE 277 /* 278 ** Add a lock on the table with root-page iTable to the shared-btree used 279 ** by Btree handle p. Parameter eLock must be either READ_LOCK or 280 ** WRITE_LOCK. 281 ** 282 ** This function assumes the following: 283 ** 284 ** (a) The specified Btree object p is connected to a sharable 285 ** database (one with the BtShared.sharable flag set), and 286 ** 287 ** (b) No other Btree objects hold a lock that conflicts 288 ** with the requested lock (i.e. querySharedCacheTableLock() has 289 ** already been called and returned SQLITE_OK). 290 ** 291 ** SQLITE_OK is returned if the lock is added successfully. SQLITE_NOMEM 292 ** is returned if a malloc attempt fails. 293 */ 294 static int setSharedCacheTableLock(Btree *p, Pgno iTable, u8 eLock){ 295 BtShared *pBt = p->pBt; 296 BtLock *pLock = 0; 297 BtLock *pIter; 298 299 assert( sqlite3BtreeHoldsMutex(p) ); 300 assert( eLock==READ_LOCK || eLock==WRITE_LOCK ); 301 assert( p->db!=0 ); 302 303 /* A connection with the read-uncommitted flag set will never try to 304 ** obtain a read-lock using this function. The only read-lock obtained 305 ** by a connection in read-uncommitted mode is on the sqlite_master 306 ** table, and that lock is obtained in BtreeBeginTrans(). */ 307 assert( 0==(p->db->flags&SQLITE_ReadUncommitted) || eLock==WRITE_LOCK ); 308 309 /* This function should only be called on a sharable b-tree after it 310 ** has been determined that no other b-tree holds a conflicting lock. */ 311 assert( p->sharable ); 312 assert( SQLITE_OK==querySharedCacheTableLock(p, iTable, eLock) ); 313 314 /* First search the list for an existing lock on this table. */ 315 for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){ 316 if( pIter->iTable==iTable && pIter->pBtree==p ){ 317 pLock = pIter; 318 break; 319 } 320 } 321 322 /* If the above search did not find a BtLock struct associating Btree p 323 ** with table iTable, allocate one and link it into the list. 324 */ 325 if( !pLock ){ 326 pLock = (BtLock *)sqlite3MallocZero(sizeof(BtLock)); 327 if( !pLock ){ 328 return SQLITE_NOMEM; 329 } 330 pLock->iTable = iTable; 331 pLock->pBtree = p; 332 pLock->pNext = pBt->pLock; 333 pBt->pLock = pLock; 334 } 335 336 /* Set the BtLock.eLock variable to the maximum of the current lock 337 ** and the requested lock. This means if a write-lock was already held 338 ** and a read-lock requested, we don't incorrectly downgrade the lock. 339 */ 340 assert( WRITE_LOCK>READ_LOCK ); 341 if( eLock>pLock->eLock ){ 342 pLock->eLock = eLock; 343 } 344 345 return SQLITE_OK; 346 } 347 #endif /* !SQLITE_OMIT_SHARED_CACHE */ 348 349 #ifndef SQLITE_OMIT_SHARED_CACHE 350 /* 351 ** Release all the table locks (locks obtained via calls to 352 ** the setSharedCacheTableLock() procedure) held by Btree object p. 353 ** 354 ** This function assumes that Btree p has an open read or write 355 ** transaction. If it does not, then the BtShared.isPending variable 356 ** may be incorrectly cleared. 357 */ 358 static void clearAllSharedCacheTableLocks(Btree *p){ 359 BtShared *pBt = p->pBt; 360 BtLock **ppIter = &pBt->pLock; 361 362 assert( sqlite3BtreeHoldsMutex(p) ); 363 assert( p->sharable || 0==*ppIter ); 364 assert( p->inTrans>0 ); 365 366 while( *ppIter ){ 367 BtLock *pLock = *ppIter; 368 assert( pBt->isExclusive==0 || pBt->pWriter==pLock->pBtree ); 369 assert( pLock->pBtree->inTrans>=pLock->eLock ); 370 if( pLock->pBtree==p ){ 371 *ppIter = pLock->pNext; 372 assert( pLock->iTable!=1 || pLock==&p->lock ); 373 if( pLock->iTable!=1 ){ 374 sqlite3_free(pLock); 375 } 376 }else{ 377 ppIter = &pLock->pNext; 378 } 379 } 380 381 assert( pBt->isPending==0 || pBt->pWriter ); 382 if( pBt->pWriter==p ){ 383 pBt->pWriter = 0; 384 pBt->isExclusive = 0; 385 pBt->isPending = 0; 386 }else if( pBt->nTransaction==2 ){ 387 /* This function is called when Btree p is concluding its 388 ** transaction. If there currently exists a writer, and p is not 389 ** that writer, then the number of locks held by connections other 390 ** than the writer must be about to drop to zero. In this case 391 ** set the isPending flag to 0. 392 ** 393 ** If there is not currently a writer, then BtShared.isPending must 394 ** be zero already. So this next line is harmless in that case. 395 */ 396 pBt->isPending = 0; 397 } 398 } 399 400 /* 401 ** This function changes all write-locks held by Btree p into read-locks. 402 */ 403 static void downgradeAllSharedCacheTableLocks(Btree *p){ 404 BtShared *pBt = p->pBt; 405 if( pBt->pWriter==p ){ 406 BtLock *pLock; 407 pBt->pWriter = 0; 408 pBt->isExclusive = 0; 409 pBt->isPending = 0; 410 for(pLock=pBt->pLock; pLock; pLock=pLock->pNext){ 411 assert( pLock->eLock==READ_LOCK || pLock->pBtree==p ); 412 pLock->eLock = READ_LOCK; 413 } 414 } 415 } 416 417 #endif /* SQLITE_OMIT_SHARED_CACHE */ 418 419 static void releasePage(MemPage *pPage); /* Forward reference */ 420 421 /* 422 ***** This routine is used inside of assert() only **** 423 ** 424 ** Verify that the cursor holds the mutex on its BtShared 425 */ 426 #ifdef SQLITE_DEBUG 427 static int cursorHoldsMutex(BtCursor *p){ 428 return sqlite3_mutex_held(p->pBt->mutex); 429 } 430 #endif 431 432 433 #ifndef SQLITE_OMIT_INCRBLOB 434 /* 435 ** Invalidate the overflow page-list cache for cursor pCur, if any. 436 */ 437 static void invalidateOverflowCache(BtCursor *pCur){ 438 assert( cursorHoldsMutex(pCur) ); 439 sqlite3_free(pCur->aOverflow); 440 pCur->aOverflow = 0; 441 } 442 443 /* 444 ** Invalidate the overflow page-list cache for all cursors opened 445 ** on the shared btree structure pBt. 446 */ 447 static void invalidateAllOverflowCache(BtShared *pBt){ 448 BtCursor *p; 449 assert( sqlite3_mutex_held(pBt->mutex) ); 450 for(p=pBt->pCursor; p; p=p->pNext){ 451 invalidateOverflowCache(p); 452 } 453 } 454 455 /* 456 ** This function is called before modifying the contents of a table 457 ** to invalidate any incrblob cursors that are open on the 458 ** row or one of the rows being modified. 459 ** 460 ** If argument isClearTable is true, then the entire contents of the 461 ** table is about to be deleted. In this case invalidate all incrblob 462 ** cursors open on any row within the table with root-page pgnoRoot. 463 ** 464 ** Otherwise, if argument isClearTable is false, then the row with 465 ** rowid iRow is being replaced or deleted. In this case invalidate 466 ** only those incrblob cursors open on that specific row. 467 */ 468 static void invalidateIncrblobCursors( 469 Btree *pBtree, /* The database file to check */ 470 i64 iRow, /* The rowid that might be changing */ 471 int isClearTable /* True if all rows are being deleted */ 472 ){ 473 BtCursor *p; 474 BtShared *pBt = pBtree->pBt; 475 assert( sqlite3BtreeHoldsMutex(pBtree) ); 476 for(p=pBt->pCursor; p; p=p->pNext){ 477 if( p->isIncrblobHandle && (isClearTable || p->info.nKey==iRow) ){ 478 p->eState = CURSOR_INVALID; 479 } 480 } 481 } 482 483 #else 484 /* Stub functions when INCRBLOB is omitted */ 485 #define invalidateOverflowCache(x) 486 #define invalidateAllOverflowCache(x) 487 #define invalidateIncrblobCursors(x,y,z) 488 #endif /* SQLITE_OMIT_INCRBLOB */ 489 490 /* 491 ** Set bit pgno of the BtShared.pHasContent bitvec. This is called 492 ** when a page that previously contained data becomes a free-list leaf 493 ** page. 494 ** 495 ** The BtShared.pHasContent bitvec exists to work around an obscure 496 ** bug caused by the interaction of two useful IO optimizations surrounding 497 ** free-list leaf pages: 498 ** 499 ** 1) When all data is deleted from a page and the page becomes 500 ** a free-list leaf page, the page is not written to the database 501 ** (as free-list leaf pages contain no meaningful data). Sometimes 502 ** such a page is not even journalled (as it will not be modified, 503 ** why bother journalling it?). 504 ** 505 ** 2) When a free-list leaf page is reused, its content is not read 506 ** from the database or written to the journal file (why should it 507 ** be, if it is not at all meaningful?). 508 ** 509 ** By themselves, these optimizations work fine and provide a handy 510 ** performance boost to bulk delete or insert operations. However, if 511 ** a page is moved to the free-list and then reused within the same 512 ** transaction, a problem comes up. If the page is not journalled when 513 ** it is moved to the free-list and it is also not journalled when it 514 ** is extracted from the free-list and reused, then the original data 515 ** may be lost. In the event of a rollback, it may not be possible 516 ** to restore the database to its original configuration. 517 ** 518 ** The solution is the BtShared.pHasContent bitvec. Whenever a page is 519 ** moved to become a free-list leaf page, the corresponding bit is 520 ** set in the bitvec. Whenever a leaf page is extracted from the free-list, 521 ** optimization 2 above is omitted if the corresponding bit is already 522 ** set in BtShared.pHasContent. The contents of the bitvec are cleared 523 ** at the end of every transaction. 524 */ 525 static int btreeSetHasContent(BtShared *pBt, Pgno pgno){ 526 int rc = SQLITE_OK; 527 if( !pBt->pHasContent ){ 528 assert( pgno<=pBt->nPage ); 529 pBt->pHasContent = sqlite3BitvecCreate(pBt->nPage); 530 if( !pBt->pHasContent ){ 531 rc = SQLITE_NOMEM; 532 } 533 } 534 if( rc==SQLITE_OK && pgno<=sqlite3BitvecSize(pBt->pHasContent) ){ 535 rc = sqlite3BitvecSet(pBt->pHasContent, pgno); 536 } 537 return rc; 538 } 539 540 /* 541 ** Query the BtShared.pHasContent vector. 542 ** 543 ** This function is called when a free-list leaf page is removed from the 544 ** free-list for reuse. It returns false if it is safe to retrieve the 545 ** page from the pager layer with the 'no-content' flag set. True otherwise. 546 */ 547 static int btreeGetHasContent(BtShared *pBt, Pgno pgno){ 548 Bitvec *p = pBt->pHasContent; 549 return (p && (pgno>sqlite3BitvecSize(p) || sqlite3BitvecTest(p, pgno))); 550 } 551 552 /* 553 ** Clear (destroy) the BtShared.pHasContent bitvec. This should be 554 ** invoked at the conclusion of each write-transaction. 555 */ 556 static void btreeClearHasContent(BtShared *pBt){ 557 sqlite3BitvecDestroy(pBt->pHasContent); 558 pBt->pHasContent = 0; 559 } 560 561 /* 562 ** Save the current cursor position in the variables BtCursor.nKey 563 ** and BtCursor.pKey. The cursor's state is set to CURSOR_REQUIRESEEK. 564 ** 565 ** The caller must ensure that the cursor is valid (has eState==CURSOR_VALID) 566 ** prior to calling this routine. 567 */ 568 static int saveCursorPosition(BtCursor *pCur){ 569 int rc; 570 571 assert( CURSOR_VALID==pCur->eState ); 572 assert( 0==pCur->pKey ); 573 assert( cursorHoldsMutex(pCur) ); 574 575 rc = sqlite3BtreeKeySize(pCur, &pCur->nKey); 576 assert( rc==SQLITE_OK ); /* KeySize() cannot fail */ 577 578 /* If this is an intKey table, then the above call to BtreeKeySize() 579 ** stores the integer key in pCur->nKey. In this case this value is 580 ** all that is required. Otherwise, if pCur is not open on an intKey 581 ** table, then malloc space for and store the pCur->nKey bytes of key 582 ** data. 583 */ 584 if( 0==pCur->apPage[0]->intKey ){ 585 void *pKey = sqlite3Malloc( (int)pCur->nKey ); 586 if( pKey ){ 587 rc = sqlite3BtreeKey(pCur, 0, (int)pCur->nKey, pKey); 588 if( rc==SQLITE_OK ){ 589 pCur->pKey = pKey; 590 }else{ 591 sqlite3_free(pKey); 592 } 593 }else{ 594 rc = SQLITE_NOMEM; 595 } 596 } 597 assert( !pCur->apPage[0]->intKey || !pCur->pKey ); 598 599 if( rc==SQLITE_OK ){ 600 int i; 601 for(i=0; i<=pCur->iPage; i++){ 602 releasePage(pCur->apPage[i]); 603 pCur->apPage[i] = 0; 604 } 605 pCur->iPage = -1; 606 pCur->eState = CURSOR_REQUIRESEEK; 607 } 608 609 invalidateOverflowCache(pCur); 610 return rc; 611 } 612 613 /* 614 ** Save the positions of all cursors (except pExcept) that are open on 615 ** the table with root-page iRoot. Usually, this is called just before cursor 616 ** pExcept is used to modify the table (BtreeDelete() or BtreeInsert()). 617 */ 618 static int saveAllCursors(BtShared *pBt, Pgno iRoot, BtCursor *pExcept){ 619 BtCursor *p; 620 assert( sqlite3_mutex_held(pBt->mutex) ); 621 assert( pExcept==0 || pExcept->pBt==pBt ); 622 for(p=pBt->pCursor; p; p=p->pNext){ 623 if( p!=pExcept && (0==iRoot || p->pgnoRoot==iRoot) && 624 p->eState==CURSOR_VALID ){ 625 int rc = saveCursorPosition(p); 626 if( SQLITE_OK!=rc ){ 627 return rc; 628 } 629 } 630 } 631 return SQLITE_OK; 632 } 633 634 /* 635 ** Clear the current cursor position. 636 */ 637 void sqlite3BtreeClearCursor(BtCursor *pCur){ 638 assert( cursorHoldsMutex(pCur) ); 639 sqlite3_free(pCur->pKey); 640 pCur->pKey = 0; 641 pCur->eState = CURSOR_INVALID; 642 } 643 644 /* 645 ** In this version of BtreeMoveto, pKey is a packed index record 646 ** such as is generated by the OP_MakeRecord opcode. Unpack the 647 ** record and then call BtreeMovetoUnpacked() to do the work. 648 */ 649 static int btreeMoveto( 650 BtCursor *pCur, /* Cursor open on the btree to be searched */ 651 const void *pKey, /* Packed key if the btree is an index */ 652 i64 nKey, /* Integer key for tables. Size of pKey for indices */ 653 int bias, /* Bias search to the high end */ 654 int *pRes /* Write search results here */ 655 ){ 656 int rc; /* Status code */ 657 UnpackedRecord *pIdxKey; /* Unpacked index key */ 658 char aSpace[150]; /* Temp space for pIdxKey - to avoid a malloc */ 659 660 if( pKey ){ 661 assert( nKey==(i64)(int)nKey ); 662 pIdxKey = sqlite3VdbeRecordUnpack(pCur->pKeyInfo, (int)nKey, pKey, 663 aSpace, sizeof(aSpace)); 664 if( pIdxKey==0 ) return SQLITE_NOMEM; 665 }else{ 666 pIdxKey = 0; 667 } 668 rc = sqlite3BtreeMovetoUnpacked(pCur, pIdxKey, nKey, bias, pRes); 669 if( pKey ){ 670 sqlite3VdbeDeleteUnpackedRecord(pIdxKey); 671 } 672 return rc; 673 } 674 675 /* 676 ** Restore the cursor to the position it was in (or as close to as possible) 677 ** when saveCursorPosition() was called. Note that this call deletes the 678 ** saved position info stored by saveCursorPosition(), so there can be 679 ** at most one effective restoreCursorPosition() call after each 680 ** saveCursorPosition(). 681 */ 682 static int btreeRestoreCursorPosition(BtCursor *pCur){ 683 int rc; 684 assert( cursorHoldsMutex(pCur) ); 685 assert( pCur->eState>=CURSOR_REQUIRESEEK ); 686 if( pCur->eState==CURSOR_FAULT ){ 687 return pCur->skipNext; 688 } 689 pCur->eState = CURSOR_INVALID; 690 rc = btreeMoveto(pCur, pCur->pKey, pCur->nKey, 0, &pCur->skipNext); 691 if( rc==SQLITE_OK ){ 692 sqlite3_free(pCur->pKey); 693 pCur->pKey = 0; 694 assert( pCur->eState==CURSOR_VALID || pCur->eState==CURSOR_INVALID ); 695 } 696 return rc; 697 } 698 699 #define restoreCursorPosition(p) \ 700 (p->eState>=CURSOR_REQUIRESEEK ? \ 701 btreeRestoreCursorPosition(p) : \ 702 SQLITE_OK) 703 704 /* 705 ** Determine whether or not a cursor has moved from the position it 706 ** was last placed at. Cursors can move when the row they are pointing 707 ** at is deleted out from under them. 708 ** 709 ** This routine returns an error code if something goes wrong. The 710 ** integer *pHasMoved is set to one if the cursor has moved and 0 if not. 711 */ 712 int sqlite3BtreeCursorHasMoved(BtCursor *pCur, int *pHasMoved){ 713 int rc; 714 715 rc = restoreCursorPosition(pCur); 716 if( rc ){ 717 *pHasMoved = 1; 718 return rc; 719 } 720 if( pCur->eState!=CURSOR_VALID || pCur->skipNext!=0 ){ 721 *pHasMoved = 1; 722 }else{ 723 *pHasMoved = 0; 724 } 725 return SQLITE_OK; 726 } 727 728 #ifndef SQLITE_OMIT_AUTOVACUUM 729 /* 730 ** Given a page number of a regular database page, return the page 731 ** number for the pointer-map page that contains the entry for the 732 ** input page number. 733 ** 734 ** Return 0 (not a valid page) for pgno==1 since there is 735 ** no pointer map associated with page 1. The integrity_check logic 736 ** requires that ptrmapPageno(*,1)!=1. 737 */ 738 static Pgno ptrmapPageno(BtShared *pBt, Pgno pgno){ 739 int nPagesPerMapPage; 740 Pgno iPtrMap, ret; 741 assert( sqlite3_mutex_held(pBt->mutex) ); 742 if( pgno<2 ) return 0; 743 nPagesPerMapPage = (pBt->usableSize/5)+1; 744 iPtrMap = (pgno-2)/nPagesPerMapPage; 745 ret = (iPtrMap*nPagesPerMapPage) + 2; 746 if( ret==PENDING_BYTE_PAGE(pBt) ){ 747 ret++; 748 } 749 return ret; 750 } 751 752 /* 753 ** Write an entry into the pointer map. 754 ** 755 ** This routine updates the pointer map entry for page number 'key' 756 ** so that it maps to type 'eType' and parent page number 'pgno'. 757 ** 758 ** If *pRC is initially non-zero (non-SQLITE_OK) then this routine is 759 ** a no-op. If an error occurs, the appropriate error code is written 760 ** into *pRC. 761 */ 762 static void ptrmapPut(BtShared *pBt, Pgno key, u8 eType, Pgno parent, int *pRC){ 763 DbPage *pDbPage; /* The pointer map page */ 764 u8 *pPtrmap; /* The pointer map data */ 765 Pgno iPtrmap; /* The pointer map page number */ 766 int offset; /* Offset in pointer map page */ 767 int rc; /* Return code from subfunctions */ 768 769 if( *pRC ) return; 770 771 assert( sqlite3_mutex_held(pBt->mutex) ); 772 /* The master-journal page number must never be used as a pointer map page */ 773 assert( 0==PTRMAP_ISPAGE(pBt, PENDING_BYTE_PAGE(pBt)) ); 774 775 assert( pBt->autoVacuum ); 776 if( key==0 ){ 777 *pRC = SQLITE_CORRUPT_BKPT; 778 return; 779 } 780 iPtrmap = PTRMAP_PAGENO(pBt, key); 781 rc = sqlite3PagerGet(pBt->pPager, iPtrmap, &pDbPage); 782 if( rc!=SQLITE_OK ){ 783 *pRC = rc; 784 return; 785 } 786 offset = PTRMAP_PTROFFSET(iPtrmap, key); 787 if( offset<0 ){ 788 *pRC = SQLITE_CORRUPT_BKPT; 789 goto ptrmap_exit; 790 } 791 pPtrmap = (u8 *)sqlite3PagerGetData(pDbPage); 792 793 if( eType!=pPtrmap[offset] || get4byte(&pPtrmap[offset+1])!=parent ){ 794 TRACE(("PTRMAP_UPDATE: %d->(%d,%d)\n", key, eType, parent)); 795 *pRC= rc = sqlite3PagerWrite(pDbPage); 796 if( rc==SQLITE_OK ){ 797 pPtrmap[offset] = eType; 798 put4byte(&pPtrmap[offset+1], parent); 799 } 800 } 801 802 ptrmap_exit: 803 sqlite3PagerUnref(pDbPage); 804 } 805 806 /* 807 ** Read an entry from the pointer map. 808 ** 809 ** This routine retrieves the pointer map entry for page 'key', writing 810 ** the type and parent page number to *pEType and *pPgno respectively. 811 ** An error code is returned if something goes wrong, otherwise SQLITE_OK. 812 */ 813 static int ptrmapGet(BtShared *pBt, Pgno key, u8 *pEType, Pgno *pPgno){ 814 DbPage *pDbPage; /* The pointer map page */ 815 int iPtrmap; /* Pointer map page index */ 816 u8 *pPtrmap; /* Pointer map page data */ 817 int offset; /* Offset of entry in pointer map */ 818 int rc; 819 820 assert( sqlite3_mutex_held(pBt->mutex) ); 821 822 iPtrmap = PTRMAP_PAGENO(pBt, key); 823 rc = sqlite3PagerGet(pBt->pPager, iPtrmap, &pDbPage); 824 if( rc!=0 ){ 825 return rc; 826 } 827 pPtrmap = (u8 *)sqlite3PagerGetData(pDbPage); 828 829 offset = PTRMAP_PTROFFSET(iPtrmap, key); 830 assert( pEType!=0 ); 831 *pEType = pPtrmap[offset]; 832 if( pPgno ) *pPgno = get4byte(&pPtrmap[offset+1]); 833 834 sqlite3PagerUnref(pDbPage); 835 if( *pEType<1 || *pEType>5 ) return SQLITE_CORRUPT_BKPT; 836 return SQLITE_OK; 837 } 838 839 #else /* if defined SQLITE_OMIT_AUTOVACUUM */ 840 #define ptrmapPut(w,x,y,z,rc) 841 #define ptrmapGet(w,x,y,z) SQLITE_OK 842 #define ptrmapPutOvflPtr(x, y, rc) 843 #endif 844 845 /* 846 ** Given a btree page and a cell index (0 means the first cell on 847 ** the page, 1 means the second cell, and so forth) return a pointer 848 ** to the cell content. 849 ** 850 ** This routine works only for pages that do not contain overflow cells. 851 */ 852 #define findCell(P,I) \ 853 ((P)->aData + ((P)->maskPage & get2byte(&(P)->aData[(P)->cellOffset+2*(I)]))) 854 855 /* 856 ** This a more complex version of findCell() that works for 857 ** pages that do contain overflow cells. 858 */ 859 static u8 *findOverflowCell(MemPage *pPage, int iCell){ 860 int i; 861 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 862 for(i=pPage->nOverflow-1; i>=0; i--){ 863 int k; 864 struct _OvflCell *pOvfl; 865 pOvfl = &pPage->aOvfl[i]; 866 k = pOvfl->idx; 867 if( k<=iCell ){ 868 if( k==iCell ){ 869 return pOvfl->pCell; 870 } 871 iCell--; 872 } 873 } 874 return findCell(pPage, iCell); 875 } 876 877 /* 878 ** Parse a cell content block and fill in the CellInfo structure. There 879 ** are two versions of this function. btreeParseCell() takes a 880 ** cell index as the second argument and btreeParseCellPtr() 881 ** takes a pointer to the body of the cell as its second argument. 882 ** 883 ** Within this file, the parseCell() macro can be called instead of 884 ** btreeParseCellPtr(). Using some compilers, this will be faster. 885 */ 886 static void btreeParseCellPtr( 887 MemPage *pPage, /* Page containing the cell */ 888 u8 *pCell, /* Pointer to the cell text. */ 889 CellInfo *pInfo /* Fill in this structure */ 890 ){ 891 u16 n; /* Number bytes in cell content header */ 892 u32 nPayload; /* Number of bytes of cell payload */ 893 894 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 895 896 pInfo->pCell = pCell; 897 assert( pPage->leaf==0 || pPage->leaf==1 ); 898 n = pPage->childPtrSize; 899 assert( n==4-4*pPage->leaf ); 900 if( pPage->intKey ){ 901 if( pPage->hasData ){ 902 n += getVarint32(&pCell[n], nPayload); 903 }else{ 904 nPayload = 0; 905 } 906 n += getVarint(&pCell[n], (u64*)&pInfo->nKey); 907 pInfo->nData = nPayload; 908 }else{ 909 pInfo->nData = 0; 910 n += getVarint32(&pCell[n], nPayload); 911 pInfo->nKey = nPayload; 912 } 913 pInfo->nPayload = nPayload; 914 pInfo->nHeader = n; 915 testcase( nPayload==pPage->maxLocal ); 916 testcase( nPayload==pPage->maxLocal+1 ); 917 if( likely(nPayload<=pPage->maxLocal) ){ 918 /* This is the (easy) common case where the entire payload fits 919 ** on the local page. No overflow is required. 920 */ 921 int nSize; /* Total size of cell content in bytes */ 922 nSize = nPayload + n; 923 pInfo->nLocal = (u16)nPayload; 924 pInfo->iOverflow = 0; 925 if( (nSize & ~3)==0 ){ 926 nSize = 4; /* Minimum cell size is 4 */ 927 } 928 pInfo->nSize = (u16)nSize; 929 }else{ 930 /* If the payload will not fit completely on the local page, we have 931 ** to decide how much to store locally and how much to spill onto 932 ** overflow pages. The strategy is to minimize the amount of unused 933 ** space on overflow pages while keeping the amount of local storage 934 ** in between minLocal and maxLocal. 935 ** 936 ** Warning: changing the way overflow payload is distributed in any 937 ** way will result in an incompatible file format. 938 */ 939 int minLocal; /* Minimum amount of payload held locally */ 940 int maxLocal; /* Maximum amount of payload held locally */ 941 int surplus; /* Overflow payload available for local storage */ 942 943 minLocal = pPage->minLocal; 944 maxLocal = pPage->maxLocal; 945 surplus = minLocal + (nPayload - minLocal)%(pPage->pBt->usableSize - 4); 946 testcase( surplus==maxLocal ); 947 testcase( surplus==maxLocal+1 ); 948 if( surplus <= maxLocal ){ 949 pInfo->nLocal = (u16)surplus; 950 }else{ 951 pInfo->nLocal = (u16)minLocal; 952 } 953 pInfo->iOverflow = (u16)(pInfo->nLocal + n); 954 pInfo->nSize = pInfo->iOverflow + 4; 955 } 956 } 957 #define parseCell(pPage, iCell, pInfo) \ 958 btreeParseCellPtr((pPage), findCell((pPage), (iCell)), (pInfo)) 959 static void btreeParseCell( 960 MemPage *pPage, /* Page containing the cell */ 961 int iCell, /* The cell index. First cell is 0 */ 962 CellInfo *pInfo /* Fill in this structure */ 963 ){ 964 parseCell(pPage, iCell, pInfo); 965 } 966 967 /* 968 ** Compute the total number of bytes that a Cell needs in the cell 969 ** data area of the btree-page. The return number includes the cell 970 ** data header and the local payload, but not any overflow page or 971 ** the space used by the cell pointer. 972 */ 973 static u16 cellSizePtr(MemPage *pPage, u8 *pCell){ 974 u8 *pIter = &pCell[pPage->childPtrSize]; 975 u32 nSize; 976 977 #ifdef SQLITE_DEBUG 978 /* The value returned by this function should always be the same as 979 ** the (CellInfo.nSize) value found by doing a full parse of the 980 ** cell. If SQLITE_DEBUG is defined, an assert() at the bottom of 981 ** this function verifies that this invariant is not violated. */ 982 CellInfo debuginfo; 983 btreeParseCellPtr(pPage, pCell, &debuginfo); 984 #endif 985 986 if( pPage->intKey ){ 987 u8 *pEnd; 988 if( pPage->hasData ){ 989 pIter += getVarint32(pIter, nSize); 990 }else{ 991 nSize = 0; 992 } 993 994 /* pIter now points at the 64-bit integer key value, a variable length 995 ** integer. The following block moves pIter to point at the first byte 996 ** past the end of the key value. */ 997 pEnd = &pIter[9]; 998 while( (*pIter++)&0x80 && pIter<pEnd ); 999 }else{ 1000 pIter += getVarint32(pIter, nSize); 1001 } 1002 1003 testcase( nSize==pPage->maxLocal ); 1004 testcase( nSize==pPage->maxLocal+1 ); 1005 if( nSize>pPage->maxLocal ){ 1006 int minLocal = pPage->minLocal; 1007 nSize = minLocal + (nSize - minLocal) % (pPage->pBt->usableSize - 4); 1008 testcase( nSize==pPage->maxLocal ); 1009 testcase( nSize==pPage->maxLocal+1 ); 1010 if( nSize>pPage->maxLocal ){ 1011 nSize = minLocal; 1012 } 1013 nSize += 4; 1014 } 1015 nSize += (u32)(pIter - pCell); 1016 1017 /* The minimum size of any cell is 4 bytes. */ 1018 if( nSize<4 ){ 1019 nSize = 4; 1020 } 1021 1022 assert( nSize==debuginfo.nSize ); 1023 return (u16)nSize; 1024 } 1025 1026 #ifdef SQLITE_DEBUG 1027 /* This variation on cellSizePtr() is used inside of assert() statements 1028 ** only. */ 1029 static u16 cellSize(MemPage *pPage, int iCell){ 1030 return cellSizePtr(pPage, findCell(pPage, iCell)); 1031 } 1032 #endif 1033 1034 #ifndef SQLITE_OMIT_AUTOVACUUM 1035 /* 1036 ** If the cell pCell, part of page pPage contains a pointer 1037 ** to an overflow page, insert an entry into the pointer-map 1038 ** for the overflow page. 1039 */ 1040 static void ptrmapPutOvflPtr(MemPage *pPage, u8 *pCell, int *pRC){ 1041 CellInfo info; 1042 if( *pRC ) return; 1043 assert( pCell!=0 ); 1044 btreeParseCellPtr(pPage, pCell, &info); 1045 assert( (info.nData+(pPage->intKey?0:info.nKey))==info.nPayload ); 1046 if( info.iOverflow ){ 1047 Pgno ovfl = get4byte(&pCell[info.iOverflow]); 1048 ptrmapPut(pPage->pBt, ovfl, PTRMAP_OVERFLOW1, pPage->pgno, pRC); 1049 } 1050 } 1051 #endif 1052 1053 1054 /* 1055 ** Defragment the page given. All Cells are moved to the 1056 ** end of the page and all free space is collected into one 1057 ** big FreeBlk that occurs in between the header and cell 1058 ** pointer array and the cell content area. 1059 */ 1060 static int defragmentPage(MemPage *pPage){ 1061 int i; /* Loop counter */ 1062 int pc; /* Address of a i-th cell */ 1063 int hdr; /* Offset to the page header */ 1064 int size; /* Size of a cell */ 1065 int usableSize; /* Number of usable bytes on a page */ 1066 int cellOffset; /* Offset to the cell pointer array */ 1067 int cbrk; /* Offset to the cell content area */ 1068 int nCell; /* Number of cells on the page */ 1069 unsigned char *data; /* The page data */ 1070 unsigned char *temp; /* Temp area for cell content */ 1071 int iCellFirst; /* First allowable cell index */ 1072 int iCellLast; /* Last possible cell index */ 1073 1074 1075 assert( sqlite3PagerIswriteable(pPage->pDbPage) ); 1076 assert( pPage->pBt!=0 ); 1077 assert( pPage->pBt->usableSize <= SQLITE_MAX_PAGE_SIZE ); 1078 assert( pPage->nOverflow==0 ); 1079 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 1080 temp = sqlite3PagerTempSpace(pPage->pBt->pPager); 1081 data = pPage->aData; 1082 hdr = pPage->hdrOffset; 1083 cellOffset = pPage->cellOffset; 1084 nCell = pPage->nCell; 1085 assert( nCell==get2byte(&data[hdr+3]) ); 1086 usableSize = pPage->pBt->usableSize; 1087 cbrk = get2byte(&data[hdr+5]); 1088 memcpy(&temp[cbrk], &data[cbrk], usableSize - cbrk); 1089 cbrk = usableSize; 1090 iCellFirst = cellOffset + 2*nCell; 1091 iCellLast = usableSize - 4; 1092 for(i=0; i<nCell; i++){ 1093 u8 *pAddr; /* The i-th cell pointer */ 1094 pAddr = &data[cellOffset + i*2]; 1095 pc = get2byte(pAddr); 1096 testcase( pc==iCellFirst ); 1097 testcase( pc==iCellLast ); 1098 #if !defined(SQLITE_ENABLE_OVERSIZE_CELL_CHECK) 1099 /* These conditions have already been verified in btreeInitPage() 1100 ** if SQLITE_ENABLE_OVERSIZE_CELL_CHECK is defined 1101 */ 1102 if( pc<iCellFirst || pc>iCellLast ){ 1103 return SQLITE_CORRUPT_BKPT; 1104 } 1105 #endif 1106 assert( pc>=iCellFirst && pc<=iCellLast ); 1107 size = cellSizePtr(pPage, &temp[pc]); 1108 cbrk -= size; 1109 #if defined(SQLITE_ENABLE_OVERSIZE_CELL_CHECK) 1110 if( cbrk<iCellFirst ){ 1111 return SQLITE_CORRUPT_BKPT; 1112 } 1113 #else 1114 if( cbrk<iCellFirst || pc+size>usableSize ){ 1115 return SQLITE_CORRUPT_BKPT; 1116 } 1117 #endif 1118 assert( cbrk+size<=usableSize && cbrk>=iCellFirst ); 1119 testcase( cbrk+size==usableSize ); 1120 testcase( pc+size==usableSize ); 1121 memcpy(&data[cbrk], &temp[pc], size); 1122 put2byte(pAddr, cbrk); 1123 } 1124 assert( cbrk>=iCellFirst ); 1125 put2byte(&data[hdr+5], cbrk); 1126 data[hdr+1] = 0; 1127 data[hdr+2] = 0; 1128 data[hdr+7] = 0; 1129 memset(&data[iCellFirst], 0, cbrk-iCellFirst); 1130 assert( sqlite3PagerIswriteable(pPage->pDbPage) ); 1131 if( cbrk-iCellFirst!=pPage->nFree ){ 1132 return SQLITE_CORRUPT_BKPT; 1133 } 1134 return SQLITE_OK; 1135 } 1136 1137 /* 1138 ** Allocate nByte bytes of space from within the B-Tree page passed 1139 ** as the first argument. Write into *pIdx the index into pPage->aData[] 1140 ** of the first byte of allocated space. Return either SQLITE_OK or 1141 ** an error code (usually SQLITE_CORRUPT). 1142 ** 1143 ** The caller guarantees that there is sufficient space to make the 1144 ** allocation. This routine might need to defragment in order to bring 1145 ** all the space together, however. This routine will avoid using 1146 ** the first two bytes past the cell pointer area since presumably this 1147 ** allocation is being made in order to insert a new cell, so we will 1148 ** also end up needing a new cell pointer. 1149 */ 1150 static int allocateSpace(MemPage *pPage, int nByte, int *pIdx){ 1151 const int hdr = pPage->hdrOffset; /* Local cache of pPage->hdrOffset */ 1152 u8 * const data = pPage->aData; /* Local cache of pPage->aData */ 1153 int nFrag; /* Number of fragmented bytes on pPage */ 1154 int top; /* First byte of cell content area */ 1155 int gap; /* First byte of gap between cell pointers and cell content */ 1156 int rc; /* Integer return code */ 1157 int usableSize; /* Usable size of the page */ 1158 1159 assert( sqlite3PagerIswriteable(pPage->pDbPage) ); 1160 assert( pPage->pBt ); 1161 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 1162 assert( nByte>=0 ); /* Minimum cell size is 4 */ 1163 assert( pPage->nFree>=nByte ); 1164 assert( pPage->nOverflow==0 ); 1165 usableSize = pPage->pBt->usableSize; 1166 assert( nByte < usableSize-8 ); 1167 1168 nFrag = data[hdr+7]; 1169 assert( pPage->cellOffset == hdr + 12 - 4*pPage->leaf ); 1170 gap = pPage->cellOffset + 2*pPage->nCell; 1171 top = get2byteNotZero(&data[hdr+5]); 1172 if( gap>top ) return SQLITE_CORRUPT_BKPT; 1173 testcase( gap+2==top ); 1174 testcase( gap+1==top ); 1175 testcase( gap==top ); 1176 1177 if( nFrag>=60 ){ 1178 /* Always defragment highly fragmented pages */ 1179 rc = defragmentPage(pPage); 1180 if( rc ) return rc; 1181 top = get2byteNotZero(&data[hdr+5]); 1182 }else if( gap+2<=top ){ 1183 /* Search the freelist looking for a free slot big enough to satisfy 1184 ** the request. The allocation is made from the first free slot in 1185 ** the list that is large enough to accomadate it. 1186 */ 1187 int pc, addr; 1188 for(addr=hdr+1; (pc = get2byte(&data[addr]))>0; addr=pc){ 1189 int size; /* Size of the free slot */ 1190 if( pc>usableSize-4 || pc<addr+4 ){ 1191 return SQLITE_CORRUPT_BKPT; 1192 } 1193 size = get2byte(&data[pc+2]); 1194 if( size>=nByte ){ 1195 int x = size - nByte; 1196 testcase( x==4 ); 1197 testcase( x==3 ); 1198 if( x<4 ){ 1199 /* Remove the slot from the free-list. Update the number of 1200 ** fragmented bytes within the page. */ 1201 memcpy(&data[addr], &data[pc], 2); 1202 data[hdr+7] = (u8)(nFrag + x); 1203 }else if( size+pc > usableSize ){ 1204 return SQLITE_CORRUPT_BKPT; 1205 }else{ 1206 /* The slot remains on the free-list. Reduce its size to account 1207 ** for the portion used by the new allocation. */ 1208 put2byte(&data[pc+2], x); 1209 } 1210 *pIdx = pc + x; 1211 return SQLITE_OK; 1212 } 1213 } 1214 } 1215 1216 /* Check to make sure there is enough space in the gap to satisfy 1217 ** the allocation. If not, defragment. 1218 */ 1219 testcase( gap+2+nByte==top ); 1220 if( gap+2+nByte>top ){ 1221 rc = defragmentPage(pPage); 1222 if( rc ) return rc; 1223 top = get2byteNotZero(&data[hdr+5]); 1224 assert( gap+nByte<=top ); 1225 } 1226 1227 1228 /* Allocate memory from the gap in between the cell pointer array 1229 ** and the cell content area. The btreeInitPage() call has already 1230 ** validated the freelist. Given that the freelist is valid, there 1231 ** is no way that the allocation can extend off the end of the page. 1232 ** The assert() below verifies the previous sentence. 1233 */ 1234 top -= nByte; 1235 put2byte(&data[hdr+5], top); 1236 assert( top+nByte <= pPage->pBt->usableSize ); 1237 *pIdx = top; 1238 return SQLITE_OK; 1239 } 1240 1241 /* 1242 ** Return a section of the pPage->aData to the freelist. 1243 ** The first byte of the new free block is pPage->aDisk[start] 1244 ** and the size of the block is "size" bytes. 1245 ** 1246 ** Most of the effort here is involved in coalesing adjacent 1247 ** free blocks into a single big free block. 1248 */ 1249 static int freeSpace(MemPage *pPage, int start, int size){ 1250 int addr, pbegin, hdr; 1251 int iLast; /* Largest possible freeblock offset */ 1252 unsigned char *data = pPage->aData; 1253 1254 assert( pPage->pBt!=0 ); 1255 assert( sqlite3PagerIswriteable(pPage->pDbPage) ); 1256 assert( start>=pPage->hdrOffset+6+pPage->childPtrSize ); 1257 assert( (start + size)<=pPage->pBt->usableSize ); 1258 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 1259 assert( size>=0 ); /* Minimum cell size is 4 */ 1260 1261 if( pPage->pBt->secureDelete ){ 1262 /* Overwrite deleted information with zeros when the secure_delete 1263 ** option is enabled */ 1264 memset(&data[start], 0, size); 1265 } 1266 1267 /* Add the space back into the linked list of freeblocks. Note that 1268 ** even though the freeblock list was checked by btreeInitPage(), 1269 ** btreeInitPage() did not detect overlapping cells or 1270 ** freeblocks that overlapped cells. Nor does it detect when the 1271 ** cell content area exceeds the value in the page header. If these 1272 ** situations arise, then subsequent insert operations might corrupt 1273 ** the freelist. So we do need to check for corruption while scanning 1274 ** the freelist. 1275 */ 1276 hdr = pPage->hdrOffset; 1277 addr = hdr + 1; 1278 iLast = pPage->pBt->usableSize - 4; 1279 assert( start<=iLast ); 1280 while( (pbegin = get2byte(&data[addr]))<start && pbegin>0 ){ 1281 if( pbegin<addr+4 ){ 1282 return SQLITE_CORRUPT_BKPT; 1283 } 1284 addr = pbegin; 1285 } 1286 if( pbegin>iLast ){ 1287 return SQLITE_CORRUPT_BKPT; 1288 } 1289 assert( pbegin>addr || pbegin==0 ); 1290 put2byte(&data[addr], start); 1291 put2byte(&data[start], pbegin); 1292 put2byte(&data[start+2], size); 1293 pPage->nFree = pPage->nFree + (u16)size; 1294 1295 /* Coalesce adjacent free blocks */ 1296 addr = hdr + 1; 1297 while( (pbegin = get2byte(&data[addr]))>0 ){ 1298 int pnext, psize, x; 1299 assert( pbegin>addr ); 1300 assert( pbegin<=pPage->pBt->usableSize-4 ); 1301 pnext = get2byte(&data[pbegin]); 1302 psize = get2byte(&data[pbegin+2]); 1303 if( pbegin + psize + 3 >= pnext && pnext>0 ){ 1304 int frag = pnext - (pbegin+psize); 1305 if( (frag<0) || (frag>(int)data[hdr+7]) ){ 1306 return SQLITE_CORRUPT_BKPT; 1307 } 1308 data[hdr+7] -= (u8)frag; 1309 x = get2byte(&data[pnext]); 1310 put2byte(&data[pbegin], x); 1311 x = pnext + get2byte(&data[pnext+2]) - pbegin; 1312 put2byte(&data[pbegin+2], x); 1313 }else{ 1314 addr = pbegin; 1315 } 1316 } 1317 1318 /* If the cell content area begins with a freeblock, remove it. */ 1319 if( data[hdr+1]==data[hdr+5] && data[hdr+2]==data[hdr+6] ){ 1320 int top; 1321 pbegin = get2byte(&data[hdr+1]); 1322 memcpy(&data[hdr+1], &data[pbegin], 2); 1323 top = get2byte(&data[hdr+5]) + get2byte(&data[pbegin+2]); 1324 put2byte(&data[hdr+5], top); 1325 } 1326 assert( sqlite3PagerIswriteable(pPage->pDbPage) ); 1327 return SQLITE_OK; 1328 } 1329 1330 /* 1331 ** Decode the flags byte (the first byte of the header) for a page 1332 ** and initialize fields of the MemPage structure accordingly. 1333 ** 1334 ** Only the following combinations are supported. Anything different 1335 ** indicates a corrupt database files: 1336 ** 1337 ** PTF_ZERODATA 1338 ** PTF_ZERODATA | PTF_LEAF 1339 ** PTF_LEAFDATA | PTF_INTKEY 1340 ** PTF_LEAFDATA | PTF_INTKEY | PTF_LEAF 1341 */ 1342 static int decodeFlags(MemPage *pPage, int flagByte){ 1343 BtShared *pBt; /* A copy of pPage->pBt */ 1344 1345 assert( pPage->hdrOffset==(pPage->pgno==1 ? 100 : 0) ); 1346 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 1347 pPage->leaf = (u8)(flagByte>>3); assert( PTF_LEAF == 1<<3 ); 1348 flagByte &= ~PTF_LEAF; 1349 pPage->childPtrSize = 4-4*pPage->leaf; 1350 pBt = pPage->pBt; 1351 if( flagByte==(PTF_LEAFDATA | PTF_INTKEY) ){ 1352 pPage->intKey = 1; 1353 pPage->hasData = pPage->leaf; 1354 pPage->maxLocal = pBt->maxLeaf; 1355 pPage->minLocal = pBt->minLeaf; 1356 }else if( flagByte==PTF_ZERODATA ){ 1357 pPage->intKey = 0; 1358 pPage->hasData = 0; 1359 pPage->maxLocal = pBt->maxLocal; 1360 pPage->minLocal = pBt->minLocal; 1361 }else{ 1362 return SQLITE_CORRUPT_BKPT; 1363 } 1364 return SQLITE_OK; 1365 } 1366 1367 /* 1368 ** Initialize the auxiliary information for a disk block. 1369 ** 1370 ** Return SQLITE_OK on success. If we see that the page does 1371 ** not contain a well-formed database page, then return 1372 ** SQLITE_CORRUPT. Note that a return of SQLITE_OK does not 1373 ** guarantee that the page is well-formed. It only shows that 1374 ** we failed to detect any corruption. 1375 */ 1376 static int btreeInitPage(MemPage *pPage){ 1377 1378 assert( pPage->pBt!=0 ); 1379 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 1380 assert( pPage->pgno==sqlite3PagerPagenumber(pPage->pDbPage) ); 1381 assert( pPage == sqlite3PagerGetExtra(pPage->pDbPage) ); 1382 assert( pPage->aData == sqlite3PagerGetData(pPage->pDbPage) ); 1383 1384 if( !pPage->isInit ){ 1385 u16 pc; /* Address of a freeblock within pPage->aData[] */ 1386 u8 hdr; /* Offset to beginning of page header */ 1387 u8 *data; /* Equal to pPage->aData */ 1388 BtShared *pBt; /* The main btree structure */ 1389 int usableSize; /* Amount of usable space on each page */ 1390 u16 cellOffset; /* Offset from start of page to first cell pointer */ 1391 int nFree; /* Number of unused bytes on the page */ 1392 int top; /* First byte of the cell content area */ 1393 int iCellFirst; /* First allowable cell or freeblock offset */ 1394 int iCellLast; /* Last possible cell or freeblock offset */ 1395 1396 pBt = pPage->pBt; 1397 1398 hdr = pPage->hdrOffset; 1399 data = pPage->aData; 1400 if( decodeFlags(pPage, data[hdr]) ) return SQLITE_CORRUPT_BKPT; 1401 assert( pBt->pageSize>=512 && pBt->pageSize<=65536 ); 1402 pPage->maskPage = (u16)(pBt->pageSize - 1); 1403 pPage->nOverflow = 0; 1404 usableSize = pBt->usableSize; 1405 pPage->cellOffset = cellOffset = hdr + 12 - 4*pPage->leaf; 1406 top = get2byteNotZero(&data[hdr+5]); 1407 pPage->nCell = get2byte(&data[hdr+3]); 1408 if( pPage->nCell>MX_CELL(pBt) ){ 1409 /* To many cells for a single page. The page must be corrupt */ 1410 return SQLITE_CORRUPT_BKPT; 1411 } 1412 testcase( pPage->nCell==MX_CELL(pBt) ); 1413 1414 /* A malformed database page might cause us to read past the end 1415 ** of page when parsing a cell. 1416 ** 1417 ** The following block of code checks early to see if a cell extends 1418 ** past the end of a page boundary and causes SQLITE_CORRUPT to be 1419 ** returned if it does. 1420 */ 1421 iCellFirst = cellOffset + 2*pPage->nCell; 1422 iCellLast = usableSize - 4; 1423 #if defined(SQLITE_ENABLE_OVERSIZE_CELL_CHECK) 1424 { 1425 int i; /* Index into the cell pointer array */ 1426 int sz; /* Size of a cell */ 1427 1428 if( !pPage->leaf ) iCellLast--; 1429 for(i=0; i<pPage->nCell; i++){ 1430 pc = get2byte(&data[cellOffset+i*2]); 1431 testcase( pc==iCellFirst ); 1432 testcase( pc==iCellLast ); 1433 if( pc<iCellFirst || pc>iCellLast ){ 1434 return SQLITE_CORRUPT_BKPT; 1435 } 1436 sz = cellSizePtr(pPage, &data[pc]); 1437 testcase( pc+sz==usableSize ); 1438 if( pc+sz>usableSize ){ 1439 return SQLITE_CORRUPT_BKPT; 1440 } 1441 } 1442 if( !pPage->leaf ) iCellLast++; 1443 } 1444 #endif 1445 1446 /* Compute the total free space on the page */ 1447 pc = get2byte(&data[hdr+1]); 1448 nFree = data[hdr+7] + top; 1449 while( pc>0 ){ 1450 u16 next, size; 1451 if( pc<iCellFirst || pc>iCellLast ){ 1452 /* Start of free block is off the page */ 1453 return SQLITE_CORRUPT_BKPT; 1454 } 1455 next = get2byte(&data[pc]); 1456 size = get2byte(&data[pc+2]); 1457 if( (next>0 && next<=pc+size+3) || pc+size>usableSize ){ 1458 /* Free blocks must be in ascending order. And the last byte of 1459 ** the free-block must lie on the database page. */ 1460 return SQLITE_CORRUPT_BKPT; 1461 } 1462 nFree = nFree + size; 1463 pc = next; 1464 } 1465 1466 /* At this point, nFree contains the sum of the offset to the start 1467 ** of the cell-content area plus the number of free bytes within 1468 ** the cell-content area. If this is greater than the usable-size 1469 ** of the page, then the page must be corrupted. This check also 1470 ** serves to verify that the offset to the start of the cell-content 1471 ** area, according to the page header, lies within the page. 1472 */ 1473 if( nFree>usableSize ){ 1474 return SQLITE_CORRUPT_BKPT; 1475 } 1476 pPage->nFree = (u16)(nFree - iCellFirst); 1477 pPage->isInit = 1; 1478 } 1479 return SQLITE_OK; 1480 } 1481 1482 /* 1483 ** Set up a raw page so that it looks like a database page holding 1484 ** no entries. 1485 */ 1486 static void zeroPage(MemPage *pPage, int flags){ 1487 unsigned char *data = pPage->aData; 1488 BtShared *pBt = pPage->pBt; 1489 u8 hdr = pPage->hdrOffset; 1490 u16 first; 1491 1492 assert( sqlite3PagerPagenumber(pPage->pDbPage)==pPage->pgno ); 1493 assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage ); 1494 assert( sqlite3PagerGetData(pPage->pDbPage) == data ); 1495 assert( sqlite3PagerIswriteable(pPage->pDbPage) ); 1496 assert( sqlite3_mutex_held(pBt->mutex) ); 1497 if( pBt->secureDelete ){ 1498 memset(&data[hdr], 0, pBt->usableSize - hdr); 1499 } 1500 data[hdr] = (char)flags; 1501 first = hdr + 8 + 4*((flags&PTF_LEAF)==0 ?1:0); 1502 memset(&data[hdr+1], 0, 4); 1503 data[hdr+7] = 0; 1504 put2byte(&data[hdr+5], pBt->usableSize); 1505 pPage->nFree = (u16)(pBt->usableSize - first); 1506 decodeFlags(pPage, flags); 1507 pPage->hdrOffset = hdr; 1508 pPage->cellOffset = first; 1509 pPage->nOverflow = 0; 1510 assert( pBt->pageSize>=512 && pBt->pageSize<=65536 ); 1511 pPage->maskPage = (u16)(pBt->pageSize - 1); 1512 pPage->nCell = 0; 1513 pPage->isInit = 1; 1514 } 1515 1516 1517 /* 1518 ** Convert a DbPage obtained from the pager into a MemPage used by 1519 ** the btree layer. 1520 */ 1521 static MemPage *btreePageFromDbPage(DbPage *pDbPage, Pgno pgno, BtShared *pBt){ 1522 MemPage *pPage = (MemPage*)sqlite3PagerGetExtra(pDbPage); 1523 pPage->aData = sqlite3PagerGetData(pDbPage); 1524 pPage->pDbPage = pDbPage; 1525 pPage->pBt = pBt; 1526 pPage->pgno = pgno; 1527 pPage->hdrOffset = pPage->pgno==1 ? 100 : 0; 1528 return pPage; 1529 } 1530 1531 /* 1532 ** Get a page from the pager. Initialize the MemPage.pBt and 1533 ** MemPage.aData elements if needed. 1534 ** 1535 ** If the noContent flag is set, it means that we do not care about 1536 ** the content of the page at this time. So do not go to the disk 1537 ** to fetch the content. Just fill in the content with zeros for now. 1538 ** If in the future we call sqlite3PagerWrite() on this page, that 1539 ** means we have started to be concerned about content and the disk 1540 ** read should occur at that point. 1541 */ 1542 static int btreeGetPage( 1543 BtShared *pBt, /* The btree */ 1544 Pgno pgno, /* Number of the page to fetch */ 1545 MemPage **ppPage, /* Return the page in this parameter */ 1546 int noContent /* Do not load page content if true */ 1547 ){ 1548 int rc; 1549 DbPage *pDbPage; 1550 1551 assert( sqlite3_mutex_held(pBt->mutex) ); 1552 rc = sqlite3PagerAcquire(pBt->pPager, pgno, (DbPage**)&pDbPage, noContent); 1553 if( rc ) return rc; 1554 *ppPage = btreePageFromDbPage(pDbPage, pgno, pBt); 1555 return SQLITE_OK; 1556 } 1557 1558 /* 1559 ** Retrieve a page from the pager cache. If the requested page is not 1560 ** already in the pager cache return NULL. Initialize the MemPage.pBt and 1561 ** MemPage.aData elements if needed. 1562 */ 1563 static MemPage *btreePageLookup(BtShared *pBt, Pgno pgno){ 1564 DbPage *pDbPage; 1565 assert( sqlite3_mutex_held(pBt->mutex) ); 1566 pDbPage = sqlite3PagerLookup(pBt->pPager, pgno); 1567 if( pDbPage ){ 1568 return btreePageFromDbPage(pDbPage, pgno, pBt); 1569 } 1570 return 0; 1571 } 1572 1573 /* 1574 ** Return the size of the database file in pages. If there is any kind of 1575 ** error, return ((unsigned int)-1). 1576 */ 1577 static Pgno btreePagecount(BtShared *pBt){ 1578 return pBt->nPage; 1579 } 1580 u32 sqlite3BtreeLastPage(Btree *p){ 1581 assert( sqlite3BtreeHoldsMutex(p) ); 1582 assert( ((p->pBt->nPage)&0x8000000)==0 ); 1583 return (int)btreePagecount(p->pBt); 1584 } 1585 1586 /* 1587 ** Get a page from the pager and initialize it. This routine is just a 1588 ** convenience wrapper around separate calls to btreeGetPage() and 1589 ** btreeInitPage(). 1590 ** 1591 ** If an error occurs, then the value *ppPage is set to is undefined. It 1592 ** may remain unchanged, or it may be set to an invalid value. 1593 */ 1594 static int getAndInitPage( 1595 BtShared *pBt, /* The database file */ 1596 Pgno pgno, /* Number of the page to get */ 1597 MemPage **ppPage /* Write the page pointer here */ 1598 ){ 1599 int rc; 1600 assert( sqlite3_mutex_held(pBt->mutex) ); 1601 1602 if( pgno>btreePagecount(pBt) ){ 1603 rc = SQLITE_CORRUPT_BKPT; 1604 }else{ 1605 rc = btreeGetPage(pBt, pgno, ppPage, 0); 1606 if( rc==SQLITE_OK ){ 1607 rc = btreeInitPage(*ppPage); 1608 if( rc!=SQLITE_OK ){ 1609 releasePage(*ppPage); 1610 } 1611 } 1612 } 1613 1614 testcase( pgno==0 ); 1615 assert( pgno!=0 || rc==SQLITE_CORRUPT ); 1616 return rc; 1617 } 1618 1619 /* 1620 ** Release a MemPage. This should be called once for each prior 1621 ** call to btreeGetPage. 1622 */ 1623 static void releasePage(MemPage *pPage){ 1624 if( pPage ){ 1625 assert( pPage->aData ); 1626 assert( pPage->pBt ); 1627 assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage ); 1628 assert( sqlite3PagerGetData(pPage->pDbPage)==pPage->aData ); 1629 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 1630 sqlite3PagerUnref(pPage->pDbPage); 1631 } 1632 } 1633 1634 /* 1635 ** During a rollback, when the pager reloads information into the cache 1636 ** so that the cache is restored to its original state at the start of 1637 ** the transaction, for each page restored this routine is called. 1638 ** 1639 ** This routine needs to reset the extra data section at the end of the 1640 ** page to agree with the restored data. 1641 */ 1642 static void pageReinit(DbPage *pData){ 1643 MemPage *pPage; 1644 pPage = (MemPage *)sqlite3PagerGetExtra(pData); 1645 assert( sqlite3PagerPageRefcount(pData)>0 ); 1646 if( pPage->isInit ){ 1647 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 1648 pPage->isInit = 0; 1649 if( sqlite3PagerPageRefcount(pData)>1 ){ 1650 /* pPage might not be a btree page; it might be an overflow page 1651 ** or ptrmap page or a free page. In those cases, the following 1652 ** call to btreeInitPage() will likely return SQLITE_CORRUPT. 1653 ** But no harm is done by this. And it is very important that 1654 ** btreeInitPage() be called on every btree page so we make 1655 ** the call for every page that comes in for re-initing. */ 1656 btreeInitPage(pPage); 1657 } 1658 } 1659 } 1660 1661 /* 1662 ** Invoke the busy handler for a btree. 1663 */ 1664 static int btreeInvokeBusyHandler(void *pArg){ 1665 BtShared *pBt = (BtShared*)pArg; 1666 assert( pBt->db ); 1667 assert( sqlite3_mutex_held(pBt->db->mutex) ); 1668 return sqlite3InvokeBusyHandler(&pBt->db->busyHandler); 1669 } 1670 1671 /* 1672 ** Open a database file. 1673 ** 1674 ** zFilename is the name of the database file. If zFilename is NULL 1675 ** then an ephemeral database is created. The ephemeral database might 1676 ** be exclusively in memory, or it might use a disk-based memory cache. 1677 ** Either way, the ephemeral database will be automatically deleted 1678 ** when sqlite3BtreeClose() is called. 1679 ** 1680 ** If zFilename is ":memory:" then an in-memory database is created 1681 ** that is automatically destroyed when it is closed. 1682 ** 1683 ** The "flags" parameter is a bitmask that might contain bits 1684 ** BTREE_OMIT_JOURNAL and/or BTREE_NO_READLOCK. The BTREE_NO_READLOCK 1685 ** bit is also set if the SQLITE_NoReadlock flags is set in db->flags. 1686 ** These flags are passed through into sqlite3PagerOpen() and must 1687 ** be the same values as PAGER_OMIT_JOURNAL and PAGER_NO_READLOCK. 1688 ** 1689 ** If the database is already opened in the same database connection 1690 ** and we are in shared cache mode, then the open will fail with an 1691 ** SQLITE_CONSTRAINT error. We cannot allow two or more BtShared 1692 ** objects in the same database connection since doing so will lead 1693 ** to problems with locking. 1694 */ 1695 int sqlite3BtreeOpen( 1696 const char *zFilename, /* Name of the file containing the BTree database */ 1697 sqlite3 *db, /* Associated database handle */ 1698 Btree **ppBtree, /* Pointer to new Btree object written here */ 1699 int flags, /* Options */ 1700 int vfsFlags /* Flags passed through to sqlite3_vfs.xOpen() */ 1701 ){ 1702 sqlite3_vfs *pVfs; /* The VFS to use for this btree */ 1703 BtShared *pBt = 0; /* Shared part of btree structure */ 1704 Btree *p; /* Handle to return */ 1705 sqlite3_mutex *mutexOpen = 0; /* Prevents a race condition. Ticket #3537 */ 1706 int rc = SQLITE_OK; /* Result code from this function */ 1707 u8 nReserve; /* Byte of unused space on each page */ 1708 unsigned char zDbHeader[100]; /* Database header content */ 1709 1710 /* True if opening an ephemeral, temporary database */ 1711 const int isTempDb = zFilename==0 || zFilename[0]==0; 1712 1713 /* Set the variable isMemdb to true for an in-memory database, or 1714 ** false for a file-based database. This symbol is only required if 1715 ** either of the shared-data or autovacuum features are compiled 1716 ** into the library. 1717 */ 1718 #if !defined(SQLITE_OMIT_SHARED_CACHE) || !defined(SQLITE_OMIT_AUTOVACUUM) 1719 #ifdef SQLITE_OMIT_MEMORYDB 1720 const int isMemdb = 0; 1721 #else 1722 const int isMemdb = (zFilename && strcmp(zFilename, ":memory:")==0) 1723 || (isTempDb && sqlite3TempInMemory(db)); 1724 #endif 1725 #endif 1726 1727 assert( db!=0 ); 1728 assert( sqlite3_mutex_held(db->mutex) ); 1729 assert( (flags&0xff)==flags ); /* flags fit in 8 bits */ 1730 1731 /* Only a BTREE_SINGLE database can be BTREE_UNORDERED */ 1732 assert( (flags & BTREE_UNORDERED)==0 || (flags & BTREE_SINGLE)!=0 ); 1733 1734 /* A BTREE_SINGLE database is always a temporary and/or ephemeral */ 1735 assert( (flags & BTREE_SINGLE)==0 || isTempDb ); 1736 1737 if( db->flags & SQLITE_NoReadlock ){ 1738 flags |= BTREE_NO_READLOCK; 1739 } 1740 if( isMemdb ){ 1741 flags |= BTREE_MEMORY; 1742 } 1743 if( (vfsFlags & SQLITE_OPEN_MAIN_DB)!=0 && (isMemdb || isTempDb) ){ 1744 vfsFlags = (vfsFlags & ~SQLITE_OPEN_MAIN_DB) | SQLITE_OPEN_TEMP_DB; 1745 } 1746 pVfs = db->pVfs; 1747 p = sqlite3MallocZero(sizeof(Btree)); 1748 if( !p ){ 1749 return SQLITE_NOMEM; 1750 } 1751 p->inTrans = TRANS_NONE; 1752 p->db = db; 1753 #ifndef SQLITE_OMIT_SHARED_CACHE 1754 p->lock.pBtree = p; 1755 p->lock.iTable = 1; 1756 #endif 1757 1758 #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO) 1759 /* 1760 ** If this Btree is a candidate for shared cache, try to find an 1761 ** existing BtShared object that we can share with 1762 */ 1763 if( isMemdb==0 && isTempDb==0 ){ 1764 if( vfsFlags & SQLITE_OPEN_SHAREDCACHE ){ 1765 int nFullPathname = pVfs->mxPathname+1; 1766 char *zFullPathname = sqlite3Malloc(nFullPathname); 1767 sqlite3_mutex *mutexShared; 1768 p->sharable = 1; 1769 if( !zFullPathname ){ 1770 sqlite3_free(p); 1771 return SQLITE_NOMEM; 1772 } 1773 sqlite3OsFullPathname(pVfs, zFilename, nFullPathname, zFullPathname); 1774 mutexOpen = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_OPEN); 1775 sqlite3_mutex_enter(mutexOpen); 1776 mutexShared = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER); 1777 sqlite3_mutex_enter(mutexShared); 1778 for(pBt=GLOBAL(BtShared*,sqlite3SharedCacheList); pBt; pBt=pBt->pNext){ 1779 assert( pBt->nRef>0 ); 1780 if( 0==strcmp(zFullPathname, sqlite3PagerFilename(pBt->pPager)) 1781 && sqlite3PagerVfs(pBt->pPager)==pVfs ){ 1782 int iDb; 1783 for(iDb=db->nDb-1; iDb>=0; iDb--){ 1784 Btree *pExisting = db->aDb[iDb].pBt; 1785 if( pExisting && pExisting->pBt==pBt ){ 1786 sqlite3_mutex_leave(mutexShared); 1787 sqlite3_mutex_leave(mutexOpen); 1788 sqlite3_free(zFullPathname); 1789 sqlite3_free(p); 1790 return SQLITE_CONSTRAINT; 1791 } 1792 } 1793 p->pBt = pBt; 1794 pBt->nRef++; 1795 break; 1796 } 1797 } 1798 sqlite3_mutex_leave(mutexShared); 1799 sqlite3_free(zFullPathname); 1800 } 1801 #ifdef SQLITE_DEBUG 1802 else{ 1803 /* In debug mode, we mark all persistent databases as sharable 1804 ** even when they are not. This exercises the locking code and 1805 ** gives more opportunity for asserts(sqlite3_mutex_held()) 1806 ** statements to find locking problems. 1807 */ 1808 p->sharable = 1; 1809 } 1810 #endif 1811 } 1812 #endif 1813 if( pBt==0 ){ 1814 /* 1815 ** The following asserts make sure that structures used by the btree are 1816 ** the right size. This is to guard against size changes that result 1817 ** when compiling on a different architecture. 1818 */ 1819 assert( sizeof(i64)==8 || sizeof(i64)==4 ); 1820 assert( sizeof(u64)==8 || sizeof(u64)==4 ); 1821 assert( sizeof(u32)==4 ); 1822 assert( sizeof(u16)==2 ); 1823 assert( sizeof(Pgno)==4 ); 1824 1825 pBt = sqlite3MallocZero( sizeof(*pBt) ); 1826 if( pBt==0 ){ 1827 rc = SQLITE_NOMEM; 1828 goto btree_open_out; 1829 } 1830 rc = sqlite3PagerOpen(pVfs, &pBt->pPager, zFilename, 1831 EXTRA_SIZE, flags, vfsFlags, pageReinit); 1832 if( rc==SQLITE_OK ){ 1833 rc = sqlite3PagerReadFileheader(pBt->pPager,sizeof(zDbHeader),zDbHeader); 1834 } 1835 if( rc!=SQLITE_OK ){ 1836 goto btree_open_out; 1837 } 1838 pBt->openFlags = (u8)flags; 1839 pBt->db = db; 1840 sqlite3PagerSetBusyhandler(pBt->pPager, btreeInvokeBusyHandler, pBt); 1841 p->pBt = pBt; 1842 1843 pBt->pCursor = 0; 1844 pBt->pPage1 = 0; 1845 pBt->readOnly = sqlite3PagerIsreadonly(pBt->pPager); 1846 #ifdef SQLITE_SECURE_DELETE 1847 pBt->secureDelete = 1; 1848 #endif 1849 pBt->pageSize = (zDbHeader[16]<<8) | (zDbHeader[17]<<16); 1850 if( pBt->pageSize<512 || pBt->pageSize>SQLITE_MAX_PAGE_SIZE 1851 || ((pBt->pageSize-1)&pBt->pageSize)!=0 ){ 1852 pBt->pageSize = 0; 1853 #ifndef SQLITE_OMIT_AUTOVACUUM 1854 /* If the magic name ":memory:" will create an in-memory database, then 1855 ** leave the autoVacuum mode at 0 (do not auto-vacuum), even if 1856 ** SQLITE_DEFAULT_AUTOVACUUM is true. On the other hand, if 1857 ** SQLITE_OMIT_MEMORYDB has been defined, then ":memory:" is just a 1858 ** regular file-name. In this case the auto-vacuum applies as per normal. 1859 */ 1860 if( zFilename && !isMemdb ){ 1861 pBt->autoVacuum = (SQLITE_DEFAULT_AUTOVACUUM ? 1 : 0); 1862 pBt->incrVacuum = (SQLITE_DEFAULT_AUTOVACUUM==2 ? 1 : 0); 1863 } 1864 #endif 1865 nReserve = 0; 1866 }else{ 1867 nReserve = zDbHeader[20]; 1868 pBt->pageSizeFixed = 1; 1869 #ifndef SQLITE_OMIT_AUTOVACUUM 1870 pBt->autoVacuum = (get4byte(&zDbHeader[36 + 4*4])?1:0); 1871 pBt->incrVacuum = (get4byte(&zDbHeader[36 + 7*4])?1:0); 1872 #endif 1873 } 1874 rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize, nReserve); 1875 if( rc ) goto btree_open_out; 1876 pBt->usableSize = pBt->pageSize - nReserve; 1877 assert( (pBt->pageSize & 7)==0 ); /* 8-byte alignment of pageSize */ 1878 1879 #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO) 1880 /* Add the new BtShared object to the linked list sharable BtShareds. 1881 */ 1882 if( p->sharable ){ 1883 sqlite3_mutex *mutexShared; 1884 pBt->nRef = 1; 1885 mutexShared = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER); 1886 if( SQLITE_THREADSAFE && sqlite3GlobalConfig.bCoreMutex ){ 1887 pBt->mutex = sqlite3MutexAlloc(SQLITE_MUTEX_FAST); 1888 if( pBt->mutex==0 ){ 1889 rc = SQLITE_NOMEM; 1890 db->mallocFailed = 0; 1891 goto btree_open_out; 1892 } 1893 } 1894 sqlite3_mutex_enter(mutexShared); 1895 pBt->pNext = GLOBAL(BtShared*,sqlite3SharedCacheList); 1896 GLOBAL(BtShared*,sqlite3SharedCacheList) = pBt; 1897 sqlite3_mutex_leave(mutexShared); 1898 } 1899 #endif 1900 } 1901 1902 #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO) 1903 /* If the new Btree uses a sharable pBtShared, then link the new 1904 ** Btree into the list of all sharable Btrees for the same connection. 1905 ** The list is kept in ascending order by pBt address. 1906 */ 1907 if( p->sharable ){ 1908 int i; 1909 Btree *pSib; 1910 for(i=0; i<db->nDb; i++){ 1911 if( (pSib = db->aDb[i].pBt)!=0 && pSib->sharable ){ 1912 while( pSib->pPrev ){ pSib = pSib->pPrev; } 1913 if( p->pBt<pSib->pBt ){ 1914 p->pNext = pSib; 1915 p->pPrev = 0; 1916 pSib->pPrev = p; 1917 }else{ 1918 while( pSib->pNext && pSib->pNext->pBt<p->pBt ){ 1919 pSib = pSib->pNext; 1920 } 1921 p->pNext = pSib->pNext; 1922 p->pPrev = pSib; 1923 if( p->pNext ){ 1924 p->pNext->pPrev = p; 1925 } 1926 pSib->pNext = p; 1927 } 1928 break; 1929 } 1930 } 1931 } 1932 #endif 1933 *ppBtree = p; 1934 1935 btree_open_out: 1936 if( rc!=SQLITE_OK ){ 1937 if( pBt && pBt->pPager ){ 1938 sqlite3PagerClose(pBt->pPager); 1939 } 1940 sqlite3_free(pBt); 1941 sqlite3_free(p); 1942 *ppBtree = 0; 1943 }else{ 1944 /* If the B-Tree was successfully opened, set the pager-cache size to the 1945 ** default value. Except, when opening on an existing shared pager-cache, 1946 ** do not change the pager-cache size. 1947 */ 1948 if( sqlite3BtreeSchema(p, 0, 0)==0 ){ 1949 sqlite3PagerSetCachesize(p->pBt->pPager, SQLITE_DEFAULT_CACHE_SIZE); 1950 } 1951 } 1952 if( mutexOpen ){ 1953 assert( sqlite3_mutex_held(mutexOpen) ); 1954 sqlite3_mutex_leave(mutexOpen); 1955 } 1956 return rc; 1957 } 1958 1959 /* 1960 ** Decrement the BtShared.nRef counter. When it reaches zero, 1961 ** remove the BtShared structure from the sharing list. Return 1962 ** true if the BtShared.nRef counter reaches zero and return 1963 ** false if it is still positive. 1964 */ 1965 static int removeFromSharingList(BtShared *pBt){ 1966 #ifndef SQLITE_OMIT_SHARED_CACHE 1967 sqlite3_mutex *pMaster; 1968 BtShared *pList; 1969 int removed = 0; 1970 1971 assert( sqlite3_mutex_notheld(pBt->mutex) ); 1972 pMaster = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER); 1973 sqlite3_mutex_enter(pMaster); 1974 pBt->nRef--; 1975 if( pBt->nRef<=0 ){ 1976 if( GLOBAL(BtShared*,sqlite3SharedCacheList)==pBt ){ 1977 GLOBAL(BtShared*,sqlite3SharedCacheList) = pBt->pNext; 1978 }else{ 1979 pList = GLOBAL(BtShared*,sqlite3SharedCacheList); 1980 while( ALWAYS(pList) && pList->pNext!=pBt ){ 1981 pList=pList->pNext; 1982 } 1983 if( ALWAYS(pList) ){ 1984 pList->pNext = pBt->pNext; 1985 } 1986 } 1987 if( SQLITE_THREADSAFE ){ 1988 sqlite3_mutex_free(pBt->mutex); 1989 } 1990 removed = 1; 1991 } 1992 sqlite3_mutex_leave(pMaster); 1993 return removed; 1994 #else 1995 return 1; 1996 #endif 1997 } 1998 1999 /* 2000 ** Make sure pBt->pTmpSpace points to an allocation of 2001 ** MX_CELL_SIZE(pBt) bytes. 2002 */ 2003 static void allocateTempSpace(BtShared *pBt){ 2004 if( !pBt->pTmpSpace ){ 2005 pBt->pTmpSpace = sqlite3PageMalloc( pBt->pageSize ); 2006 } 2007 } 2008 2009 /* 2010 ** Free the pBt->pTmpSpace allocation 2011 */ 2012 static void freeTempSpace(BtShared *pBt){ 2013 sqlite3PageFree( pBt->pTmpSpace); 2014 pBt->pTmpSpace = 0; 2015 } 2016 2017 /* 2018 ** Close an open database and invalidate all cursors. 2019 */ 2020 int sqlite3BtreeClose(Btree *p){ 2021 BtShared *pBt = p->pBt; 2022 BtCursor *pCur; 2023 2024 /* Close all cursors opened via this handle. */ 2025 assert( sqlite3_mutex_held(p->db->mutex) ); 2026 sqlite3BtreeEnter(p); 2027 pCur = pBt->pCursor; 2028 while( pCur ){ 2029 BtCursor *pTmp = pCur; 2030 pCur = pCur->pNext; 2031 if( pTmp->pBtree==p ){ 2032 sqlite3BtreeCloseCursor(pTmp); 2033 } 2034 } 2035 2036 /* Rollback any active transaction and free the handle structure. 2037 ** The call to sqlite3BtreeRollback() drops any table-locks held by 2038 ** this handle. 2039 */ 2040 sqlite3BtreeRollback(p); 2041 sqlite3BtreeLeave(p); 2042 2043 /* If there are still other outstanding references to the shared-btree 2044 ** structure, return now. The remainder of this procedure cleans 2045 ** up the shared-btree. 2046 */ 2047 assert( p->wantToLock==0 && p->locked==0 ); 2048 if( !p->sharable || removeFromSharingList(pBt) ){ 2049 /* The pBt is no longer on the sharing list, so we can access 2050 ** it without having to hold the mutex. 2051 ** 2052 ** Clean out and delete the BtShared object. 2053 */ 2054 assert( !pBt->pCursor ); 2055 sqlite3PagerClose(pBt->pPager); 2056 if( pBt->xFreeSchema && pBt->pSchema ){ 2057 pBt->xFreeSchema(pBt->pSchema); 2058 } 2059 sqlite3DbFree(0, pBt->pSchema); 2060 freeTempSpace(pBt); 2061 sqlite3_free(pBt); 2062 } 2063 2064 #ifndef SQLITE_OMIT_SHARED_CACHE 2065 assert( p->wantToLock==0 ); 2066 assert( p->locked==0 ); 2067 if( p->pPrev ) p->pPrev->pNext = p->pNext; 2068 if( p->pNext ) p->pNext->pPrev = p->pPrev; 2069 #endif 2070 2071 sqlite3_free(p); 2072 return SQLITE_OK; 2073 } 2074 2075 /* 2076 ** Change the limit on the number of pages allowed in the cache. 2077 ** 2078 ** The maximum number of cache pages is set to the absolute 2079 ** value of mxPage. If mxPage is negative, the pager will 2080 ** operate asynchronously - it will not stop to do fsync()s 2081 ** to insure data is written to the disk surface before 2082 ** continuing. Transactions still work if synchronous is off, 2083 ** and the database cannot be corrupted if this program 2084 ** crashes. But if the operating system crashes or there is 2085 ** an abrupt power failure when synchronous is off, the database 2086 ** could be left in an inconsistent and unrecoverable state. 2087 ** Synchronous is on by default so database corruption is not 2088 ** normally a worry. 2089 */ 2090 int sqlite3BtreeSetCacheSize(Btree *p, int mxPage){ 2091 BtShared *pBt = p->pBt; 2092 assert( sqlite3_mutex_held(p->db->mutex) ); 2093 sqlite3BtreeEnter(p); 2094 sqlite3PagerSetCachesize(pBt->pPager, mxPage); 2095 sqlite3BtreeLeave(p); 2096 return SQLITE_OK; 2097 } 2098 2099 /* 2100 ** Change the way data is synced to disk in order to increase or decrease 2101 ** how well the database resists damage due to OS crashes and power 2102 ** failures. Level 1 is the same as asynchronous (no syncs() occur and 2103 ** there is a high probability of damage) Level 2 is the default. There 2104 ** is a very low but non-zero probability of damage. Level 3 reduces the 2105 ** probability of damage to near zero but with a write performance reduction. 2106 */ 2107 #ifndef SQLITE_OMIT_PAGER_PRAGMAS 2108 int sqlite3BtreeSetSafetyLevel(Btree *p, int level, int fullSync){ 2109 BtShared *pBt = p->pBt; 2110 assert( sqlite3_mutex_held(p->db->mutex) ); 2111 sqlite3BtreeEnter(p); 2112 sqlite3PagerSetSafetyLevel(pBt->pPager, level, fullSync); 2113 sqlite3BtreeLeave(p); 2114 return SQLITE_OK; 2115 } 2116 #endif 2117 2118 /* 2119 ** Return TRUE if the given btree is set to safety level 1. In other 2120 ** words, return TRUE if no sync() occurs on the disk files. 2121 */ 2122 int sqlite3BtreeSyncDisabled(Btree *p){ 2123 BtShared *pBt = p->pBt; 2124 int rc; 2125 assert( sqlite3_mutex_held(p->db->mutex) ); 2126 sqlite3BtreeEnter(p); 2127 assert( pBt && pBt->pPager ); 2128 rc = sqlite3PagerNosync(pBt->pPager); 2129 sqlite3BtreeLeave(p); 2130 return rc; 2131 } 2132 2133 #if !defined(SQLITE_OMIT_PAGER_PRAGMAS) || !defined(SQLITE_OMIT_VACUUM) 2134 /* 2135 ** Change the default pages size and the number of reserved bytes per page. 2136 ** Or, if the page size has already been fixed, return SQLITE_READONLY 2137 ** without changing anything. 2138 ** 2139 ** The page size must be a power of 2 between 512 and 65536. If the page 2140 ** size supplied does not meet this constraint then the page size is not 2141 ** changed. 2142 ** 2143 ** Page sizes are constrained to be a power of two so that the region 2144 ** of the database file used for locking (beginning at PENDING_BYTE, 2145 ** the first byte past the 1GB boundary, 0x40000000) needs to occur 2146 ** at the beginning of a page. 2147 ** 2148 ** If parameter nReserve is less than zero, then the number of reserved 2149 ** bytes per page is left unchanged. 2150 ** 2151 ** If the iFix!=0 then the pageSizeFixed flag is set so that the page size 2152 ** and autovacuum mode can no longer be changed. 2153 */ 2154 int sqlite3BtreeSetPageSize(Btree *p, int pageSize, int nReserve, int iFix){ 2155 int rc = SQLITE_OK; 2156 BtShared *pBt = p->pBt; 2157 assert( nReserve>=-1 && nReserve<=255 ); 2158 sqlite3BtreeEnter(p); 2159 if( pBt->pageSizeFixed ){ 2160 sqlite3BtreeLeave(p); 2161 return SQLITE_READONLY; 2162 } 2163 if( nReserve<0 ){ 2164 nReserve = pBt->pageSize - pBt->usableSize; 2165 } 2166 assert( nReserve>=0 && nReserve<=255 ); 2167 if( pageSize>=512 && pageSize<=SQLITE_MAX_PAGE_SIZE && 2168 ((pageSize-1)&pageSize)==0 ){ 2169 assert( (pageSize & 7)==0 ); 2170 assert( !pBt->pPage1 && !pBt->pCursor ); 2171 pBt->pageSize = (u32)pageSize; 2172 freeTempSpace(pBt); 2173 } 2174 rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize, nReserve); 2175 pBt->usableSize = pBt->pageSize - (u16)nReserve; 2176 if( iFix ) pBt->pageSizeFixed = 1; 2177 sqlite3BtreeLeave(p); 2178 return rc; 2179 } 2180 2181 /* 2182 ** Return the currently defined page size 2183 */ 2184 int sqlite3BtreeGetPageSize(Btree *p){ 2185 return p->pBt->pageSize; 2186 } 2187 2188 /* 2189 ** Return the number of bytes of space at the end of every page that 2190 ** are intentually left unused. This is the "reserved" space that is 2191 ** sometimes used by extensions. 2192 */ 2193 int sqlite3BtreeGetReserve(Btree *p){ 2194 int n; 2195 sqlite3BtreeEnter(p); 2196 n = p->pBt->pageSize - p->pBt->usableSize; 2197 sqlite3BtreeLeave(p); 2198 return n; 2199 } 2200 2201 /* 2202 ** Set the maximum page count for a database if mxPage is positive. 2203 ** No changes are made if mxPage is 0 or negative. 2204 ** Regardless of the value of mxPage, return the maximum page count. 2205 */ 2206 int sqlite3BtreeMaxPageCount(Btree *p, int mxPage){ 2207 int n; 2208 sqlite3BtreeEnter(p); 2209 n = sqlite3PagerMaxPageCount(p->pBt->pPager, mxPage); 2210 sqlite3BtreeLeave(p); 2211 return n; 2212 } 2213 2214 /* 2215 ** Set the secureDelete flag if newFlag is 0 or 1. If newFlag is -1, 2216 ** then make no changes. Always return the value of the secureDelete 2217 ** setting after the change. 2218 */ 2219 int sqlite3BtreeSecureDelete(Btree *p, int newFlag){ 2220 int b; 2221 if( p==0 ) return 0; 2222 sqlite3BtreeEnter(p); 2223 if( newFlag>=0 ){ 2224 p->pBt->secureDelete = (newFlag!=0) ? 1 : 0; 2225 } 2226 b = p->pBt->secureDelete; 2227 sqlite3BtreeLeave(p); 2228 return b; 2229 } 2230 #endif /* !defined(SQLITE_OMIT_PAGER_PRAGMAS) || !defined(SQLITE_OMIT_VACUUM) */ 2231 2232 /* 2233 ** Change the 'auto-vacuum' property of the database. If the 'autoVacuum' 2234 ** parameter is non-zero, then auto-vacuum mode is enabled. If zero, it 2235 ** is disabled. The default value for the auto-vacuum property is 2236 ** determined by the SQLITE_DEFAULT_AUTOVACUUM macro. 2237 */ 2238 int sqlite3BtreeSetAutoVacuum(Btree *p, int autoVacuum){ 2239 #ifdef SQLITE_OMIT_AUTOVACUUM 2240 return SQLITE_READONLY; 2241 #else 2242 BtShared *pBt = p->pBt; 2243 int rc = SQLITE_OK; 2244 u8 av = (u8)autoVacuum; 2245 2246 sqlite3BtreeEnter(p); 2247 if( pBt->pageSizeFixed && (av ?1:0)!=pBt->autoVacuum ){ 2248 rc = SQLITE_READONLY; 2249 }else{ 2250 pBt->autoVacuum = av ?1:0; 2251 pBt->incrVacuum = av==2 ?1:0; 2252 } 2253 sqlite3BtreeLeave(p); 2254 return rc; 2255 #endif 2256 } 2257 2258 /* 2259 ** Return the value of the 'auto-vacuum' property. If auto-vacuum is 2260 ** enabled 1 is returned. Otherwise 0. 2261 */ 2262 int sqlite3BtreeGetAutoVacuum(Btree *p){ 2263 #ifdef SQLITE_OMIT_AUTOVACUUM 2264 return BTREE_AUTOVACUUM_NONE; 2265 #else 2266 int rc; 2267 sqlite3BtreeEnter(p); 2268 rc = ( 2269 (!p->pBt->autoVacuum)?BTREE_AUTOVACUUM_NONE: 2270 (!p->pBt->incrVacuum)?BTREE_AUTOVACUUM_FULL: 2271 BTREE_AUTOVACUUM_INCR 2272 ); 2273 sqlite3BtreeLeave(p); 2274 return rc; 2275 #endif 2276 } 2277 2278 2279 /* 2280 ** Get a reference to pPage1 of the database file. This will 2281 ** also acquire a readlock on that file. 2282 ** 2283 ** SQLITE_OK is returned on success. If the file is not a 2284 ** well-formed database file, then SQLITE_CORRUPT is returned. 2285 ** SQLITE_BUSY is returned if the database is locked. SQLITE_NOMEM 2286 ** is returned if we run out of memory. 2287 */ 2288 static int lockBtree(BtShared *pBt){ 2289 int rc; /* Result code from subfunctions */ 2290 MemPage *pPage1; /* Page 1 of the database file */ 2291 int nPage; /* Number of pages in the database */ 2292 int nPageFile = 0; /* Number of pages in the database file */ 2293 int nPageHeader; /* Number of pages in the database according to hdr */ 2294 2295 assert( sqlite3_mutex_held(pBt->mutex) ); 2296 assert( pBt->pPage1==0 ); 2297 rc = sqlite3PagerSharedLock(pBt->pPager); 2298 if( rc!=SQLITE_OK ) return rc; 2299 rc = btreeGetPage(pBt, 1, &pPage1, 0); 2300 if( rc!=SQLITE_OK ) return rc; 2301 2302 /* Do some checking to help insure the file we opened really is 2303 ** a valid database file. 2304 */ 2305 nPage = nPageHeader = get4byte(28+(u8*)pPage1->aData); 2306 sqlite3PagerPagecount(pBt->pPager, &nPageFile); 2307 if( nPage==0 || memcmp(24+(u8*)pPage1->aData, 92+(u8*)pPage1->aData,4)!=0 ){ 2308 nPage = nPageFile; 2309 } 2310 if( nPage>0 ){ 2311 u32 pageSize; 2312 u32 usableSize; 2313 u8 *page1 = pPage1->aData; 2314 rc = SQLITE_NOTADB; 2315 if( memcmp(page1, zMagicHeader, 16)!=0 ){ 2316 goto page1_init_failed; 2317 } 2318 2319 #ifdef SQLITE_OMIT_WAL 2320 if( page1[18]>1 ){ 2321 pBt->readOnly = 1; 2322 } 2323 if( page1[19]>1 ){ 2324 goto page1_init_failed; 2325 } 2326 #else 2327 if( page1[18]>2 ){ 2328 pBt->readOnly = 1; 2329 } 2330 if( page1[19]>2 ){ 2331 goto page1_init_failed; 2332 } 2333 2334 /* If the write version is set to 2, this database should be accessed 2335 ** in WAL mode. If the log is not already open, open it now. Then 2336 ** return SQLITE_OK and return without populating BtShared.pPage1. 2337 ** The caller detects this and calls this function again. This is 2338 ** required as the version of page 1 currently in the page1 buffer 2339 ** may not be the latest version - there may be a newer one in the log 2340 ** file. 2341 */ 2342 if( page1[19]==2 && pBt->doNotUseWAL==0 ){ 2343 int isOpen = 0; 2344 rc = sqlite3PagerOpenWal(pBt->pPager, &isOpen); 2345 if( rc!=SQLITE_OK ){ 2346 goto page1_init_failed; 2347 }else if( isOpen==0 ){ 2348 releasePage(pPage1); 2349 return SQLITE_OK; 2350 } 2351 rc = SQLITE_NOTADB; 2352 } 2353 #endif 2354 2355 /* The maximum embedded fraction must be exactly 25%. And the minimum 2356 ** embedded fraction must be 12.5% for both leaf-data and non-leaf-data. 2357 ** The original design allowed these amounts to vary, but as of 2358 ** version 3.6.0, we require them to be fixed. 2359 */ 2360 if( memcmp(&page1[21], "\100\040\040",3)!=0 ){ 2361 goto page1_init_failed; 2362 } 2363 pageSize = (page1[16]<<8) | (page1[17]<<16); 2364 if( ((pageSize-1)&pageSize)!=0 2365 || pageSize>SQLITE_MAX_PAGE_SIZE 2366 || pageSize<=256 2367 ){ 2368 goto page1_init_failed; 2369 } 2370 assert( (pageSize & 7)==0 ); 2371 usableSize = pageSize - page1[20]; 2372 if( (u32)pageSize!=pBt->pageSize ){ 2373 /* After reading the first page of the database assuming a page size 2374 ** of BtShared.pageSize, we have discovered that the page-size is 2375 ** actually pageSize. Unlock the database, leave pBt->pPage1 at 2376 ** zero and return SQLITE_OK. The caller will call this function 2377 ** again with the correct page-size. 2378 */ 2379 releasePage(pPage1); 2380 pBt->usableSize = usableSize; 2381 pBt->pageSize = pageSize; 2382 freeTempSpace(pBt); 2383 rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize, 2384 pageSize-usableSize); 2385 return rc; 2386 } 2387 if( nPageHeader>nPageFile ){ 2388 rc = SQLITE_CORRUPT_BKPT; 2389 goto page1_init_failed; 2390 } 2391 if( usableSize<480 ){ 2392 goto page1_init_failed; 2393 } 2394 pBt->pageSize = pageSize; 2395 pBt->usableSize = usableSize; 2396 #ifndef SQLITE_OMIT_AUTOVACUUM 2397 pBt->autoVacuum = (get4byte(&page1[36 + 4*4])?1:0); 2398 pBt->incrVacuum = (get4byte(&page1[36 + 7*4])?1:0); 2399 #endif 2400 } 2401 2402 /* maxLocal is the maximum amount of payload to store locally for 2403 ** a cell. Make sure it is small enough so that at least minFanout 2404 ** cells can will fit on one page. We assume a 10-byte page header. 2405 ** Besides the payload, the cell must store: 2406 ** 2-byte pointer to the cell 2407 ** 4-byte child pointer 2408 ** 9-byte nKey value 2409 ** 4-byte nData value 2410 ** 4-byte overflow page pointer 2411 ** So a cell consists of a 2-byte pointer, a header which is as much as 2412 ** 17 bytes long, 0 to N bytes of payload, and an optional 4 byte overflow 2413 ** page pointer. 2414 */ 2415 pBt->maxLocal = (u16)((pBt->usableSize-12)*64/255 - 23); 2416 pBt->minLocal = (u16)((pBt->usableSize-12)*32/255 - 23); 2417 pBt->maxLeaf = (u16)(pBt->usableSize - 35); 2418 pBt->minLeaf = (u16)((pBt->usableSize-12)*32/255 - 23); 2419 assert( pBt->maxLeaf + 23 <= MX_CELL_SIZE(pBt) ); 2420 pBt->pPage1 = pPage1; 2421 pBt->nPage = nPage; 2422 return SQLITE_OK; 2423 2424 page1_init_failed: 2425 releasePage(pPage1); 2426 pBt->pPage1 = 0; 2427 return rc; 2428 } 2429 2430 /* 2431 ** If there are no outstanding cursors and we are not in the middle 2432 ** of a transaction but there is a read lock on the database, then 2433 ** this routine unrefs the first page of the database file which 2434 ** has the effect of releasing the read lock. 2435 ** 2436 ** If there is a transaction in progress, this routine is a no-op. 2437 */ 2438 static void unlockBtreeIfUnused(BtShared *pBt){ 2439 assert( sqlite3_mutex_held(pBt->mutex) ); 2440 assert( pBt->pCursor==0 || pBt->inTransaction>TRANS_NONE ); 2441 if( pBt->inTransaction==TRANS_NONE && pBt->pPage1!=0 ){ 2442 assert( pBt->pPage1->aData ); 2443 assert( sqlite3PagerRefcount(pBt->pPager)==1 ); 2444 assert( pBt->pPage1->aData ); 2445 releasePage(pBt->pPage1); 2446 pBt->pPage1 = 0; 2447 } 2448 } 2449 2450 /* 2451 ** If pBt points to an empty file then convert that empty file 2452 ** into a new empty database by initializing the first page of 2453 ** the database. 2454 */ 2455 static int newDatabase(BtShared *pBt){ 2456 MemPage *pP1; 2457 unsigned char *data; 2458 int rc; 2459 2460 assert( sqlite3_mutex_held(pBt->mutex) ); 2461 if( pBt->nPage>0 ){ 2462 return SQLITE_OK; 2463 } 2464 pP1 = pBt->pPage1; 2465 assert( pP1!=0 ); 2466 data = pP1->aData; 2467 rc = sqlite3PagerWrite(pP1->pDbPage); 2468 if( rc ) return rc; 2469 memcpy(data, zMagicHeader, sizeof(zMagicHeader)); 2470 assert( sizeof(zMagicHeader)==16 ); 2471 data[16] = (u8)((pBt->pageSize>>8)&0xff); 2472 data[17] = (u8)((pBt->pageSize>>16)&0xff); 2473 data[18] = 1; 2474 data[19] = 1; 2475 assert( pBt->usableSize<=pBt->pageSize && pBt->usableSize+255>=pBt->pageSize); 2476 data[20] = (u8)(pBt->pageSize - pBt->usableSize); 2477 data[21] = 64; 2478 data[22] = 32; 2479 data[23] = 32; 2480 memset(&data[24], 0, 100-24); 2481 zeroPage(pP1, PTF_INTKEY|PTF_LEAF|PTF_LEAFDATA ); 2482 pBt->pageSizeFixed = 1; 2483 #ifndef SQLITE_OMIT_AUTOVACUUM 2484 assert( pBt->autoVacuum==1 || pBt->autoVacuum==0 ); 2485 assert( pBt->incrVacuum==1 || pBt->incrVacuum==0 ); 2486 put4byte(&data[36 + 4*4], pBt->autoVacuum); 2487 put4byte(&data[36 + 7*4], pBt->incrVacuum); 2488 #endif 2489 pBt->nPage = 1; 2490 data[31] = 1; 2491 return SQLITE_OK; 2492 } 2493 2494 /* 2495 ** Attempt to start a new transaction. A write-transaction 2496 ** is started if the second argument is nonzero, otherwise a read- 2497 ** transaction. If the second argument is 2 or more and exclusive 2498 ** transaction is started, meaning that no other process is allowed 2499 ** to access the database. A preexisting transaction may not be 2500 ** upgraded to exclusive by calling this routine a second time - the 2501 ** exclusivity flag only works for a new transaction. 2502 ** 2503 ** A write-transaction must be started before attempting any 2504 ** changes to the database. None of the following routines 2505 ** will work unless a transaction is started first: 2506 ** 2507 ** sqlite3BtreeCreateTable() 2508 ** sqlite3BtreeCreateIndex() 2509 ** sqlite3BtreeClearTable() 2510 ** sqlite3BtreeDropTable() 2511 ** sqlite3BtreeInsert() 2512 ** sqlite3BtreeDelete() 2513 ** sqlite3BtreeUpdateMeta() 2514 ** 2515 ** If an initial attempt to acquire the lock fails because of lock contention 2516 ** and the database was previously unlocked, then invoke the busy handler 2517 ** if there is one. But if there was previously a read-lock, do not 2518 ** invoke the busy handler - just return SQLITE_BUSY. SQLITE_BUSY is 2519 ** returned when there is already a read-lock in order to avoid a deadlock. 2520 ** 2521 ** Suppose there are two processes A and B. A has a read lock and B has 2522 ** a reserved lock. B tries to promote to exclusive but is blocked because 2523 ** of A's read lock. A tries to promote to reserved but is blocked by B. 2524 ** One or the other of the two processes must give way or there can be 2525 ** no progress. By returning SQLITE_BUSY and not invoking the busy callback 2526 ** when A already has a read lock, we encourage A to give up and let B 2527 ** proceed. 2528 */ 2529 int sqlite3BtreeBeginTrans(Btree *p, int wrflag){ 2530 sqlite3 *pBlock = 0; 2531 BtShared *pBt = p->pBt; 2532 int rc = SQLITE_OK; 2533 2534 sqlite3BtreeEnter(p); 2535 btreeIntegrity(p); 2536 2537 /* If the btree is already in a write-transaction, or it 2538 ** is already in a read-transaction and a read-transaction 2539 ** is requested, this is a no-op. 2540 */ 2541 if( p->inTrans==TRANS_WRITE || (p->inTrans==TRANS_READ && !wrflag) ){ 2542 goto trans_begun; 2543 } 2544 2545 /* Write transactions are not possible on a read-only database */ 2546 if( pBt->readOnly && wrflag ){ 2547 rc = SQLITE_READONLY; 2548 goto trans_begun; 2549 } 2550 2551 #ifndef SQLITE_OMIT_SHARED_CACHE 2552 /* If another database handle has already opened a write transaction 2553 ** on this shared-btree structure and a second write transaction is 2554 ** requested, return SQLITE_LOCKED. 2555 */ 2556 if( (wrflag && pBt->inTransaction==TRANS_WRITE) || pBt->isPending ){ 2557 pBlock = pBt->pWriter->db; 2558 }else if( wrflag>1 ){ 2559 BtLock *pIter; 2560 for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){ 2561 if( pIter->pBtree!=p ){ 2562 pBlock = pIter->pBtree->db; 2563 break; 2564 } 2565 } 2566 } 2567 if( pBlock ){ 2568 sqlite3ConnectionBlocked(p->db, pBlock); 2569 rc = SQLITE_LOCKED_SHAREDCACHE; 2570 goto trans_begun; 2571 } 2572 #endif 2573 2574 /* Any read-only or read-write transaction implies a read-lock on 2575 ** page 1. So if some other shared-cache client already has a write-lock 2576 ** on page 1, the transaction cannot be opened. */ 2577 rc = querySharedCacheTableLock(p, MASTER_ROOT, READ_LOCK); 2578 if( SQLITE_OK!=rc ) goto trans_begun; 2579 2580 pBt->initiallyEmpty = (u8)(pBt->nPage==0); 2581 do { 2582 /* Call lockBtree() until either pBt->pPage1 is populated or 2583 ** lockBtree() returns something other than SQLITE_OK. lockBtree() 2584 ** may return SQLITE_OK but leave pBt->pPage1 set to 0 if after 2585 ** reading page 1 it discovers that the page-size of the database 2586 ** file is not pBt->pageSize. In this case lockBtree() will update 2587 ** pBt->pageSize to the page-size of the file on disk. 2588 */ 2589 while( pBt->pPage1==0 && SQLITE_OK==(rc = lockBtree(pBt)) ); 2590 2591 if( rc==SQLITE_OK && wrflag ){ 2592 if( pBt->readOnly ){ 2593 rc = SQLITE_READONLY; 2594 }else{ 2595 rc = sqlite3PagerBegin(pBt->pPager,wrflag>1,sqlite3TempInMemory(p->db)); 2596 if( rc==SQLITE_OK ){ 2597 rc = newDatabase(pBt); 2598 } 2599 } 2600 } 2601 2602 if( rc!=SQLITE_OK ){ 2603 unlockBtreeIfUnused(pBt); 2604 } 2605 }while( (rc&0xFF)==SQLITE_BUSY && pBt->inTransaction==TRANS_NONE && 2606 btreeInvokeBusyHandler(pBt) ); 2607 2608 if( rc==SQLITE_OK ){ 2609 if( p->inTrans==TRANS_NONE ){ 2610 pBt->nTransaction++; 2611 #ifndef SQLITE_OMIT_SHARED_CACHE 2612 if( p->sharable ){ 2613 assert( p->lock.pBtree==p && p->lock.iTable==1 ); 2614 p->lock.eLock = READ_LOCK; 2615 p->lock.pNext = pBt->pLock; 2616 pBt->pLock = &p->lock; 2617 } 2618 #endif 2619 } 2620 p->inTrans = (wrflag?TRANS_WRITE:TRANS_READ); 2621 if( p->inTrans>pBt->inTransaction ){ 2622 pBt->inTransaction = p->inTrans; 2623 } 2624 if( wrflag ){ 2625 MemPage *pPage1 = pBt->pPage1; 2626 #ifndef SQLITE_OMIT_SHARED_CACHE 2627 assert( !pBt->pWriter ); 2628 pBt->pWriter = p; 2629 pBt->isExclusive = (u8)(wrflag>1); 2630 #endif 2631 2632 /* If the db-size header field is incorrect (as it may be if an old 2633 ** client has been writing the database file), update it now. Doing 2634 ** this sooner rather than later means the database size can safely 2635 ** re-read the database size from page 1 if a savepoint or transaction 2636 ** rollback occurs within the transaction. 2637 */ 2638 if( pBt->nPage!=get4byte(&pPage1->aData[28]) ){ 2639 rc = sqlite3PagerWrite(pPage1->pDbPage); 2640 if( rc==SQLITE_OK ){ 2641 put4byte(&pPage1->aData[28], pBt->nPage); 2642 } 2643 } 2644 } 2645 } 2646 2647 2648 trans_begun: 2649 if( rc==SQLITE_OK && wrflag ){ 2650 /* This call makes sure that the pager has the correct number of 2651 ** open savepoints. If the second parameter is greater than 0 and 2652 ** the sub-journal is not already open, then it will be opened here. 2653 */ 2654 rc = sqlite3PagerOpenSavepoint(pBt->pPager, p->db->nSavepoint); 2655 } 2656 2657 btreeIntegrity(p); 2658 sqlite3BtreeLeave(p); 2659 return rc; 2660 } 2661 2662 #ifndef SQLITE_OMIT_AUTOVACUUM 2663 2664 /* 2665 ** Set the pointer-map entries for all children of page pPage. Also, if 2666 ** pPage contains cells that point to overflow pages, set the pointer 2667 ** map entries for the overflow pages as well. 2668 */ 2669 static int setChildPtrmaps(MemPage *pPage){ 2670 int i; /* Counter variable */ 2671 int nCell; /* Number of cells in page pPage */ 2672 int rc; /* Return code */ 2673 BtShared *pBt = pPage->pBt; 2674 u8 isInitOrig = pPage->isInit; 2675 Pgno pgno = pPage->pgno; 2676 2677 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 2678 rc = btreeInitPage(pPage); 2679 if( rc!=SQLITE_OK ){ 2680 goto set_child_ptrmaps_out; 2681 } 2682 nCell = pPage->nCell; 2683 2684 for(i=0; i<nCell; i++){ 2685 u8 *pCell = findCell(pPage, i); 2686 2687 ptrmapPutOvflPtr(pPage, pCell, &rc); 2688 2689 if( !pPage->leaf ){ 2690 Pgno childPgno = get4byte(pCell); 2691 ptrmapPut(pBt, childPgno, PTRMAP_BTREE, pgno, &rc); 2692 } 2693 } 2694 2695 if( !pPage->leaf ){ 2696 Pgno childPgno = get4byte(&pPage->aData[pPage->hdrOffset+8]); 2697 ptrmapPut(pBt, childPgno, PTRMAP_BTREE, pgno, &rc); 2698 } 2699 2700 set_child_ptrmaps_out: 2701 pPage->isInit = isInitOrig; 2702 return rc; 2703 } 2704 2705 /* 2706 ** Somewhere on pPage is a pointer to page iFrom. Modify this pointer so 2707 ** that it points to iTo. Parameter eType describes the type of pointer to 2708 ** be modified, as follows: 2709 ** 2710 ** PTRMAP_BTREE: pPage is a btree-page. The pointer points at a child 2711 ** page of pPage. 2712 ** 2713 ** PTRMAP_OVERFLOW1: pPage is a btree-page. The pointer points at an overflow 2714 ** page pointed to by one of the cells on pPage. 2715 ** 2716 ** PTRMAP_OVERFLOW2: pPage is an overflow-page. The pointer points at the next 2717 ** overflow page in the list. 2718 */ 2719 static int modifyPagePointer(MemPage *pPage, Pgno iFrom, Pgno iTo, u8 eType){ 2720 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 2721 assert( sqlite3PagerIswriteable(pPage->pDbPage) ); 2722 if( eType==PTRMAP_OVERFLOW2 ){ 2723 /* The pointer is always the first 4 bytes of the page in this case. */ 2724 if( get4byte(pPage->aData)!=iFrom ){ 2725 return SQLITE_CORRUPT_BKPT; 2726 } 2727 put4byte(pPage->aData, iTo); 2728 }else{ 2729 u8 isInitOrig = pPage->isInit; 2730 int i; 2731 int nCell; 2732 2733 btreeInitPage(pPage); 2734 nCell = pPage->nCell; 2735 2736 for(i=0; i<nCell; i++){ 2737 u8 *pCell = findCell(pPage, i); 2738 if( eType==PTRMAP_OVERFLOW1 ){ 2739 CellInfo info; 2740 btreeParseCellPtr(pPage, pCell, &info); 2741 if( info.iOverflow ){ 2742 if( iFrom==get4byte(&pCell[info.iOverflow]) ){ 2743 put4byte(&pCell[info.iOverflow], iTo); 2744 break; 2745 } 2746 } 2747 }else{ 2748 if( get4byte(pCell)==iFrom ){ 2749 put4byte(pCell, iTo); 2750 break; 2751 } 2752 } 2753 } 2754 2755 if( i==nCell ){ 2756 if( eType!=PTRMAP_BTREE || 2757 get4byte(&pPage->aData[pPage->hdrOffset+8])!=iFrom ){ 2758 return SQLITE_CORRUPT_BKPT; 2759 } 2760 put4byte(&pPage->aData[pPage->hdrOffset+8], iTo); 2761 } 2762 2763 pPage->isInit = isInitOrig; 2764 } 2765 return SQLITE_OK; 2766 } 2767 2768 2769 /* 2770 ** Move the open database page pDbPage to location iFreePage in the 2771 ** database. The pDbPage reference remains valid. 2772 ** 2773 ** The isCommit flag indicates that there is no need to remember that 2774 ** the journal needs to be sync()ed before database page pDbPage->pgno 2775 ** can be written to. The caller has already promised not to write to that 2776 ** page. 2777 */ 2778 static int relocatePage( 2779 BtShared *pBt, /* Btree */ 2780 MemPage *pDbPage, /* Open page to move */ 2781 u8 eType, /* Pointer map 'type' entry for pDbPage */ 2782 Pgno iPtrPage, /* Pointer map 'page-no' entry for pDbPage */ 2783 Pgno iFreePage, /* The location to move pDbPage to */ 2784 int isCommit /* isCommit flag passed to sqlite3PagerMovepage */ 2785 ){ 2786 MemPage *pPtrPage; /* The page that contains a pointer to pDbPage */ 2787 Pgno iDbPage = pDbPage->pgno; 2788 Pager *pPager = pBt->pPager; 2789 int rc; 2790 2791 assert( eType==PTRMAP_OVERFLOW2 || eType==PTRMAP_OVERFLOW1 || 2792 eType==PTRMAP_BTREE || eType==PTRMAP_ROOTPAGE ); 2793 assert( sqlite3_mutex_held(pBt->mutex) ); 2794 assert( pDbPage->pBt==pBt ); 2795 2796 /* Move page iDbPage from its current location to page number iFreePage */ 2797 TRACE(("AUTOVACUUM: Moving %d to free page %d (ptr page %d type %d)\n", 2798 iDbPage, iFreePage, iPtrPage, eType)); 2799 rc = sqlite3PagerMovepage(pPager, pDbPage->pDbPage, iFreePage, isCommit); 2800 if( rc!=SQLITE_OK ){ 2801 return rc; 2802 } 2803 pDbPage->pgno = iFreePage; 2804 2805 /* If pDbPage was a btree-page, then it may have child pages and/or cells 2806 ** that point to overflow pages. The pointer map entries for all these 2807 ** pages need to be changed. 2808 ** 2809 ** If pDbPage is an overflow page, then the first 4 bytes may store a 2810 ** pointer to a subsequent overflow page. If this is the case, then 2811 ** the pointer map needs to be updated for the subsequent overflow page. 2812 */ 2813 if( eType==PTRMAP_BTREE || eType==PTRMAP_ROOTPAGE ){ 2814 rc = setChildPtrmaps(pDbPage); 2815 if( rc!=SQLITE_OK ){ 2816 return rc; 2817 } 2818 }else{ 2819 Pgno nextOvfl = get4byte(pDbPage->aData); 2820 if( nextOvfl!=0 ){ 2821 ptrmapPut(pBt, nextOvfl, PTRMAP_OVERFLOW2, iFreePage, &rc); 2822 if( rc!=SQLITE_OK ){ 2823 return rc; 2824 } 2825 } 2826 } 2827 2828 /* Fix the database pointer on page iPtrPage that pointed at iDbPage so 2829 ** that it points at iFreePage. Also fix the pointer map entry for 2830 ** iPtrPage. 2831 */ 2832 if( eType!=PTRMAP_ROOTPAGE ){ 2833 rc = btreeGetPage(pBt, iPtrPage, &pPtrPage, 0); 2834 if( rc!=SQLITE_OK ){ 2835 return rc; 2836 } 2837 rc = sqlite3PagerWrite(pPtrPage->pDbPage); 2838 if( rc!=SQLITE_OK ){ 2839 releasePage(pPtrPage); 2840 return rc; 2841 } 2842 rc = modifyPagePointer(pPtrPage, iDbPage, iFreePage, eType); 2843 releasePage(pPtrPage); 2844 if( rc==SQLITE_OK ){ 2845 ptrmapPut(pBt, iFreePage, eType, iPtrPage, &rc); 2846 } 2847 } 2848 return rc; 2849 } 2850 2851 /* Forward declaration required by incrVacuumStep(). */ 2852 static int allocateBtreePage(BtShared *, MemPage **, Pgno *, Pgno, u8); 2853 2854 /* 2855 ** Perform a single step of an incremental-vacuum. If successful, 2856 ** return SQLITE_OK. If there is no work to do (and therefore no 2857 ** point in calling this function again), return SQLITE_DONE. 2858 ** 2859 ** More specificly, this function attempts to re-organize the 2860 ** database so that the last page of the file currently in use 2861 ** is no longer in use. 2862 ** 2863 ** If the nFin parameter is non-zero, this function assumes 2864 ** that the caller will keep calling incrVacuumStep() until 2865 ** it returns SQLITE_DONE or an error, and that nFin is the 2866 ** number of pages the database file will contain after this 2867 ** process is complete. If nFin is zero, it is assumed that 2868 ** incrVacuumStep() will be called a finite amount of times 2869 ** which may or may not empty the freelist. A full autovacuum 2870 ** has nFin>0. A "PRAGMA incremental_vacuum" has nFin==0. 2871 */ 2872 static int incrVacuumStep(BtShared *pBt, Pgno nFin, Pgno iLastPg){ 2873 Pgno nFreeList; /* Number of pages still on the free-list */ 2874 int rc; 2875 2876 assert( sqlite3_mutex_held(pBt->mutex) ); 2877 assert( iLastPg>nFin ); 2878 2879 if( !PTRMAP_ISPAGE(pBt, iLastPg) && iLastPg!=PENDING_BYTE_PAGE(pBt) ){ 2880 u8 eType; 2881 Pgno iPtrPage; 2882 2883 nFreeList = get4byte(&pBt->pPage1->aData[36]); 2884 if( nFreeList==0 ){ 2885 return SQLITE_DONE; 2886 } 2887 2888 rc = ptrmapGet(pBt, iLastPg, &eType, &iPtrPage); 2889 if( rc!=SQLITE_OK ){ 2890 return rc; 2891 } 2892 if( eType==PTRMAP_ROOTPAGE ){ 2893 return SQLITE_CORRUPT_BKPT; 2894 } 2895 2896 if( eType==PTRMAP_FREEPAGE ){ 2897 if( nFin==0 ){ 2898 /* Remove the page from the files free-list. This is not required 2899 ** if nFin is non-zero. In that case, the free-list will be 2900 ** truncated to zero after this function returns, so it doesn't 2901 ** matter if it still contains some garbage entries. 2902 */ 2903 Pgno iFreePg; 2904 MemPage *pFreePg; 2905 rc = allocateBtreePage(pBt, &pFreePg, &iFreePg, iLastPg, 1); 2906 if( rc!=SQLITE_OK ){ 2907 return rc; 2908 } 2909 assert( iFreePg==iLastPg ); 2910 releasePage(pFreePg); 2911 } 2912 } else { 2913 Pgno iFreePg; /* Index of free page to move pLastPg to */ 2914 MemPage *pLastPg; 2915 2916 rc = btreeGetPage(pBt, iLastPg, &pLastPg, 0); 2917 if( rc!=SQLITE_OK ){ 2918 return rc; 2919 } 2920 2921 /* If nFin is zero, this loop runs exactly once and page pLastPg 2922 ** is swapped with the first free page pulled off the free list. 2923 ** 2924 ** On the other hand, if nFin is greater than zero, then keep 2925 ** looping until a free-page located within the first nFin pages 2926 ** of the file is found. 2927 */ 2928 do { 2929 MemPage *pFreePg; 2930 rc = allocateBtreePage(pBt, &pFreePg, &iFreePg, 0, 0); 2931 if( rc!=SQLITE_OK ){ 2932 releasePage(pLastPg); 2933 return rc; 2934 } 2935 releasePage(pFreePg); 2936 }while( nFin!=0 && iFreePg>nFin ); 2937 assert( iFreePg<iLastPg ); 2938 2939 rc = sqlite3PagerWrite(pLastPg->pDbPage); 2940 if( rc==SQLITE_OK ){ 2941 rc = relocatePage(pBt, pLastPg, eType, iPtrPage, iFreePg, nFin!=0); 2942 } 2943 releasePage(pLastPg); 2944 if( rc!=SQLITE_OK ){ 2945 return rc; 2946 } 2947 } 2948 } 2949 2950 if( nFin==0 ){ 2951 iLastPg--; 2952 while( iLastPg==PENDING_BYTE_PAGE(pBt)||PTRMAP_ISPAGE(pBt, iLastPg) ){ 2953 if( PTRMAP_ISPAGE(pBt, iLastPg) ){ 2954 MemPage *pPg; 2955 rc = btreeGetPage(pBt, iLastPg, &pPg, 0); 2956 if( rc!=SQLITE_OK ){ 2957 return rc; 2958 } 2959 rc = sqlite3PagerWrite(pPg->pDbPage); 2960 releasePage(pPg); 2961 if( rc!=SQLITE_OK ){ 2962 return rc; 2963 } 2964 } 2965 iLastPg--; 2966 } 2967 sqlite3PagerTruncateImage(pBt->pPager, iLastPg); 2968 pBt->nPage = iLastPg; 2969 } 2970 return SQLITE_OK; 2971 } 2972 2973 /* 2974 ** A write-transaction must be opened before calling this function. 2975 ** It performs a single unit of work towards an incremental vacuum. 2976 ** 2977 ** If the incremental vacuum is finished after this function has run, 2978 ** SQLITE_DONE is returned. If it is not finished, but no error occurred, 2979 ** SQLITE_OK is returned. Otherwise an SQLite error code. 2980 */ 2981 int sqlite3BtreeIncrVacuum(Btree *p){ 2982 int rc; 2983 BtShared *pBt = p->pBt; 2984 2985 sqlite3BtreeEnter(p); 2986 assert( pBt->inTransaction==TRANS_WRITE && p->inTrans==TRANS_WRITE ); 2987 if( !pBt->autoVacuum ){ 2988 rc = SQLITE_DONE; 2989 }else{ 2990 invalidateAllOverflowCache(pBt); 2991 rc = incrVacuumStep(pBt, 0, btreePagecount(pBt)); 2992 if( rc==SQLITE_OK ){ 2993 rc = sqlite3PagerWrite(pBt->pPage1->pDbPage); 2994 put4byte(&pBt->pPage1->aData[28], pBt->nPage); 2995 } 2996 } 2997 sqlite3BtreeLeave(p); 2998 return rc; 2999 } 3000 3001 /* 3002 ** This routine is called prior to sqlite3PagerCommit when a transaction 3003 ** is commited for an auto-vacuum database. 3004 ** 3005 ** If SQLITE_OK is returned, then *pnTrunc is set to the number of pages 3006 ** the database file should be truncated to during the commit process. 3007 ** i.e. the database has been reorganized so that only the first *pnTrunc 3008 ** pages are in use. 3009 */ 3010 static int autoVacuumCommit(BtShared *pBt){ 3011 int rc = SQLITE_OK; 3012 Pager *pPager = pBt->pPager; 3013 VVA_ONLY( int nRef = sqlite3PagerRefcount(pPager) ); 3014 3015 assert( sqlite3_mutex_held(pBt->mutex) ); 3016 invalidateAllOverflowCache(pBt); 3017 assert(pBt->autoVacuum); 3018 if( !pBt->incrVacuum ){ 3019 Pgno nFin; /* Number of pages in database after autovacuuming */ 3020 Pgno nFree; /* Number of pages on the freelist initially */ 3021 Pgno nPtrmap; /* Number of PtrMap pages to be freed */ 3022 Pgno iFree; /* The next page to be freed */ 3023 int nEntry; /* Number of entries on one ptrmap page */ 3024 Pgno nOrig; /* Database size before freeing */ 3025 3026 nOrig = btreePagecount(pBt); 3027 if( PTRMAP_ISPAGE(pBt, nOrig) || nOrig==PENDING_BYTE_PAGE(pBt) ){ 3028 /* It is not possible to create a database for which the final page 3029 ** is either a pointer-map page or the pending-byte page. If one 3030 ** is encountered, this indicates corruption. 3031 */ 3032 return SQLITE_CORRUPT_BKPT; 3033 } 3034 3035 nFree = get4byte(&pBt->pPage1->aData[36]); 3036 nEntry = pBt->usableSize/5; 3037 nPtrmap = (nFree-nOrig+PTRMAP_PAGENO(pBt, nOrig)+nEntry)/nEntry; 3038 nFin = nOrig - nFree - nPtrmap; 3039 if( nOrig>PENDING_BYTE_PAGE(pBt) && nFin<PENDING_BYTE_PAGE(pBt) ){ 3040 nFin--; 3041 } 3042 while( PTRMAP_ISPAGE(pBt, nFin) || nFin==PENDING_BYTE_PAGE(pBt) ){ 3043 nFin--; 3044 } 3045 if( nFin>nOrig ) return SQLITE_CORRUPT_BKPT; 3046 3047 for(iFree=nOrig; iFree>nFin && rc==SQLITE_OK; iFree--){ 3048 rc = incrVacuumStep(pBt, nFin, iFree); 3049 } 3050 if( (rc==SQLITE_DONE || rc==SQLITE_OK) && nFree>0 ){ 3051 rc = sqlite3PagerWrite(pBt->pPage1->pDbPage); 3052 put4byte(&pBt->pPage1->aData[32], 0); 3053 put4byte(&pBt->pPage1->aData[36], 0); 3054 put4byte(&pBt->pPage1->aData[28], nFin); 3055 sqlite3PagerTruncateImage(pBt->pPager, nFin); 3056 pBt->nPage = nFin; 3057 } 3058 if( rc!=SQLITE_OK ){ 3059 sqlite3PagerRollback(pPager); 3060 } 3061 } 3062 3063 assert( nRef==sqlite3PagerRefcount(pPager) ); 3064 return rc; 3065 } 3066 3067 #else /* ifndef SQLITE_OMIT_AUTOVACUUM */ 3068 # define setChildPtrmaps(x) SQLITE_OK 3069 #endif 3070 3071 /* 3072 ** This routine does the first phase of a two-phase commit. This routine 3073 ** causes a rollback journal to be created (if it does not already exist) 3074 ** and populated with enough information so that if a power loss occurs 3075 ** the database can be restored to its original state by playing back 3076 ** the journal. Then the contents of the journal are flushed out to 3077 ** the disk. After the journal is safely on oxide, the changes to the 3078 ** database are written into the database file and flushed to oxide. 3079 ** At the end of this call, the rollback journal still exists on the 3080 ** disk and we are still holding all locks, so the transaction has not 3081 ** committed. See sqlite3BtreeCommitPhaseTwo() for the second phase of the 3082 ** commit process. 3083 ** 3084 ** This call is a no-op if no write-transaction is currently active on pBt. 3085 ** 3086 ** Otherwise, sync the database file for the btree pBt. zMaster points to 3087 ** the name of a master journal file that should be written into the 3088 ** individual journal file, or is NULL, indicating no master journal file 3089 ** (single database transaction). 3090 ** 3091 ** When this is called, the master journal should already have been 3092 ** created, populated with this journal pointer and synced to disk. 3093 ** 3094 ** Once this is routine has returned, the only thing required to commit 3095 ** the write-transaction for this database file is to delete the journal. 3096 */ 3097 int sqlite3BtreeCommitPhaseOne(Btree *p, const char *zMaster){ 3098 int rc = SQLITE_OK; 3099 if( p->inTrans==TRANS_WRITE ){ 3100 BtShared *pBt = p->pBt; 3101 sqlite3BtreeEnter(p); 3102 #ifndef SQLITE_OMIT_AUTOVACUUM 3103 if( pBt->autoVacuum ){ 3104 rc = autoVacuumCommit(pBt); 3105 if( rc!=SQLITE_OK ){ 3106 sqlite3BtreeLeave(p); 3107 return rc; 3108 } 3109 } 3110 #endif 3111 rc = sqlite3PagerCommitPhaseOne(pBt->pPager, zMaster, 0); 3112 sqlite3BtreeLeave(p); 3113 } 3114 return rc; 3115 } 3116 3117 /* 3118 ** This function is called from both BtreeCommitPhaseTwo() and BtreeRollback() 3119 ** at the conclusion of a transaction. 3120 */ 3121 static void btreeEndTransaction(Btree *p){ 3122 BtShared *pBt = p->pBt; 3123 assert( sqlite3BtreeHoldsMutex(p) ); 3124 3125 btreeClearHasContent(pBt); 3126 if( p->inTrans>TRANS_NONE && p->db->activeVdbeCnt>1 ){ 3127 /* If there are other active statements that belong to this database 3128 ** handle, downgrade to a read-only transaction. The other statements 3129 ** may still be reading from the database. */ 3130 downgradeAllSharedCacheTableLocks(p); 3131 p->inTrans = TRANS_READ; 3132 }else{ 3133 /* If the handle had any kind of transaction open, decrement the 3134 ** transaction count of the shared btree. If the transaction count 3135 ** reaches 0, set the shared state to TRANS_NONE. The unlockBtreeIfUnused() 3136 ** call below will unlock the pager. */ 3137 if( p->inTrans!=TRANS_NONE ){ 3138 clearAllSharedCacheTableLocks(p); 3139 pBt->nTransaction--; 3140 if( 0==pBt->nTransaction ){ 3141 pBt->inTransaction = TRANS_NONE; 3142 } 3143 } 3144 3145 /* Set the current transaction state to TRANS_NONE and unlock the 3146 ** pager if this call closed the only read or write transaction. */ 3147 p->inTrans = TRANS_NONE; 3148 unlockBtreeIfUnused(pBt); 3149 } 3150 3151 btreeIntegrity(p); 3152 } 3153 3154 /* 3155 ** Commit the transaction currently in progress. 3156 ** 3157 ** This routine implements the second phase of a 2-phase commit. The 3158 ** sqlite3BtreeCommitPhaseOne() routine does the first phase and should 3159 ** be invoked prior to calling this routine. The sqlite3BtreeCommitPhaseOne() 3160 ** routine did all the work of writing information out to disk and flushing the 3161 ** contents so that they are written onto the disk platter. All this 3162 ** routine has to do is delete or truncate or zero the header in the 3163 ** the rollback journal (which causes the transaction to commit) and 3164 ** drop locks. 3165 ** 3166 ** This will release the write lock on the database file. If there 3167 ** are no active cursors, it also releases the read lock. 3168 */ 3169 int sqlite3BtreeCommitPhaseTwo(Btree *p){ 3170 BtShared *pBt = p->pBt; 3171 3172 sqlite3BtreeEnter(p); 3173 btreeIntegrity(p); 3174 3175 /* If the handle has a write-transaction open, commit the shared-btrees 3176 ** transaction and set the shared state to TRANS_READ. 3177 */ 3178 if( p->inTrans==TRANS_WRITE ){ 3179 int rc; 3180 assert( pBt->inTransaction==TRANS_WRITE ); 3181 assert( pBt->nTransaction>0 ); 3182 rc = sqlite3PagerCommitPhaseTwo(pBt->pPager); 3183 if( rc!=SQLITE_OK ){ 3184 sqlite3BtreeLeave(p); 3185 return rc; 3186 } 3187 pBt->inTransaction = TRANS_READ; 3188 } 3189 3190 btreeEndTransaction(p); 3191 sqlite3BtreeLeave(p); 3192 return SQLITE_OK; 3193 } 3194 3195 /* 3196 ** Do both phases of a commit. 3197 */ 3198 int sqlite3BtreeCommit(Btree *p){ 3199 int rc; 3200 sqlite3BtreeEnter(p); 3201 rc = sqlite3BtreeCommitPhaseOne(p, 0); 3202 if( rc==SQLITE_OK ){ 3203 rc = sqlite3BtreeCommitPhaseTwo(p); 3204 } 3205 sqlite3BtreeLeave(p); 3206 return rc; 3207 } 3208 3209 #ifndef NDEBUG 3210 /* 3211 ** Return the number of write-cursors open on this handle. This is for use 3212 ** in assert() expressions, so it is only compiled if NDEBUG is not 3213 ** defined. 3214 ** 3215 ** For the purposes of this routine, a write-cursor is any cursor that 3216 ** is capable of writing to the databse. That means the cursor was 3217 ** originally opened for writing and the cursor has not be disabled 3218 ** by having its state changed to CURSOR_FAULT. 3219 */ 3220 static int countWriteCursors(BtShared *pBt){ 3221 BtCursor *pCur; 3222 int r = 0; 3223 for(pCur=pBt->pCursor; pCur; pCur=pCur->pNext){ 3224 if( pCur->wrFlag && pCur->eState!=CURSOR_FAULT ) r++; 3225 } 3226 return r; 3227 } 3228 #endif 3229 3230 /* 3231 ** This routine sets the state to CURSOR_FAULT and the error 3232 ** code to errCode for every cursor on BtShared that pBtree 3233 ** references. 3234 ** 3235 ** Every cursor is tripped, including cursors that belong 3236 ** to other database connections that happen to be sharing 3237 ** the cache with pBtree. 3238 ** 3239 ** This routine gets called when a rollback occurs. 3240 ** All cursors using the same cache must be tripped 3241 ** to prevent them from trying to use the btree after 3242 ** the rollback. The rollback may have deleted tables 3243 ** or moved root pages, so it is not sufficient to 3244 ** save the state of the cursor. The cursor must be 3245 ** invalidated. 3246 */ 3247 void sqlite3BtreeTripAllCursors(Btree *pBtree, int errCode){ 3248 BtCursor *p; 3249 sqlite3BtreeEnter(pBtree); 3250 for(p=pBtree->pBt->pCursor; p; p=p->pNext){ 3251 int i; 3252 sqlite3BtreeClearCursor(p); 3253 p->eState = CURSOR_FAULT; 3254 p->skipNext = errCode; 3255 for(i=0; i<=p->iPage; i++){ 3256 releasePage(p->apPage[i]); 3257 p->apPage[i] = 0; 3258 } 3259 } 3260 sqlite3BtreeLeave(pBtree); 3261 } 3262 3263 /* 3264 ** Rollback the transaction in progress. All cursors will be 3265 ** invalided by this operation. Any attempt to use a cursor 3266 ** that was open at the beginning of this operation will result 3267 ** in an error. 3268 ** 3269 ** This will release the write lock on the database file. If there 3270 ** are no active cursors, it also releases the read lock. 3271 */ 3272 int sqlite3BtreeRollback(Btree *p){ 3273 int rc; 3274 BtShared *pBt = p->pBt; 3275 MemPage *pPage1; 3276 3277 sqlite3BtreeEnter(p); 3278 rc = saveAllCursors(pBt, 0, 0); 3279 #ifndef SQLITE_OMIT_SHARED_CACHE 3280 if( rc!=SQLITE_OK ){ 3281 /* This is a horrible situation. An IO or malloc() error occurred whilst 3282 ** trying to save cursor positions. If this is an automatic rollback (as 3283 ** the result of a constraint, malloc() failure or IO error) then 3284 ** the cache may be internally inconsistent (not contain valid trees) so 3285 ** we cannot simply return the error to the caller. Instead, abort 3286 ** all queries that may be using any of the cursors that failed to save. 3287 */ 3288 sqlite3BtreeTripAllCursors(p, rc); 3289 } 3290 #endif 3291 btreeIntegrity(p); 3292 3293 if( p->inTrans==TRANS_WRITE ){ 3294 int rc2; 3295 3296 assert( TRANS_WRITE==pBt->inTransaction ); 3297 rc2 = sqlite3PagerRollback(pBt->pPager); 3298 if( rc2!=SQLITE_OK ){ 3299 rc = rc2; 3300 } 3301 3302 /* The rollback may have destroyed the pPage1->aData value. So 3303 ** call btreeGetPage() on page 1 again to make 3304 ** sure pPage1->aData is set correctly. */ 3305 if( btreeGetPage(pBt, 1, &pPage1, 0)==SQLITE_OK ){ 3306 int nPage = get4byte(28+(u8*)pPage1->aData); 3307 testcase( nPage==0 ); 3308 if( nPage==0 ) sqlite3PagerPagecount(pBt->pPager, &nPage); 3309 testcase( pBt->nPage!=nPage ); 3310 pBt->nPage = nPage; 3311 releasePage(pPage1); 3312 } 3313 assert( countWriteCursors(pBt)==0 ); 3314 pBt->inTransaction = TRANS_READ; 3315 } 3316 3317 btreeEndTransaction(p); 3318 sqlite3BtreeLeave(p); 3319 return rc; 3320 } 3321 3322 /* 3323 ** Start a statement subtransaction. The subtransaction can can be rolled 3324 ** back independently of the main transaction. You must start a transaction 3325 ** before starting a subtransaction. The subtransaction is ended automatically 3326 ** if the main transaction commits or rolls back. 3327 ** 3328 ** Statement subtransactions are used around individual SQL statements 3329 ** that are contained within a BEGIN...COMMIT block. If a constraint 3330 ** error occurs within the statement, the effect of that one statement 3331 ** can be rolled back without having to rollback the entire transaction. 3332 ** 3333 ** A statement sub-transaction is implemented as an anonymous savepoint. The 3334 ** value passed as the second parameter is the total number of savepoints, 3335 ** including the new anonymous savepoint, open on the B-Tree. i.e. if there 3336 ** are no active savepoints and no other statement-transactions open, 3337 ** iStatement is 1. This anonymous savepoint can be released or rolled back 3338 ** using the sqlite3BtreeSavepoint() function. 3339 */ 3340 int sqlite3BtreeBeginStmt(Btree *p, int iStatement){ 3341 int rc; 3342 BtShared *pBt = p->pBt; 3343 sqlite3BtreeEnter(p); 3344 assert( p->inTrans==TRANS_WRITE ); 3345 assert( pBt->readOnly==0 ); 3346 assert( iStatement>0 ); 3347 assert( iStatement>p->db->nSavepoint ); 3348 assert( pBt->inTransaction==TRANS_WRITE ); 3349 /* At the pager level, a statement transaction is a savepoint with 3350 ** an index greater than all savepoints created explicitly using 3351 ** SQL statements. It is illegal to open, release or rollback any 3352 ** such savepoints while the statement transaction savepoint is active. 3353 */ 3354 rc = sqlite3PagerOpenSavepoint(pBt->pPager, iStatement); 3355 sqlite3BtreeLeave(p); 3356 return rc; 3357 } 3358 3359 /* 3360 ** The second argument to this function, op, is always SAVEPOINT_ROLLBACK 3361 ** or SAVEPOINT_RELEASE. This function either releases or rolls back the 3362 ** savepoint identified by parameter iSavepoint, depending on the value 3363 ** of op. 3364 ** 3365 ** Normally, iSavepoint is greater than or equal to zero. However, if op is 3366 ** SAVEPOINT_ROLLBACK, then iSavepoint may also be -1. In this case the 3367 ** contents of the entire transaction are rolled back. This is different 3368 ** from a normal transaction rollback, as no locks are released and the 3369 ** transaction remains open. 3370 */ 3371 int sqlite3BtreeSavepoint(Btree *p, int op, int iSavepoint){ 3372 int rc = SQLITE_OK; 3373 if( p && p->inTrans==TRANS_WRITE ){ 3374 BtShared *pBt = p->pBt; 3375 assert( op==SAVEPOINT_RELEASE || op==SAVEPOINT_ROLLBACK ); 3376 assert( iSavepoint>=0 || (iSavepoint==-1 && op==SAVEPOINT_ROLLBACK) ); 3377 sqlite3BtreeEnter(p); 3378 rc = sqlite3PagerSavepoint(pBt->pPager, op, iSavepoint); 3379 if( rc==SQLITE_OK ){ 3380 if( iSavepoint<0 && pBt->initiallyEmpty ) pBt->nPage = 0; 3381 rc = newDatabase(pBt); 3382 pBt->nPage = get4byte(28 + pBt->pPage1->aData); 3383 3384 /* The database size was written into the offset 28 of the header 3385 ** when the transaction started, so we know that the value at offset 3386 ** 28 is nonzero. */ 3387 assert( pBt->nPage>0 ); 3388 } 3389 sqlite3BtreeLeave(p); 3390 } 3391 return rc; 3392 } 3393 3394 /* 3395 ** Create a new cursor for the BTree whose root is on the page 3396 ** iTable. If a read-only cursor is requested, it is assumed that 3397 ** the caller already has at least a read-only transaction open 3398 ** on the database already. If a write-cursor is requested, then 3399 ** the caller is assumed to have an open write transaction. 3400 ** 3401 ** If wrFlag==0, then the cursor can only be used for reading. 3402 ** If wrFlag==1, then the cursor can be used for reading or for 3403 ** writing if other conditions for writing are also met. These 3404 ** are the conditions that must be met in order for writing to 3405 ** be allowed: 3406 ** 3407 ** 1: The cursor must have been opened with wrFlag==1 3408 ** 3409 ** 2: Other database connections that share the same pager cache 3410 ** but which are not in the READ_UNCOMMITTED state may not have 3411 ** cursors open with wrFlag==0 on the same table. Otherwise 3412 ** the changes made by this write cursor would be visible to 3413 ** the read cursors in the other database connection. 3414 ** 3415 ** 3: The database must be writable (not on read-only media) 3416 ** 3417 ** 4: There must be an active transaction. 3418 ** 3419 ** No checking is done to make sure that page iTable really is the 3420 ** root page of a b-tree. If it is not, then the cursor acquired 3421 ** will not work correctly. 3422 ** 3423 ** It is assumed that the sqlite3BtreeCursorZero() has been called 3424 ** on pCur to initialize the memory space prior to invoking this routine. 3425 */ 3426 static int btreeCursor( 3427 Btree *p, /* The btree */ 3428 int iTable, /* Root page of table to open */ 3429 int wrFlag, /* 1 to write. 0 read-only */ 3430 struct KeyInfo *pKeyInfo, /* First arg to comparison function */ 3431 BtCursor *pCur /* Space for new cursor */ 3432 ){ 3433 BtShared *pBt = p->pBt; /* Shared b-tree handle */ 3434 3435 assert( sqlite3BtreeHoldsMutex(p) ); 3436 assert( wrFlag==0 || wrFlag==1 ); 3437 3438 /* The following assert statements verify that if this is a sharable 3439 ** b-tree database, the connection is holding the required table locks, 3440 ** and that no other connection has any open cursor that conflicts with 3441 ** this lock. */ 3442 assert( hasSharedCacheTableLock(p, iTable, pKeyInfo!=0, wrFlag+1) ); 3443 assert( wrFlag==0 || !hasReadConflicts(p, iTable) ); 3444 3445 /* Assert that the caller has opened the required transaction. */ 3446 assert( p->inTrans>TRANS_NONE ); 3447 assert( wrFlag==0 || p->inTrans==TRANS_WRITE ); 3448 assert( pBt->pPage1 && pBt->pPage1->aData ); 3449 3450 if( NEVER(wrFlag && pBt->readOnly) ){ 3451 return SQLITE_READONLY; 3452 } 3453 if( iTable==1 && btreePagecount(pBt)==0 ){ 3454 return SQLITE_EMPTY; 3455 } 3456 3457 /* Now that no other errors can occur, finish filling in the BtCursor 3458 ** variables and link the cursor into the BtShared list. */ 3459 pCur->pgnoRoot = (Pgno)iTable; 3460 pCur->iPage = -1; 3461 pCur->pKeyInfo = pKeyInfo; 3462 pCur->pBtree = p; 3463 pCur->pBt = pBt; 3464 pCur->wrFlag = (u8)wrFlag; 3465 pCur->pNext = pBt->pCursor; 3466 if( pCur->pNext ){ 3467 pCur->pNext->pPrev = pCur; 3468 } 3469 pBt->pCursor = pCur; 3470 pCur->eState = CURSOR_INVALID; 3471 pCur->cachedRowid = 0; 3472 return SQLITE_OK; 3473 } 3474 int sqlite3BtreeCursor( 3475 Btree *p, /* The btree */ 3476 int iTable, /* Root page of table to open */ 3477 int wrFlag, /* 1 to write. 0 read-only */ 3478 struct KeyInfo *pKeyInfo, /* First arg to xCompare() */ 3479 BtCursor *pCur /* Write new cursor here */ 3480 ){ 3481 int rc; 3482 sqlite3BtreeEnter(p); 3483 rc = btreeCursor(p, iTable, wrFlag, pKeyInfo, pCur); 3484 sqlite3BtreeLeave(p); 3485 return rc; 3486 } 3487 3488 /* 3489 ** Return the size of a BtCursor object in bytes. 3490 ** 3491 ** This interfaces is needed so that users of cursors can preallocate 3492 ** sufficient storage to hold a cursor. The BtCursor object is opaque 3493 ** to users so they cannot do the sizeof() themselves - they must call 3494 ** this routine. 3495 */ 3496 int sqlite3BtreeCursorSize(void){ 3497 return ROUND8(sizeof(BtCursor)); 3498 } 3499 3500 /* 3501 ** Initialize memory that will be converted into a BtCursor object. 3502 ** 3503 ** The simple approach here would be to memset() the entire object 3504 ** to zero. But it turns out that the apPage[] and aiIdx[] arrays 3505 ** do not need to be zeroed and they are large, so we can save a lot 3506 ** of run-time by skipping the initialization of those elements. 3507 */ 3508 void sqlite3BtreeCursorZero(BtCursor *p){ 3509 memset(p, 0, offsetof(BtCursor, iPage)); 3510 } 3511 3512 /* 3513 ** Set the cached rowid value of every cursor in the same database file 3514 ** as pCur and having the same root page number as pCur. The value is 3515 ** set to iRowid. 3516 ** 3517 ** Only positive rowid values are considered valid for this cache. 3518 ** The cache is initialized to zero, indicating an invalid cache. 3519 ** A btree will work fine with zero or negative rowids. We just cannot 3520 ** cache zero or negative rowids, which means tables that use zero or 3521 ** negative rowids might run a little slower. But in practice, zero 3522 ** or negative rowids are very uncommon so this should not be a problem. 3523 */ 3524 void sqlite3BtreeSetCachedRowid(BtCursor *pCur, sqlite3_int64 iRowid){ 3525 BtCursor *p; 3526 for(p=pCur->pBt->pCursor; p; p=p->pNext){ 3527 if( p->pgnoRoot==pCur->pgnoRoot ) p->cachedRowid = iRowid; 3528 } 3529 assert( pCur->cachedRowid==iRowid ); 3530 } 3531 3532 /* 3533 ** Return the cached rowid for the given cursor. A negative or zero 3534 ** return value indicates that the rowid cache is invalid and should be 3535 ** ignored. If the rowid cache has never before been set, then a 3536 ** zero is returned. 3537 */ 3538 sqlite3_int64 sqlite3BtreeGetCachedRowid(BtCursor *pCur){ 3539 return pCur->cachedRowid; 3540 } 3541 3542 /* 3543 ** Close a cursor. The read lock on the database file is released 3544 ** when the last cursor is closed. 3545 */ 3546 int sqlite3BtreeCloseCursor(BtCursor *pCur){ 3547 Btree *pBtree = pCur->pBtree; 3548 if( pBtree ){ 3549 int i; 3550 BtShared *pBt = pCur->pBt; 3551 sqlite3BtreeEnter(pBtree); 3552 sqlite3BtreeClearCursor(pCur); 3553 if( pCur->pPrev ){ 3554 pCur->pPrev->pNext = pCur->pNext; 3555 }else{ 3556 pBt->pCursor = pCur->pNext; 3557 } 3558 if( pCur->pNext ){ 3559 pCur->pNext->pPrev = pCur->pPrev; 3560 } 3561 for(i=0; i<=pCur->iPage; i++){ 3562 releasePage(pCur->apPage[i]); 3563 } 3564 unlockBtreeIfUnused(pBt); 3565 invalidateOverflowCache(pCur); 3566 /* sqlite3_free(pCur); */ 3567 sqlite3BtreeLeave(pBtree); 3568 } 3569 return SQLITE_OK; 3570 } 3571 3572 /* 3573 ** Make sure the BtCursor* given in the argument has a valid 3574 ** BtCursor.info structure. If it is not already valid, call 3575 ** btreeParseCell() to fill it in. 3576 ** 3577 ** BtCursor.info is a cache of the information in the current cell. 3578 ** Using this cache reduces the number of calls to btreeParseCell(). 3579 ** 3580 ** 2007-06-25: There is a bug in some versions of MSVC that cause the 3581 ** compiler to crash when getCellInfo() is implemented as a macro. 3582 ** But there is a measureable speed advantage to using the macro on gcc 3583 ** (when less compiler optimizations like -Os or -O0 are used and the 3584 ** compiler is not doing agressive inlining.) So we use a real function 3585 ** for MSVC and a macro for everything else. Ticket #2457. 3586 */ 3587 #ifndef NDEBUG 3588 static void assertCellInfo(BtCursor *pCur){ 3589 CellInfo info; 3590 int iPage = pCur->iPage; 3591 memset(&info, 0, sizeof(info)); 3592 btreeParseCell(pCur->apPage[iPage], pCur->aiIdx[iPage], &info); 3593 assert( memcmp(&info, &pCur->info, sizeof(info))==0 ); 3594 } 3595 #else 3596 #define assertCellInfo(x) 3597 #endif 3598 #ifdef _MSC_VER 3599 /* Use a real function in MSVC to work around bugs in that compiler. */ 3600 static void getCellInfo(BtCursor *pCur){ 3601 if( pCur->info.nSize==0 ){ 3602 int iPage = pCur->iPage; 3603 btreeParseCell(pCur->apPage[iPage],pCur->aiIdx[iPage],&pCur->info); 3604 pCur->validNKey = 1; 3605 }else{ 3606 assertCellInfo(pCur); 3607 } 3608 } 3609 #else /* if not _MSC_VER */ 3610 /* Use a macro in all other compilers so that the function is inlined */ 3611 #define getCellInfo(pCur) \ 3612 if( pCur->info.nSize==0 ){ \ 3613 int iPage = pCur->iPage; \ 3614 btreeParseCell(pCur->apPage[iPage],pCur->aiIdx[iPage],&pCur->info); \ 3615 pCur->validNKey = 1; \ 3616 }else{ \ 3617 assertCellInfo(pCur); \ 3618 } 3619 #endif /* _MSC_VER */ 3620 3621 #ifndef NDEBUG /* The next routine used only within assert() statements */ 3622 /* 3623 ** Return true if the given BtCursor is valid. A valid cursor is one 3624 ** that is currently pointing to a row in a (non-empty) table. 3625 ** This is a verification routine is used only within assert() statements. 3626 */ 3627 int sqlite3BtreeCursorIsValid(BtCursor *pCur){ 3628 return pCur && pCur->eState==CURSOR_VALID; 3629 } 3630 #endif /* NDEBUG */ 3631 3632 /* 3633 ** Set *pSize to the size of the buffer needed to hold the value of 3634 ** the key for the current entry. If the cursor is not pointing 3635 ** to a valid entry, *pSize is set to 0. 3636 ** 3637 ** For a table with the INTKEY flag set, this routine returns the key 3638 ** itself, not the number of bytes in the key. 3639 ** 3640 ** The caller must position the cursor prior to invoking this routine. 3641 ** 3642 ** This routine cannot fail. It always returns SQLITE_OK. 3643 */ 3644 int sqlite3BtreeKeySize(BtCursor *pCur, i64 *pSize){ 3645 assert( cursorHoldsMutex(pCur) ); 3646 assert( pCur->eState==CURSOR_INVALID || pCur->eState==CURSOR_VALID ); 3647 if( pCur->eState!=CURSOR_VALID ){ 3648 *pSize = 0; 3649 }else{ 3650 getCellInfo(pCur); 3651 *pSize = pCur->info.nKey; 3652 } 3653 return SQLITE_OK; 3654 } 3655 3656 /* 3657 ** Set *pSize to the number of bytes of data in the entry the 3658 ** cursor currently points to. 3659 ** 3660 ** The caller must guarantee that the cursor is pointing to a non-NULL 3661 ** valid entry. In other words, the calling procedure must guarantee 3662 ** that the cursor has Cursor.eState==CURSOR_VALID. 3663 ** 3664 ** Failure is not possible. This function always returns SQLITE_OK. 3665 ** It might just as well be a procedure (returning void) but we continue 3666 ** to return an integer result code for historical reasons. 3667 */ 3668 int sqlite3BtreeDataSize(BtCursor *pCur, u32 *pSize){ 3669 assert( cursorHoldsMutex(pCur) ); 3670 assert( pCur->eState==CURSOR_VALID ); 3671 getCellInfo(pCur); 3672 *pSize = pCur->info.nData; 3673 return SQLITE_OK; 3674 } 3675 3676 /* 3677 ** Given the page number of an overflow page in the database (parameter 3678 ** ovfl), this function finds the page number of the next page in the 3679 ** linked list of overflow pages. If possible, it uses the auto-vacuum 3680 ** pointer-map data instead of reading the content of page ovfl to do so. 3681 ** 3682 ** If an error occurs an SQLite error code is returned. Otherwise: 3683 ** 3684 ** The page number of the next overflow page in the linked list is 3685 ** written to *pPgnoNext. If page ovfl is the last page in its linked 3686 ** list, *pPgnoNext is set to zero. 3687 ** 3688 ** If ppPage is not NULL, and a reference to the MemPage object corresponding 3689 ** to page number pOvfl was obtained, then *ppPage is set to point to that 3690 ** reference. It is the responsibility of the caller to call releasePage() 3691 ** on *ppPage to free the reference. In no reference was obtained (because 3692 ** the pointer-map was used to obtain the value for *pPgnoNext), then 3693 ** *ppPage is set to zero. 3694 */ 3695 static int getOverflowPage( 3696 BtShared *pBt, /* The database file */ 3697 Pgno ovfl, /* Current overflow page number */ 3698 MemPage **ppPage, /* OUT: MemPage handle (may be NULL) */ 3699 Pgno *pPgnoNext /* OUT: Next overflow page number */ 3700 ){ 3701 Pgno next = 0; 3702 MemPage *pPage = 0; 3703 int rc = SQLITE_OK; 3704 3705 assert( sqlite3_mutex_held(pBt->mutex) ); 3706 assert(pPgnoNext); 3707 3708 #ifndef SQLITE_OMIT_AUTOVACUUM 3709 /* Try to find the next page in the overflow list using the 3710 ** autovacuum pointer-map pages. Guess that the next page in 3711 ** the overflow list is page number (ovfl+1). If that guess turns 3712 ** out to be wrong, fall back to loading the data of page 3713 ** number ovfl to determine the next page number. 3714 */ 3715 if( pBt->autoVacuum ){ 3716 Pgno pgno; 3717 Pgno iGuess = ovfl+1; 3718 u8 eType; 3719 3720 while( PTRMAP_ISPAGE(pBt, iGuess) || iGuess==PENDING_BYTE_PAGE(pBt) ){ 3721 iGuess++; 3722 } 3723 3724 if( iGuess<=btreePagecount(pBt) ){ 3725 rc = ptrmapGet(pBt, iGuess, &eType, &pgno); 3726 if( rc==SQLITE_OK && eType==PTRMAP_OVERFLOW2 && pgno==ovfl ){ 3727 next = iGuess; 3728 rc = SQLITE_DONE; 3729 } 3730 } 3731 } 3732 #endif 3733 3734 assert( next==0 || rc==SQLITE_DONE ); 3735 if( rc==SQLITE_OK ){ 3736 rc = btreeGetPage(pBt, ovfl, &pPage, 0); 3737 assert( rc==SQLITE_OK || pPage==0 ); 3738 if( rc==SQLITE_OK ){ 3739 next = get4byte(pPage->aData); 3740 } 3741 } 3742 3743 *pPgnoNext = next; 3744 if( ppPage ){ 3745 *ppPage = pPage; 3746 }else{ 3747 releasePage(pPage); 3748 } 3749 return (rc==SQLITE_DONE ? SQLITE_OK : rc); 3750 } 3751 3752 /* 3753 ** Copy data from a buffer to a page, or from a page to a buffer. 3754 ** 3755 ** pPayload is a pointer to data stored on database page pDbPage. 3756 ** If argument eOp is false, then nByte bytes of data are copied 3757 ** from pPayload to the buffer pointed at by pBuf. If eOp is true, 3758 ** then sqlite3PagerWrite() is called on pDbPage and nByte bytes 3759 ** of data are copied from the buffer pBuf to pPayload. 3760 ** 3761 ** SQLITE_OK is returned on success, otherwise an error code. 3762 */ 3763 static int copyPayload( 3764 void *pPayload, /* Pointer to page data */ 3765 void *pBuf, /* Pointer to buffer */ 3766 int nByte, /* Number of bytes to copy */ 3767 int eOp, /* 0 -> copy from page, 1 -> copy to page */ 3768 DbPage *pDbPage /* Page containing pPayload */ 3769 ){ 3770 if( eOp ){ 3771 /* Copy data from buffer to page (a write operation) */ 3772 int rc = sqlite3PagerWrite(pDbPage); 3773 if( rc!=SQLITE_OK ){ 3774 return rc; 3775 } 3776 memcpy(pPayload, pBuf, nByte); 3777 }else{ 3778 /* Copy data from page to buffer (a read operation) */ 3779 memcpy(pBuf, pPayload, nByte); 3780 } 3781 return SQLITE_OK; 3782 } 3783 3784 /* 3785 ** This function is used to read or overwrite payload information 3786 ** for the entry that the pCur cursor is pointing to. If the eOp 3787 ** parameter is 0, this is a read operation (data copied into 3788 ** buffer pBuf). If it is non-zero, a write (data copied from 3789 ** buffer pBuf). 3790 ** 3791 ** A total of "amt" bytes are read or written beginning at "offset". 3792 ** Data is read to or from the buffer pBuf. 3793 ** 3794 ** The content being read or written might appear on the main page 3795 ** or be scattered out on multiple overflow pages. 3796 ** 3797 ** If the BtCursor.isIncrblobHandle flag is set, and the current 3798 ** cursor entry uses one or more overflow pages, this function 3799 ** allocates space for and lazily popluates the overflow page-list 3800 ** cache array (BtCursor.aOverflow). Subsequent calls use this 3801 ** cache to make seeking to the supplied offset more efficient. 3802 ** 3803 ** Once an overflow page-list cache has been allocated, it may be 3804 ** invalidated if some other cursor writes to the same table, or if 3805 ** the cursor is moved to a different row. Additionally, in auto-vacuum 3806 ** mode, the following events may invalidate an overflow page-list cache. 3807 ** 3808 ** * An incremental vacuum, 3809 ** * A commit in auto_vacuum="full" mode, 3810 ** * Creating a table (may require moving an overflow page). 3811 */ 3812 static int accessPayload( 3813 BtCursor *pCur, /* Cursor pointing to entry to read from */ 3814 u32 offset, /* Begin reading this far into payload */ 3815 u32 amt, /* Read this many bytes */ 3816 unsigned char *pBuf, /* Write the bytes into this buffer */ 3817 int eOp /* zero to read. non-zero to write. */ 3818 ){ 3819 unsigned char *aPayload; 3820 int rc = SQLITE_OK; 3821 u32 nKey; 3822 int iIdx = 0; 3823 MemPage *pPage = pCur->apPage[pCur->iPage]; /* Btree page of current entry */ 3824 BtShared *pBt = pCur->pBt; /* Btree this cursor belongs to */ 3825 3826 assert( pPage ); 3827 assert( pCur->eState==CURSOR_VALID ); 3828 assert( pCur->aiIdx[pCur->iPage]<pPage->nCell ); 3829 assert( cursorHoldsMutex(pCur) ); 3830 3831 getCellInfo(pCur); 3832 aPayload = pCur->info.pCell + pCur->info.nHeader; 3833 nKey = (pPage->intKey ? 0 : (int)pCur->info.nKey); 3834 3835 if( NEVER(offset+amt > nKey+pCur->info.nData) 3836 || &aPayload[pCur->info.nLocal] > &pPage->aData[pBt->usableSize] 3837 ){ 3838 /* Trying to read or write past the end of the data is an error */ 3839 return SQLITE_CORRUPT_BKPT; 3840 } 3841 3842 /* Check if data must be read/written to/from the btree page itself. */ 3843 if( offset<pCur->info.nLocal ){ 3844 int a = amt; 3845 if( a+offset>pCur->info.nLocal ){ 3846 a = pCur->info.nLocal - offset; 3847 } 3848 rc = copyPayload(&aPayload[offset], pBuf, a, eOp, pPage->pDbPage); 3849 offset = 0; 3850 pBuf += a; 3851 amt -= a; 3852 }else{ 3853 offset -= pCur->info.nLocal; 3854 } 3855 3856 if( rc==SQLITE_OK && amt>0 ){ 3857 const u32 ovflSize = pBt->usableSize - 4; /* Bytes content per ovfl page */ 3858 Pgno nextPage; 3859 3860 nextPage = get4byte(&aPayload[pCur->info.nLocal]); 3861 3862 #ifndef SQLITE_OMIT_INCRBLOB 3863 /* If the isIncrblobHandle flag is set and the BtCursor.aOverflow[] 3864 ** has not been allocated, allocate it now. The array is sized at 3865 ** one entry for each overflow page in the overflow chain. The 3866 ** page number of the first overflow page is stored in aOverflow[0], 3867 ** etc. A value of 0 in the aOverflow[] array means "not yet known" 3868 ** (the cache is lazily populated). 3869 */ 3870 if( pCur->isIncrblobHandle && !pCur->aOverflow ){ 3871 int nOvfl = (pCur->info.nPayload-pCur->info.nLocal+ovflSize-1)/ovflSize; 3872 pCur->aOverflow = (Pgno *)sqlite3MallocZero(sizeof(Pgno)*nOvfl); 3873 /* nOvfl is always positive. If it were zero, fetchPayload would have 3874 ** been used instead of this routine. */ 3875 if( ALWAYS(nOvfl) && !pCur->aOverflow ){ 3876 rc = SQLITE_NOMEM; 3877 } 3878 } 3879 3880 /* If the overflow page-list cache has been allocated and the 3881 ** entry for the first required overflow page is valid, skip 3882 ** directly to it. 3883 */ 3884 if( pCur->aOverflow && pCur->aOverflow[offset/ovflSize] ){ 3885 iIdx = (offset/ovflSize); 3886 nextPage = pCur->aOverflow[iIdx]; 3887 offset = (offset%ovflSize); 3888 } 3889 #endif 3890 3891 for( ; rc==SQLITE_OK && amt>0 && nextPage; iIdx++){ 3892 3893 #ifndef SQLITE_OMIT_INCRBLOB 3894 /* If required, populate the overflow page-list cache. */ 3895 if( pCur->aOverflow ){ 3896 assert(!pCur->aOverflow[iIdx] || pCur->aOverflow[iIdx]==nextPage); 3897 pCur->aOverflow[iIdx] = nextPage; 3898 } 3899 #endif 3900 3901 if( offset>=ovflSize ){ 3902 /* The only reason to read this page is to obtain the page 3903 ** number for the next page in the overflow chain. The page 3904 ** data is not required. So first try to lookup the overflow 3905 ** page-list cache, if any, then fall back to the getOverflowPage() 3906 ** function. 3907 */ 3908 #ifndef SQLITE_OMIT_INCRBLOB 3909 if( pCur->aOverflow && pCur->aOverflow[iIdx+1] ){ 3910 nextPage = pCur->aOverflow[iIdx+1]; 3911 } else 3912 #endif 3913 rc = getOverflowPage(pBt, nextPage, 0, &nextPage); 3914 offset -= ovflSize; 3915 }else{ 3916 /* Need to read this page properly. It contains some of the 3917 ** range of data that is being read (eOp==0) or written (eOp!=0). 3918 */ 3919 DbPage *pDbPage; 3920 int a = amt; 3921 rc = sqlite3PagerGet(pBt->pPager, nextPage, &pDbPage); 3922 if( rc==SQLITE_OK ){ 3923 aPayload = sqlite3PagerGetData(pDbPage); 3924 nextPage = get4byte(aPayload); 3925 if( a + offset > ovflSize ){ 3926 a = ovflSize - offset; 3927 } 3928 rc = copyPayload(&aPayload[offset+4], pBuf, a, eOp, pDbPage); 3929 sqlite3PagerUnref(pDbPage); 3930 offset = 0; 3931 amt -= a; 3932 pBuf += a; 3933 } 3934 } 3935 } 3936 } 3937 3938 if( rc==SQLITE_OK && amt>0 ){ 3939 return SQLITE_CORRUPT_BKPT; 3940 } 3941 return rc; 3942 } 3943 3944 /* 3945 ** Read part of the key associated with cursor pCur. Exactly 3946 ** "amt" bytes will be transfered into pBuf[]. The transfer 3947 ** begins at "offset". 3948 ** 3949 ** The caller must ensure that pCur is pointing to a valid row 3950 ** in the table. 3951 ** 3952 ** Return SQLITE_OK on success or an error code if anything goes 3953 ** wrong. An error is returned if "offset+amt" is larger than 3954 ** the available payload. 3955 */ 3956 int sqlite3BtreeKey(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){ 3957 assert( cursorHoldsMutex(pCur) ); 3958 assert( pCur->eState==CURSOR_VALID ); 3959 assert( pCur->iPage>=0 && pCur->apPage[pCur->iPage] ); 3960 assert( pCur->aiIdx[pCur->iPage]<pCur->apPage[pCur->iPage]->nCell ); 3961 return accessPayload(pCur, offset, amt, (unsigned char*)pBuf, 0); 3962 } 3963 3964 /* 3965 ** Read part of the data associated with cursor pCur. Exactly 3966 ** "amt" bytes will be transfered into pBuf[]. The transfer 3967 ** begins at "offset". 3968 ** 3969 ** Return SQLITE_OK on success or an error code if anything goes 3970 ** wrong. An error is returned if "offset+amt" is larger than 3971 ** the available payload. 3972 */ 3973 int sqlite3BtreeData(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){ 3974 int rc; 3975 3976 #ifndef SQLITE_OMIT_INCRBLOB 3977 if ( pCur->eState==CURSOR_INVALID ){ 3978 return SQLITE_ABORT; 3979 } 3980 #endif 3981 3982 assert( cursorHoldsMutex(pCur) ); 3983 rc = restoreCursorPosition(pCur); 3984 if( rc==SQLITE_OK ){ 3985 assert( pCur->eState==CURSOR_VALID ); 3986 assert( pCur->iPage>=0 && pCur->apPage[pCur->iPage] ); 3987 assert( pCur->aiIdx[pCur->iPage]<pCur->apPage[pCur->iPage]->nCell ); 3988 rc = accessPayload(pCur, offset, amt, pBuf, 0); 3989 } 3990 return rc; 3991 } 3992 3993 /* 3994 ** Return a pointer to payload information from the entry that the 3995 ** pCur cursor is pointing to. The pointer is to the beginning of 3996 ** the key if skipKey==0 and it points to the beginning of data if 3997 ** skipKey==1. The number of bytes of available key/data is written 3998 ** into *pAmt. If *pAmt==0, then the value returned will not be 3999 ** a valid pointer. 4000 ** 4001 ** This routine is an optimization. It is common for the entire key 4002 ** and data to fit on the local page and for there to be no overflow 4003 ** pages. When that is so, this routine can be used to access the 4004 ** key and data without making a copy. If the key and/or data spills 4005 ** onto overflow pages, then accessPayload() must be used to reassemble 4006 ** the key/data and copy it into a preallocated buffer. 4007 ** 4008 ** The pointer returned by this routine looks directly into the cached 4009 ** page of the database. The data might change or move the next time 4010 ** any btree routine is called. 4011 */ 4012 static const unsigned char *fetchPayload( 4013 BtCursor *pCur, /* Cursor pointing to entry to read from */ 4014 int *pAmt, /* Write the number of available bytes here */ 4015 int skipKey /* read beginning at data if this is true */ 4016 ){ 4017 unsigned char *aPayload; 4018 MemPage *pPage; 4019 u32 nKey; 4020 u32 nLocal; 4021 4022 assert( pCur!=0 && pCur->iPage>=0 && pCur->apPage[pCur->iPage]); 4023 assert( pCur->eState==CURSOR_VALID ); 4024 assert( cursorHoldsMutex(pCur) ); 4025 pPage = pCur->apPage[pCur->iPage]; 4026 assert( pCur->aiIdx[pCur->iPage]<pPage->nCell ); 4027 if( NEVER(pCur->info.nSize==0) ){ 4028 btreeParseCell(pCur->apPage[pCur->iPage], pCur->aiIdx[pCur->iPage], 4029 &pCur->info); 4030 } 4031 aPayload = pCur->info.pCell; 4032 aPayload += pCur->info.nHeader; 4033 if( pPage->intKey ){ 4034 nKey = 0; 4035 }else{ 4036 nKey = (int)pCur->info.nKey; 4037 } 4038 if( skipKey ){ 4039 aPayload += nKey; 4040 nLocal = pCur->info.nLocal - nKey; 4041 }else{ 4042 nLocal = pCur->info.nLocal; 4043 assert( nLocal<=nKey ); 4044 } 4045 *pAmt = nLocal; 4046 return aPayload; 4047 } 4048 4049 4050 /* 4051 ** For the entry that cursor pCur is point to, return as 4052 ** many bytes of the key or data as are available on the local 4053 ** b-tree page. Write the number of available bytes into *pAmt. 4054 ** 4055 ** The pointer returned is ephemeral. The key/data may move 4056 ** or be destroyed on the next call to any Btree routine, 4057 ** including calls from other threads against the same cache. 4058 ** Hence, a mutex on the BtShared should be held prior to calling 4059 ** this routine. 4060 ** 4061 ** These routines is used to get quick access to key and data 4062 ** in the common case where no overflow pages are used. 4063 */ 4064 const void *sqlite3BtreeKeyFetch(BtCursor *pCur, int *pAmt){ 4065 const void *p = 0; 4066 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) ); 4067 assert( cursorHoldsMutex(pCur) ); 4068 if( ALWAYS(pCur->eState==CURSOR_VALID) ){ 4069 p = (const void*)fetchPayload(pCur, pAmt, 0); 4070 } 4071 return p; 4072 } 4073 const void *sqlite3BtreeDataFetch(BtCursor *pCur, int *pAmt){ 4074 const void *p = 0; 4075 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) ); 4076 assert( cursorHoldsMutex(pCur) ); 4077 if( ALWAYS(pCur->eState==CURSOR_VALID) ){ 4078 p = (const void*)fetchPayload(pCur, pAmt, 1); 4079 } 4080 return p; 4081 } 4082 4083 4084 /* 4085 ** Move the cursor down to a new child page. The newPgno argument is the 4086 ** page number of the child page to move to. 4087 ** 4088 ** This function returns SQLITE_CORRUPT if the page-header flags field of 4089 ** the new child page does not match the flags field of the parent (i.e. 4090 ** if an intkey page appears to be the parent of a non-intkey page, or 4091 ** vice-versa). 4092 */ 4093 static int moveToChild(BtCursor *pCur, u32 newPgno){ 4094 int rc; 4095 int i = pCur->iPage; 4096 MemPage *pNewPage; 4097 BtShared *pBt = pCur->pBt; 4098 4099 assert( cursorHoldsMutex(pCur) ); 4100 assert( pCur->eState==CURSOR_VALID ); 4101 assert( pCur->iPage<BTCURSOR_MAX_DEPTH ); 4102 if( pCur->iPage>=(BTCURSOR_MAX_DEPTH-1) ){ 4103 return SQLITE_CORRUPT_BKPT; 4104 } 4105 rc = getAndInitPage(pBt, newPgno, &pNewPage); 4106 if( rc ) return rc; 4107 pCur->apPage[i+1] = pNewPage; 4108 pCur->aiIdx[i+1] = 0; 4109 pCur->iPage++; 4110 4111 pCur->info.nSize = 0; 4112 pCur->validNKey = 0; 4113 if( pNewPage->nCell<1 || pNewPage->intKey!=pCur->apPage[i]->intKey ){ 4114 return SQLITE_CORRUPT_BKPT; 4115 } 4116 return SQLITE_OK; 4117 } 4118 4119 #ifndef NDEBUG 4120 /* 4121 ** Page pParent is an internal (non-leaf) tree page. This function 4122 ** asserts that page number iChild is the left-child if the iIdx'th 4123 ** cell in page pParent. Or, if iIdx is equal to the total number of 4124 ** cells in pParent, that page number iChild is the right-child of 4125 ** the page. 4126 */ 4127 static void assertParentIndex(MemPage *pParent, int iIdx, Pgno iChild){ 4128 assert( iIdx<=pParent->nCell ); 4129 if( iIdx==pParent->nCell ){ 4130 assert( get4byte(&pParent->aData[pParent->hdrOffset+8])==iChild ); 4131 }else{ 4132 assert( get4byte(findCell(pParent, iIdx))==iChild ); 4133 } 4134 } 4135 #else 4136 # define assertParentIndex(x,y,z) 4137 #endif 4138 4139 /* 4140 ** Move the cursor up to the parent page. 4141 ** 4142 ** pCur->idx is set to the cell index that contains the pointer 4143 ** to the page we are coming from. If we are coming from the 4144 ** right-most child page then pCur->idx is set to one more than 4145 ** the largest cell index. 4146 */ 4147 static void moveToParent(BtCursor *pCur){ 4148 assert( cursorHoldsMutex(pCur) ); 4149 assert( pCur->eState==CURSOR_VALID ); 4150 assert( pCur->iPage>0 ); 4151 assert( pCur->apPage[pCur->iPage] ); 4152 assertParentIndex( 4153 pCur->apPage[pCur->iPage-1], 4154 pCur->aiIdx[pCur->iPage-1], 4155 pCur->apPage[pCur->iPage]->pgno 4156 ); 4157 releasePage(pCur->apPage[pCur->iPage]); 4158 pCur->iPage--; 4159 pCur->info.nSize = 0; 4160 pCur->validNKey = 0; 4161 } 4162 4163 /* 4164 ** Move the cursor to point to the root page of its b-tree structure. 4165 ** 4166 ** If the table has a virtual root page, then the cursor is moved to point 4167 ** to the virtual root page instead of the actual root page. A table has a 4168 ** virtual root page when the actual root page contains no cells and a 4169 ** single child page. This can only happen with the table rooted at page 1. 4170 ** 4171 ** If the b-tree structure is empty, the cursor state is set to 4172 ** CURSOR_INVALID. Otherwise, the cursor is set to point to the first 4173 ** cell located on the root (or virtual root) page and the cursor state 4174 ** is set to CURSOR_VALID. 4175 ** 4176 ** If this function returns successfully, it may be assumed that the 4177 ** page-header flags indicate that the [virtual] root-page is the expected 4178 ** kind of b-tree page (i.e. if when opening the cursor the caller did not 4179 ** specify a KeyInfo structure the flags byte is set to 0x05 or 0x0D, 4180 ** indicating a table b-tree, or if the caller did specify a KeyInfo 4181 ** structure the flags byte is set to 0x02 or 0x0A, indicating an index 4182 ** b-tree). 4183 */ 4184 static int moveToRoot(BtCursor *pCur){ 4185 MemPage *pRoot; 4186 int rc = SQLITE_OK; 4187 Btree *p = pCur->pBtree; 4188 BtShared *pBt = p->pBt; 4189 4190 assert( cursorHoldsMutex(pCur) ); 4191 assert( CURSOR_INVALID < CURSOR_REQUIRESEEK ); 4192 assert( CURSOR_VALID < CURSOR_REQUIRESEEK ); 4193 assert( CURSOR_FAULT > CURSOR_REQUIRESEEK ); 4194 if( pCur->eState>=CURSOR_REQUIRESEEK ){ 4195 if( pCur->eState==CURSOR_FAULT ){ 4196 assert( pCur->skipNext!=SQLITE_OK ); 4197 return pCur->skipNext; 4198 } 4199 sqlite3BtreeClearCursor(pCur); 4200 } 4201 4202 if( pCur->iPage>=0 ){ 4203 int i; 4204 for(i=1; i<=pCur->iPage; i++){ 4205 releasePage(pCur->apPage[i]); 4206 } 4207 pCur->iPage = 0; 4208 }else{ 4209 rc = getAndInitPage(pBt, pCur->pgnoRoot, &pCur->apPage[0]); 4210 if( rc!=SQLITE_OK ){ 4211 pCur->eState = CURSOR_INVALID; 4212 return rc; 4213 } 4214 pCur->iPage = 0; 4215 4216 /* If pCur->pKeyInfo is not NULL, then the caller that opened this cursor 4217 ** expected to open it on an index b-tree. Otherwise, if pKeyInfo is 4218 ** NULL, the caller expects a table b-tree. If this is not the case, 4219 ** return an SQLITE_CORRUPT error. */ 4220 assert( pCur->apPage[0]->intKey==1 || pCur->apPage[0]->intKey==0 ); 4221 if( (pCur->pKeyInfo==0)!=pCur->apPage[0]->intKey ){ 4222 return SQLITE_CORRUPT_BKPT; 4223 } 4224 } 4225 4226 /* Assert that the root page is of the correct type. This must be the 4227 ** case as the call to this function that loaded the root-page (either 4228 ** this call or a previous invocation) would have detected corruption 4229 ** if the assumption were not true, and it is not possible for the flags 4230 ** byte to have been modified while this cursor is holding a reference 4231 ** to the page. */ 4232 pRoot = pCur->apPage[0]; 4233 assert( pRoot->pgno==pCur->pgnoRoot ); 4234 assert( pRoot->isInit && (pCur->pKeyInfo==0)==pRoot->intKey ); 4235 4236 pCur->aiIdx[0] = 0; 4237 pCur->info.nSize = 0; 4238 pCur->atLast = 0; 4239 pCur->validNKey = 0; 4240 4241 if( pRoot->nCell==0 && !pRoot->leaf ){ 4242 Pgno subpage; 4243 if( pRoot->pgno!=1 ) return SQLITE_CORRUPT_BKPT; 4244 subpage = get4byte(&pRoot->aData[pRoot->hdrOffset+8]); 4245 pCur->eState = CURSOR_VALID; 4246 rc = moveToChild(pCur, subpage); 4247 }else{ 4248 pCur->eState = ((pRoot->nCell>0)?CURSOR_VALID:CURSOR_INVALID); 4249 } 4250 return rc; 4251 } 4252 4253 /* 4254 ** Move the cursor down to the left-most leaf entry beneath the 4255 ** entry to which it is currently pointing. 4256 ** 4257 ** The left-most leaf is the one with the smallest key - the first 4258 ** in ascending order. 4259 */ 4260 static int moveToLeftmost(BtCursor *pCur){ 4261 Pgno pgno; 4262 int rc = SQLITE_OK; 4263 MemPage *pPage; 4264 4265 assert( cursorHoldsMutex(pCur) ); 4266 assert( pCur->eState==CURSOR_VALID ); 4267 while( rc==SQLITE_OK && !(pPage = pCur->apPage[pCur->iPage])->leaf ){ 4268 assert( pCur->aiIdx[pCur->iPage]<pPage->nCell ); 4269 pgno = get4byte(findCell(pPage, pCur->aiIdx[pCur->iPage])); 4270 rc = moveToChild(pCur, pgno); 4271 } 4272 return rc; 4273 } 4274 4275 /* 4276 ** Move the cursor down to the right-most leaf entry beneath the 4277 ** page to which it is currently pointing. Notice the difference 4278 ** between moveToLeftmost() and moveToRightmost(). moveToLeftmost() 4279 ** finds the left-most entry beneath the *entry* whereas moveToRightmost() 4280 ** finds the right-most entry beneath the *page*. 4281 ** 4282 ** The right-most entry is the one with the largest key - the last 4283 ** key in ascending order. 4284 */ 4285 static int moveToRightmost(BtCursor *pCur){ 4286 Pgno pgno; 4287 int rc = SQLITE_OK; 4288 MemPage *pPage = 0; 4289 4290 assert( cursorHoldsMutex(pCur) ); 4291 assert( pCur->eState==CURSOR_VALID ); 4292 while( rc==SQLITE_OK && !(pPage = pCur->apPage[pCur->iPage])->leaf ){ 4293 pgno = get4byte(&pPage->aData[pPage->hdrOffset+8]); 4294 pCur->aiIdx[pCur->iPage] = pPage->nCell; 4295 rc = moveToChild(pCur, pgno); 4296 } 4297 if( rc==SQLITE_OK ){ 4298 pCur->aiIdx[pCur->iPage] = pPage->nCell-1; 4299 pCur->info.nSize = 0; 4300 pCur->validNKey = 0; 4301 } 4302 return rc; 4303 } 4304 4305 /* Move the cursor to the first entry in the table. Return SQLITE_OK 4306 ** on success. Set *pRes to 0 if the cursor actually points to something 4307 ** or set *pRes to 1 if the table is empty. 4308 */ 4309 int sqlite3BtreeFirst(BtCursor *pCur, int *pRes){ 4310 int rc; 4311 4312 assert( cursorHoldsMutex(pCur) ); 4313 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) ); 4314 rc = moveToRoot(pCur); 4315 if( rc==SQLITE_OK ){ 4316 if( pCur->eState==CURSOR_INVALID ){ 4317 assert( pCur->apPage[pCur->iPage]->nCell==0 ); 4318 *pRes = 1; 4319 }else{ 4320 assert( pCur->apPage[pCur->iPage]->nCell>0 ); 4321 *pRes = 0; 4322 rc = moveToLeftmost(pCur); 4323 } 4324 } 4325 return rc; 4326 } 4327 4328 /* Move the cursor to the last entry in the table. Return SQLITE_OK 4329 ** on success. Set *pRes to 0 if the cursor actually points to something 4330 ** or set *pRes to 1 if the table is empty. 4331 */ 4332 int sqlite3BtreeLast(BtCursor *pCur, int *pRes){ 4333 int rc; 4334 4335 assert( cursorHoldsMutex(pCur) ); 4336 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) ); 4337 4338 /* If the cursor already points to the last entry, this is a no-op. */ 4339 if( CURSOR_VALID==pCur->eState && pCur->atLast ){ 4340 #ifdef SQLITE_DEBUG 4341 /* This block serves to assert() that the cursor really does point 4342 ** to the last entry in the b-tree. */ 4343 int ii; 4344 for(ii=0; ii<pCur->iPage; ii++){ 4345 assert( pCur->aiIdx[ii]==pCur->apPage[ii]->nCell ); 4346 } 4347 assert( pCur->aiIdx[pCur->iPage]==pCur->apPage[pCur->iPage]->nCell-1 ); 4348 assert( pCur->apPage[pCur->iPage]->leaf ); 4349 #endif 4350 return SQLITE_OK; 4351 } 4352 4353 rc = moveToRoot(pCur); 4354 if( rc==SQLITE_OK ){ 4355 if( CURSOR_INVALID==pCur->eState ){ 4356 assert( pCur->apPage[pCur->iPage]->nCell==0 ); 4357 *pRes = 1; 4358 }else{ 4359 assert( pCur->eState==CURSOR_VALID ); 4360 *pRes = 0; 4361 rc = moveToRightmost(pCur); 4362 pCur->atLast = rc==SQLITE_OK ?1:0; 4363 } 4364 } 4365 return rc; 4366 } 4367 4368 /* Move the cursor so that it points to an entry near the key 4369 ** specified by pIdxKey or intKey. Return a success code. 4370 ** 4371 ** For INTKEY tables, the intKey parameter is used. pIdxKey 4372 ** must be NULL. For index tables, pIdxKey is used and intKey 4373 ** is ignored. 4374 ** 4375 ** If an exact match is not found, then the cursor is always 4376 ** left pointing at a leaf page which would hold the entry if it 4377 ** were present. The cursor might point to an entry that comes 4378 ** before or after the key. 4379 ** 4380 ** An integer is written into *pRes which is the result of 4381 ** comparing the key with the entry to which the cursor is 4382 ** pointing. The meaning of the integer written into 4383 ** *pRes is as follows: 4384 ** 4385 ** *pRes<0 The cursor is left pointing at an entry that 4386 ** is smaller than intKey/pIdxKey or if the table is empty 4387 ** and the cursor is therefore left point to nothing. 4388 ** 4389 ** *pRes==0 The cursor is left pointing at an entry that 4390 ** exactly matches intKey/pIdxKey. 4391 ** 4392 ** *pRes>0 The cursor is left pointing at an entry that 4393 ** is larger than intKey/pIdxKey. 4394 ** 4395 */ 4396 int sqlite3BtreeMovetoUnpacked( 4397 BtCursor *pCur, /* The cursor to be moved */ 4398 UnpackedRecord *pIdxKey, /* Unpacked index key */ 4399 i64 intKey, /* The table key */ 4400 int biasRight, /* If true, bias the search to the high end */ 4401 int *pRes /* Write search results here */ 4402 ){ 4403 int rc; 4404 4405 assert( cursorHoldsMutex(pCur) ); 4406 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) ); 4407 assert( pRes ); 4408 assert( (pIdxKey==0)==(pCur->pKeyInfo==0) ); 4409 4410 /* If the cursor is already positioned at the point we are trying 4411 ** to move to, then just return without doing any work */ 4412 if( pCur->eState==CURSOR_VALID && pCur->validNKey 4413 && pCur->apPage[0]->intKey 4414 ){ 4415 if( pCur->info.nKey==intKey ){ 4416 *pRes = 0; 4417 return SQLITE_OK; 4418 } 4419 if( pCur->atLast && pCur->info.nKey<intKey ){ 4420 *pRes = -1; 4421 return SQLITE_OK; 4422 } 4423 } 4424 4425 rc = moveToRoot(pCur); 4426 if( rc ){ 4427 return rc; 4428 } 4429 assert( pCur->apPage[pCur->iPage] ); 4430 assert( pCur->apPage[pCur->iPage]->isInit ); 4431 assert( pCur->apPage[pCur->iPage]->nCell>0 || pCur->eState==CURSOR_INVALID ); 4432 if( pCur->eState==CURSOR_INVALID ){ 4433 *pRes = -1; 4434 assert( pCur->apPage[pCur->iPage]->nCell==0 ); 4435 return SQLITE_OK; 4436 } 4437 assert( pCur->apPage[0]->intKey || pIdxKey ); 4438 for(;;){ 4439 int lwr, upr; 4440 Pgno chldPg; 4441 MemPage *pPage = pCur->apPage[pCur->iPage]; 4442 int c; 4443 4444 /* pPage->nCell must be greater than zero. If this is the root-page 4445 ** the cursor would have been INVALID above and this for(;;) loop 4446 ** not run. If this is not the root-page, then the moveToChild() routine 4447 ** would have already detected db corruption. Similarly, pPage must 4448 ** be the right kind (index or table) of b-tree page. Otherwise 4449 ** a moveToChild() or moveToRoot() call would have detected corruption. */ 4450 assert( pPage->nCell>0 ); 4451 assert( pPage->intKey==(pIdxKey==0) ); 4452 lwr = 0; 4453 upr = pPage->nCell-1; 4454 if( biasRight ){ 4455 pCur->aiIdx[pCur->iPage] = (u16)upr; 4456 }else{ 4457 pCur->aiIdx[pCur->iPage] = (u16)((upr+lwr)/2); 4458 } 4459 for(;;){ 4460 int idx = pCur->aiIdx[pCur->iPage]; /* Index of current cell in pPage */ 4461 u8 *pCell; /* Pointer to current cell in pPage */ 4462 4463 pCur->info.nSize = 0; 4464 pCell = findCell(pPage, idx) + pPage->childPtrSize; 4465 if( pPage->intKey ){ 4466 i64 nCellKey; 4467 if( pPage->hasData ){ 4468 u32 dummy; 4469 pCell += getVarint32(pCell, dummy); 4470 } 4471 getVarint(pCell, (u64*)&nCellKey); 4472 if( nCellKey==intKey ){ 4473 c = 0; 4474 }else if( nCellKey<intKey ){ 4475 c = -1; 4476 }else{ 4477 assert( nCellKey>intKey ); 4478 c = +1; 4479 } 4480 pCur->validNKey = 1; 4481 pCur->info.nKey = nCellKey; 4482 }else{ 4483 /* The maximum supported page-size is 65536 bytes. This means that 4484 ** the maximum number of record bytes stored on an index B-Tree 4485 ** page is less than 16384 bytes and may be stored as a 2-byte 4486 ** varint. This information is used to attempt to avoid parsing 4487 ** the entire cell by checking for the cases where the record is 4488 ** stored entirely within the b-tree page by inspecting the first 4489 ** 2 bytes of the cell. 4490 */ 4491 int nCell = pCell[0]; 4492 if( !(nCell & 0x80) && nCell<=pPage->maxLocal ){ 4493 /* This branch runs if the record-size field of the cell is a 4494 ** single byte varint and the record fits entirely on the main 4495 ** b-tree page. */ 4496 c = sqlite3VdbeRecordCompare(nCell, (void*)&pCell[1], pIdxKey); 4497 }else if( !(pCell[1] & 0x80) 4498 && (nCell = ((nCell&0x7f)<<7) + pCell[1])<=pPage->maxLocal 4499 ){ 4500 /* The record-size field is a 2 byte varint and the record 4501 ** fits entirely on the main b-tree page. */ 4502 c = sqlite3VdbeRecordCompare(nCell, (void*)&pCell[2], pIdxKey); 4503 }else{ 4504 /* The record flows over onto one or more overflow pages. In 4505 ** this case the whole cell needs to be parsed, a buffer allocated 4506 ** and accessPayload() used to retrieve the record into the 4507 ** buffer before VdbeRecordCompare() can be called. */ 4508 void *pCellKey; 4509 u8 * const pCellBody = pCell - pPage->childPtrSize; 4510 btreeParseCellPtr(pPage, pCellBody, &pCur->info); 4511 nCell = (int)pCur->info.nKey; 4512 pCellKey = sqlite3Malloc( nCell ); 4513 if( pCellKey==0 ){ 4514 rc = SQLITE_NOMEM; 4515 goto moveto_finish; 4516 } 4517 rc = accessPayload(pCur, 0, nCell, (unsigned char*)pCellKey, 0); 4518 if( rc ){ 4519 sqlite3_free(pCellKey); 4520 goto moveto_finish; 4521 } 4522 c = sqlite3VdbeRecordCompare(nCell, pCellKey, pIdxKey); 4523 sqlite3_free(pCellKey); 4524 } 4525 } 4526 if( c==0 ){ 4527 if( pPage->intKey && !pPage->leaf ){ 4528 lwr = idx; 4529 upr = lwr - 1; 4530 break; 4531 }else{ 4532 *pRes = 0; 4533 rc = SQLITE_OK; 4534 goto moveto_finish; 4535 } 4536 } 4537 if( c<0 ){ 4538 lwr = idx+1; 4539 }else{ 4540 upr = idx-1; 4541 } 4542 if( lwr>upr ){ 4543 break; 4544 } 4545 pCur->aiIdx[pCur->iPage] = (u16)((lwr+upr)/2); 4546 } 4547 assert( lwr==upr+1 ); 4548 assert( pPage->isInit ); 4549 if( pPage->leaf ){ 4550 chldPg = 0; 4551 }else if( lwr>=pPage->nCell ){ 4552 chldPg = get4byte(&pPage->aData[pPage->hdrOffset+8]); 4553 }else{ 4554 chldPg = get4byte(findCell(pPage, lwr)); 4555 } 4556 if( chldPg==0 ){ 4557 assert( pCur->aiIdx[pCur->iPage]<pCur->apPage[pCur->iPage]->nCell ); 4558 *pRes = c; 4559 rc = SQLITE_OK; 4560 goto moveto_finish; 4561 } 4562 pCur->aiIdx[pCur->iPage] = (u16)lwr; 4563 pCur->info.nSize = 0; 4564 pCur->validNKey = 0; 4565 rc = moveToChild(pCur, chldPg); 4566 if( rc ) goto moveto_finish; 4567 } 4568 moveto_finish: 4569 return rc; 4570 } 4571 4572 4573 /* 4574 ** Return TRUE if the cursor is not pointing at an entry of the table. 4575 ** 4576 ** TRUE will be returned after a call to sqlite3BtreeNext() moves 4577 ** past the last entry in the table or sqlite3BtreePrev() moves past 4578 ** the first entry. TRUE is also returned if the table is empty. 4579 */ 4580 int sqlite3BtreeEof(BtCursor *pCur){ 4581 /* TODO: What if the cursor is in CURSOR_REQUIRESEEK but all table entries 4582 ** have been deleted? This API will need to change to return an error code 4583 ** as well as the boolean result value. 4584 */ 4585 return (CURSOR_VALID!=pCur->eState); 4586 } 4587 4588 /* 4589 ** Advance the cursor to the next entry in the database. If 4590 ** successful then set *pRes=0. If the cursor 4591 ** was already pointing to the last entry in the database before 4592 ** this routine was called, then set *pRes=1. 4593 */ 4594 int sqlite3BtreeNext(BtCursor *pCur, int *pRes){ 4595 int rc; 4596 int idx; 4597 MemPage *pPage; 4598 4599 assert( cursorHoldsMutex(pCur) ); 4600 rc = restoreCursorPosition(pCur); 4601 if( rc!=SQLITE_OK ){ 4602 return rc; 4603 } 4604 assert( pRes!=0 ); 4605 if( CURSOR_INVALID==pCur->eState ){ 4606 *pRes = 1; 4607 return SQLITE_OK; 4608 } 4609 if( pCur->skipNext>0 ){ 4610 pCur->skipNext = 0; 4611 *pRes = 0; 4612 return SQLITE_OK; 4613 } 4614 pCur->skipNext = 0; 4615 4616 pPage = pCur->apPage[pCur->iPage]; 4617 idx = ++pCur->aiIdx[pCur->iPage]; 4618 assert( pPage->isInit ); 4619 assert( idx<=pPage->nCell ); 4620 4621 pCur->info.nSize = 0; 4622 pCur->validNKey = 0; 4623 if( idx>=pPage->nCell ){ 4624 if( !pPage->leaf ){ 4625 rc = moveToChild(pCur, get4byte(&pPage->aData[pPage->hdrOffset+8])); 4626 if( rc ) return rc; 4627 rc = moveToLeftmost(pCur); 4628 *pRes = 0; 4629 return rc; 4630 } 4631 do{ 4632 if( pCur->iPage==0 ){ 4633 *pRes = 1; 4634 pCur->eState = CURSOR_INVALID; 4635 return SQLITE_OK; 4636 } 4637 moveToParent(pCur); 4638 pPage = pCur->apPage[pCur->iPage]; 4639 }while( pCur->aiIdx[pCur->iPage]>=pPage->nCell ); 4640 *pRes = 0; 4641 if( pPage->intKey ){ 4642 rc = sqlite3BtreeNext(pCur, pRes); 4643 }else{ 4644 rc = SQLITE_OK; 4645 } 4646 return rc; 4647 } 4648 *pRes = 0; 4649 if( pPage->leaf ){ 4650 return SQLITE_OK; 4651 } 4652 rc = moveToLeftmost(pCur); 4653 return rc; 4654 } 4655 4656 4657 /* 4658 ** Step the cursor to the back to the previous entry in the database. If 4659 ** successful then set *pRes=0. If the cursor 4660 ** was already pointing to the first entry in the database before 4661 ** this routine was called, then set *pRes=1. 4662 */ 4663 int sqlite3BtreePrevious(BtCursor *pCur, int *pRes){ 4664 int rc; 4665 MemPage *pPage; 4666 4667 assert( cursorHoldsMutex(pCur) ); 4668 rc = restoreCursorPosition(pCur); 4669 if( rc!=SQLITE_OK ){ 4670 return rc; 4671 } 4672 pCur->atLast = 0; 4673 if( CURSOR_INVALID==pCur->eState ){ 4674 *pRes = 1; 4675 return SQLITE_OK; 4676 } 4677 if( pCur->skipNext<0 ){ 4678 pCur->skipNext = 0; 4679 *pRes = 0; 4680 return SQLITE_OK; 4681 } 4682 pCur->skipNext = 0; 4683 4684 pPage = pCur->apPage[pCur->iPage]; 4685 assert( pPage->isInit ); 4686 if( !pPage->leaf ){ 4687 int idx = pCur->aiIdx[pCur->iPage]; 4688 rc = moveToChild(pCur, get4byte(findCell(pPage, idx))); 4689 if( rc ){ 4690 return rc; 4691 } 4692 rc = moveToRightmost(pCur); 4693 }else{ 4694 while( pCur->aiIdx[pCur->iPage]==0 ){ 4695 if( pCur->iPage==0 ){ 4696 pCur->eState = CURSOR_INVALID; 4697 *pRes = 1; 4698 return SQLITE_OK; 4699 } 4700 moveToParent(pCur); 4701 } 4702 pCur->info.nSize = 0; 4703 pCur->validNKey = 0; 4704 4705 pCur->aiIdx[pCur->iPage]--; 4706 pPage = pCur->apPage[pCur->iPage]; 4707 if( pPage->intKey && !pPage->leaf ){ 4708 rc = sqlite3BtreePrevious(pCur, pRes); 4709 }else{ 4710 rc = SQLITE_OK; 4711 } 4712 } 4713 *pRes = 0; 4714 return rc; 4715 } 4716 4717 /* 4718 ** Allocate a new page from the database file. 4719 ** 4720 ** The new page is marked as dirty. (In other words, sqlite3PagerWrite() 4721 ** has already been called on the new page.) The new page has also 4722 ** been referenced and the calling routine is responsible for calling 4723 ** sqlite3PagerUnref() on the new page when it is done. 4724 ** 4725 ** SQLITE_OK is returned on success. Any other return value indicates 4726 ** an error. *ppPage and *pPgno are undefined in the event of an error. 4727 ** Do not invoke sqlite3PagerUnref() on *ppPage if an error is returned. 4728 ** 4729 ** If the "nearby" parameter is not 0, then a (feeble) effort is made to 4730 ** locate a page close to the page number "nearby". This can be used in an 4731 ** attempt to keep related pages close to each other in the database file, 4732 ** which in turn can make database access faster. 4733 ** 4734 ** If the "exact" parameter is not 0, and the page-number nearby exists 4735 ** anywhere on the free-list, then it is guarenteed to be returned. This 4736 ** is only used by auto-vacuum databases when allocating a new table. 4737 */ 4738 static int allocateBtreePage( 4739 BtShared *pBt, 4740 MemPage **ppPage, 4741 Pgno *pPgno, 4742 Pgno nearby, 4743 u8 exact 4744 ){ 4745 MemPage *pPage1; 4746 int rc; 4747 u32 n; /* Number of pages on the freelist */ 4748 u32 k; /* Number of leaves on the trunk of the freelist */ 4749 MemPage *pTrunk = 0; 4750 MemPage *pPrevTrunk = 0; 4751 Pgno mxPage; /* Total size of the database file */ 4752 4753 assert( sqlite3_mutex_held(pBt->mutex) ); 4754 pPage1 = pBt->pPage1; 4755 mxPage = btreePagecount(pBt); 4756 n = get4byte(&pPage1->aData[36]); 4757 testcase( n==mxPage-1 ); 4758 if( n>=mxPage ){ 4759 return SQLITE_CORRUPT_BKPT; 4760 } 4761 if( n>0 ){ 4762 /* There are pages on the freelist. Reuse one of those pages. */ 4763 Pgno iTrunk; 4764 u8 searchList = 0; /* If the free-list must be searched for 'nearby' */ 4765 4766 /* If the 'exact' parameter was true and a query of the pointer-map 4767 ** shows that the page 'nearby' is somewhere on the free-list, then 4768 ** the entire-list will be searched for that page. 4769 */ 4770 #ifndef SQLITE_OMIT_AUTOVACUUM 4771 if( exact && nearby<=mxPage ){ 4772 u8 eType; 4773 assert( nearby>0 ); 4774 assert( pBt->autoVacuum ); 4775 rc = ptrmapGet(pBt, nearby, &eType, 0); 4776 if( rc ) return rc; 4777 if( eType==PTRMAP_FREEPAGE ){ 4778 searchList = 1; 4779 } 4780 *pPgno = nearby; 4781 } 4782 #endif 4783 4784 /* Decrement the free-list count by 1. Set iTrunk to the index of the 4785 ** first free-list trunk page. iPrevTrunk is initially 1. 4786 */ 4787 rc = sqlite3PagerWrite(pPage1->pDbPage); 4788 if( rc ) return rc; 4789 put4byte(&pPage1->aData[36], n-1); 4790 4791 /* The code within this loop is run only once if the 'searchList' variable 4792 ** is not true. Otherwise, it runs once for each trunk-page on the 4793 ** free-list until the page 'nearby' is located. 4794 */ 4795 do { 4796 pPrevTrunk = pTrunk; 4797 if( pPrevTrunk ){ 4798 iTrunk = get4byte(&pPrevTrunk->aData[0]); 4799 }else{ 4800 iTrunk = get4byte(&pPage1->aData[32]); 4801 } 4802 testcase( iTrunk==mxPage ); 4803 if( iTrunk>mxPage ){ 4804 rc = SQLITE_CORRUPT_BKPT; 4805 }else{ 4806 rc = btreeGetPage(pBt, iTrunk, &pTrunk, 0); 4807 } 4808 if( rc ){ 4809 pTrunk = 0; 4810 goto end_allocate_page; 4811 } 4812 4813 k = get4byte(&pTrunk->aData[4]); 4814 if( k==0 && !searchList ){ 4815 /* The trunk has no leaves and the list is not being searched. 4816 ** So extract the trunk page itself and use it as the newly 4817 ** allocated page */ 4818 assert( pPrevTrunk==0 ); 4819 rc = sqlite3PagerWrite(pTrunk->pDbPage); 4820 if( rc ){ 4821 goto end_allocate_page; 4822 } 4823 *pPgno = iTrunk; 4824 memcpy(&pPage1->aData[32], &pTrunk->aData[0], 4); 4825 *ppPage = pTrunk; 4826 pTrunk = 0; 4827 TRACE(("ALLOCATE: %d trunk - %d free pages left\n", *pPgno, n-1)); 4828 }else if( k>(u32)(pBt->usableSize/4 - 2) ){ 4829 /* Value of k is out of range. Database corruption */ 4830 rc = SQLITE_CORRUPT_BKPT; 4831 goto end_allocate_page; 4832 #ifndef SQLITE_OMIT_AUTOVACUUM 4833 }else if( searchList && nearby==iTrunk ){ 4834 /* The list is being searched and this trunk page is the page 4835 ** to allocate, regardless of whether it has leaves. 4836 */ 4837 assert( *pPgno==iTrunk ); 4838 *ppPage = pTrunk; 4839 searchList = 0; 4840 rc = sqlite3PagerWrite(pTrunk->pDbPage); 4841 if( rc ){ 4842 goto end_allocate_page; 4843 } 4844 if( k==0 ){ 4845 if( !pPrevTrunk ){ 4846 memcpy(&pPage1->aData[32], &pTrunk->aData[0], 4); 4847 }else{ 4848 rc = sqlite3PagerWrite(pPrevTrunk->pDbPage); 4849 if( rc!=SQLITE_OK ){ 4850 goto end_allocate_page; 4851 } 4852 memcpy(&pPrevTrunk->aData[0], &pTrunk->aData[0], 4); 4853 } 4854 }else{ 4855 /* The trunk page is required by the caller but it contains 4856 ** pointers to free-list leaves. The first leaf becomes a trunk 4857 ** page in this case. 4858 */ 4859 MemPage *pNewTrunk; 4860 Pgno iNewTrunk = get4byte(&pTrunk->aData[8]); 4861 if( iNewTrunk>mxPage ){ 4862 rc = SQLITE_CORRUPT_BKPT; 4863 goto end_allocate_page; 4864 } 4865 testcase( iNewTrunk==mxPage ); 4866 rc = btreeGetPage(pBt, iNewTrunk, &pNewTrunk, 0); 4867 if( rc!=SQLITE_OK ){ 4868 goto end_allocate_page; 4869 } 4870 rc = sqlite3PagerWrite(pNewTrunk->pDbPage); 4871 if( rc!=SQLITE_OK ){ 4872 releasePage(pNewTrunk); 4873 goto end_allocate_page; 4874 } 4875 memcpy(&pNewTrunk->aData[0], &pTrunk->aData[0], 4); 4876 put4byte(&pNewTrunk->aData[4], k-1); 4877 memcpy(&pNewTrunk->aData[8], &pTrunk->aData[12], (k-1)*4); 4878 releasePage(pNewTrunk); 4879 if( !pPrevTrunk ){ 4880 assert( sqlite3PagerIswriteable(pPage1->pDbPage) ); 4881 put4byte(&pPage1->aData[32], iNewTrunk); 4882 }else{ 4883 rc = sqlite3PagerWrite(pPrevTrunk->pDbPage); 4884 if( rc ){ 4885 goto end_allocate_page; 4886 } 4887 put4byte(&pPrevTrunk->aData[0], iNewTrunk); 4888 } 4889 } 4890 pTrunk = 0; 4891 TRACE(("ALLOCATE: %d trunk - %d free pages left\n", *pPgno, n-1)); 4892 #endif 4893 }else if( k>0 ){ 4894 /* Extract a leaf from the trunk */ 4895 u32 closest; 4896 Pgno iPage; 4897 unsigned char *aData = pTrunk->aData; 4898 rc = sqlite3PagerWrite(pTrunk->pDbPage); 4899 if( rc ){ 4900 goto end_allocate_page; 4901 } 4902 if( nearby>0 ){ 4903 u32 i; 4904 int dist; 4905 closest = 0; 4906 dist = get4byte(&aData[8]) - nearby; 4907 if( dist<0 ) dist = -dist; 4908 for(i=1; i<k; i++){ 4909 int d2 = get4byte(&aData[8+i*4]) - nearby; 4910 if( d2<0 ) d2 = -d2; 4911 if( d2<dist ){ 4912 closest = i; 4913 dist = d2; 4914 } 4915 } 4916 }else{ 4917 closest = 0; 4918 } 4919 4920 iPage = get4byte(&aData[8+closest*4]); 4921 testcase( iPage==mxPage ); 4922 if( iPage>mxPage ){ 4923 rc = SQLITE_CORRUPT_BKPT; 4924 goto end_allocate_page; 4925 } 4926 testcase( iPage==mxPage ); 4927 if( !searchList || iPage==nearby ){ 4928 int noContent; 4929 *pPgno = iPage; 4930 TRACE(("ALLOCATE: %d was leaf %d of %d on trunk %d" 4931 ": %d more free pages\n", 4932 *pPgno, closest+1, k, pTrunk->pgno, n-1)); 4933 if( closest<k-1 ){ 4934 memcpy(&aData[8+closest*4], &aData[4+k*4], 4); 4935 } 4936 put4byte(&aData[4], k-1); 4937 assert( sqlite3PagerIswriteable(pTrunk->pDbPage) ); 4938 noContent = !btreeGetHasContent(pBt, *pPgno); 4939 rc = btreeGetPage(pBt, *pPgno, ppPage, noContent); 4940 if( rc==SQLITE_OK ){ 4941 rc = sqlite3PagerWrite((*ppPage)->pDbPage); 4942 if( rc!=SQLITE_OK ){ 4943 releasePage(*ppPage); 4944 } 4945 } 4946 searchList = 0; 4947 } 4948 } 4949 releasePage(pPrevTrunk); 4950 pPrevTrunk = 0; 4951 }while( searchList ); 4952 }else{ 4953 /* There are no pages on the freelist, so create a new page at the 4954 ** end of the file */ 4955 rc = sqlite3PagerWrite(pBt->pPage1->pDbPage); 4956 if( rc ) return rc; 4957 pBt->nPage++; 4958 if( pBt->nPage==PENDING_BYTE_PAGE(pBt) ) pBt->nPage++; 4959 4960 #ifndef SQLITE_OMIT_AUTOVACUUM 4961 if( pBt->autoVacuum && PTRMAP_ISPAGE(pBt, pBt->nPage) ){ 4962 /* If *pPgno refers to a pointer-map page, allocate two new pages 4963 ** at the end of the file instead of one. The first allocated page 4964 ** becomes a new pointer-map page, the second is used by the caller. 4965 */ 4966 MemPage *pPg = 0; 4967 TRACE(("ALLOCATE: %d from end of file (pointer-map page)\n", pBt->nPage)); 4968 assert( pBt->nPage!=PENDING_BYTE_PAGE(pBt) ); 4969 rc = btreeGetPage(pBt, pBt->nPage, &pPg, 1); 4970 if( rc==SQLITE_OK ){ 4971 rc = sqlite3PagerWrite(pPg->pDbPage); 4972 releasePage(pPg); 4973 } 4974 if( rc ) return rc; 4975 pBt->nPage++; 4976 if( pBt->nPage==PENDING_BYTE_PAGE(pBt) ){ pBt->nPage++; } 4977 } 4978 #endif 4979 put4byte(28 + (u8*)pBt->pPage1->aData, pBt->nPage); 4980 *pPgno = pBt->nPage; 4981 4982 assert( *pPgno!=PENDING_BYTE_PAGE(pBt) ); 4983 rc = btreeGetPage(pBt, *pPgno, ppPage, 1); 4984 if( rc ) return rc; 4985 rc = sqlite3PagerWrite((*ppPage)->pDbPage); 4986 if( rc!=SQLITE_OK ){ 4987 releasePage(*ppPage); 4988 } 4989 TRACE(("ALLOCATE: %d from end of file\n", *pPgno)); 4990 } 4991 4992 assert( *pPgno!=PENDING_BYTE_PAGE(pBt) ); 4993 4994 end_allocate_page: 4995 releasePage(pTrunk); 4996 releasePage(pPrevTrunk); 4997 if( rc==SQLITE_OK ){ 4998 if( sqlite3PagerPageRefcount((*ppPage)->pDbPage)>1 ){ 4999 releasePage(*ppPage); 5000 return SQLITE_CORRUPT_BKPT; 5001 } 5002 (*ppPage)->isInit = 0; 5003 }else{ 5004 *ppPage = 0; 5005 } 5006 return rc; 5007 } 5008 5009 /* 5010 ** This function is used to add page iPage to the database file free-list. 5011 ** It is assumed that the page is not already a part of the free-list. 5012 ** 5013 ** The value passed as the second argument to this function is optional. 5014 ** If the caller happens to have a pointer to the MemPage object 5015 ** corresponding to page iPage handy, it may pass it as the second value. 5016 ** Otherwise, it may pass NULL. 5017 ** 5018 ** If a pointer to a MemPage object is passed as the second argument, 5019 ** its reference count is not altered by this function. 5020 */ 5021 static int freePage2(BtShared *pBt, MemPage *pMemPage, Pgno iPage){ 5022 MemPage *pTrunk = 0; /* Free-list trunk page */ 5023 Pgno iTrunk = 0; /* Page number of free-list trunk page */ 5024 MemPage *pPage1 = pBt->pPage1; /* Local reference to page 1 */ 5025 MemPage *pPage; /* Page being freed. May be NULL. */ 5026 int rc; /* Return Code */ 5027 int nFree; /* Initial number of pages on free-list */ 5028 5029 assert( sqlite3_mutex_held(pBt->mutex) ); 5030 assert( iPage>1 ); 5031 assert( !pMemPage || pMemPage->pgno==iPage ); 5032 5033 if( pMemPage ){ 5034 pPage = pMemPage; 5035 sqlite3PagerRef(pPage->pDbPage); 5036 }else{ 5037 pPage = btreePageLookup(pBt, iPage); 5038 } 5039 5040 /* Increment the free page count on pPage1 */ 5041 rc = sqlite3PagerWrite(pPage1->pDbPage); 5042 if( rc ) goto freepage_out; 5043 nFree = get4byte(&pPage1->aData[36]); 5044 put4byte(&pPage1->aData[36], nFree+1); 5045 5046 if( pBt->secureDelete ){ 5047 /* If the secure_delete option is enabled, then 5048 ** always fully overwrite deleted information with zeros. 5049 */ 5050 if( (!pPage && ((rc = btreeGetPage(pBt, iPage, &pPage, 0))!=0) ) 5051 || ((rc = sqlite3PagerWrite(pPage->pDbPage))!=0) 5052 ){ 5053 goto freepage_out; 5054 } 5055 memset(pPage->aData, 0, pPage->pBt->pageSize); 5056 } 5057 5058 /* If the database supports auto-vacuum, write an entry in the pointer-map 5059 ** to indicate that the page is free. 5060 */ 5061 if( ISAUTOVACUUM ){ 5062 ptrmapPut(pBt, iPage, PTRMAP_FREEPAGE, 0, &rc); 5063 if( rc ) goto freepage_out; 5064 } 5065 5066 /* Now manipulate the actual database free-list structure. There are two 5067 ** possibilities. If the free-list is currently empty, or if the first 5068 ** trunk page in the free-list is full, then this page will become a 5069 ** new free-list trunk page. Otherwise, it will become a leaf of the 5070 ** first trunk page in the current free-list. This block tests if it 5071 ** is possible to add the page as a new free-list leaf. 5072 */ 5073 if( nFree!=0 ){ 5074 u32 nLeaf; /* Initial number of leaf cells on trunk page */ 5075 5076 iTrunk = get4byte(&pPage1->aData[32]); 5077 rc = btreeGetPage(pBt, iTrunk, &pTrunk, 0); 5078 if( rc!=SQLITE_OK ){ 5079 goto freepage_out; 5080 } 5081 5082 nLeaf = get4byte(&pTrunk->aData[4]); 5083 assert( pBt->usableSize>32 ); 5084 if( nLeaf > (u32)pBt->usableSize/4 - 2 ){ 5085 rc = SQLITE_CORRUPT_BKPT; 5086 goto freepage_out; 5087 } 5088 if( nLeaf < (u32)pBt->usableSize/4 - 8 ){ 5089 /* In this case there is room on the trunk page to insert the page 5090 ** being freed as a new leaf. 5091 ** 5092 ** Note that the trunk page is not really full until it contains 5093 ** usableSize/4 - 2 entries, not usableSize/4 - 8 entries as we have 5094 ** coded. But due to a coding error in versions of SQLite prior to 5095 ** 3.6.0, databases with freelist trunk pages holding more than 5096 ** usableSize/4 - 8 entries will be reported as corrupt. In order 5097 ** to maintain backwards compatibility with older versions of SQLite, 5098 ** we will continue to restrict the number of entries to usableSize/4 - 8 5099 ** for now. At some point in the future (once everyone has upgraded 5100 ** to 3.6.0 or later) we should consider fixing the conditional above 5101 ** to read "usableSize/4-2" instead of "usableSize/4-8". 5102 */ 5103 rc = sqlite3PagerWrite(pTrunk->pDbPage); 5104 if( rc==SQLITE_OK ){ 5105 put4byte(&pTrunk->aData[4], nLeaf+1); 5106 put4byte(&pTrunk->aData[8+nLeaf*4], iPage); 5107 if( pPage && !pBt->secureDelete ){ 5108 sqlite3PagerDontWrite(pPage->pDbPage); 5109 } 5110 rc = btreeSetHasContent(pBt, iPage); 5111 } 5112 TRACE(("FREE-PAGE: %d leaf on trunk page %d\n",pPage->pgno,pTrunk->pgno)); 5113 goto freepage_out; 5114 } 5115 } 5116 5117 /* If control flows to this point, then it was not possible to add the 5118 ** the page being freed as a leaf page of the first trunk in the free-list. 5119 ** Possibly because the free-list is empty, or possibly because the 5120 ** first trunk in the free-list is full. Either way, the page being freed 5121 ** will become the new first trunk page in the free-list. 5122 */ 5123 if( pPage==0 && SQLITE_OK!=(rc = btreeGetPage(pBt, iPage, &pPage, 0)) ){ 5124 goto freepage_out; 5125 } 5126 rc = sqlite3PagerWrite(pPage->pDbPage); 5127 if( rc!=SQLITE_OK ){ 5128 goto freepage_out; 5129 } 5130 put4byte(pPage->aData, iTrunk); 5131 put4byte(&pPage->aData[4], 0); 5132 put4byte(&pPage1->aData[32], iPage); 5133 TRACE(("FREE-PAGE: %d new trunk page replacing %d\n", pPage->pgno, iTrunk)); 5134 5135 freepage_out: 5136 if( pPage ){ 5137 pPage->isInit = 0; 5138 } 5139 releasePage(pPage); 5140 releasePage(pTrunk); 5141 return rc; 5142 } 5143 static void freePage(MemPage *pPage, int *pRC){ 5144 if( (*pRC)==SQLITE_OK ){ 5145 *pRC = freePage2(pPage->pBt, pPage, pPage->pgno); 5146 } 5147 } 5148 5149 /* 5150 ** Free any overflow pages associated with the given Cell. 5151 */ 5152 static int clearCell(MemPage *pPage, unsigned char *pCell){ 5153 BtShared *pBt = pPage->pBt; 5154 CellInfo info; 5155 Pgno ovflPgno; 5156 int rc; 5157 int nOvfl; 5158 u32 ovflPageSize; 5159 5160 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 5161 btreeParseCellPtr(pPage, pCell, &info); 5162 if( info.iOverflow==0 ){ 5163 return SQLITE_OK; /* No overflow pages. Return without doing anything */ 5164 } 5165 ovflPgno = get4byte(&pCell[info.iOverflow]); 5166 assert( pBt->usableSize > 4 ); 5167 ovflPageSize = pBt->usableSize - 4; 5168 nOvfl = (info.nPayload - info.nLocal + ovflPageSize - 1)/ovflPageSize; 5169 assert( ovflPgno==0 || nOvfl>0 ); 5170 while( nOvfl-- ){ 5171 Pgno iNext = 0; 5172 MemPage *pOvfl = 0; 5173 if( ovflPgno<2 || ovflPgno>btreePagecount(pBt) ){ 5174 /* 0 is not a legal page number and page 1 cannot be an 5175 ** overflow page. Therefore if ovflPgno<2 or past the end of the 5176 ** file the database must be corrupt. */ 5177 return SQLITE_CORRUPT_BKPT; 5178 } 5179 if( nOvfl ){ 5180 rc = getOverflowPage(pBt, ovflPgno, &pOvfl, &iNext); 5181 if( rc ) return rc; 5182 } 5183 5184 if( ( pOvfl || ((pOvfl = btreePageLookup(pBt, ovflPgno))!=0) ) 5185 && sqlite3PagerPageRefcount(pOvfl->pDbPage)!=1 5186 ){ 5187 /* There is no reason any cursor should have an outstanding reference 5188 ** to an overflow page belonging to a cell that is being deleted/updated. 5189 ** So if there exists more than one reference to this page, then it 5190 ** must not really be an overflow page and the database must be corrupt. 5191 ** It is helpful to detect this before calling freePage2(), as 5192 ** freePage2() may zero the page contents if secure-delete mode is 5193 ** enabled. If this 'overflow' page happens to be a page that the 5194 ** caller is iterating through or using in some other way, this 5195 ** can be problematic. 5196 */ 5197 rc = SQLITE_CORRUPT_BKPT; 5198 }else{ 5199 rc = freePage2(pBt, pOvfl, ovflPgno); 5200 } 5201 5202 if( pOvfl ){ 5203 sqlite3PagerUnref(pOvfl->pDbPage); 5204 } 5205 if( rc ) return rc; 5206 ovflPgno = iNext; 5207 } 5208 return SQLITE_OK; 5209 } 5210 5211 /* 5212 ** Create the byte sequence used to represent a cell on page pPage 5213 ** and write that byte sequence into pCell[]. Overflow pages are 5214 ** allocated and filled in as necessary. The calling procedure 5215 ** is responsible for making sure sufficient space has been allocated 5216 ** for pCell[]. 5217 ** 5218 ** Note that pCell does not necessary need to point to the pPage->aData 5219 ** area. pCell might point to some temporary storage. The cell will 5220 ** be constructed in this temporary area then copied into pPage->aData 5221 ** later. 5222 */ 5223 static int fillInCell( 5224 MemPage *pPage, /* The page that contains the cell */ 5225 unsigned char *pCell, /* Complete text of the cell */ 5226 const void *pKey, i64 nKey, /* The key */ 5227 const void *pData,int nData, /* The data */ 5228 int nZero, /* Extra zero bytes to append to pData */ 5229 int *pnSize /* Write cell size here */ 5230 ){ 5231 int nPayload; 5232 const u8 *pSrc; 5233 int nSrc, n, rc; 5234 int spaceLeft; 5235 MemPage *pOvfl = 0; 5236 MemPage *pToRelease = 0; 5237 unsigned char *pPrior; 5238 unsigned char *pPayload; 5239 BtShared *pBt = pPage->pBt; 5240 Pgno pgnoOvfl = 0; 5241 int nHeader; 5242 CellInfo info; 5243 5244 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 5245 5246 /* pPage is not necessarily writeable since pCell might be auxiliary 5247 ** buffer space that is separate from the pPage buffer area */ 5248 assert( pCell<pPage->aData || pCell>=&pPage->aData[pBt->pageSize] 5249 || sqlite3PagerIswriteable(pPage->pDbPage) ); 5250 5251 /* Fill in the header. */ 5252 nHeader = 0; 5253 if( !pPage->leaf ){ 5254 nHeader += 4; 5255 } 5256 if( pPage->hasData ){ 5257 nHeader += putVarint(&pCell[nHeader], nData+nZero); 5258 }else{ 5259 nData = nZero = 0; 5260 } 5261 nHeader += putVarint(&pCell[nHeader], *(u64*)&nKey); 5262 btreeParseCellPtr(pPage, pCell, &info); 5263 assert( info.nHeader==nHeader ); 5264 assert( info.nKey==nKey ); 5265 assert( info.nData==(u32)(nData+nZero) ); 5266 5267 /* Fill in the payload */ 5268 nPayload = nData + nZero; 5269 if( pPage->intKey ){ 5270 pSrc = pData; 5271 nSrc = nData; 5272 nData = 0; 5273 }else{ 5274 if( NEVER(nKey>0x7fffffff || pKey==0) ){ 5275 return SQLITE_CORRUPT_BKPT; 5276 } 5277 nPayload += (int)nKey; 5278 pSrc = pKey; 5279 nSrc = (int)nKey; 5280 } 5281 *pnSize = info.nSize; 5282 spaceLeft = info.nLocal; 5283 pPayload = &pCell[nHeader]; 5284 pPrior = &pCell[info.iOverflow]; 5285 5286 while( nPayload>0 ){ 5287 if( spaceLeft==0 ){ 5288 #ifndef SQLITE_OMIT_AUTOVACUUM 5289 Pgno pgnoPtrmap = pgnoOvfl; /* Overflow page pointer-map entry page */ 5290 if( pBt->autoVacuum ){ 5291 do{ 5292 pgnoOvfl++; 5293 } while( 5294 PTRMAP_ISPAGE(pBt, pgnoOvfl) || pgnoOvfl==PENDING_BYTE_PAGE(pBt) 5295 ); 5296 } 5297 #endif 5298 rc = allocateBtreePage(pBt, &pOvfl, &pgnoOvfl, pgnoOvfl, 0); 5299 #ifndef SQLITE_OMIT_AUTOVACUUM 5300 /* If the database supports auto-vacuum, and the second or subsequent 5301 ** overflow page is being allocated, add an entry to the pointer-map 5302 ** for that page now. 5303 ** 5304 ** If this is the first overflow page, then write a partial entry 5305 ** to the pointer-map. If we write nothing to this pointer-map slot, 5306 ** then the optimistic overflow chain processing in clearCell() 5307 ** may misinterpret the uninitialised values and delete the 5308 ** wrong pages from the database. 5309 */ 5310 if( pBt->autoVacuum && rc==SQLITE_OK ){ 5311 u8 eType = (pgnoPtrmap?PTRMAP_OVERFLOW2:PTRMAP_OVERFLOW1); 5312 ptrmapPut(pBt, pgnoOvfl, eType, pgnoPtrmap, &rc); 5313 if( rc ){ 5314 releasePage(pOvfl); 5315 } 5316 } 5317 #endif 5318 if( rc ){ 5319 releasePage(pToRelease); 5320 return rc; 5321 } 5322 5323 /* If pToRelease is not zero than pPrior points into the data area 5324 ** of pToRelease. Make sure pToRelease is still writeable. */ 5325 assert( pToRelease==0 || sqlite3PagerIswriteable(pToRelease->pDbPage) ); 5326 5327 /* If pPrior is part of the data area of pPage, then make sure pPage 5328 ** is still writeable */ 5329 assert( pPrior<pPage->aData || pPrior>=&pPage->aData[pBt->pageSize] 5330 || sqlite3PagerIswriteable(pPage->pDbPage) ); 5331 5332 put4byte(pPrior, pgnoOvfl); 5333 releasePage(pToRelease); 5334 pToRelease = pOvfl; 5335 pPrior = pOvfl->aData; 5336 put4byte(pPrior, 0); 5337 pPayload = &pOvfl->aData[4]; 5338 spaceLeft = pBt->usableSize - 4; 5339 } 5340 n = nPayload; 5341 if( n>spaceLeft ) n = spaceLeft; 5342 5343 /* If pToRelease is not zero than pPayload points into the data area 5344 ** of pToRelease. Make sure pToRelease is still writeable. */ 5345 assert( pToRelease==0 || sqlite3PagerIswriteable(pToRelease->pDbPage) ); 5346 5347 /* If pPayload is part of the data area of pPage, then make sure pPage 5348 ** is still writeable */ 5349 assert( pPayload<pPage->aData || pPayload>=&pPage->aData[pBt->pageSize] 5350 || sqlite3PagerIswriteable(pPage->pDbPage) ); 5351 5352 if( nSrc>0 ){ 5353 if( n>nSrc ) n = nSrc; 5354 assert( pSrc ); 5355 memcpy(pPayload, pSrc, n); 5356 }else{ 5357 memset(pPayload, 0, n); 5358 } 5359 nPayload -= n; 5360 pPayload += n; 5361 pSrc += n; 5362 nSrc -= n; 5363 spaceLeft -= n; 5364 if( nSrc==0 ){ 5365 nSrc = nData; 5366 pSrc = pData; 5367 } 5368 } 5369 releasePage(pToRelease); 5370 return SQLITE_OK; 5371 } 5372 5373 /* 5374 ** Remove the i-th cell from pPage. This routine effects pPage only. 5375 ** The cell content is not freed or deallocated. It is assumed that 5376 ** the cell content has been copied someplace else. This routine just 5377 ** removes the reference to the cell from pPage. 5378 ** 5379 ** "sz" must be the number of bytes in the cell. 5380 */ 5381 static void dropCell(MemPage *pPage, int idx, int sz, int *pRC){ 5382 int i; /* Loop counter */ 5383 u32 pc; /* Offset to cell content of cell being deleted */ 5384 u8 *data; /* pPage->aData */ 5385 u8 *ptr; /* Used to move bytes around within data[] */ 5386 int rc; /* The return code */ 5387 int hdr; /* Beginning of the header. 0 most pages. 100 page 1 */ 5388 5389 if( *pRC ) return; 5390 5391 assert( idx>=0 && idx<pPage->nCell ); 5392 assert( sz==cellSize(pPage, idx) ); 5393 assert( sqlite3PagerIswriteable(pPage->pDbPage) ); 5394 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 5395 data = pPage->aData; 5396 ptr = &data[pPage->cellOffset + 2*idx]; 5397 pc = get2byte(ptr); 5398 hdr = pPage->hdrOffset; 5399 testcase( pc==get2byte(&data[hdr+5]) ); 5400 testcase( pc+sz==pPage->pBt->usableSize ); 5401 if( pc < (u32)get2byte(&data[hdr+5]) || pc+sz > pPage->pBt->usableSize ){ 5402 *pRC = SQLITE_CORRUPT_BKPT; 5403 return; 5404 } 5405 rc = freeSpace(pPage, pc, sz); 5406 if( rc ){ 5407 *pRC = rc; 5408 return; 5409 } 5410 for(i=idx+1; i<pPage->nCell; i++, ptr+=2){ 5411 ptr[0] = ptr[2]; 5412 ptr[1] = ptr[3]; 5413 } 5414 pPage->nCell--; 5415 put2byte(&data[hdr+3], pPage->nCell); 5416 pPage->nFree += 2; 5417 } 5418 5419 /* 5420 ** Insert a new cell on pPage at cell index "i". pCell points to the 5421 ** content of the cell. 5422 ** 5423 ** If the cell content will fit on the page, then put it there. If it 5424 ** will not fit, then make a copy of the cell content into pTemp if 5425 ** pTemp is not null. Regardless of pTemp, allocate a new entry 5426 ** in pPage->aOvfl[] and make it point to the cell content (either 5427 ** in pTemp or the original pCell) and also record its index. 5428 ** Allocating a new entry in pPage->aCell[] implies that 5429 ** pPage->nOverflow is incremented. 5430 ** 5431 ** If nSkip is non-zero, then do not copy the first nSkip bytes of the 5432 ** cell. The caller will overwrite them after this function returns. If 5433 ** nSkip is non-zero, then pCell may not point to an invalid memory location 5434 ** (but pCell+nSkip is always valid). 5435 */ 5436 static void insertCell( 5437 MemPage *pPage, /* Page into which we are copying */ 5438 int i, /* New cell becomes the i-th cell of the page */ 5439 u8 *pCell, /* Content of the new cell */ 5440 int sz, /* Bytes of content in pCell */ 5441 u8 *pTemp, /* Temp storage space for pCell, if needed */ 5442 Pgno iChild, /* If non-zero, replace first 4 bytes with this value */ 5443 int *pRC /* Read and write return code from here */ 5444 ){ 5445 int idx = 0; /* Where to write new cell content in data[] */ 5446 int j; /* Loop counter */ 5447 int end; /* First byte past the last cell pointer in data[] */ 5448 int ins; /* Index in data[] where new cell pointer is inserted */ 5449 int cellOffset; /* Address of first cell pointer in data[] */ 5450 u8 *data; /* The content of the whole page */ 5451 u8 *ptr; /* Used for moving information around in data[] */ 5452 5453 int nSkip = (iChild ? 4 : 0); 5454 5455 if( *pRC ) return; 5456 5457 assert( i>=0 && i<=pPage->nCell+pPage->nOverflow ); 5458 assert( pPage->nCell<=MX_CELL(pPage->pBt) && MX_CELL(pPage->pBt)<=10921 ); 5459 assert( pPage->nOverflow<=ArraySize(pPage->aOvfl) ); 5460 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 5461 /* The cell should normally be sized correctly. However, when moving a 5462 ** malformed cell from a leaf page to an interior page, if the cell size 5463 ** wanted to be less than 4 but got rounded up to 4 on the leaf, then size 5464 ** might be less than 8 (leaf-size + pointer) on the interior node. Hence 5465 ** the term after the || in the following assert(). */ 5466 assert( sz==cellSizePtr(pPage, pCell) || (sz==8 && iChild>0) ); 5467 if( pPage->nOverflow || sz+2>pPage->nFree ){ 5468 if( pTemp ){ 5469 memcpy(pTemp+nSkip, pCell+nSkip, sz-nSkip); 5470 pCell = pTemp; 5471 } 5472 if( iChild ){ 5473 put4byte(pCell, iChild); 5474 } 5475 j = pPage->nOverflow++; 5476 assert( j<(int)(sizeof(pPage->aOvfl)/sizeof(pPage->aOvfl[0])) ); 5477 pPage->aOvfl[j].pCell = pCell; 5478 pPage->aOvfl[j].idx = (u16)i; 5479 }else{ 5480 int rc = sqlite3PagerWrite(pPage->pDbPage); 5481 if( rc!=SQLITE_OK ){ 5482 *pRC = rc; 5483 return; 5484 } 5485 assert( sqlite3PagerIswriteable(pPage->pDbPage) ); 5486 data = pPage->aData; 5487 cellOffset = pPage->cellOffset; 5488 end = cellOffset + 2*pPage->nCell; 5489 ins = cellOffset + 2*i; 5490 rc = allocateSpace(pPage, sz, &idx); 5491 if( rc ){ *pRC = rc; return; } 5492 /* The allocateSpace() routine guarantees the following two properties 5493 ** if it returns success */ 5494 assert( idx >= end+2 ); 5495 assert( idx+sz <= pPage->pBt->usableSize ); 5496 pPage->nCell++; 5497 pPage->nFree -= (u16)(2 + sz); 5498 memcpy(&data[idx+nSkip], pCell+nSkip, sz-nSkip); 5499 if( iChild ){ 5500 put4byte(&data[idx], iChild); 5501 } 5502 for(j=end, ptr=&data[j]; j>ins; j-=2, ptr-=2){ 5503 ptr[0] = ptr[-2]; 5504 ptr[1] = ptr[-1]; 5505 } 5506 put2byte(&data[ins], idx); 5507 put2byte(&data[pPage->hdrOffset+3], pPage->nCell); 5508 #ifndef SQLITE_OMIT_AUTOVACUUM 5509 if( pPage->pBt->autoVacuum ){ 5510 /* The cell may contain a pointer to an overflow page. If so, write 5511 ** the entry for the overflow page into the pointer map. 5512 */ 5513 ptrmapPutOvflPtr(pPage, pCell, pRC); 5514 } 5515 #endif 5516 } 5517 } 5518 5519 /* 5520 ** Add a list of cells to a page. The page should be initially empty. 5521 ** The cells are guaranteed to fit on the page. 5522 */ 5523 static void assemblePage( 5524 MemPage *pPage, /* The page to be assemblied */ 5525 int nCell, /* The number of cells to add to this page */ 5526 u8 **apCell, /* Pointers to cell bodies */ 5527 u16 *aSize /* Sizes of the cells */ 5528 ){ 5529 int i; /* Loop counter */ 5530 u8 *pCellptr; /* Address of next cell pointer */ 5531 int cellbody; /* Address of next cell body */ 5532 u8 * const data = pPage->aData; /* Pointer to data for pPage */ 5533 const int hdr = pPage->hdrOffset; /* Offset of header on pPage */ 5534 const int nUsable = pPage->pBt->usableSize; /* Usable size of page */ 5535 5536 assert( pPage->nOverflow==0 ); 5537 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 5538 assert( nCell>=0 && nCell<=MX_CELL(pPage->pBt) && MX_CELL(pPage->pBt)<=10921); 5539 assert( sqlite3PagerIswriteable(pPage->pDbPage) ); 5540 5541 /* Check that the page has just been zeroed by zeroPage() */ 5542 assert( pPage->nCell==0 ); 5543 assert( get2byteNotZero(&data[hdr+5])==nUsable ); 5544 5545 pCellptr = &data[pPage->cellOffset + nCell*2]; 5546 cellbody = nUsable; 5547 for(i=nCell-1; i>=0; i--){ 5548 pCellptr -= 2; 5549 cellbody -= aSize[i]; 5550 put2byte(pCellptr, cellbody); 5551 memcpy(&data[cellbody], apCell[i], aSize[i]); 5552 } 5553 put2byte(&data[hdr+3], nCell); 5554 put2byte(&data[hdr+5], cellbody); 5555 pPage->nFree -= (nCell*2 + nUsable - cellbody); 5556 pPage->nCell = (u16)nCell; 5557 } 5558 5559 /* 5560 ** The following parameters determine how many adjacent pages get involved 5561 ** in a balancing operation. NN is the number of neighbors on either side 5562 ** of the page that participate in the balancing operation. NB is the 5563 ** total number of pages that participate, including the target page and 5564 ** NN neighbors on either side. 5565 ** 5566 ** The minimum value of NN is 1 (of course). Increasing NN above 1 5567 ** (to 2 or 3) gives a modest improvement in SELECT and DELETE performance 5568 ** in exchange for a larger degradation in INSERT and UPDATE performance. 5569 ** The value of NN appears to give the best results overall. 5570 */ 5571 #define NN 1 /* Number of neighbors on either side of pPage */ 5572 #define NB (NN*2+1) /* Total pages involved in the balance */ 5573 5574 5575 #ifndef SQLITE_OMIT_QUICKBALANCE 5576 /* 5577 ** This version of balance() handles the common special case where 5578 ** a new entry is being inserted on the extreme right-end of the 5579 ** tree, in other words, when the new entry will become the largest 5580 ** entry in the tree. 5581 ** 5582 ** Instead of trying to balance the 3 right-most leaf pages, just add 5583 ** a new page to the right-hand side and put the one new entry in 5584 ** that page. This leaves the right side of the tree somewhat 5585 ** unbalanced. But odds are that we will be inserting new entries 5586 ** at the end soon afterwards so the nearly empty page will quickly 5587 ** fill up. On average. 5588 ** 5589 ** pPage is the leaf page which is the right-most page in the tree. 5590 ** pParent is its parent. pPage must have a single overflow entry 5591 ** which is also the right-most entry on the page. 5592 ** 5593 ** The pSpace buffer is used to store a temporary copy of the divider 5594 ** cell that will be inserted into pParent. Such a cell consists of a 4 5595 ** byte page number followed by a variable length integer. In other 5596 ** words, at most 13 bytes. Hence the pSpace buffer must be at 5597 ** least 13 bytes in size. 5598 */ 5599 static int balance_quick(MemPage *pParent, MemPage *pPage, u8 *pSpace){ 5600 BtShared *const pBt = pPage->pBt; /* B-Tree Database */ 5601 MemPage *pNew; /* Newly allocated page */ 5602 int rc; /* Return Code */ 5603 Pgno pgnoNew; /* Page number of pNew */ 5604 5605 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 5606 assert( sqlite3PagerIswriteable(pParent->pDbPage) ); 5607 assert( pPage->nOverflow==1 ); 5608 5609 /* This error condition is now caught prior to reaching this function */ 5610 if( pPage->nCell<=0 ) return SQLITE_CORRUPT_BKPT; 5611 5612 /* Allocate a new page. This page will become the right-sibling of 5613 ** pPage. Make the parent page writable, so that the new divider cell 5614 ** may be inserted. If both these operations are successful, proceed. 5615 */ 5616 rc = allocateBtreePage(pBt, &pNew, &pgnoNew, 0, 0); 5617 5618 if( rc==SQLITE_OK ){ 5619 5620 u8 *pOut = &pSpace[4]; 5621 u8 *pCell = pPage->aOvfl[0].pCell; 5622 u16 szCell = cellSizePtr(pPage, pCell); 5623 u8 *pStop; 5624 5625 assert( sqlite3PagerIswriteable(pNew->pDbPage) ); 5626 assert( pPage->aData[0]==(PTF_INTKEY|PTF_LEAFDATA|PTF_LEAF) ); 5627 zeroPage(pNew, PTF_INTKEY|PTF_LEAFDATA|PTF_LEAF); 5628 assemblePage(pNew, 1, &pCell, &szCell); 5629 5630 /* If this is an auto-vacuum database, update the pointer map 5631 ** with entries for the new page, and any pointer from the 5632 ** cell on the page to an overflow page. If either of these 5633 ** operations fails, the return code is set, but the contents 5634 ** of the parent page are still manipulated by thh code below. 5635 ** That is Ok, at this point the parent page is guaranteed to 5636 ** be marked as dirty. Returning an error code will cause a 5637 ** rollback, undoing any changes made to the parent page. 5638 */ 5639 if( ISAUTOVACUUM ){ 5640 ptrmapPut(pBt, pgnoNew, PTRMAP_BTREE, pParent->pgno, &rc); 5641 if( szCell>pNew->minLocal ){ 5642 ptrmapPutOvflPtr(pNew, pCell, &rc); 5643 } 5644 } 5645 5646 /* Create a divider cell to insert into pParent. The divider cell 5647 ** consists of a 4-byte page number (the page number of pPage) and 5648 ** a variable length key value (which must be the same value as the 5649 ** largest key on pPage). 5650 ** 5651 ** To find the largest key value on pPage, first find the right-most 5652 ** cell on pPage. The first two fields of this cell are the 5653 ** record-length (a variable length integer at most 32-bits in size) 5654 ** and the key value (a variable length integer, may have any value). 5655 ** The first of the while(...) loops below skips over the record-length 5656 ** field. The second while(...) loop copies the key value from the 5657 ** cell on pPage into the pSpace buffer. 5658 */ 5659 pCell = findCell(pPage, pPage->nCell-1); 5660 pStop = &pCell[9]; 5661 while( (*(pCell++)&0x80) && pCell<pStop ); 5662 pStop = &pCell[9]; 5663 while( ((*(pOut++) = *(pCell++))&0x80) && pCell<pStop ); 5664 5665 /* Insert the new divider cell into pParent. */ 5666 insertCell(pParent, pParent->nCell, pSpace, (int)(pOut-pSpace), 5667 0, pPage->pgno, &rc); 5668 5669 /* Set the right-child pointer of pParent to point to the new page. */ 5670 put4byte(&pParent->aData[pParent->hdrOffset+8], pgnoNew); 5671 5672 /* Release the reference to the new page. */ 5673 releasePage(pNew); 5674 } 5675 5676 return rc; 5677 } 5678 #endif /* SQLITE_OMIT_QUICKBALANCE */ 5679 5680 #if 0 5681 /* 5682 ** This function does not contribute anything to the operation of SQLite. 5683 ** it is sometimes activated temporarily while debugging code responsible 5684 ** for setting pointer-map entries. 5685 */ 5686 static int ptrmapCheckPages(MemPage **apPage, int nPage){ 5687 int i, j; 5688 for(i=0; i<nPage; i++){ 5689 Pgno n; 5690 u8 e; 5691 MemPage *pPage = apPage[i]; 5692 BtShared *pBt = pPage->pBt; 5693 assert( pPage->isInit ); 5694 5695 for(j=0; j<pPage->nCell; j++){ 5696 CellInfo info; 5697 u8 *z; 5698 5699 z = findCell(pPage, j); 5700 btreeParseCellPtr(pPage, z, &info); 5701 if( info.iOverflow ){ 5702 Pgno ovfl = get4byte(&z[info.iOverflow]); 5703 ptrmapGet(pBt, ovfl, &e, &n); 5704 assert( n==pPage->pgno && e==PTRMAP_OVERFLOW1 ); 5705 } 5706 if( !pPage->leaf ){ 5707 Pgno child = get4byte(z); 5708 ptrmapGet(pBt, child, &e, &n); 5709 assert( n==pPage->pgno && e==PTRMAP_BTREE ); 5710 } 5711 } 5712 if( !pPage->leaf ){ 5713 Pgno child = get4byte(&pPage->aData[pPage->hdrOffset+8]); 5714 ptrmapGet(pBt, child, &e, &n); 5715 assert( n==pPage->pgno && e==PTRMAP_BTREE ); 5716 } 5717 } 5718 return 1; 5719 } 5720 #endif 5721 5722 /* 5723 ** This function is used to copy the contents of the b-tree node stored 5724 ** on page pFrom to page pTo. If page pFrom was not a leaf page, then 5725 ** the pointer-map entries for each child page are updated so that the 5726 ** parent page stored in the pointer map is page pTo. If pFrom contained 5727 ** any cells with overflow page pointers, then the corresponding pointer 5728 ** map entries are also updated so that the parent page is page pTo. 5729 ** 5730 ** If pFrom is currently carrying any overflow cells (entries in the 5731 ** MemPage.aOvfl[] array), they are not copied to pTo. 5732 ** 5733 ** Before returning, page pTo is reinitialized using btreeInitPage(). 5734 ** 5735 ** The performance of this function is not critical. It is only used by 5736 ** the balance_shallower() and balance_deeper() procedures, neither of 5737 ** which are called often under normal circumstances. 5738 */ 5739 static void copyNodeContent(MemPage *pFrom, MemPage *pTo, int *pRC){ 5740 if( (*pRC)==SQLITE_OK ){ 5741 BtShared * const pBt = pFrom->pBt; 5742 u8 * const aFrom = pFrom->aData; 5743 u8 * const aTo = pTo->aData; 5744 int const iFromHdr = pFrom->hdrOffset; 5745 int const iToHdr = ((pTo->pgno==1) ? 100 : 0); 5746 int rc; 5747 int iData; 5748 5749 5750 assert( pFrom->isInit ); 5751 assert( pFrom->nFree>=iToHdr ); 5752 assert( get2byte(&aFrom[iFromHdr+5])<=pBt->usableSize ); 5753 5754 /* Copy the b-tree node content from page pFrom to page pTo. */ 5755 iData = get2byte(&aFrom[iFromHdr+5]); 5756 memcpy(&aTo[iData], &aFrom[iData], pBt->usableSize-iData); 5757 memcpy(&aTo[iToHdr], &aFrom[iFromHdr], pFrom->cellOffset + 2*pFrom->nCell); 5758 5759 /* Reinitialize page pTo so that the contents of the MemPage structure 5760 ** match the new data. The initialization of pTo can actually fail under 5761 ** fairly obscure circumstances, even though it is a copy of initialized 5762 ** page pFrom. 5763 */ 5764 pTo->isInit = 0; 5765 rc = btreeInitPage(pTo); 5766 if( rc!=SQLITE_OK ){ 5767 *pRC = rc; 5768 return; 5769 } 5770 5771 /* If this is an auto-vacuum database, update the pointer-map entries 5772 ** for any b-tree or overflow pages that pTo now contains the pointers to. 5773 */ 5774 if( ISAUTOVACUUM ){ 5775 *pRC = setChildPtrmaps(pTo); 5776 } 5777 } 5778 } 5779 5780 /* 5781 ** This routine redistributes cells on the iParentIdx'th child of pParent 5782 ** (hereafter "the page") and up to 2 siblings so that all pages have about the 5783 ** same amount of free space. Usually a single sibling on either side of the 5784 ** page are used in the balancing, though both siblings might come from one 5785 ** side if the page is the first or last child of its parent. If the page 5786 ** has fewer than 2 siblings (something which can only happen if the page 5787 ** is a root page or a child of a root page) then all available siblings 5788 ** participate in the balancing. 5789 ** 5790 ** The number of siblings of the page might be increased or decreased by 5791 ** one or two in an effort to keep pages nearly full but not over full. 5792 ** 5793 ** Note that when this routine is called, some of the cells on the page 5794 ** might not actually be stored in MemPage.aData[]. This can happen 5795 ** if the page is overfull. This routine ensures that all cells allocated 5796 ** to the page and its siblings fit into MemPage.aData[] before returning. 5797 ** 5798 ** In the course of balancing the page and its siblings, cells may be 5799 ** inserted into or removed from the parent page (pParent). Doing so 5800 ** may cause the parent page to become overfull or underfull. If this 5801 ** happens, it is the responsibility of the caller to invoke the correct 5802 ** balancing routine to fix this problem (see the balance() routine). 5803 ** 5804 ** If this routine fails for any reason, it might leave the database 5805 ** in a corrupted state. So if this routine fails, the database should 5806 ** be rolled back. 5807 ** 5808 ** The third argument to this function, aOvflSpace, is a pointer to a 5809 ** buffer big enough to hold one page. If while inserting cells into the parent 5810 ** page (pParent) the parent page becomes overfull, this buffer is 5811 ** used to store the parent's overflow cells. Because this function inserts 5812 ** a maximum of four divider cells into the parent page, and the maximum 5813 ** size of a cell stored within an internal node is always less than 1/4 5814 ** of the page-size, the aOvflSpace[] buffer is guaranteed to be large 5815 ** enough for all overflow cells. 5816 ** 5817 ** If aOvflSpace is set to a null pointer, this function returns 5818 ** SQLITE_NOMEM. 5819 */ 5820 static int balance_nonroot( 5821 MemPage *pParent, /* Parent page of siblings being balanced */ 5822 int iParentIdx, /* Index of "the page" in pParent */ 5823 u8 *aOvflSpace, /* page-size bytes of space for parent ovfl */ 5824 int isRoot /* True if pParent is a root-page */ 5825 ){ 5826 BtShared *pBt; /* The whole database */ 5827 int nCell = 0; /* Number of cells in apCell[] */ 5828 int nMaxCells = 0; /* Allocated size of apCell, szCell, aFrom. */ 5829 int nNew = 0; /* Number of pages in apNew[] */ 5830 int nOld; /* Number of pages in apOld[] */ 5831 int i, j, k; /* Loop counters */ 5832 int nxDiv; /* Next divider slot in pParent->aCell[] */ 5833 int rc = SQLITE_OK; /* The return code */ 5834 u16 leafCorrection; /* 4 if pPage is a leaf. 0 if not */ 5835 int leafData; /* True if pPage is a leaf of a LEAFDATA tree */ 5836 int usableSpace; /* Bytes in pPage beyond the header */ 5837 int pageFlags; /* Value of pPage->aData[0] */ 5838 int subtotal; /* Subtotal of bytes in cells on one page */ 5839 int iSpace1 = 0; /* First unused byte of aSpace1[] */ 5840 int iOvflSpace = 0; /* First unused byte of aOvflSpace[] */ 5841 int szScratch; /* Size of scratch memory requested */ 5842 MemPage *apOld[NB]; /* pPage and up to two siblings */ 5843 MemPage *apCopy[NB]; /* Private copies of apOld[] pages */ 5844 MemPage *apNew[NB+2]; /* pPage and up to NB siblings after balancing */ 5845 u8 *pRight; /* Location in parent of right-sibling pointer */ 5846 u8 *apDiv[NB-1]; /* Divider cells in pParent */ 5847 int cntNew[NB+2]; /* Index in aCell[] of cell after i-th page */ 5848 int szNew[NB+2]; /* Combined size of cells place on i-th page */ 5849 u8 **apCell = 0; /* All cells begin balanced */ 5850 u16 *szCell; /* Local size of all cells in apCell[] */ 5851 u8 *aSpace1; /* Space for copies of dividers cells */ 5852 Pgno pgno; /* Temp var to store a page number in */ 5853 5854 pBt = pParent->pBt; 5855 assert( sqlite3_mutex_held(pBt->mutex) ); 5856 assert( sqlite3PagerIswriteable(pParent->pDbPage) ); 5857 5858 #if 0 5859 TRACE(("BALANCE: begin page %d child of %d\n", pPage->pgno, pParent->pgno)); 5860 #endif 5861 5862 /* At this point pParent may have at most one overflow cell. And if 5863 ** this overflow cell is present, it must be the cell with 5864 ** index iParentIdx. This scenario comes about when this function 5865 ** is called (indirectly) from sqlite3BtreeDelete(). 5866 */ 5867 assert( pParent->nOverflow==0 || pParent->nOverflow==1 ); 5868 assert( pParent->nOverflow==0 || pParent->aOvfl[0].idx==iParentIdx ); 5869 5870 if( !aOvflSpace ){ 5871 return SQLITE_NOMEM; 5872 } 5873 5874 /* Find the sibling pages to balance. Also locate the cells in pParent 5875 ** that divide the siblings. An attempt is made to find NN siblings on 5876 ** either side of pPage. More siblings are taken from one side, however, 5877 ** if there are fewer than NN siblings on the other side. If pParent 5878 ** has NB or fewer children then all children of pParent are taken. 5879 ** 5880 ** This loop also drops the divider cells from the parent page. This 5881 ** way, the remainder of the function does not have to deal with any 5882 ** overflow cells in the parent page, since if any existed they will 5883 ** have already been removed. 5884 */ 5885 i = pParent->nOverflow + pParent->nCell; 5886 if( i<2 ){ 5887 nxDiv = 0; 5888 nOld = i+1; 5889 }else{ 5890 nOld = 3; 5891 if( iParentIdx==0 ){ 5892 nxDiv = 0; 5893 }else if( iParentIdx==i ){ 5894 nxDiv = i-2; 5895 }else{ 5896 nxDiv = iParentIdx-1; 5897 } 5898 i = 2; 5899 } 5900 if( (i+nxDiv-pParent->nOverflow)==pParent->nCell ){ 5901 pRight = &pParent->aData[pParent->hdrOffset+8]; 5902 }else{ 5903 pRight = findCell(pParent, i+nxDiv-pParent->nOverflow); 5904 } 5905 pgno = get4byte(pRight); 5906 while( 1 ){ 5907 rc = getAndInitPage(pBt, pgno, &apOld[i]); 5908 if( rc ){ 5909 memset(apOld, 0, (i+1)*sizeof(MemPage*)); 5910 goto balance_cleanup; 5911 } 5912 nMaxCells += 1+apOld[i]->nCell+apOld[i]->nOverflow; 5913 if( (i--)==0 ) break; 5914 5915 if( i+nxDiv==pParent->aOvfl[0].idx && pParent->nOverflow ){ 5916 apDiv[i] = pParent->aOvfl[0].pCell; 5917 pgno = get4byte(apDiv[i]); 5918 szNew[i] = cellSizePtr(pParent, apDiv[i]); 5919 pParent->nOverflow = 0; 5920 }else{ 5921 apDiv[i] = findCell(pParent, i+nxDiv-pParent->nOverflow); 5922 pgno = get4byte(apDiv[i]); 5923 szNew[i] = cellSizePtr(pParent, apDiv[i]); 5924 5925 /* Drop the cell from the parent page. apDiv[i] still points to 5926 ** the cell within the parent, even though it has been dropped. 5927 ** This is safe because dropping a cell only overwrites the first 5928 ** four bytes of it, and this function does not need the first 5929 ** four bytes of the divider cell. So the pointer is safe to use 5930 ** later on. 5931 ** 5932 ** Unless SQLite is compiled in secure-delete mode. In this case, 5933 ** the dropCell() routine will overwrite the entire cell with zeroes. 5934 ** In this case, temporarily copy the cell into the aOvflSpace[] 5935 ** buffer. It will be copied out again as soon as the aSpace[] buffer 5936 ** is allocated. */ 5937 if( pBt->secureDelete ){ 5938 int iOff = SQLITE_PTR_TO_INT(apDiv[i]) - SQLITE_PTR_TO_INT(pParent->aData); 5939 if( (iOff+szNew[i])>(int)pBt->usableSize ){ 5940 rc = SQLITE_CORRUPT_BKPT; 5941 memset(apOld, 0, (i+1)*sizeof(MemPage*)); 5942 goto balance_cleanup; 5943 }else{ 5944 memcpy(&aOvflSpace[iOff], apDiv[i], szNew[i]); 5945 apDiv[i] = &aOvflSpace[apDiv[i]-pParent->aData]; 5946 } 5947 } 5948 dropCell(pParent, i+nxDiv-pParent->nOverflow, szNew[i], &rc); 5949 } 5950 } 5951 5952 /* Make nMaxCells a multiple of 4 in order to preserve 8-byte 5953 ** alignment */ 5954 nMaxCells = (nMaxCells + 3)&~3; 5955 5956 /* 5957 ** Allocate space for memory structures 5958 */ 5959 k = pBt->pageSize + ROUND8(sizeof(MemPage)); 5960 szScratch = 5961 nMaxCells*sizeof(u8*) /* apCell */ 5962 + nMaxCells*sizeof(u16) /* szCell */ 5963 + pBt->pageSize /* aSpace1 */ 5964 + k*nOld; /* Page copies (apCopy) */ 5965 apCell = sqlite3ScratchMalloc( szScratch ); 5966 if( apCell==0 ){ 5967 rc = SQLITE_NOMEM; 5968 goto balance_cleanup; 5969 } 5970 szCell = (u16*)&apCell[nMaxCells]; 5971 aSpace1 = (u8*)&szCell[nMaxCells]; 5972 assert( EIGHT_BYTE_ALIGNMENT(aSpace1) ); 5973 5974 /* 5975 ** Load pointers to all cells on sibling pages and the divider cells 5976 ** into the local apCell[] array. Make copies of the divider cells 5977 ** into space obtained from aSpace1[] and remove the the divider Cells 5978 ** from pParent. 5979 ** 5980 ** If the siblings are on leaf pages, then the child pointers of the 5981 ** divider cells are stripped from the cells before they are copied 5982 ** into aSpace1[]. In this way, all cells in apCell[] are without 5983 ** child pointers. If siblings are not leaves, then all cell in 5984 ** apCell[] include child pointers. Either way, all cells in apCell[] 5985 ** are alike. 5986 ** 5987 ** leafCorrection: 4 if pPage is a leaf. 0 if pPage is not a leaf. 5988 ** leafData: 1 if pPage holds key+data and pParent holds only keys. 5989 */ 5990 leafCorrection = apOld[0]->leaf*4; 5991 leafData = apOld[0]->hasData; 5992 for(i=0; i<nOld; i++){ 5993 int limit; 5994 5995 /* Before doing anything else, take a copy of the i'th original sibling 5996 ** The rest of this function will use data from the copies rather 5997 ** that the original pages since the original pages will be in the 5998 ** process of being overwritten. */ 5999 MemPage *pOld = apCopy[i] = (MemPage*)&aSpace1[pBt->pageSize + k*i]; 6000 memcpy(pOld, apOld[i], sizeof(MemPage)); 6001 pOld->aData = (void*)&pOld[1]; 6002 memcpy(pOld->aData, apOld[i]->aData, pBt->pageSize); 6003 6004 limit = pOld->nCell+pOld->nOverflow; 6005 for(j=0; j<limit; j++){ 6006 assert( nCell<nMaxCells ); 6007 apCell[nCell] = findOverflowCell(pOld, j); 6008 szCell[nCell] = cellSizePtr(pOld, apCell[nCell]); 6009 nCell++; 6010 } 6011 if( i<nOld-1 && !leafData){ 6012 u16 sz = (u16)szNew[i]; 6013 u8 *pTemp; 6014 assert( nCell<nMaxCells ); 6015 szCell[nCell] = sz; 6016 pTemp = &aSpace1[iSpace1]; 6017 iSpace1 += sz; 6018 assert( sz<=pBt->maxLocal+23 ); 6019 assert( iSpace1<=pBt->pageSize ); 6020 memcpy(pTemp, apDiv[i], sz); 6021 apCell[nCell] = pTemp+leafCorrection; 6022 assert( leafCorrection==0 || leafCorrection==4 ); 6023 szCell[nCell] = szCell[nCell] - leafCorrection; 6024 if( !pOld->leaf ){ 6025 assert( leafCorrection==0 ); 6026 assert( pOld->hdrOffset==0 ); 6027 /* The right pointer of the child page pOld becomes the left 6028 ** pointer of the divider cell */ 6029 memcpy(apCell[nCell], &pOld->aData[8], 4); 6030 }else{ 6031 assert( leafCorrection==4 ); 6032 if( szCell[nCell]<4 ){ 6033 /* Do not allow any cells smaller than 4 bytes. */ 6034 szCell[nCell] = 4; 6035 } 6036 } 6037 nCell++; 6038 } 6039 } 6040 6041 /* 6042 ** Figure out the number of pages needed to hold all nCell cells. 6043 ** Store this number in "k". Also compute szNew[] which is the total 6044 ** size of all cells on the i-th page and cntNew[] which is the index 6045 ** in apCell[] of the cell that divides page i from page i+1. 6046 ** cntNew[k] should equal nCell. 6047 ** 6048 ** Values computed by this block: 6049 ** 6050 ** k: The total number of sibling pages 6051 ** szNew[i]: Spaced used on the i-th sibling page. 6052 ** cntNew[i]: Index in apCell[] and szCell[] for the first cell to 6053 ** the right of the i-th sibling page. 6054 ** usableSpace: Number of bytes of space available on each sibling. 6055 ** 6056 */ 6057 usableSpace = pBt->usableSize - 12 + leafCorrection; 6058 for(subtotal=k=i=0; i<nCell; i++){ 6059 assert( i<nMaxCells ); 6060 subtotal += szCell[i] + 2; 6061 if( subtotal > usableSpace ){ 6062 szNew[k] = subtotal - szCell[i]; 6063 cntNew[k] = i; 6064 if( leafData ){ i--; } 6065 subtotal = 0; 6066 k++; 6067 if( k>NB+1 ){ rc = SQLITE_CORRUPT_BKPT; goto balance_cleanup; } 6068 } 6069 } 6070 szNew[k] = subtotal; 6071 cntNew[k] = nCell; 6072 k++; 6073 6074 /* 6075 ** The packing computed by the previous block is biased toward the siblings 6076 ** on the left side. The left siblings are always nearly full, while the 6077 ** right-most sibling might be nearly empty. This block of code attempts 6078 ** to adjust the packing of siblings to get a better balance. 6079 ** 6080 ** This adjustment is more than an optimization. The packing above might 6081 ** be so out of balance as to be illegal. For example, the right-most 6082 ** sibling might be completely empty. This adjustment is not optional. 6083 */ 6084 for(i=k-1; i>0; i--){ 6085 int szRight = szNew[i]; /* Size of sibling on the right */ 6086 int szLeft = szNew[i-1]; /* Size of sibling on the left */ 6087 int r; /* Index of right-most cell in left sibling */ 6088 int d; /* Index of first cell to the left of right sibling */ 6089 6090 r = cntNew[i-1] - 1; 6091 d = r + 1 - leafData; 6092 assert( d<nMaxCells ); 6093 assert( r<nMaxCells ); 6094 while( szRight==0 || szRight+szCell[d]+2<=szLeft-(szCell[r]+2) ){ 6095 szRight += szCell[d] + 2; 6096 szLeft -= szCell[r] + 2; 6097 cntNew[i-1]--; 6098 r = cntNew[i-1] - 1; 6099 d = r + 1 - leafData; 6100 } 6101 szNew[i] = szRight; 6102 szNew[i-1] = szLeft; 6103 } 6104 6105 /* Either we found one or more cells (cntnew[0])>0) or pPage is 6106 ** a virtual root page. A virtual root page is when the real root 6107 ** page is page 1 and we are the only child of that page. 6108 */ 6109 assert( cntNew[0]>0 || (pParent->pgno==1 && pParent->nCell==0) ); 6110 6111 TRACE(("BALANCE: old: %d %d %d ", 6112 apOld[0]->pgno, 6113 nOld>=2 ? apOld[1]->pgno : 0, 6114 nOld>=3 ? apOld[2]->pgno : 0 6115 )); 6116 6117 /* 6118 ** Allocate k new pages. Reuse old pages where possible. 6119 */ 6120 if( apOld[0]->pgno<=1 ){ 6121 rc = SQLITE_CORRUPT_BKPT; 6122 goto balance_cleanup; 6123 } 6124 pageFlags = apOld[0]->aData[0]; 6125 for(i=0; i<k; i++){ 6126 MemPage *pNew; 6127 if( i<nOld ){ 6128 pNew = apNew[i] = apOld[i]; 6129 apOld[i] = 0; 6130 rc = sqlite3PagerWrite(pNew->pDbPage); 6131 nNew++; 6132 if( rc ) goto balance_cleanup; 6133 }else{ 6134 assert( i>0 ); 6135 rc = allocateBtreePage(pBt, &pNew, &pgno, pgno, 0); 6136 if( rc ) goto balance_cleanup; 6137 apNew[i] = pNew; 6138 nNew++; 6139 6140 /* Set the pointer-map entry for the new sibling page. */ 6141 if( ISAUTOVACUUM ){ 6142 ptrmapPut(pBt, pNew->pgno, PTRMAP_BTREE, pParent->pgno, &rc); 6143 if( rc!=SQLITE_OK ){ 6144 goto balance_cleanup; 6145 } 6146 } 6147 } 6148 } 6149 6150 /* Free any old pages that were not reused as new pages. 6151 */ 6152 while( i<nOld ){ 6153 freePage(apOld[i], &rc); 6154 if( rc ) goto balance_cleanup; 6155 releasePage(apOld[i]); 6156 apOld[i] = 0; 6157 i++; 6158 } 6159 6160 /* 6161 ** Put the new pages in accending order. This helps to 6162 ** keep entries in the disk file in order so that a scan 6163 ** of the table is a linear scan through the file. That 6164 ** in turn helps the operating system to deliver pages 6165 ** from the disk more rapidly. 6166 ** 6167 ** An O(n^2) insertion sort algorithm is used, but since 6168 ** n is never more than NB (a small constant), that should 6169 ** not be a problem. 6170 ** 6171 ** When NB==3, this one optimization makes the database 6172 ** about 25% faster for large insertions and deletions. 6173 */ 6174 for(i=0; i<k-1; i++){ 6175 int minV = apNew[i]->pgno; 6176 int minI = i; 6177 for(j=i+1; j<k; j++){ 6178 if( apNew[j]->pgno<(unsigned)minV ){ 6179 minI = j; 6180 minV = apNew[j]->pgno; 6181 } 6182 } 6183 if( minI>i ){ 6184 int t; 6185 MemPage *pT; 6186 t = apNew[i]->pgno; 6187 pT = apNew[i]; 6188 apNew[i] = apNew[minI]; 6189 apNew[minI] = pT; 6190 } 6191 } 6192 TRACE(("new: %d(%d) %d(%d) %d(%d) %d(%d) %d(%d)\n", 6193 apNew[0]->pgno, szNew[0], 6194 nNew>=2 ? apNew[1]->pgno : 0, nNew>=2 ? szNew[1] : 0, 6195 nNew>=3 ? apNew[2]->pgno : 0, nNew>=3 ? szNew[2] : 0, 6196 nNew>=4 ? apNew[3]->pgno : 0, nNew>=4 ? szNew[3] : 0, 6197 nNew>=5 ? apNew[4]->pgno : 0, nNew>=5 ? szNew[4] : 0)); 6198 6199 assert( sqlite3PagerIswriteable(pParent->pDbPage) ); 6200 put4byte(pRight, apNew[nNew-1]->pgno); 6201 6202 /* 6203 ** Evenly distribute the data in apCell[] across the new pages. 6204 ** Insert divider cells into pParent as necessary. 6205 */ 6206 j = 0; 6207 for(i=0; i<nNew; i++){ 6208 /* Assemble the new sibling page. */ 6209 MemPage *pNew = apNew[i]; 6210 assert( j<nMaxCells ); 6211 zeroPage(pNew, pageFlags); 6212 assemblePage(pNew, cntNew[i]-j, &apCell[j], &szCell[j]); 6213 assert( pNew->nCell>0 || (nNew==1 && cntNew[0]==0) ); 6214 assert( pNew->nOverflow==0 ); 6215 6216 j = cntNew[i]; 6217 6218 /* If the sibling page assembled above was not the right-most sibling, 6219 ** insert a divider cell into the parent page. 6220 */ 6221 assert( i<nNew-1 || j==nCell ); 6222 if( j<nCell ){ 6223 u8 *pCell; 6224 u8 *pTemp; 6225 int sz; 6226 6227 assert( j<nMaxCells ); 6228 pCell = apCell[j]; 6229 sz = szCell[j] + leafCorrection; 6230 pTemp = &aOvflSpace[iOvflSpace]; 6231 if( !pNew->leaf ){ 6232 memcpy(&pNew->aData[8], pCell, 4); 6233 }else if( leafData ){ 6234 /* If the tree is a leaf-data tree, and the siblings are leaves, 6235 ** then there is no divider cell in apCell[]. Instead, the divider 6236 ** cell consists of the integer key for the right-most cell of 6237 ** the sibling-page assembled above only. 6238 */ 6239 CellInfo info; 6240 j--; 6241 btreeParseCellPtr(pNew, apCell[j], &info); 6242 pCell = pTemp; 6243 sz = 4 + putVarint(&pCell[4], info.nKey); 6244 pTemp = 0; 6245 }else{ 6246 pCell -= 4; 6247 /* Obscure case for non-leaf-data trees: If the cell at pCell was 6248 ** previously stored on a leaf node, and its reported size was 4 6249 ** bytes, then it may actually be smaller than this 6250 ** (see btreeParseCellPtr(), 4 bytes is the minimum size of 6251 ** any cell). But it is important to pass the correct size to 6252 ** insertCell(), so reparse the cell now. 6253 ** 6254 ** Note that this can never happen in an SQLite data file, as all 6255 ** cells are at least 4 bytes. It only happens in b-trees used 6256 ** to evaluate "IN (SELECT ...)" and similar clauses. 6257 */ 6258 if( szCell[j]==4 ){ 6259 assert(leafCorrection==4); 6260 sz = cellSizePtr(pParent, pCell); 6261 } 6262 } 6263 iOvflSpace += sz; 6264 assert( sz<=pBt->maxLocal+23 ); 6265 assert( iOvflSpace<=pBt->pageSize ); 6266 insertCell(pParent, nxDiv, pCell, sz, pTemp, pNew->pgno, &rc); 6267 if( rc!=SQLITE_OK ) goto balance_cleanup; 6268 assert( sqlite3PagerIswriteable(pParent->pDbPage) ); 6269 6270 j++; 6271 nxDiv++; 6272 } 6273 } 6274 assert( j==nCell ); 6275 assert( nOld>0 ); 6276 assert( nNew>0 ); 6277 if( (pageFlags & PTF_LEAF)==0 ){ 6278 u8 *zChild = &apCopy[nOld-1]->aData[8]; 6279 memcpy(&apNew[nNew-1]->aData[8], zChild, 4); 6280 } 6281 6282 if( isRoot && pParent->nCell==0 && pParent->hdrOffset<=apNew[0]->nFree ){ 6283 /* The root page of the b-tree now contains no cells. The only sibling 6284 ** page is the right-child of the parent. Copy the contents of the 6285 ** child page into the parent, decreasing the overall height of the 6286 ** b-tree structure by one. This is described as the "balance-shallower" 6287 ** sub-algorithm in some documentation. 6288 ** 6289 ** If this is an auto-vacuum database, the call to copyNodeContent() 6290 ** sets all pointer-map entries corresponding to database image pages 6291 ** for which the pointer is stored within the content being copied. 6292 ** 6293 ** The second assert below verifies that the child page is defragmented 6294 ** (it must be, as it was just reconstructed using assemblePage()). This 6295 ** is important if the parent page happens to be page 1 of the database 6296 ** image. */ 6297 assert( nNew==1 ); 6298 assert( apNew[0]->nFree == 6299 (get2byte(&apNew[0]->aData[5])-apNew[0]->cellOffset-apNew[0]->nCell*2) 6300 ); 6301 copyNodeContent(apNew[0], pParent, &rc); 6302 freePage(apNew[0], &rc); 6303 }else if( ISAUTOVACUUM ){ 6304 /* Fix the pointer-map entries for all the cells that were shifted around. 6305 ** There are several different types of pointer-map entries that need to 6306 ** be dealt with by this routine. Some of these have been set already, but 6307 ** many have not. The following is a summary: 6308 ** 6309 ** 1) The entries associated with new sibling pages that were not 6310 ** siblings when this function was called. These have already 6311 ** been set. We don't need to worry about old siblings that were 6312 ** moved to the free-list - the freePage() code has taken care 6313 ** of those. 6314 ** 6315 ** 2) The pointer-map entries associated with the first overflow 6316 ** page in any overflow chains used by new divider cells. These 6317 ** have also already been taken care of by the insertCell() code. 6318 ** 6319 ** 3) If the sibling pages are not leaves, then the child pages of 6320 ** cells stored on the sibling pages may need to be updated. 6321 ** 6322 ** 4) If the sibling pages are not internal intkey nodes, then any 6323 ** overflow pages used by these cells may need to be updated 6324 ** (internal intkey nodes never contain pointers to overflow pages). 6325 ** 6326 ** 5) If the sibling pages are not leaves, then the pointer-map 6327 ** entries for the right-child pages of each sibling may need 6328 ** to be updated. 6329 ** 6330 ** Cases 1 and 2 are dealt with above by other code. The next 6331 ** block deals with cases 3 and 4 and the one after that, case 5. Since 6332 ** setting a pointer map entry is a relatively expensive operation, this 6333 ** code only sets pointer map entries for child or overflow pages that have 6334 ** actually moved between pages. */ 6335 MemPage *pNew = apNew[0]; 6336 MemPage *pOld = apCopy[0]; 6337 int nOverflow = pOld->nOverflow; 6338 int iNextOld = pOld->nCell + nOverflow; 6339 int iOverflow = (nOverflow ? pOld->aOvfl[0].idx : -1); 6340 j = 0; /* Current 'old' sibling page */ 6341 k = 0; /* Current 'new' sibling page */ 6342 for(i=0; i<nCell; i++){ 6343 int isDivider = 0; 6344 while( i==iNextOld ){ 6345 /* Cell i is the cell immediately following the last cell on old 6346 ** sibling page j. If the siblings are not leaf pages of an 6347 ** intkey b-tree, then cell i was a divider cell. */ 6348 pOld = apCopy[++j]; 6349 iNextOld = i + !leafData + pOld->nCell + pOld->nOverflow; 6350 if( pOld->nOverflow ){ 6351 nOverflow = pOld->nOverflow; 6352 iOverflow = i + !leafData + pOld->aOvfl[0].idx; 6353 } 6354 isDivider = !leafData; 6355 } 6356 6357 assert(nOverflow>0 || iOverflow<i ); 6358 assert(nOverflow<2 || pOld->aOvfl[0].idx==pOld->aOvfl[1].idx-1); 6359 assert(nOverflow<3 || pOld->aOvfl[1].idx==pOld->aOvfl[2].idx-1); 6360 if( i==iOverflow ){ 6361 isDivider = 1; 6362 if( (--nOverflow)>0 ){ 6363 iOverflow++; 6364 } 6365 } 6366 6367 if( i==cntNew[k] ){ 6368 /* Cell i is the cell immediately following the last cell on new 6369 ** sibling page k. If the siblings are not leaf pages of an 6370 ** intkey b-tree, then cell i is a divider cell. */ 6371 pNew = apNew[++k]; 6372 if( !leafData ) continue; 6373 } 6374 assert( j<nOld ); 6375 assert( k<nNew ); 6376 6377 /* If the cell was originally divider cell (and is not now) or 6378 ** an overflow cell, or if the cell was located on a different sibling 6379 ** page before the balancing, then the pointer map entries associated 6380 ** with any child or overflow pages need to be updated. */ 6381 if( isDivider || pOld->pgno!=pNew->pgno ){ 6382 if( !leafCorrection ){ 6383 ptrmapPut(pBt, get4byte(apCell[i]), PTRMAP_BTREE, pNew->pgno, &rc); 6384 } 6385 if( szCell[i]>pNew->minLocal ){ 6386 ptrmapPutOvflPtr(pNew, apCell[i], &rc); 6387 } 6388 } 6389 } 6390 6391 if( !leafCorrection ){ 6392 for(i=0; i<nNew; i++){ 6393 u32 key = get4byte(&apNew[i]->aData[8]); 6394 ptrmapPut(pBt, key, PTRMAP_BTREE, apNew[i]->pgno, &rc); 6395 } 6396 } 6397 6398 #if 0 6399 /* The ptrmapCheckPages() contains assert() statements that verify that 6400 ** all pointer map pages are set correctly. This is helpful while 6401 ** debugging. This is usually disabled because a corrupt database may 6402 ** cause an assert() statement to fail. */ 6403 ptrmapCheckPages(apNew, nNew); 6404 ptrmapCheckPages(&pParent, 1); 6405 #endif 6406 } 6407 6408 assert( pParent->isInit ); 6409 TRACE(("BALANCE: finished: old=%d new=%d cells=%d\n", 6410 nOld, nNew, nCell)); 6411 6412 /* 6413 ** Cleanup before returning. 6414 */ 6415 balance_cleanup: 6416 sqlite3ScratchFree(apCell); 6417 for(i=0; i<nOld; i++){ 6418 releasePage(apOld[i]); 6419 } 6420 for(i=0; i<nNew; i++){ 6421 releasePage(apNew[i]); 6422 } 6423 6424 return rc; 6425 } 6426 6427 6428 /* 6429 ** This function is called when the root page of a b-tree structure is 6430 ** overfull (has one or more overflow pages). 6431 ** 6432 ** A new child page is allocated and the contents of the current root 6433 ** page, including overflow cells, are copied into the child. The root 6434 ** page is then overwritten to make it an empty page with the right-child 6435 ** pointer pointing to the new page. 6436 ** 6437 ** Before returning, all pointer-map entries corresponding to pages 6438 ** that the new child-page now contains pointers to are updated. The 6439 ** entry corresponding to the new right-child pointer of the root 6440 ** page is also updated. 6441 ** 6442 ** If successful, *ppChild is set to contain a reference to the child 6443 ** page and SQLITE_OK is returned. In this case the caller is required 6444 ** to call releasePage() on *ppChild exactly once. If an error occurs, 6445 ** an error code is returned and *ppChild is set to 0. 6446 */ 6447 static int balance_deeper(MemPage *pRoot, MemPage **ppChild){ 6448 int rc; /* Return value from subprocedures */ 6449 MemPage *pChild = 0; /* Pointer to a new child page */ 6450 Pgno pgnoChild = 0; /* Page number of the new child page */ 6451 BtShared *pBt = pRoot->pBt; /* The BTree */ 6452 6453 assert( pRoot->nOverflow>0 ); 6454 assert( sqlite3_mutex_held(pBt->mutex) ); 6455 6456 /* Make pRoot, the root page of the b-tree, writable. Allocate a new 6457 ** page that will become the new right-child of pPage. Copy the contents 6458 ** of the node stored on pRoot into the new child page. 6459 */ 6460 rc = sqlite3PagerWrite(pRoot->pDbPage); 6461 if( rc==SQLITE_OK ){ 6462 rc = allocateBtreePage(pBt,&pChild,&pgnoChild,pRoot->pgno,0); 6463 copyNodeContent(pRoot, pChild, &rc); 6464 if( ISAUTOVACUUM ){ 6465 ptrmapPut(pBt, pgnoChild, PTRMAP_BTREE, pRoot->pgno, &rc); 6466 } 6467 } 6468 if( rc ){ 6469 *ppChild = 0; 6470 releasePage(pChild); 6471 return rc; 6472 } 6473 assert( sqlite3PagerIswriteable(pChild->pDbPage) ); 6474 assert( sqlite3PagerIswriteable(pRoot->pDbPage) ); 6475 assert( pChild->nCell==pRoot->nCell ); 6476 6477 TRACE(("BALANCE: copy root %d into %d\n", pRoot->pgno, pChild->pgno)); 6478 6479 /* Copy the overflow cells from pRoot to pChild */ 6480 memcpy(pChild->aOvfl, pRoot->aOvfl, pRoot->nOverflow*sizeof(pRoot->aOvfl[0])); 6481 pChild->nOverflow = pRoot->nOverflow; 6482 6483 /* Zero the contents of pRoot. Then install pChild as the right-child. */ 6484 zeroPage(pRoot, pChild->aData[0] & ~PTF_LEAF); 6485 put4byte(&pRoot->aData[pRoot->hdrOffset+8], pgnoChild); 6486 6487 *ppChild = pChild; 6488 return SQLITE_OK; 6489 } 6490 6491 /* 6492 ** The page that pCur currently points to has just been modified in 6493 ** some way. This function figures out if this modification means the 6494 ** tree needs to be balanced, and if so calls the appropriate balancing 6495 ** routine. Balancing routines are: 6496 ** 6497 ** balance_quick() 6498 ** balance_deeper() 6499 ** balance_nonroot() 6500 */ 6501 static int balance(BtCursor *pCur){ 6502 int rc = SQLITE_OK; 6503 const int nMin = pCur->pBt->usableSize * 2 / 3; 6504 u8 aBalanceQuickSpace[13]; 6505 u8 *pFree = 0; 6506 6507 TESTONLY( int balance_quick_called = 0 ); 6508 TESTONLY( int balance_deeper_called = 0 ); 6509 6510 do { 6511 int iPage = pCur->iPage; 6512 MemPage *pPage = pCur->apPage[iPage]; 6513 6514 if( iPage==0 ){ 6515 if( pPage->nOverflow ){ 6516 /* The root page of the b-tree is overfull. In this case call the 6517 ** balance_deeper() function to create a new child for the root-page 6518 ** and copy the current contents of the root-page to it. The 6519 ** next iteration of the do-loop will balance the child page. 6520 */ 6521 assert( (balance_deeper_called++)==0 ); 6522 rc = balance_deeper(pPage, &pCur->apPage[1]); 6523 if( rc==SQLITE_OK ){ 6524 pCur->iPage = 1; 6525 pCur->aiIdx[0] = 0; 6526 pCur->aiIdx[1] = 0; 6527 assert( pCur->apPage[1]->nOverflow ); 6528 } 6529 }else{ 6530 break; 6531 } 6532 }else if( pPage->nOverflow==0 && pPage->nFree<=nMin ){ 6533 break; 6534 }else{ 6535 MemPage * const pParent = pCur->apPage[iPage-1]; 6536 int const iIdx = pCur->aiIdx[iPage-1]; 6537 6538 rc = sqlite3PagerWrite(pParent->pDbPage); 6539 if( rc==SQLITE_OK ){ 6540 #ifndef SQLITE_OMIT_QUICKBALANCE 6541 if( pPage->hasData 6542 && pPage->nOverflow==1 6543 && pPage->aOvfl[0].idx==pPage->nCell 6544 && pParent->pgno!=1 6545 && pParent->nCell==iIdx 6546 ){ 6547 /* Call balance_quick() to create a new sibling of pPage on which 6548 ** to store the overflow cell. balance_quick() inserts a new cell 6549 ** into pParent, which may cause pParent overflow. If this 6550 ** happens, the next interation of the do-loop will balance pParent 6551 ** use either balance_nonroot() or balance_deeper(). Until this 6552 ** happens, the overflow cell is stored in the aBalanceQuickSpace[] 6553 ** buffer. 6554 ** 6555 ** The purpose of the following assert() is to check that only a 6556 ** single call to balance_quick() is made for each call to this 6557 ** function. If this were not verified, a subtle bug involving reuse 6558 ** of the aBalanceQuickSpace[] might sneak in. 6559 */ 6560 assert( (balance_quick_called++)==0 ); 6561 rc = balance_quick(pParent, pPage, aBalanceQuickSpace); 6562 }else 6563 #endif 6564 { 6565 /* In this case, call balance_nonroot() to redistribute cells 6566 ** between pPage and up to 2 of its sibling pages. This involves 6567 ** modifying the contents of pParent, which may cause pParent to 6568 ** become overfull or underfull. The next iteration of the do-loop 6569 ** will balance the parent page to correct this. 6570 ** 6571 ** If the parent page becomes overfull, the overflow cell or cells 6572 ** are stored in the pSpace buffer allocated immediately below. 6573 ** A subsequent iteration of the do-loop will deal with this by 6574 ** calling balance_nonroot() (balance_deeper() may be called first, 6575 ** but it doesn't deal with overflow cells - just moves them to a 6576 ** different page). Once this subsequent call to balance_nonroot() 6577 ** has completed, it is safe to release the pSpace buffer used by 6578 ** the previous call, as the overflow cell data will have been 6579 ** copied either into the body of a database page or into the new 6580 ** pSpace buffer passed to the latter call to balance_nonroot(). 6581 */ 6582 u8 *pSpace = sqlite3PageMalloc(pCur->pBt->pageSize); 6583 rc = balance_nonroot(pParent, iIdx, pSpace, iPage==1); 6584 if( pFree ){ 6585 /* If pFree is not NULL, it points to the pSpace buffer used 6586 ** by a previous call to balance_nonroot(). Its contents are 6587 ** now stored either on real database pages or within the 6588 ** new pSpace buffer, so it may be safely freed here. */ 6589 sqlite3PageFree(pFree); 6590 } 6591 6592 /* The pSpace buffer will be freed after the next call to 6593 ** balance_nonroot(), or just before this function returns, whichever 6594 ** comes first. */ 6595 pFree = pSpace; 6596 } 6597 } 6598 6599 pPage->nOverflow = 0; 6600 6601 /* The next iteration of the do-loop balances the parent page. */ 6602 releasePage(pPage); 6603 pCur->iPage--; 6604 } 6605 }while( rc==SQLITE_OK ); 6606 6607 if( pFree ){ 6608 sqlite3PageFree(pFree); 6609 } 6610 return rc; 6611 } 6612 6613 6614 /* 6615 ** Insert a new record into the BTree. The key is given by (pKey,nKey) 6616 ** and the data is given by (pData,nData). The cursor is used only to 6617 ** define what table the record should be inserted into. The cursor 6618 ** is left pointing at a random location. 6619 ** 6620 ** For an INTKEY table, only the nKey value of the key is used. pKey is 6621 ** ignored. For a ZERODATA table, the pData and nData are both ignored. 6622 ** 6623 ** If the seekResult parameter is non-zero, then a successful call to 6624 ** MovetoUnpacked() to seek cursor pCur to (pKey, nKey) has already 6625 ** been performed. seekResult is the search result returned (a negative 6626 ** number if pCur points at an entry that is smaller than (pKey, nKey), or 6627 ** a positive value if pCur points at an etry that is larger than 6628 ** (pKey, nKey)). 6629 ** 6630 ** If the seekResult parameter is non-zero, then the caller guarantees that 6631 ** cursor pCur is pointing at the existing copy of a row that is to be 6632 ** overwritten. If the seekResult parameter is 0, then cursor pCur may 6633 ** point to any entry or to no entry at all and so this function has to seek 6634 ** the cursor before the new key can be inserted. 6635 */ 6636 int sqlite3BtreeInsert( 6637 BtCursor *pCur, /* Insert data into the table of this cursor */ 6638 const void *pKey, i64 nKey, /* The key of the new record */ 6639 const void *pData, int nData, /* The data of the new record */ 6640 int nZero, /* Number of extra 0 bytes to append to data */ 6641 int appendBias, /* True if this is likely an append */ 6642 int seekResult /* Result of prior MovetoUnpacked() call */ 6643 ){ 6644 int rc; 6645 int loc = seekResult; /* -1: before desired location +1: after */ 6646 int szNew = 0; 6647 int idx; 6648 MemPage *pPage; 6649 Btree *p = pCur->pBtree; 6650 BtShared *pBt = p->pBt; 6651 unsigned char *oldCell; 6652 unsigned char *newCell = 0; 6653 6654 if( pCur->eState==CURSOR_FAULT ){ 6655 assert( pCur->skipNext!=SQLITE_OK ); 6656 return pCur->skipNext; 6657 } 6658 6659 assert( cursorHoldsMutex(pCur) ); 6660 assert( pCur->wrFlag && pBt->inTransaction==TRANS_WRITE && !pBt->readOnly ); 6661 assert( hasSharedCacheTableLock(p, pCur->pgnoRoot, pCur->pKeyInfo!=0, 2) ); 6662 6663 /* Assert that the caller has been consistent. If this cursor was opened 6664 ** expecting an index b-tree, then the caller should be inserting blob 6665 ** keys with no associated data. If the cursor was opened expecting an 6666 ** intkey table, the caller should be inserting integer keys with a 6667 ** blob of associated data. */ 6668 assert( (pKey==0)==(pCur->pKeyInfo==0) ); 6669 6670 /* If this is an insert into a table b-tree, invalidate any incrblob 6671 ** cursors open on the row being replaced (assuming this is a replace 6672 ** operation - if it is not, the following is a no-op). */ 6673 if( pCur->pKeyInfo==0 ){ 6674 invalidateIncrblobCursors(p, nKey, 0); 6675 } 6676 6677 /* Save the positions of any other cursors open on this table. 6678 ** 6679 ** In some cases, the call to btreeMoveto() below is a no-op. For 6680 ** example, when inserting data into a table with auto-generated integer 6681 ** keys, the VDBE layer invokes sqlite3BtreeLast() to figure out the 6682 ** integer key to use. It then calls this function to actually insert the 6683 ** data into the intkey B-Tree. In this case btreeMoveto() recognizes 6684 ** that the cursor is already where it needs to be and returns without 6685 ** doing any work. To avoid thwarting these optimizations, it is important 6686 ** not to clear the cursor here. 6687 */ 6688 rc = saveAllCursors(pBt, pCur->pgnoRoot, pCur); 6689 if( rc ) return rc; 6690 if( !loc ){ 6691 rc = btreeMoveto(pCur, pKey, nKey, appendBias, &loc); 6692 if( rc ) return rc; 6693 } 6694 assert( pCur->eState==CURSOR_VALID || (pCur->eState==CURSOR_INVALID && loc) ); 6695 6696 pPage = pCur->apPage[pCur->iPage]; 6697 assert( pPage->intKey || nKey>=0 ); 6698 assert( pPage->leaf || !pPage->intKey ); 6699 6700 TRACE(("INSERT: table=%d nkey=%lld ndata=%d page=%d %s\n", 6701 pCur->pgnoRoot, nKey, nData, pPage->pgno, 6702 loc==0 ? "overwrite" : "new entry")); 6703 assert( pPage->isInit ); 6704 allocateTempSpace(pBt); 6705 newCell = pBt->pTmpSpace; 6706 if( newCell==0 ) return SQLITE_NOMEM; 6707 rc = fillInCell(pPage, newCell, pKey, nKey, pData, nData, nZero, &szNew); 6708 if( rc ) goto end_insert; 6709 assert( szNew==cellSizePtr(pPage, newCell) ); 6710 assert( szNew<=MX_CELL_SIZE(pBt) ); 6711 idx = pCur->aiIdx[pCur->iPage]; 6712 if( loc==0 ){ 6713 u16 szOld; 6714 assert( idx<pPage->nCell ); 6715 rc = sqlite3PagerWrite(pPage->pDbPage); 6716 if( rc ){ 6717 goto end_insert; 6718 } 6719 oldCell = findCell(pPage, idx); 6720 if( !pPage->leaf ){ 6721 memcpy(newCell, oldCell, 4); 6722 } 6723 szOld = cellSizePtr(pPage, oldCell); 6724 rc = clearCell(pPage, oldCell); 6725 dropCell(pPage, idx, szOld, &rc); 6726 if( rc ) goto end_insert; 6727 }else if( loc<0 && pPage->nCell>0 ){ 6728 assert( pPage->leaf ); 6729 idx = ++pCur->aiIdx[pCur->iPage]; 6730 }else{ 6731 assert( pPage->leaf ); 6732 } 6733 insertCell(pPage, idx, newCell, szNew, 0, 0, &rc); 6734 assert( rc!=SQLITE_OK || pPage->nCell>0 || pPage->nOverflow>0 ); 6735 6736 /* If no error has occured and pPage has an overflow cell, call balance() 6737 ** to redistribute the cells within the tree. Since balance() may move 6738 ** the cursor, zero the BtCursor.info.nSize and BtCursor.validNKey 6739 ** variables. 6740 ** 6741 ** Previous versions of SQLite called moveToRoot() to move the cursor 6742 ** back to the root page as balance() used to invalidate the contents 6743 ** of BtCursor.apPage[] and BtCursor.aiIdx[]. Instead of doing that, 6744 ** set the cursor state to "invalid". This makes common insert operations 6745 ** slightly faster. 6746 ** 6747 ** There is a subtle but important optimization here too. When inserting 6748 ** multiple records into an intkey b-tree using a single cursor (as can 6749 ** happen while processing an "INSERT INTO ... SELECT" statement), it 6750 ** is advantageous to leave the cursor pointing to the last entry in 6751 ** the b-tree if possible. If the cursor is left pointing to the last 6752 ** entry in the table, and the next row inserted has an integer key 6753 ** larger than the largest existing key, it is possible to insert the 6754 ** row without seeking the cursor. This can be a big performance boost. 6755 */ 6756 pCur->info.nSize = 0; 6757 pCur->validNKey = 0; 6758 if( rc==SQLITE_OK && pPage->nOverflow ){ 6759 rc = balance(pCur); 6760 6761 /* Must make sure nOverflow is reset to zero even if the balance() 6762 ** fails. Internal data structure corruption will result otherwise. 6763 ** Also, set the cursor state to invalid. This stops saveCursorPosition() 6764 ** from trying to save the current position of the cursor. */ 6765 pCur->apPage[pCur->iPage]->nOverflow = 0; 6766 pCur->eState = CURSOR_INVALID; 6767 } 6768 assert( pCur->apPage[pCur->iPage]->nOverflow==0 ); 6769 6770 end_insert: 6771 return rc; 6772 } 6773 6774 /* 6775 ** Delete the entry that the cursor is pointing to. The cursor 6776 ** is left pointing at a arbitrary location. 6777 */ 6778 int sqlite3BtreeDelete(BtCursor *pCur){ 6779 Btree *p = pCur->pBtree; 6780 BtShared *pBt = p->pBt; 6781 int rc; /* Return code */ 6782 MemPage *pPage; /* Page to delete cell from */ 6783 unsigned char *pCell; /* Pointer to cell to delete */ 6784 int iCellIdx; /* Index of cell to delete */ 6785 int iCellDepth; /* Depth of node containing pCell */ 6786 6787 assert( cursorHoldsMutex(pCur) ); 6788 assert( pBt->inTransaction==TRANS_WRITE ); 6789 assert( !pBt->readOnly ); 6790 assert( pCur->wrFlag ); 6791 assert( hasSharedCacheTableLock(p, pCur->pgnoRoot, pCur->pKeyInfo!=0, 2) ); 6792 assert( !hasReadConflicts(p, pCur->pgnoRoot) ); 6793 6794 if( NEVER(pCur->aiIdx[pCur->iPage]>=pCur->apPage[pCur->iPage]->nCell) 6795 || NEVER(pCur->eState!=CURSOR_VALID) 6796 ){ 6797 return SQLITE_ERROR; /* Something has gone awry. */ 6798 } 6799 6800 /* If this is a delete operation to remove a row from a table b-tree, 6801 ** invalidate any incrblob cursors open on the row being deleted. */ 6802 if( pCur->pKeyInfo==0 ){ 6803 invalidateIncrblobCursors(p, pCur->info.nKey, 0); 6804 } 6805 6806 iCellDepth = pCur->iPage; 6807 iCellIdx = pCur->aiIdx[iCellDepth]; 6808 pPage = pCur->apPage[iCellDepth]; 6809 pCell = findCell(pPage, iCellIdx); 6810 6811 /* If the page containing the entry to delete is not a leaf page, move 6812 ** the cursor to the largest entry in the tree that is smaller than 6813 ** the entry being deleted. This cell will replace the cell being deleted 6814 ** from the internal node. The 'previous' entry is used for this instead 6815 ** of the 'next' entry, as the previous entry is always a part of the 6816 ** sub-tree headed by the child page of the cell being deleted. This makes 6817 ** balancing the tree following the delete operation easier. */ 6818 if( !pPage->leaf ){ 6819 int notUsed; 6820 rc = sqlite3BtreePrevious(pCur, ¬Used); 6821 if( rc ) return rc; 6822 } 6823 6824 /* Save the positions of any other cursors open on this table before 6825 ** making any modifications. Make the page containing the entry to be 6826 ** deleted writable. Then free any overflow pages associated with the 6827 ** entry and finally remove the cell itself from within the page. 6828 */ 6829 rc = saveAllCursors(pBt, pCur->pgnoRoot, pCur); 6830 if( rc ) return rc; 6831 rc = sqlite3PagerWrite(pPage->pDbPage); 6832 if( rc ) return rc; 6833 rc = clearCell(pPage, pCell); 6834 dropCell(pPage, iCellIdx, cellSizePtr(pPage, pCell), &rc); 6835 if( rc ) return rc; 6836 6837 /* If the cell deleted was not located on a leaf page, then the cursor 6838 ** is currently pointing to the largest entry in the sub-tree headed 6839 ** by the child-page of the cell that was just deleted from an internal 6840 ** node. The cell from the leaf node needs to be moved to the internal 6841 ** node to replace the deleted cell. */ 6842 if( !pPage->leaf ){ 6843 MemPage *pLeaf = pCur->apPage[pCur->iPage]; 6844 int nCell; 6845 Pgno n = pCur->apPage[iCellDepth+1]->pgno; 6846 unsigned char *pTmp; 6847 6848 pCell = findCell(pLeaf, pLeaf->nCell-1); 6849 nCell = cellSizePtr(pLeaf, pCell); 6850 assert( MX_CELL_SIZE(pBt)>=nCell ); 6851 6852 allocateTempSpace(pBt); 6853 pTmp = pBt->pTmpSpace; 6854 6855 rc = sqlite3PagerWrite(pLeaf->pDbPage); 6856 insertCell(pPage, iCellIdx, pCell-4, nCell+4, pTmp, n, &rc); 6857 dropCell(pLeaf, pLeaf->nCell-1, nCell, &rc); 6858 if( rc ) return rc; 6859 } 6860 6861 /* Balance the tree. If the entry deleted was located on a leaf page, 6862 ** then the cursor still points to that page. In this case the first 6863 ** call to balance() repairs the tree, and the if(...) condition is 6864 ** never true. 6865 ** 6866 ** Otherwise, if the entry deleted was on an internal node page, then 6867 ** pCur is pointing to the leaf page from which a cell was removed to 6868 ** replace the cell deleted from the internal node. This is slightly 6869 ** tricky as the leaf node may be underfull, and the internal node may 6870 ** be either under or overfull. In this case run the balancing algorithm 6871 ** on the leaf node first. If the balance proceeds far enough up the 6872 ** tree that we can be sure that any problem in the internal node has 6873 ** been corrected, so be it. Otherwise, after balancing the leaf node, 6874 ** walk the cursor up the tree to the internal node and balance it as 6875 ** well. */ 6876 rc = balance(pCur); 6877 if( rc==SQLITE_OK && pCur->iPage>iCellDepth ){ 6878 while( pCur->iPage>iCellDepth ){ 6879 releasePage(pCur->apPage[pCur->iPage--]); 6880 } 6881 rc = balance(pCur); 6882 } 6883 6884 if( rc==SQLITE_OK ){ 6885 moveToRoot(pCur); 6886 } 6887 return rc; 6888 } 6889 6890 /* 6891 ** Create a new BTree table. Write into *piTable the page 6892 ** number for the root page of the new table. 6893 ** 6894 ** The type of type is determined by the flags parameter. Only the 6895 ** following values of flags are currently in use. Other values for 6896 ** flags might not work: 6897 ** 6898 ** BTREE_INTKEY|BTREE_LEAFDATA Used for SQL tables with rowid keys 6899 ** BTREE_ZERODATA Used for SQL indices 6900 */ 6901 static int btreeCreateTable(Btree *p, int *piTable, int createTabFlags){ 6902 BtShared *pBt = p->pBt; 6903 MemPage *pRoot; 6904 Pgno pgnoRoot; 6905 int rc; 6906 int ptfFlags; /* Page-type flage for the root page of new table */ 6907 6908 assert( sqlite3BtreeHoldsMutex(p) ); 6909 assert( pBt->inTransaction==TRANS_WRITE ); 6910 assert( !pBt->readOnly ); 6911 6912 #ifdef SQLITE_OMIT_AUTOVACUUM 6913 rc = allocateBtreePage(pBt, &pRoot, &pgnoRoot, 1, 0); 6914 if( rc ){ 6915 return rc; 6916 } 6917 #else 6918 if( pBt->autoVacuum ){ 6919 Pgno pgnoMove; /* Move a page here to make room for the root-page */ 6920 MemPage *pPageMove; /* The page to move to. */ 6921 6922 /* Creating a new table may probably require moving an existing database 6923 ** to make room for the new tables root page. In case this page turns 6924 ** out to be an overflow page, delete all overflow page-map caches 6925 ** held by open cursors. 6926 */ 6927 invalidateAllOverflowCache(pBt); 6928 6929 /* Read the value of meta[3] from the database to determine where the 6930 ** root page of the new table should go. meta[3] is the largest root-page 6931 ** created so far, so the new root-page is (meta[3]+1). 6932 */ 6933 sqlite3BtreeGetMeta(p, BTREE_LARGEST_ROOT_PAGE, &pgnoRoot); 6934 pgnoRoot++; 6935 6936 /* The new root-page may not be allocated on a pointer-map page, or the 6937 ** PENDING_BYTE page. 6938 */ 6939 while( pgnoRoot==PTRMAP_PAGENO(pBt, pgnoRoot) || 6940 pgnoRoot==PENDING_BYTE_PAGE(pBt) ){ 6941 pgnoRoot++; 6942 } 6943 assert( pgnoRoot>=3 ); 6944 6945 /* Allocate a page. The page that currently resides at pgnoRoot will 6946 ** be moved to the allocated page (unless the allocated page happens 6947 ** to reside at pgnoRoot). 6948 */ 6949 rc = allocateBtreePage(pBt, &pPageMove, &pgnoMove, pgnoRoot, 1); 6950 if( rc!=SQLITE_OK ){ 6951 return rc; 6952 } 6953 6954 if( pgnoMove!=pgnoRoot ){ 6955 /* pgnoRoot is the page that will be used for the root-page of 6956 ** the new table (assuming an error did not occur). But we were 6957 ** allocated pgnoMove. If required (i.e. if it was not allocated 6958 ** by extending the file), the current page at position pgnoMove 6959 ** is already journaled. 6960 */ 6961 u8 eType = 0; 6962 Pgno iPtrPage = 0; 6963 6964 releasePage(pPageMove); 6965 6966 /* Move the page currently at pgnoRoot to pgnoMove. */ 6967 rc = btreeGetPage(pBt, pgnoRoot, &pRoot, 0); 6968 if( rc!=SQLITE_OK ){ 6969 return rc; 6970 } 6971 rc = ptrmapGet(pBt, pgnoRoot, &eType, &iPtrPage); 6972 if( eType==PTRMAP_ROOTPAGE || eType==PTRMAP_FREEPAGE ){ 6973 rc = SQLITE_CORRUPT_BKPT; 6974 } 6975 if( rc!=SQLITE_OK ){ 6976 releasePage(pRoot); 6977 return rc; 6978 } 6979 assert( eType!=PTRMAP_ROOTPAGE ); 6980 assert( eType!=PTRMAP_FREEPAGE ); 6981 rc = relocatePage(pBt, pRoot, eType, iPtrPage, pgnoMove, 0); 6982 releasePage(pRoot); 6983 6984 /* Obtain the page at pgnoRoot */ 6985 if( rc!=SQLITE_OK ){ 6986 return rc; 6987 } 6988 rc = btreeGetPage(pBt, pgnoRoot, &pRoot, 0); 6989 if( rc!=SQLITE_OK ){ 6990 return rc; 6991 } 6992 rc = sqlite3PagerWrite(pRoot->pDbPage); 6993 if( rc!=SQLITE_OK ){ 6994 releasePage(pRoot); 6995 return rc; 6996 } 6997 }else{ 6998 pRoot = pPageMove; 6999 } 7000 7001 /* Update the pointer-map and meta-data with the new root-page number. */ 7002 ptrmapPut(pBt, pgnoRoot, PTRMAP_ROOTPAGE, 0, &rc); 7003 if( rc ){ 7004 releasePage(pRoot); 7005 return rc; 7006 } 7007 7008 /* When the new root page was allocated, page 1 was made writable in 7009 ** order either to increase the database filesize, or to decrement the 7010 ** freelist count. Hence, the sqlite3BtreeUpdateMeta() call cannot fail. 7011 */ 7012 assert( sqlite3PagerIswriteable(pBt->pPage1->pDbPage) ); 7013 rc = sqlite3BtreeUpdateMeta(p, 4, pgnoRoot); 7014 if( NEVER(rc) ){ 7015 releasePage(pRoot); 7016 return rc; 7017 } 7018 7019 }else{ 7020 rc = allocateBtreePage(pBt, &pRoot, &pgnoRoot, 1, 0); 7021 if( rc ) return rc; 7022 } 7023 #endif 7024 assert( sqlite3PagerIswriteable(pRoot->pDbPage) ); 7025 if( createTabFlags & BTREE_INTKEY ){ 7026 ptfFlags = PTF_INTKEY | PTF_LEAFDATA | PTF_LEAF; 7027 }else{ 7028 ptfFlags = PTF_ZERODATA | PTF_LEAF; 7029 } 7030 zeroPage(pRoot, ptfFlags); 7031 sqlite3PagerUnref(pRoot->pDbPage); 7032 assert( (pBt->openFlags & BTREE_SINGLE)==0 || pgnoRoot==2 ); 7033 *piTable = (int)pgnoRoot; 7034 return SQLITE_OK; 7035 } 7036 int sqlite3BtreeCreateTable(Btree *p, int *piTable, int flags){ 7037 int rc; 7038 sqlite3BtreeEnter(p); 7039 rc = btreeCreateTable(p, piTable, flags); 7040 sqlite3BtreeLeave(p); 7041 return rc; 7042 } 7043 7044 /* 7045 ** Erase the given database page and all its children. Return 7046 ** the page to the freelist. 7047 */ 7048 static int clearDatabasePage( 7049 BtShared *pBt, /* The BTree that contains the table */ 7050 Pgno pgno, /* Page number to clear */ 7051 int freePageFlag, /* Deallocate page if true */ 7052 int *pnChange /* Add number of Cells freed to this counter */ 7053 ){ 7054 MemPage *pPage; 7055 int rc; 7056 unsigned char *pCell; 7057 int i; 7058 7059 assert( sqlite3_mutex_held(pBt->mutex) ); 7060 if( pgno>btreePagecount(pBt) ){ 7061 return SQLITE_CORRUPT_BKPT; 7062 } 7063 7064 rc = getAndInitPage(pBt, pgno, &pPage); 7065 if( rc ) return rc; 7066 for(i=0; i<pPage->nCell; i++){ 7067 pCell = findCell(pPage, i); 7068 if( !pPage->leaf ){ 7069 rc = clearDatabasePage(pBt, get4byte(pCell), 1, pnChange); 7070 if( rc ) goto cleardatabasepage_out; 7071 } 7072 rc = clearCell(pPage, pCell); 7073 if( rc ) goto cleardatabasepage_out; 7074 } 7075 if( !pPage->leaf ){ 7076 rc = clearDatabasePage(pBt, get4byte(&pPage->aData[8]), 1, pnChange); 7077 if( rc ) goto cleardatabasepage_out; 7078 }else if( pnChange ){ 7079 assert( pPage->intKey ); 7080 *pnChange += pPage->nCell; 7081 } 7082 if( freePageFlag ){ 7083 freePage(pPage, &rc); 7084 }else if( (rc = sqlite3PagerWrite(pPage->pDbPage))==0 ){ 7085 zeroPage(pPage, pPage->aData[0] | PTF_LEAF); 7086 } 7087 7088 cleardatabasepage_out: 7089 releasePage(pPage); 7090 return rc; 7091 } 7092 7093 /* 7094 ** Delete all information from a single table in the database. iTable is 7095 ** the page number of the root of the table. After this routine returns, 7096 ** the root page is empty, but still exists. 7097 ** 7098 ** This routine will fail with SQLITE_LOCKED if there are any open 7099 ** read cursors on the table. Open write cursors are moved to the 7100 ** root of the table. 7101 ** 7102 ** If pnChange is not NULL, then table iTable must be an intkey table. The 7103 ** integer value pointed to by pnChange is incremented by the number of 7104 ** entries in the table. 7105 */ 7106 int sqlite3BtreeClearTable(Btree *p, int iTable, int *pnChange){ 7107 int rc; 7108 BtShared *pBt = p->pBt; 7109 sqlite3BtreeEnter(p); 7110 assert( p->inTrans==TRANS_WRITE ); 7111 7112 /* Invalidate all incrblob cursors open on table iTable (assuming iTable 7113 ** is the root of a table b-tree - if it is not, the following call is 7114 ** a no-op). */ 7115 invalidateIncrblobCursors(p, 0, 1); 7116 7117 rc = saveAllCursors(pBt, (Pgno)iTable, 0); 7118 if( SQLITE_OK==rc ){ 7119 rc = clearDatabasePage(pBt, (Pgno)iTable, 0, pnChange); 7120 } 7121 sqlite3BtreeLeave(p); 7122 return rc; 7123 } 7124 7125 /* 7126 ** Erase all information in a table and add the root of the table to 7127 ** the freelist. Except, the root of the principle table (the one on 7128 ** page 1) is never added to the freelist. 7129 ** 7130 ** This routine will fail with SQLITE_LOCKED if there are any open 7131 ** cursors on the table. 7132 ** 7133 ** If AUTOVACUUM is enabled and the page at iTable is not the last 7134 ** root page in the database file, then the last root page 7135 ** in the database file is moved into the slot formerly occupied by 7136 ** iTable and that last slot formerly occupied by the last root page 7137 ** is added to the freelist instead of iTable. In this say, all 7138 ** root pages are kept at the beginning of the database file, which 7139 ** is necessary for AUTOVACUUM to work right. *piMoved is set to the 7140 ** page number that used to be the last root page in the file before 7141 ** the move. If no page gets moved, *piMoved is set to 0. 7142 ** The last root page is recorded in meta[3] and the value of 7143 ** meta[3] is updated by this procedure. 7144 */ 7145 static int btreeDropTable(Btree *p, Pgno iTable, int *piMoved){ 7146 int rc; 7147 MemPage *pPage = 0; 7148 BtShared *pBt = p->pBt; 7149 7150 assert( sqlite3BtreeHoldsMutex(p) ); 7151 assert( p->inTrans==TRANS_WRITE ); 7152 7153 /* It is illegal to drop a table if any cursors are open on the 7154 ** database. This is because in auto-vacuum mode the backend may 7155 ** need to move another root-page to fill a gap left by the deleted 7156 ** root page. If an open cursor was using this page a problem would 7157 ** occur. 7158 ** 7159 ** This error is caught long before control reaches this point. 7160 */ 7161 if( NEVER(pBt->pCursor) ){ 7162 sqlite3ConnectionBlocked(p->db, pBt->pCursor->pBtree->db); 7163 return SQLITE_LOCKED_SHAREDCACHE; 7164 } 7165 7166 rc = btreeGetPage(pBt, (Pgno)iTable, &pPage, 0); 7167 if( rc ) return rc; 7168 rc = sqlite3BtreeClearTable(p, iTable, 0); 7169 if( rc ){ 7170 releasePage(pPage); 7171 return rc; 7172 } 7173 7174 *piMoved = 0; 7175 7176 if( iTable>1 ){ 7177 #ifdef SQLITE_OMIT_AUTOVACUUM 7178 freePage(pPage, &rc); 7179 releasePage(pPage); 7180 #else 7181 if( pBt->autoVacuum ){ 7182 Pgno maxRootPgno; 7183 sqlite3BtreeGetMeta(p, BTREE_LARGEST_ROOT_PAGE, &maxRootPgno); 7184 7185 if( iTable==maxRootPgno ){ 7186 /* If the table being dropped is the table with the largest root-page 7187 ** number in the database, put the root page on the free list. 7188 */ 7189 freePage(pPage, &rc); 7190 releasePage(pPage); 7191 if( rc!=SQLITE_OK ){ 7192 return rc; 7193 } 7194 }else{ 7195 /* The table being dropped does not have the largest root-page 7196 ** number in the database. So move the page that does into the 7197 ** gap left by the deleted root-page. 7198 */ 7199 MemPage *pMove; 7200 releasePage(pPage); 7201 rc = btreeGetPage(pBt, maxRootPgno, &pMove, 0); 7202 if( rc!=SQLITE_OK ){ 7203 return rc; 7204 } 7205 rc = relocatePage(pBt, pMove, PTRMAP_ROOTPAGE, 0, iTable, 0); 7206 releasePage(pMove); 7207 if( rc!=SQLITE_OK ){ 7208 return rc; 7209 } 7210 pMove = 0; 7211 rc = btreeGetPage(pBt, maxRootPgno, &pMove, 0); 7212 freePage(pMove, &rc); 7213 releasePage(pMove); 7214 if( rc!=SQLITE_OK ){ 7215 return rc; 7216 } 7217 *piMoved = maxRootPgno; 7218 } 7219 7220 /* Set the new 'max-root-page' value in the database header. This 7221 ** is the old value less one, less one more if that happens to 7222 ** be a root-page number, less one again if that is the 7223 ** PENDING_BYTE_PAGE. 7224 */ 7225 maxRootPgno--; 7226 while( maxRootPgno==PENDING_BYTE_PAGE(pBt) 7227 || PTRMAP_ISPAGE(pBt, maxRootPgno) ){ 7228 maxRootPgno--; 7229 } 7230 assert( maxRootPgno!=PENDING_BYTE_PAGE(pBt) ); 7231 7232 rc = sqlite3BtreeUpdateMeta(p, 4, maxRootPgno); 7233 }else{ 7234 freePage(pPage, &rc); 7235 releasePage(pPage); 7236 } 7237 #endif 7238 }else{ 7239 /* If sqlite3BtreeDropTable was called on page 1. 7240 ** This really never should happen except in a corrupt 7241 ** database. 7242 */ 7243 zeroPage(pPage, PTF_INTKEY|PTF_LEAF ); 7244 releasePage(pPage); 7245 } 7246 return rc; 7247 } 7248 int sqlite3BtreeDropTable(Btree *p, int iTable, int *piMoved){ 7249 int rc; 7250 sqlite3BtreeEnter(p); 7251 rc = btreeDropTable(p, iTable, piMoved); 7252 sqlite3BtreeLeave(p); 7253 return rc; 7254 } 7255 7256 7257 /* 7258 ** This function may only be called if the b-tree connection already 7259 ** has a read or write transaction open on the database. 7260 ** 7261 ** Read the meta-information out of a database file. Meta[0] 7262 ** is the number of free pages currently in the database. Meta[1] 7263 ** through meta[15] are available for use by higher layers. Meta[0] 7264 ** is read-only, the others are read/write. 7265 ** 7266 ** The schema layer numbers meta values differently. At the schema 7267 ** layer (and the SetCookie and ReadCookie opcodes) the number of 7268 ** free pages is not visible. So Cookie[0] is the same as Meta[1]. 7269 */ 7270 void sqlite3BtreeGetMeta(Btree *p, int idx, u32 *pMeta){ 7271 BtShared *pBt = p->pBt; 7272 7273 sqlite3BtreeEnter(p); 7274 assert( p->inTrans>TRANS_NONE ); 7275 assert( SQLITE_OK==querySharedCacheTableLock(p, MASTER_ROOT, READ_LOCK) ); 7276 assert( pBt->pPage1 ); 7277 assert( idx>=0 && idx<=15 ); 7278 7279 *pMeta = get4byte(&pBt->pPage1->aData[36 + idx*4]); 7280 7281 /* If auto-vacuum is disabled in this build and this is an auto-vacuum 7282 ** database, mark the database as read-only. */ 7283 #ifdef SQLITE_OMIT_AUTOVACUUM 7284 if( idx==BTREE_LARGEST_ROOT_PAGE && *pMeta>0 ) pBt->readOnly = 1; 7285 #endif 7286 7287 sqlite3BtreeLeave(p); 7288 } 7289 7290 /* 7291 ** Write meta-information back into the database. Meta[0] is 7292 ** read-only and may not be written. 7293 */ 7294 int sqlite3BtreeUpdateMeta(Btree *p, int idx, u32 iMeta){ 7295 BtShared *pBt = p->pBt; 7296 unsigned char *pP1; 7297 int rc; 7298 assert( idx>=1 && idx<=15 ); 7299 sqlite3BtreeEnter(p); 7300 assert( p->inTrans==TRANS_WRITE ); 7301 assert( pBt->pPage1!=0 ); 7302 pP1 = pBt->pPage1->aData; 7303 rc = sqlite3PagerWrite(pBt->pPage1->pDbPage); 7304 if( rc==SQLITE_OK ){ 7305 put4byte(&pP1[36 + idx*4], iMeta); 7306 #ifndef SQLITE_OMIT_AUTOVACUUM 7307 if( idx==BTREE_INCR_VACUUM ){ 7308 assert( pBt->autoVacuum || iMeta==0 ); 7309 assert( iMeta==0 || iMeta==1 ); 7310 pBt->incrVacuum = (u8)iMeta; 7311 } 7312 #endif 7313 } 7314 sqlite3BtreeLeave(p); 7315 return rc; 7316 } 7317 7318 #ifndef SQLITE_OMIT_BTREECOUNT 7319 /* 7320 ** The first argument, pCur, is a cursor opened on some b-tree. Count the 7321 ** number of entries in the b-tree and write the result to *pnEntry. 7322 ** 7323 ** SQLITE_OK is returned if the operation is successfully executed. 7324 ** Otherwise, if an error is encountered (i.e. an IO error or database 7325 ** corruption) an SQLite error code is returned. 7326 */ 7327 int sqlite3BtreeCount(BtCursor *pCur, i64 *pnEntry){ 7328 i64 nEntry = 0; /* Value to return in *pnEntry */ 7329 int rc; /* Return code */ 7330 rc = moveToRoot(pCur); 7331 7332 /* Unless an error occurs, the following loop runs one iteration for each 7333 ** page in the B-Tree structure (not including overflow pages). 7334 */ 7335 while( rc==SQLITE_OK ){ 7336 int iIdx; /* Index of child node in parent */ 7337 MemPage *pPage; /* Current page of the b-tree */ 7338 7339 /* If this is a leaf page or the tree is not an int-key tree, then 7340 ** this page contains countable entries. Increment the entry counter 7341 ** accordingly. 7342 */ 7343 pPage = pCur->apPage[pCur->iPage]; 7344 if( pPage->leaf || !pPage->intKey ){ 7345 nEntry += pPage->nCell; 7346 } 7347 7348 /* pPage is a leaf node. This loop navigates the cursor so that it 7349 ** points to the first interior cell that it points to the parent of 7350 ** the next page in the tree that has not yet been visited. The 7351 ** pCur->aiIdx[pCur->iPage] value is set to the index of the parent cell 7352 ** of the page, or to the number of cells in the page if the next page 7353 ** to visit is the right-child of its parent. 7354 ** 7355 ** If all pages in the tree have been visited, return SQLITE_OK to the 7356 ** caller. 7357 */ 7358 if( pPage->leaf ){ 7359 do { 7360 if( pCur->iPage==0 ){ 7361 /* All pages of the b-tree have been visited. Return successfully. */ 7362 *pnEntry = nEntry; 7363 return SQLITE_OK; 7364 } 7365 moveToParent(pCur); 7366 }while ( pCur->aiIdx[pCur->iPage]>=pCur->apPage[pCur->iPage]->nCell ); 7367 7368 pCur->aiIdx[pCur->iPage]++; 7369 pPage = pCur->apPage[pCur->iPage]; 7370 } 7371 7372 /* Descend to the child node of the cell that the cursor currently 7373 ** points at. This is the right-child if (iIdx==pPage->nCell). 7374 */ 7375 iIdx = pCur->aiIdx[pCur->iPage]; 7376 if( iIdx==pPage->nCell ){ 7377 rc = moveToChild(pCur, get4byte(&pPage->aData[pPage->hdrOffset+8])); 7378 }else{ 7379 rc = moveToChild(pCur, get4byte(findCell(pPage, iIdx))); 7380 } 7381 } 7382 7383 /* An error has occurred. Return an error code. */ 7384 return rc; 7385 } 7386 #endif 7387 7388 /* 7389 ** Return the pager associated with a BTree. This routine is used for 7390 ** testing and debugging only. 7391 */ 7392 Pager *sqlite3BtreePager(Btree *p){ 7393 return p->pBt->pPager; 7394 } 7395 7396 #ifndef SQLITE_OMIT_INTEGRITY_CHECK 7397 /* 7398 ** Append a message to the error message string. 7399 */ 7400 static void checkAppendMsg( 7401 IntegrityCk *pCheck, 7402 char *zMsg1, 7403 const char *zFormat, 7404 ... 7405 ){ 7406 va_list ap; 7407 if( !pCheck->mxErr ) return; 7408 pCheck->mxErr--; 7409 pCheck->nErr++; 7410 va_start(ap, zFormat); 7411 if( pCheck->errMsg.nChar ){ 7412 sqlite3StrAccumAppend(&pCheck->errMsg, "\n", 1); 7413 } 7414 if( zMsg1 ){ 7415 sqlite3StrAccumAppend(&pCheck->errMsg, zMsg1, -1); 7416 } 7417 sqlite3VXPrintf(&pCheck->errMsg, 1, zFormat, ap); 7418 va_end(ap); 7419 if( pCheck->errMsg.mallocFailed ){ 7420 pCheck->mallocFailed = 1; 7421 } 7422 } 7423 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */ 7424 7425 #ifndef SQLITE_OMIT_INTEGRITY_CHECK 7426 /* 7427 ** Add 1 to the reference count for page iPage. If this is the second 7428 ** reference to the page, add an error message to pCheck->zErrMsg. 7429 ** Return 1 if there are 2 ore more references to the page and 0 if 7430 ** if this is the first reference to the page. 7431 ** 7432 ** Also check that the page number is in bounds. 7433 */ 7434 static int checkRef(IntegrityCk *pCheck, Pgno iPage, char *zContext){ 7435 if( iPage==0 ) return 1; 7436 if( iPage>pCheck->nPage ){ 7437 checkAppendMsg(pCheck, zContext, "invalid page number %d", iPage); 7438 return 1; 7439 } 7440 if( pCheck->anRef[iPage]==1 ){ 7441 checkAppendMsg(pCheck, zContext, "2nd reference to page %d", iPage); 7442 return 1; 7443 } 7444 return (pCheck->anRef[iPage]++)>1; 7445 } 7446 7447 #ifndef SQLITE_OMIT_AUTOVACUUM 7448 /* 7449 ** Check that the entry in the pointer-map for page iChild maps to 7450 ** page iParent, pointer type ptrType. If not, append an error message 7451 ** to pCheck. 7452 */ 7453 static void checkPtrmap( 7454 IntegrityCk *pCheck, /* Integrity check context */ 7455 Pgno iChild, /* Child page number */ 7456 u8 eType, /* Expected pointer map type */ 7457 Pgno iParent, /* Expected pointer map parent page number */ 7458 char *zContext /* Context description (used for error msg) */ 7459 ){ 7460 int rc; 7461 u8 ePtrmapType; 7462 Pgno iPtrmapParent; 7463 7464 rc = ptrmapGet(pCheck->pBt, iChild, &ePtrmapType, &iPtrmapParent); 7465 if( rc!=SQLITE_OK ){ 7466 if( rc==SQLITE_NOMEM || rc==SQLITE_IOERR_NOMEM ) pCheck->mallocFailed = 1; 7467 checkAppendMsg(pCheck, zContext, "Failed to read ptrmap key=%d", iChild); 7468 return; 7469 } 7470 7471 if( ePtrmapType!=eType || iPtrmapParent!=iParent ){ 7472 checkAppendMsg(pCheck, zContext, 7473 "Bad ptr map entry key=%d expected=(%d,%d) got=(%d,%d)", 7474 iChild, eType, iParent, ePtrmapType, iPtrmapParent); 7475 } 7476 } 7477 #endif 7478 7479 /* 7480 ** Check the integrity of the freelist or of an overflow page list. 7481 ** Verify that the number of pages on the list is N. 7482 */ 7483 static void checkList( 7484 IntegrityCk *pCheck, /* Integrity checking context */ 7485 int isFreeList, /* True for a freelist. False for overflow page list */ 7486 int iPage, /* Page number for first page in the list */ 7487 int N, /* Expected number of pages in the list */ 7488 char *zContext /* Context for error messages */ 7489 ){ 7490 int i; 7491 int expected = N; 7492 int iFirst = iPage; 7493 while( N-- > 0 && pCheck->mxErr ){ 7494 DbPage *pOvflPage; 7495 unsigned char *pOvflData; 7496 if( iPage<1 ){ 7497 checkAppendMsg(pCheck, zContext, 7498 "%d of %d pages missing from overflow list starting at %d", 7499 N+1, expected, iFirst); 7500 break; 7501 } 7502 if( checkRef(pCheck, iPage, zContext) ) break; 7503 if( sqlite3PagerGet(pCheck->pPager, (Pgno)iPage, &pOvflPage) ){ 7504 checkAppendMsg(pCheck, zContext, "failed to get page %d", iPage); 7505 break; 7506 } 7507 pOvflData = (unsigned char *)sqlite3PagerGetData(pOvflPage); 7508 if( isFreeList ){ 7509 int n = get4byte(&pOvflData[4]); 7510 #ifndef SQLITE_OMIT_AUTOVACUUM 7511 if( pCheck->pBt->autoVacuum ){ 7512 checkPtrmap(pCheck, iPage, PTRMAP_FREEPAGE, 0, zContext); 7513 } 7514 #endif 7515 if( n>(int)pCheck->pBt->usableSize/4-2 ){ 7516 checkAppendMsg(pCheck, zContext, 7517 "freelist leaf count too big on page %d", iPage); 7518 N--; 7519 }else{ 7520 for(i=0; i<n; i++){ 7521 Pgno iFreePage = get4byte(&pOvflData[8+i*4]); 7522 #ifndef SQLITE_OMIT_AUTOVACUUM 7523 if( pCheck->pBt->autoVacuum ){ 7524 checkPtrmap(pCheck, iFreePage, PTRMAP_FREEPAGE, 0, zContext); 7525 } 7526 #endif 7527 checkRef(pCheck, iFreePage, zContext); 7528 } 7529 N -= n; 7530 } 7531 } 7532 #ifndef SQLITE_OMIT_AUTOVACUUM 7533 else{ 7534 /* If this database supports auto-vacuum and iPage is not the last 7535 ** page in this overflow list, check that the pointer-map entry for 7536 ** the following page matches iPage. 7537 */ 7538 if( pCheck->pBt->autoVacuum && N>0 ){ 7539 i = get4byte(pOvflData); 7540 checkPtrmap(pCheck, i, PTRMAP_OVERFLOW2, iPage, zContext); 7541 } 7542 } 7543 #endif 7544 iPage = get4byte(pOvflData); 7545 sqlite3PagerUnref(pOvflPage); 7546 } 7547 } 7548 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */ 7549 7550 #ifndef SQLITE_OMIT_INTEGRITY_CHECK 7551 /* 7552 ** Do various sanity checks on a single page of a tree. Return 7553 ** the tree depth. Root pages return 0. Parents of root pages 7554 ** return 1, and so forth. 7555 ** 7556 ** These checks are done: 7557 ** 7558 ** 1. Make sure that cells and freeblocks do not overlap 7559 ** but combine to completely cover the page. 7560 ** NO 2. Make sure cell keys are in order. 7561 ** NO 3. Make sure no key is less than or equal to zLowerBound. 7562 ** NO 4. Make sure no key is greater than or equal to zUpperBound. 7563 ** 5. Check the integrity of overflow pages. 7564 ** 6. Recursively call checkTreePage on all children. 7565 ** 7. Verify that the depth of all children is the same. 7566 ** 8. Make sure this page is at least 33% full or else it is 7567 ** the root of the tree. 7568 */ 7569 static int checkTreePage( 7570 IntegrityCk *pCheck, /* Context for the sanity check */ 7571 int iPage, /* Page number of the page to check */ 7572 char *zParentContext, /* Parent context */ 7573 i64 *pnParentMinKey, 7574 i64 *pnParentMaxKey 7575 ){ 7576 MemPage *pPage; 7577 int i, rc, depth, d2, pgno, cnt; 7578 int hdr, cellStart; 7579 int nCell; 7580 u8 *data; 7581 BtShared *pBt; 7582 int usableSize; 7583 char zContext[100]; 7584 char *hit = 0; 7585 i64 nMinKey = 0; 7586 i64 nMaxKey = 0; 7587 7588 sqlite3_snprintf(sizeof(zContext), zContext, "Page %d: ", iPage); 7589 7590 /* Check that the page exists 7591 */ 7592 pBt = pCheck->pBt; 7593 usableSize = pBt->usableSize; 7594 if( iPage==0 ) return 0; 7595 if( checkRef(pCheck, iPage, zParentContext) ) return 0; 7596 if( (rc = btreeGetPage(pBt, (Pgno)iPage, &pPage, 0))!=0 ){ 7597 checkAppendMsg(pCheck, zContext, 7598 "unable to get the page. error code=%d", rc); 7599 return 0; 7600 } 7601 7602 /* Clear MemPage.isInit to make sure the corruption detection code in 7603 ** btreeInitPage() is executed. */ 7604 pPage->isInit = 0; 7605 if( (rc = btreeInitPage(pPage))!=0 ){ 7606 assert( rc==SQLITE_CORRUPT ); /* The only possible error from InitPage */ 7607 checkAppendMsg(pCheck, zContext, 7608 "btreeInitPage() returns error code %d", rc); 7609 releasePage(pPage); 7610 return 0; 7611 } 7612 7613 /* Check out all the cells. 7614 */ 7615 depth = 0; 7616 for(i=0; i<pPage->nCell && pCheck->mxErr; i++){ 7617 u8 *pCell; 7618 u32 sz; 7619 CellInfo info; 7620 7621 /* Check payload overflow pages 7622 */ 7623 sqlite3_snprintf(sizeof(zContext), zContext, 7624 "On tree page %d cell %d: ", iPage, i); 7625 pCell = findCell(pPage,i); 7626 btreeParseCellPtr(pPage, pCell, &info); 7627 sz = info.nData; 7628 if( !pPage->intKey ) sz += (int)info.nKey; 7629 /* For intKey pages, check that the keys are in order. 7630 */ 7631 else if( i==0 ) nMinKey = nMaxKey = info.nKey; 7632 else{ 7633 if( info.nKey <= nMaxKey ){ 7634 checkAppendMsg(pCheck, zContext, 7635 "Rowid %lld out of order (previous was %lld)", info.nKey, nMaxKey); 7636 } 7637 nMaxKey = info.nKey; 7638 } 7639 assert( sz==info.nPayload ); 7640 if( (sz>info.nLocal) 7641 && (&pCell[info.iOverflow]<=&pPage->aData[pBt->usableSize]) 7642 ){ 7643 int nPage = (sz - info.nLocal + usableSize - 5)/(usableSize - 4); 7644 Pgno pgnoOvfl = get4byte(&pCell[info.iOverflow]); 7645 #ifndef SQLITE_OMIT_AUTOVACUUM 7646 if( pBt->autoVacuum ){ 7647 checkPtrmap(pCheck, pgnoOvfl, PTRMAP_OVERFLOW1, iPage, zContext); 7648 } 7649 #endif 7650 checkList(pCheck, 0, pgnoOvfl, nPage, zContext); 7651 } 7652 7653 /* Check sanity of left child page. 7654 */ 7655 if( !pPage->leaf ){ 7656 pgno = get4byte(pCell); 7657 #ifndef SQLITE_OMIT_AUTOVACUUM 7658 if( pBt->autoVacuum ){ 7659 checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage, zContext); 7660 } 7661 #endif 7662 d2 = checkTreePage(pCheck, pgno, zContext, &nMinKey, i==0 ? NULL : &nMaxKey); 7663 if( i>0 && d2!=depth ){ 7664 checkAppendMsg(pCheck, zContext, "Child page depth differs"); 7665 } 7666 depth = d2; 7667 } 7668 } 7669 7670 if( !pPage->leaf ){ 7671 pgno = get4byte(&pPage->aData[pPage->hdrOffset+8]); 7672 sqlite3_snprintf(sizeof(zContext), zContext, 7673 "On page %d at right child: ", iPage); 7674 #ifndef SQLITE_OMIT_AUTOVACUUM 7675 if( pBt->autoVacuum ){ 7676 checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage, zContext); 7677 } 7678 #endif 7679 checkTreePage(pCheck, pgno, zContext, NULL, !pPage->nCell ? NULL : &nMaxKey); 7680 } 7681 7682 /* For intKey leaf pages, check that the min/max keys are in order 7683 ** with any left/parent/right pages. 7684 */ 7685 if( pPage->leaf && pPage->intKey ){ 7686 /* if we are a left child page */ 7687 if( pnParentMinKey ){ 7688 /* if we are the left most child page */ 7689 if( !pnParentMaxKey ){ 7690 if( nMaxKey > *pnParentMinKey ){ 7691 checkAppendMsg(pCheck, zContext, 7692 "Rowid %lld out of order (max larger than parent min of %lld)", 7693 nMaxKey, *pnParentMinKey); 7694 } 7695 }else{ 7696 if( nMinKey <= *pnParentMinKey ){ 7697 checkAppendMsg(pCheck, zContext, 7698 "Rowid %lld out of order (min less than parent min of %lld)", 7699 nMinKey, *pnParentMinKey); 7700 } 7701 if( nMaxKey > *pnParentMaxKey ){ 7702 checkAppendMsg(pCheck, zContext, 7703 "Rowid %lld out of order (max larger than parent max of %lld)", 7704 nMaxKey, *pnParentMaxKey); 7705 } 7706 *pnParentMinKey = nMaxKey; 7707 } 7708 /* else if we're a right child page */ 7709 } else if( pnParentMaxKey ){ 7710 if( nMinKey <= *pnParentMaxKey ){ 7711 checkAppendMsg(pCheck, zContext, 7712 "Rowid %lld out of order (min less than parent max of %lld)", 7713 nMinKey, *pnParentMaxKey); 7714 } 7715 } 7716 } 7717 7718 /* Check for complete coverage of the page 7719 */ 7720 data = pPage->aData; 7721 hdr = pPage->hdrOffset; 7722 hit = sqlite3PageMalloc( pBt->pageSize ); 7723 if( hit==0 ){ 7724 pCheck->mallocFailed = 1; 7725 }else{ 7726 int contentOffset = get2byteNotZero(&data[hdr+5]); 7727 assert( contentOffset<=usableSize ); /* Enforced by btreeInitPage() */ 7728 memset(hit+contentOffset, 0, usableSize-contentOffset); 7729 memset(hit, 1, contentOffset); 7730 nCell = get2byte(&data[hdr+3]); 7731 cellStart = hdr + 12 - 4*pPage->leaf; 7732 for(i=0; i<nCell; i++){ 7733 int pc = get2byte(&data[cellStart+i*2]); 7734 u32 size = 65536; 7735 int j; 7736 if( pc<=usableSize-4 ){ 7737 size = cellSizePtr(pPage, &data[pc]); 7738 } 7739 if( (int)(pc+size-1)>=usableSize ){ 7740 checkAppendMsg(pCheck, 0, 7741 "Corruption detected in cell %d on page %d",i,iPage); 7742 }else{ 7743 for(j=pc+size-1; j>=pc; j--) hit[j]++; 7744 } 7745 } 7746 i = get2byte(&data[hdr+1]); 7747 while( i>0 ){ 7748 int size, j; 7749 assert( i<=usableSize-4 ); /* Enforced by btreeInitPage() */ 7750 size = get2byte(&data[i+2]); 7751 assert( i+size<=usableSize ); /* Enforced by btreeInitPage() */ 7752 for(j=i+size-1; j>=i; j--) hit[j]++; 7753 j = get2byte(&data[i]); 7754 assert( j==0 || j>i+size ); /* Enforced by btreeInitPage() */ 7755 assert( j<=usableSize-4 ); /* Enforced by btreeInitPage() */ 7756 i = j; 7757 } 7758 for(i=cnt=0; i<usableSize; i++){ 7759 if( hit[i]==0 ){ 7760 cnt++; 7761 }else if( hit[i]>1 ){ 7762 checkAppendMsg(pCheck, 0, 7763 "Multiple uses for byte %d of page %d", i, iPage); 7764 break; 7765 } 7766 } 7767 if( cnt!=data[hdr+7] ){ 7768 checkAppendMsg(pCheck, 0, 7769 "Fragmentation of %d bytes reported as %d on page %d", 7770 cnt, data[hdr+7], iPage); 7771 } 7772 } 7773 sqlite3PageFree(hit); 7774 releasePage(pPage); 7775 return depth+1; 7776 } 7777 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */ 7778 7779 #ifndef SQLITE_OMIT_INTEGRITY_CHECK 7780 /* 7781 ** This routine does a complete check of the given BTree file. aRoot[] is 7782 ** an array of pages numbers were each page number is the root page of 7783 ** a table. nRoot is the number of entries in aRoot. 7784 ** 7785 ** A read-only or read-write transaction must be opened before calling 7786 ** this function. 7787 ** 7788 ** Write the number of error seen in *pnErr. Except for some memory 7789 ** allocation errors, an error message held in memory obtained from 7790 ** malloc is returned if *pnErr is non-zero. If *pnErr==0 then NULL is 7791 ** returned. If a memory allocation error occurs, NULL is returned. 7792 */ 7793 char *sqlite3BtreeIntegrityCheck( 7794 Btree *p, /* The btree to be checked */ 7795 int *aRoot, /* An array of root pages numbers for individual trees */ 7796 int nRoot, /* Number of entries in aRoot[] */ 7797 int mxErr, /* Stop reporting errors after this many */ 7798 int *pnErr /* Write number of errors seen to this variable */ 7799 ){ 7800 Pgno i; 7801 int nRef; 7802 IntegrityCk sCheck; 7803 BtShared *pBt = p->pBt; 7804 char zErr[100]; 7805 7806 sqlite3BtreeEnter(p); 7807 assert( p->inTrans>TRANS_NONE && pBt->inTransaction>TRANS_NONE ); 7808 nRef = sqlite3PagerRefcount(pBt->pPager); 7809 sCheck.pBt = pBt; 7810 sCheck.pPager = pBt->pPager; 7811 sCheck.nPage = btreePagecount(sCheck.pBt); 7812 sCheck.mxErr = mxErr; 7813 sCheck.nErr = 0; 7814 sCheck.mallocFailed = 0; 7815 *pnErr = 0; 7816 if( sCheck.nPage==0 ){ 7817 sqlite3BtreeLeave(p); 7818 return 0; 7819 } 7820 sCheck.anRef = sqlite3Malloc( (sCheck.nPage+1)*sizeof(sCheck.anRef[0]) ); 7821 if( !sCheck.anRef ){ 7822 *pnErr = 1; 7823 sqlite3BtreeLeave(p); 7824 return 0; 7825 } 7826 for(i=0; i<=sCheck.nPage; i++){ sCheck.anRef[i] = 0; } 7827 i = PENDING_BYTE_PAGE(pBt); 7828 if( i<=sCheck.nPage ){ 7829 sCheck.anRef[i] = 1; 7830 } 7831 sqlite3StrAccumInit(&sCheck.errMsg, zErr, sizeof(zErr), 20000); 7832 sCheck.errMsg.useMalloc = 2; 7833 7834 /* Check the integrity of the freelist 7835 */ 7836 checkList(&sCheck, 1, get4byte(&pBt->pPage1->aData[32]), 7837 get4byte(&pBt->pPage1->aData[36]), "Main freelist: "); 7838 7839 /* Check all the tables. 7840 */ 7841 for(i=0; (int)i<nRoot && sCheck.mxErr; i++){ 7842 if( aRoot[i]==0 ) continue; 7843 #ifndef SQLITE_OMIT_AUTOVACUUM 7844 if( pBt->autoVacuum && aRoot[i]>1 ){ 7845 checkPtrmap(&sCheck, aRoot[i], PTRMAP_ROOTPAGE, 0, 0); 7846 } 7847 #endif 7848 checkTreePage(&sCheck, aRoot[i], "List of tree roots: ", NULL, NULL); 7849 } 7850 7851 /* Make sure every page in the file is referenced 7852 */ 7853 for(i=1; i<=sCheck.nPage && sCheck.mxErr; i++){ 7854 #ifdef SQLITE_OMIT_AUTOVACUUM 7855 if( sCheck.anRef[i]==0 ){ 7856 checkAppendMsg(&sCheck, 0, "Page %d is never used", i); 7857 } 7858 #else 7859 /* If the database supports auto-vacuum, make sure no tables contain 7860 ** references to pointer-map pages. 7861 */ 7862 if( sCheck.anRef[i]==0 && 7863 (PTRMAP_PAGENO(pBt, i)!=i || !pBt->autoVacuum) ){ 7864 checkAppendMsg(&sCheck, 0, "Page %d is never used", i); 7865 } 7866 if( sCheck.anRef[i]!=0 && 7867 (PTRMAP_PAGENO(pBt, i)==i && pBt->autoVacuum) ){ 7868 checkAppendMsg(&sCheck, 0, "Pointer map page %d is referenced", i); 7869 } 7870 #endif 7871 } 7872 7873 /* Make sure this analysis did not leave any unref() pages. 7874 ** This is an internal consistency check; an integrity check 7875 ** of the integrity check. 7876 */ 7877 if( NEVER(nRef != sqlite3PagerRefcount(pBt->pPager)) ){ 7878 checkAppendMsg(&sCheck, 0, 7879 "Outstanding page count goes from %d to %d during this analysis", 7880 nRef, sqlite3PagerRefcount(pBt->pPager) 7881 ); 7882 } 7883 7884 /* Clean up and report errors. 7885 */ 7886 sqlite3BtreeLeave(p); 7887 sqlite3_free(sCheck.anRef); 7888 if( sCheck.mallocFailed ){ 7889 sqlite3StrAccumReset(&sCheck.errMsg); 7890 *pnErr = sCheck.nErr+1; 7891 return 0; 7892 } 7893 *pnErr = sCheck.nErr; 7894 if( sCheck.nErr==0 ) sqlite3StrAccumReset(&sCheck.errMsg); 7895 return sqlite3StrAccumFinish(&sCheck.errMsg); 7896 } 7897 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */ 7898 7899 /* 7900 ** Return the full pathname of the underlying database file. 7901 ** 7902 ** The pager filename is invariant as long as the pager is 7903 ** open so it is safe to access without the BtShared mutex. 7904 */ 7905 const char *sqlite3BtreeGetFilename(Btree *p){ 7906 assert( p->pBt->pPager!=0 ); 7907 return sqlite3PagerFilename(p->pBt->pPager); 7908 } 7909 7910 /* 7911 ** Return the pathname of the journal file for this database. The return 7912 ** value of this routine is the same regardless of whether the journal file 7913 ** has been created or not. 7914 ** 7915 ** The pager journal filename is invariant as long as the pager is 7916 ** open so it is safe to access without the BtShared mutex. 7917 */ 7918 const char *sqlite3BtreeGetJournalname(Btree *p){ 7919 assert( p->pBt->pPager!=0 ); 7920 return sqlite3PagerJournalname(p->pBt->pPager); 7921 } 7922 7923 /* 7924 ** Return non-zero if a transaction is active. 7925 */ 7926 int sqlite3BtreeIsInTrans(Btree *p){ 7927 assert( p==0 || sqlite3_mutex_held(p->db->mutex) ); 7928 return (p && (p->inTrans==TRANS_WRITE)); 7929 } 7930 7931 #ifndef SQLITE_OMIT_WAL 7932 /* 7933 ** Run a checkpoint on the Btree passed as the first argument. 7934 ** 7935 ** Return SQLITE_LOCKED if this or any other connection has an open 7936 ** transaction on the shared-cache the argument Btree is connected to. 7937 */ 7938 int sqlite3BtreeCheckpoint(Btree *p){ 7939 int rc = SQLITE_OK; 7940 if( p ){ 7941 BtShared *pBt = p->pBt; 7942 sqlite3BtreeEnter(p); 7943 if( pBt->inTransaction!=TRANS_NONE ){ 7944 rc = SQLITE_LOCKED; 7945 }else{ 7946 rc = sqlite3PagerCheckpoint(pBt->pPager); 7947 } 7948 sqlite3BtreeLeave(p); 7949 } 7950 return rc; 7951 } 7952 #endif 7953 7954 /* 7955 ** Return non-zero if a read (or write) transaction is active. 7956 */ 7957 int sqlite3BtreeIsInReadTrans(Btree *p){ 7958 assert( p ); 7959 assert( sqlite3_mutex_held(p->db->mutex) ); 7960 return p->inTrans!=TRANS_NONE; 7961 } 7962 7963 int sqlite3BtreeIsInBackup(Btree *p){ 7964 assert( p ); 7965 assert( sqlite3_mutex_held(p->db->mutex) ); 7966 return p->nBackup!=0; 7967 } 7968 7969 /* 7970 ** This function returns a pointer to a blob of memory associated with 7971 ** a single shared-btree. The memory is used by client code for its own 7972 ** purposes (for example, to store a high-level schema associated with 7973 ** the shared-btree). The btree layer manages reference counting issues. 7974 ** 7975 ** The first time this is called on a shared-btree, nBytes bytes of memory 7976 ** are allocated, zeroed, and returned to the caller. For each subsequent 7977 ** call the nBytes parameter is ignored and a pointer to the same blob 7978 ** of memory returned. 7979 ** 7980 ** If the nBytes parameter is 0 and the blob of memory has not yet been 7981 ** allocated, a null pointer is returned. If the blob has already been 7982 ** allocated, it is returned as normal. 7983 ** 7984 ** Just before the shared-btree is closed, the function passed as the 7985 ** xFree argument when the memory allocation was made is invoked on the 7986 ** blob of allocated memory. This function should not call sqlite3_free() 7987 ** on the memory, the btree layer does that. 7988 */ 7989 void *sqlite3BtreeSchema(Btree *p, int nBytes, void(*xFree)(void *)){ 7990 BtShared *pBt = p->pBt; 7991 sqlite3BtreeEnter(p); 7992 if( !pBt->pSchema && nBytes ){ 7993 pBt->pSchema = sqlite3DbMallocZero(0, nBytes); 7994 pBt->xFreeSchema = xFree; 7995 } 7996 sqlite3BtreeLeave(p); 7997 return pBt->pSchema; 7998 } 7999 8000 /* 8001 ** Return SQLITE_LOCKED_SHAREDCACHE if another user of the same shared 8002 ** btree as the argument handle holds an exclusive lock on the 8003 ** sqlite_master table. Otherwise SQLITE_OK. 8004 */ 8005 int sqlite3BtreeSchemaLocked(Btree *p){ 8006 int rc; 8007 assert( sqlite3_mutex_held(p->db->mutex) ); 8008 sqlite3BtreeEnter(p); 8009 rc = querySharedCacheTableLock(p, MASTER_ROOT, READ_LOCK); 8010 assert( rc==SQLITE_OK || rc==SQLITE_LOCKED_SHAREDCACHE ); 8011 sqlite3BtreeLeave(p); 8012 return rc; 8013 } 8014 8015 8016 #ifndef SQLITE_OMIT_SHARED_CACHE 8017 /* 8018 ** Obtain a lock on the table whose root page is iTab. The 8019 ** lock is a write lock if isWritelock is true or a read lock 8020 ** if it is false. 8021 */ 8022 int sqlite3BtreeLockTable(Btree *p, int iTab, u8 isWriteLock){ 8023 int rc = SQLITE_OK; 8024 assert( p->inTrans!=TRANS_NONE ); 8025 if( p->sharable ){ 8026 u8 lockType = READ_LOCK + isWriteLock; 8027 assert( READ_LOCK+1==WRITE_LOCK ); 8028 assert( isWriteLock==0 || isWriteLock==1 ); 8029 8030 sqlite3BtreeEnter(p); 8031 rc = querySharedCacheTableLock(p, iTab, lockType); 8032 if( rc==SQLITE_OK ){ 8033 rc = setSharedCacheTableLock(p, iTab, lockType); 8034 } 8035 sqlite3BtreeLeave(p); 8036 } 8037 return rc; 8038 } 8039 #endif 8040 8041 #ifndef SQLITE_OMIT_INCRBLOB 8042 /* 8043 ** Argument pCsr must be a cursor opened for writing on an 8044 ** INTKEY table currently pointing at a valid table entry. 8045 ** This function modifies the data stored as part of that entry. 8046 ** 8047 ** Only the data content may only be modified, it is not possible to 8048 ** change the length of the data stored. If this function is called with 8049 ** parameters that attempt to write past the end of the existing data, 8050 ** no modifications are made and SQLITE_CORRUPT is returned. 8051 */ 8052 int sqlite3BtreePutData(BtCursor *pCsr, u32 offset, u32 amt, void *z){ 8053 int rc; 8054 assert( cursorHoldsMutex(pCsr) ); 8055 assert( sqlite3_mutex_held(pCsr->pBtree->db->mutex) ); 8056 assert( pCsr->isIncrblobHandle ); 8057 8058 rc = restoreCursorPosition(pCsr); 8059 if( rc!=SQLITE_OK ){ 8060 return rc; 8061 } 8062 assert( pCsr->eState!=CURSOR_REQUIRESEEK ); 8063 if( pCsr->eState!=CURSOR_VALID ){ 8064 return SQLITE_ABORT; 8065 } 8066 8067 /* Check some assumptions: 8068 ** (a) the cursor is open for writing, 8069 ** (b) there is a read/write transaction open, 8070 ** (c) the connection holds a write-lock on the table (if required), 8071 ** (d) there are no conflicting read-locks, and 8072 ** (e) the cursor points at a valid row of an intKey table. 8073 */ 8074 if( !pCsr->wrFlag ){ 8075 return SQLITE_READONLY; 8076 } 8077 assert( !pCsr->pBt->readOnly && pCsr->pBt->inTransaction==TRANS_WRITE ); 8078 assert( hasSharedCacheTableLock(pCsr->pBtree, pCsr->pgnoRoot, 0, 2) ); 8079 assert( !hasReadConflicts(pCsr->pBtree, pCsr->pgnoRoot) ); 8080 assert( pCsr->apPage[pCsr->iPage]->intKey ); 8081 8082 return accessPayload(pCsr, offset, amt, (unsigned char *)z, 1); 8083 } 8084 8085 /* 8086 ** Set a flag on this cursor to cache the locations of pages from the 8087 ** overflow list for the current row. This is used by cursors opened 8088 ** for incremental blob IO only. 8089 ** 8090 ** This function sets a flag only. The actual page location cache 8091 ** (stored in BtCursor.aOverflow[]) is allocated and used by function 8092 ** accessPayload() (the worker function for sqlite3BtreeData() and 8093 ** sqlite3BtreePutData()). 8094 */ 8095 void sqlite3BtreeCacheOverflow(BtCursor *pCur){ 8096 assert( cursorHoldsMutex(pCur) ); 8097 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) ); 8098 assert(!pCur->isIncrblobHandle); 8099 assert(!pCur->aOverflow); 8100 pCur->isIncrblobHandle = 1; 8101 } 8102 #endif 8103 8104 /* 8105 ** Set both the "read version" (single byte at byte offset 18) and 8106 ** "write version" (single byte at byte offset 19) fields in the database 8107 ** header to iVersion. 8108 */ 8109 int sqlite3BtreeSetVersion(Btree *pBtree, int iVersion){ 8110 BtShared *pBt = pBtree->pBt; 8111 int rc; /* Return code */ 8112 8113 assert( pBtree->inTrans==TRANS_NONE ); 8114 assert( iVersion==1 || iVersion==2 ); 8115 8116 /* If setting the version fields to 1, do not automatically open the 8117 ** WAL connection, even if the version fields are currently set to 2. 8118 */ 8119 pBt->doNotUseWAL = (u8)(iVersion==1); 8120 8121 rc = sqlite3BtreeBeginTrans(pBtree, 0); 8122 if( rc==SQLITE_OK ){ 8123 u8 *aData = pBt->pPage1->aData; 8124 if( aData[18]!=(u8)iVersion || aData[19]!=(u8)iVersion ){ 8125 rc = sqlite3BtreeBeginTrans(pBtree, 2); 8126 if( rc==SQLITE_OK ){ 8127 rc = sqlite3PagerWrite(pBt->pPage1->pDbPage); 8128 if( rc==SQLITE_OK ){ 8129 aData[18] = (u8)iVersion; 8130 aData[19] = (u8)iVersion; 8131 } 8132 } 8133 } 8134 } 8135 8136 pBt->doNotUseWAL = 0; 8137 return rc; 8138 } 8139