1 /* 2 ** 2004 April 6 3 ** 4 ** The author disclaims copyright to this source code. In place of 5 ** a legal notice, here is a blessing: 6 ** 7 ** May you do good and not evil. 8 ** May you find forgiveness for yourself and forgive others. 9 ** May you share freely, never taking more than you give. 10 ** 11 ************************************************************************* 12 ** This file implements a external (disk-based) database using BTrees. 13 ** See the header comment on "btreeInt.h" for additional information. 14 ** Including a description of file format and an overview of operation. 15 */ 16 #include "btreeInt.h" 17 18 /* 19 ** The header string that appears at the beginning of every 20 ** SQLite database. 21 */ 22 static const char zMagicHeader[] = SQLITE_FILE_HEADER; 23 24 /* 25 ** Set this global variable to 1 to enable tracing using the TRACE 26 ** macro. 27 */ 28 #if 0 29 int sqlite3BtreeTrace=1; /* True to enable tracing */ 30 # define TRACE(X) if(sqlite3BtreeTrace){printf X;fflush(stdout);} 31 #else 32 # define TRACE(X) 33 #endif 34 35 /* 36 ** Extract a 2-byte big-endian integer from an array of unsigned bytes. 37 ** But if the value is zero, make it 65536. 38 ** 39 ** This routine is used to extract the "offset to cell content area" value 40 ** from the header of a btree page. If the page size is 65536 and the page 41 ** is empty, the offset should be 65536, but the 2-byte value stores zero. 42 ** This routine makes the necessary adjustment to 65536. 43 */ 44 #define get2byteNotZero(X) (((((int)get2byte(X))-1)&0xffff)+1) 45 46 #ifndef SQLITE_OMIT_SHARED_CACHE 47 /* 48 ** A list of BtShared objects that are eligible for participation 49 ** in shared cache. This variable has file scope during normal builds, 50 ** but the test harness needs to access it so we make it global for 51 ** test builds. 52 ** 53 ** Access to this variable is protected by SQLITE_MUTEX_STATIC_MASTER. 54 */ 55 #ifdef SQLITE_TEST 56 BtShared *SQLITE_WSD sqlite3SharedCacheList = 0; 57 #else 58 static BtShared *SQLITE_WSD sqlite3SharedCacheList = 0; 59 #endif 60 #endif /* SQLITE_OMIT_SHARED_CACHE */ 61 62 #ifndef SQLITE_OMIT_SHARED_CACHE 63 /* 64 ** Enable or disable the shared pager and schema features. 65 ** 66 ** This routine has no effect on existing database connections. 67 ** The shared cache setting effects only future calls to 68 ** sqlite3_open(), sqlite3_open16(), or sqlite3_open_v2(). 69 */ 70 int sqlite3_enable_shared_cache(int enable){ 71 sqlite3GlobalConfig.sharedCacheEnabled = enable; 72 return SQLITE_OK; 73 } 74 #endif 75 76 77 78 #ifdef SQLITE_OMIT_SHARED_CACHE 79 /* 80 ** The functions querySharedCacheTableLock(), setSharedCacheTableLock(), 81 ** and clearAllSharedCacheTableLocks() 82 ** manipulate entries in the BtShared.pLock linked list used to store 83 ** shared-cache table level locks. If the library is compiled with the 84 ** shared-cache feature disabled, then there is only ever one user 85 ** of each BtShared structure and so this locking is not necessary. 86 ** So define the lock related functions as no-ops. 87 */ 88 #define querySharedCacheTableLock(a,b,c) SQLITE_OK 89 #define setSharedCacheTableLock(a,b,c) SQLITE_OK 90 #define clearAllSharedCacheTableLocks(a) 91 #define downgradeAllSharedCacheTableLocks(a) 92 #define hasSharedCacheTableLock(a,b,c,d) 1 93 #define hasReadConflicts(a, b) 0 94 #endif 95 96 #ifndef SQLITE_OMIT_SHARED_CACHE 97 98 #ifdef SQLITE_DEBUG 99 /* 100 **** This function is only used as part of an assert() statement. *** 101 ** 102 ** Check to see if pBtree holds the required locks to read or write to the 103 ** table with root page iRoot. Return 1 if it does and 0 if not. 104 ** 105 ** For example, when writing to a table with root-page iRoot via 106 ** Btree connection pBtree: 107 ** 108 ** assert( hasSharedCacheTableLock(pBtree, iRoot, 0, WRITE_LOCK) ); 109 ** 110 ** When writing to an index that resides in a sharable database, the 111 ** caller should have first obtained a lock specifying the root page of 112 ** the corresponding table. This makes things a bit more complicated, 113 ** as this module treats each table as a separate structure. To determine 114 ** the table corresponding to the index being written, this 115 ** function has to search through the database schema. 116 ** 117 ** Instead of a lock on the table/index rooted at page iRoot, the caller may 118 ** hold a write-lock on the schema table (root page 1). This is also 119 ** acceptable. 120 */ 121 static int hasSharedCacheTableLock( 122 Btree *pBtree, /* Handle that must hold lock */ 123 Pgno iRoot, /* Root page of b-tree */ 124 int isIndex, /* True if iRoot is the root of an index b-tree */ 125 int eLockType /* Required lock type (READ_LOCK or WRITE_LOCK) */ 126 ){ 127 Schema *pSchema = (Schema *)pBtree->pBt->pSchema; 128 Pgno iTab = 0; 129 BtLock *pLock; 130 131 /* If this database is not shareable, or if the client is reading 132 ** and has the read-uncommitted flag set, then no lock is required. 133 ** Return true immediately. 134 */ 135 if( (pBtree->sharable==0) 136 || (eLockType==READ_LOCK && (pBtree->db->flags & SQLITE_ReadUncommitted)) 137 ){ 138 return 1; 139 } 140 141 /* If the client is reading or writing an index and the schema is 142 ** not loaded, then it is too difficult to actually check to see if 143 ** the correct locks are held. So do not bother - just return true. 144 ** This case does not come up very often anyhow. 145 */ 146 if( isIndex && (!pSchema || (pSchema->flags&DB_SchemaLoaded)==0) ){ 147 return 1; 148 } 149 150 /* Figure out the root-page that the lock should be held on. For table 151 ** b-trees, this is just the root page of the b-tree being read or 152 ** written. For index b-trees, it is the root page of the associated 153 ** table. */ 154 if( isIndex ){ 155 HashElem *p; 156 for(p=sqliteHashFirst(&pSchema->idxHash); p; p=sqliteHashNext(p)){ 157 Index *pIdx = (Index *)sqliteHashData(p); 158 if( pIdx->tnum==(int)iRoot ){ 159 iTab = pIdx->pTable->tnum; 160 } 161 } 162 }else{ 163 iTab = iRoot; 164 } 165 166 /* Search for the required lock. Either a write-lock on root-page iTab, a 167 ** write-lock on the schema table, or (if the client is reading) a 168 ** read-lock on iTab will suffice. Return 1 if any of these are found. */ 169 for(pLock=pBtree->pBt->pLock; pLock; pLock=pLock->pNext){ 170 if( pLock->pBtree==pBtree 171 && (pLock->iTable==iTab || (pLock->eLock==WRITE_LOCK && pLock->iTable==1)) 172 && pLock->eLock>=eLockType 173 ){ 174 return 1; 175 } 176 } 177 178 /* Failed to find the required lock. */ 179 return 0; 180 } 181 #endif /* SQLITE_DEBUG */ 182 183 #ifdef SQLITE_DEBUG 184 /* 185 **** This function may be used as part of assert() statements only. **** 186 ** 187 ** Return true if it would be illegal for pBtree to write into the 188 ** table or index rooted at iRoot because other shared connections are 189 ** simultaneously reading that same table or index. 190 ** 191 ** It is illegal for pBtree to write if some other Btree object that 192 ** shares the same BtShared object is currently reading or writing 193 ** the iRoot table. Except, if the other Btree object has the 194 ** read-uncommitted flag set, then it is OK for the other object to 195 ** have a read cursor. 196 ** 197 ** For example, before writing to any part of the table or index 198 ** rooted at page iRoot, one should call: 199 ** 200 ** assert( !hasReadConflicts(pBtree, iRoot) ); 201 */ 202 static int hasReadConflicts(Btree *pBtree, Pgno iRoot){ 203 BtCursor *p; 204 for(p=pBtree->pBt->pCursor; p; p=p->pNext){ 205 if( p->pgnoRoot==iRoot 206 && p->pBtree!=pBtree 207 && 0==(p->pBtree->db->flags & SQLITE_ReadUncommitted) 208 ){ 209 return 1; 210 } 211 } 212 return 0; 213 } 214 #endif /* #ifdef SQLITE_DEBUG */ 215 216 /* 217 ** Query to see if Btree handle p may obtain a lock of type eLock 218 ** (READ_LOCK or WRITE_LOCK) on the table with root-page iTab. Return 219 ** SQLITE_OK if the lock may be obtained (by calling 220 ** setSharedCacheTableLock()), or SQLITE_LOCKED if not. 221 */ 222 static int querySharedCacheTableLock(Btree *p, Pgno iTab, u8 eLock){ 223 BtShared *pBt = p->pBt; 224 BtLock *pIter; 225 226 assert( sqlite3BtreeHoldsMutex(p) ); 227 assert( eLock==READ_LOCK || eLock==WRITE_LOCK ); 228 assert( p->db!=0 ); 229 assert( !(p->db->flags&SQLITE_ReadUncommitted)||eLock==WRITE_LOCK||iTab==1 ); 230 231 /* If requesting a write-lock, then the Btree must have an open write 232 ** transaction on this file. And, obviously, for this to be so there 233 ** must be an open write transaction on the file itself. 234 */ 235 assert( eLock==READ_LOCK || (p==pBt->pWriter && p->inTrans==TRANS_WRITE) ); 236 assert( eLock==READ_LOCK || pBt->inTransaction==TRANS_WRITE ); 237 238 /* This routine is a no-op if the shared-cache is not enabled */ 239 if( !p->sharable ){ 240 return SQLITE_OK; 241 } 242 243 /* If some other connection is holding an exclusive lock, the 244 ** requested lock may not be obtained. 245 */ 246 if( pBt->pWriter!=p && pBt->isExclusive ){ 247 sqlite3ConnectionBlocked(p->db, pBt->pWriter->db); 248 return SQLITE_LOCKED_SHAREDCACHE; 249 } 250 251 for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){ 252 /* The condition (pIter->eLock!=eLock) in the following if(...) 253 ** statement is a simplification of: 254 ** 255 ** (eLock==WRITE_LOCK || pIter->eLock==WRITE_LOCK) 256 ** 257 ** since we know that if eLock==WRITE_LOCK, then no other connection 258 ** may hold a WRITE_LOCK on any table in this file (since there can 259 ** only be a single writer). 260 */ 261 assert( pIter->eLock==READ_LOCK || pIter->eLock==WRITE_LOCK ); 262 assert( eLock==READ_LOCK || pIter->pBtree==p || pIter->eLock==READ_LOCK); 263 if( pIter->pBtree!=p && pIter->iTable==iTab && pIter->eLock!=eLock ){ 264 sqlite3ConnectionBlocked(p->db, pIter->pBtree->db); 265 if( eLock==WRITE_LOCK ){ 266 assert( p==pBt->pWriter ); 267 pBt->isPending = 1; 268 } 269 return SQLITE_LOCKED_SHAREDCACHE; 270 } 271 } 272 return SQLITE_OK; 273 } 274 #endif /* !SQLITE_OMIT_SHARED_CACHE */ 275 276 #ifndef SQLITE_OMIT_SHARED_CACHE 277 /* 278 ** Add a lock on the table with root-page iTable to the shared-btree used 279 ** by Btree handle p. Parameter eLock must be either READ_LOCK or 280 ** WRITE_LOCK. 281 ** 282 ** This function assumes the following: 283 ** 284 ** (a) The specified Btree object p is connected to a sharable 285 ** database (one with the BtShared.sharable flag set), and 286 ** 287 ** (b) No other Btree objects hold a lock that conflicts 288 ** with the requested lock (i.e. querySharedCacheTableLock() has 289 ** already been called and returned SQLITE_OK). 290 ** 291 ** SQLITE_OK is returned if the lock is added successfully. SQLITE_NOMEM 292 ** is returned if a malloc attempt fails. 293 */ 294 static int setSharedCacheTableLock(Btree *p, Pgno iTable, u8 eLock){ 295 BtShared *pBt = p->pBt; 296 BtLock *pLock = 0; 297 BtLock *pIter; 298 299 assert( sqlite3BtreeHoldsMutex(p) ); 300 assert( eLock==READ_LOCK || eLock==WRITE_LOCK ); 301 assert( p->db!=0 ); 302 303 /* A connection with the read-uncommitted flag set will never try to 304 ** obtain a read-lock using this function. The only read-lock obtained 305 ** by a connection in read-uncommitted mode is on the sqlite_master 306 ** table, and that lock is obtained in BtreeBeginTrans(). */ 307 assert( 0==(p->db->flags&SQLITE_ReadUncommitted) || eLock==WRITE_LOCK ); 308 309 /* This function should only be called on a sharable b-tree after it 310 ** has been determined that no other b-tree holds a conflicting lock. */ 311 assert( p->sharable ); 312 assert( SQLITE_OK==querySharedCacheTableLock(p, iTable, eLock) ); 313 314 /* First search the list for an existing lock on this table. */ 315 for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){ 316 if( pIter->iTable==iTable && pIter->pBtree==p ){ 317 pLock = pIter; 318 break; 319 } 320 } 321 322 /* If the above search did not find a BtLock struct associating Btree p 323 ** with table iTable, allocate one and link it into the list. 324 */ 325 if( !pLock ){ 326 pLock = (BtLock *)sqlite3MallocZero(sizeof(BtLock)); 327 if( !pLock ){ 328 return SQLITE_NOMEM; 329 } 330 pLock->iTable = iTable; 331 pLock->pBtree = p; 332 pLock->pNext = pBt->pLock; 333 pBt->pLock = pLock; 334 } 335 336 /* Set the BtLock.eLock variable to the maximum of the current lock 337 ** and the requested lock. This means if a write-lock was already held 338 ** and a read-lock requested, we don't incorrectly downgrade the lock. 339 */ 340 assert( WRITE_LOCK>READ_LOCK ); 341 if( eLock>pLock->eLock ){ 342 pLock->eLock = eLock; 343 } 344 345 return SQLITE_OK; 346 } 347 #endif /* !SQLITE_OMIT_SHARED_CACHE */ 348 349 #ifndef SQLITE_OMIT_SHARED_CACHE 350 /* 351 ** Release all the table locks (locks obtained via calls to 352 ** the setSharedCacheTableLock() procedure) held by Btree object p. 353 ** 354 ** This function assumes that Btree p has an open read or write 355 ** transaction. If it does not, then the BtShared.isPending variable 356 ** may be incorrectly cleared. 357 */ 358 static void clearAllSharedCacheTableLocks(Btree *p){ 359 BtShared *pBt = p->pBt; 360 BtLock **ppIter = &pBt->pLock; 361 362 assert( sqlite3BtreeHoldsMutex(p) ); 363 assert( p->sharable || 0==*ppIter ); 364 assert( p->inTrans>0 ); 365 366 while( *ppIter ){ 367 BtLock *pLock = *ppIter; 368 assert( pBt->isExclusive==0 || pBt->pWriter==pLock->pBtree ); 369 assert( pLock->pBtree->inTrans>=pLock->eLock ); 370 if( pLock->pBtree==p ){ 371 *ppIter = pLock->pNext; 372 assert( pLock->iTable!=1 || pLock==&p->lock ); 373 if( pLock->iTable!=1 ){ 374 sqlite3_free(pLock); 375 } 376 }else{ 377 ppIter = &pLock->pNext; 378 } 379 } 380 381 assert( pBt->isPending==0 || pBt->pWriter ); 382 if( pBt->pWriter==p ){ 383 pBt->pWriter = 0; 384 pBt->isExclusive = 0; 385 pBt->isPending = 0; 386 }else if( pBt->nTransaction==2 ){ 387 /* This function is called when Btree p is concluding its 388 ** transaction. If there currently exists a writer, and p is not 389 ** that writer, then the number of locks held by connections other 390 ** than the writer must be about to drop to zero. In this case 391 ** set the isPending flag to 0. 392 ** 393 ** If there is not currently a writer, then BtShared.isPending must 394 ** be zero already. So this next line is harmless in that case. 395 */ 396 pBt->isPending = 0; 397 } 398 } 399 400 /* 401 ** This function changes all write-locks held by Btree p into read-locks. 402 */ 403 static void downgradeAllSharedCacheTableLocks(Btree *p){ 404 BtShared *pBt = p->pBt; 405 if( pBt->pWriter==p ){ 406 BtLock *pLock; 407 pBt->pWriter = 0; 408 pBt->isExclusive = 0; 409 pBt->isPending = 0; 410 for(pLock=pBt->pLock; pLock; pLock=pLock->pNext){ 411 assert( pLock->eLock==READ_LOCK || pLock->pBtree==p ); 412 pLock->eLock = READ_LOCK; 413 } 414 } 415 } 416 417 #endif /* SQLITE_OMIT_SHARED_CACHE */ 418 419 static void releasePage(MemPage *pPage); /* Forward reference */ 420 421 /* 422 ***** This routine is used inside of assert() only **** 423 ** 424 ** Verify that the cursor holds the mutex on its BtShared 425 */ 426 #ifdef SQLITE_DEBUG 427 static int cursorHoldsMutex(BtCursor *p){ 428 return sqlite3_mutex_held(p->pBt->mutex); 429 } 430 #endif 431 432 433 #ifndef SQLITE_OMIT_INCRBLOB 434 /* 435 ** Invalidate the overflow page-list cache for cursor pCur, if any. 436 */ 437 static void invalidateOverflowCache(BtCursor *pCur){ 438 assert( cursorHoldsMutex(pCur) ); 439 sqlite3_free(pCur->aOverflow); 440 pCur->aOverflow = 0; 441 } 442 443 /* 444 ** Invalidate the overflow page-list cache for all cursors opened 445 ** on the shared btree structure pBt. 446 */ 447 static void invalidateAllOverflowCache(BtShared *pBt){ 448 BtCursor *p; 449 assert( sqlite3_mutex_held(pBt->mutex) ); 450 for(p=pBt->pCursor; p; p=p->pNext){ 451 invalidateOverflowCache(p); 452 } 453 } 454 455 /* 456 ** This function is called before modifying the contents of a table 457 ** to invalidate any incrblob cursors that are open on the 458 ** row or one of the rows being modified. 459 ** 460 ** If argument isClearTable is true, then the entire contents of the 461 ** table is about to be deleted. In this case invalidate all incrblob 462 ** cursors open on any row within the table with root-page pgnoRoot. 463 ** 464 ** Otherwise, if argument isClearTable is false, then the row with 465 ** rowid iRow is being replaced or deleted. In this case invalidate 466 ** only those incrblob cursors open on that specific row. 467 */ 468 static void invalidateIncrblobCursors( 469 Btree *pBtree, /* The database file to check */ 470 i64 iRow, /* The rowid that might be changing */ 471 int isClearTable /* True if all rows are being deleted */ 472 ){ 473 BtCursor *p; 474 BtShared *pBt = pBtree->pBt; 475 assert( sqlite3BtreeHoldsMutex(pBtree) ); 476 for(p=pBt->pCursor; p; p=p->pNext){ 477 if( p->isIncrblobHandle && (isClearTable || p->info.nKey==iRow) ){ 478 p->eState = CURSOR_INVALID; 479 } 480 } 481 } 482 483 #else 484 /* Stub functions when INCRBLOB is omitted */ 485 #define invalidateOverflowCache(x) 486 #define invalidateAllOverflowCache(x) 487 #define invalidateIncrblobCursors(x,y,z) 488 #endif /* SQLITE_OMIT_INCRBLOB */ 489 490 /* 491 ** Set bit pgno of the BtShared.pHasContent bitvec. This is called 492 ** when a page that previously contained data becomes a free-list leaf 493 ** page. 494 ** 495 ** The BtShared.pHasContent bitvec exists to work around an obscure 496 ** bug caused by the interaction of two useful IO optimizations surrounding 497 ** free-list leaf pages: 498 ** 499 ** 1) When all data is deleted from a page and the page becomes 500 ** a free-list leaf page, the page is not written to the database 501 ** (as free-list leaf pages contain no meaningful data). Sometimes 502 ** such a page is not even journalled (as it will not be modified, 503 ** why bother journalling it?). 504 ** 505 ** 2) When a free-list leaf page is reused, its content is not read 506 ** from the database or written to the journal file (why should it 507 ** be, if it is not at all meaningful?). 508 ** 509 ** By themselves, these optimizations work fine and provide a handy 510 ** performance boost to bulk delete or insert operations. However, if 511 ** a page is moved to the free-list and then reused within the same 512 ** transaction, a problem comes up. If the page is not journalled when 513 ** it is moved to the free-list and it is also not journalled when it 514 ** is extracted from the free-list and reused, then the original data 515 ** may be lost. In the event of a rollback, it may not be possible 516 ** to restore the database to its original configuration. 517 ** 518 ** The solution is the BtShared.pHasContent bitvec. Whenever a page is 519 ** moved to become a free-list leaf page, the corresponding bit is 520 ** set in the bitvec. Whenever a leaf page is extracted from the free-list, 521 ** optimization 2 above is omitted if the corresponding bit is already 522 ** set in BtShared.pHasContent. The contents of the bitvec are cleared 523 ** at the end of every transaction. 524 */ 525 static int btreeSetHasContent(BtShared *pBt, Pgno pgno){ 526 int rc = SQLITE_OK; 527 if( !pBt->pHasContent ){ 528 assert( pgno<=pBt->nPage ); 529 pBt->pHasContent = sqlite3BitvecCreate(pBt->nPage); 530 if( !pBt->pHasContent ){ 531 rc = SQLITE_NOMEM; 532 } 533 } 534 if( rc==SQLITE_OK && pgno<=sqlite3BitvecSize(pBt->pHasContent) ){ 535 rc = sqlite3BitvecSet(pBt->pHasContent, pgno); 536 } 537 return rc; 538 } 539 540 /* 541 ** Query the BtShared.pHasContent vector. 542 ** 543 ** This function is called when a free-list leaf page is removed from the 544 ** free-list for reuse. It returns false if it is safe to retrieve the 545 ** page from the pager layer with the 'no-content' flag set. True otherwise. 546 */ 547 static int btreeGetHasContent(BtShared *pBt, Pgno pgno){ 548 Bitvec *p = pBt->pHasContent; 549 return (p && (pgno>sqlite3BitvecSize(p) || sqlite3BitvecTest(p, pgno))); 550 } 551 552 /* 553 ** Clear (destroy) the BtShared.pHasContent bitvec. This should be 554 ** invoked at the conclusion of each write-transaction. 555 */ 556 static void btreeClearHasContent(BtShared *pBt){ 557 sqlite3BitvecDestroy(pBt->pHasContent); 558 pBt->pHasContent = 0; 559 } 560 561 /* 562 ** Save the current cursor position in the variables BtCursor.nKey 563 ** and BtCursor.pKey. The cursor's state is set to CURSOR_REQUIRESEEK. 564 ** 565 ** The caller must ensure that the cursor is valid (has eState==CURSOR_VALID) 566 ** prior to calling this routine. 567 */ 568 static int saveCursorPosition(BtCursor *pCur){ 569 int rc; 570 571 assert( CURSOR_VALID==pCur->eState ); 572 assert( 0==pCur->pKey ); 573 assert( cursorHoldsMutex(pCur) ); 574 575 rc = sqlite3BtreeKeySize(pCur, &pCur->nKey); 576 assert( rc==SQLITE_OK ); /* KeySize() cannot fail */ 577 578 /* If this is an intKey table, then the above call to BtreeKeySize() 579 ** stores the integer key in pCur->nKey. In this case this value is 580 ** all that is required. Otherwise, if pCur is not open on an intKey 581 ** table, then malloc space for and store the pCur->nKey bytes of key 582 ** data. 583 */ 584 if( 0==pCur->apPage[0]->intKey ){ 585 void *pKey = sqlite3Malloc( (int)pCur->nKey ); 586 if( pKey ){ 587 rc = sqlite3BtreeKey(pCur, 0, (int)pCur->nKey, pKey); 588 if( rc==SQLITE_OK ){ 589 pCur->pKey = pKey; 590 }else{ 591 sqlite3_free(pKey); 592 } 593 }else{ 594 rc = SQLITE_NOMEM; 595 } 596 } 597 assert( !pCur->apPage[0]->intKey || !pCur->pKey ); 598 599 if( rc==SQLITE_OK ){ 600 int i; 601 for(i=0; i<=pCur->iPage; i++){ 602 releasePage(pCur->apPage[i]); 603 pCur->apPage[i] = 0; 604 } 605 pCur->iPage = -1; 606 pCur->eState = CURSOR_REQUIRESEEK; 607 } 608 609 invalidateOverflowCache(pCur); 610 return rc; 611 } 612 613 /* 614 ** Save the positions of all cursors (except pExcept) that are open on 615 ** the table with root-page iRoot. Usually, this is called just before cursor 616 ** pExcept is used to modify the table (BtreeDelete() or BtreeInsert()). 617 */ 618 static int saveAllCursors(BtShared *pBt, Pgno iRoot, BtCursor *pExcept){ 619 BtCursor *p; 620 assert( sqlite3_mutex_held(pBt->mutex) ); 621 assert( pExcept==0 || pExcept->pBt==pBt ); 622 for(p=pBt->pCursor; p; p=p->pNext){ 623 if( p!=pExcept && (0==iRoot || p->pgnoRoot==iRoot) && 624 p->eState==CURSOR_VALID ){ 625 int rc = saveCursorPosition(p); 626 if( SQLITE_OK!=rc ){ 627 return rc; 628 } 629 } 630 } 631 return SQLITE_OK; 632 } 633 634 /* 635 ** Clear the current cursor position. 636 */ 637 void sqlite3BtreeClearCursor(BtCursor *pCur){ 638 assert( cursorHoldsMutex(pCur) ); 639 sqlite3_free(pCur->pKey); 640 pCur->pKey = 0; 641 pCur->eState = CURSOR_INVALID; 642 } 643 644 /* 645 ** In this version of BtreeMoveto, pKey is a packed index record 646 ** such as is generated by the OP_MakeRecord opcode. Unpack the 647 ** record and then call BtreeMovetoUnpacked() to do the work. 648 */ 649 static int btreeMoveto( 650 BtCursor *pCur, /* Cursor open on the btree to be searched */ 651 const void *pKey, /* Packed key if the btree is an index */ 652 i64 nKey, /* Integer key for tables. Size of pKey for indices */ 653 int bias, /* Bias search to the high end */ 654 int *pRes /* Write search results here */ 655 ){ 656 int rc; /* Status code */ 657 UnpackedRecord *pIdxKey; /* Unpacked index key */ 658 char aSpace[150]; /* Temp space for pIdxKey - to avoid a malloc */ 659 660 if( pKey ){ 661 assert( nKey==(i64)(int)nKey ); 662 pIdxKey = sqlite3VdbeRecordUnpack(pCur->pKeyInfo, (int)nKey, pKey, 663 aSpace, sizeof(aSpace)); 664 if( pIdxKey==0 ) return SQLITE_NOMEM; 665 }else{ 666 pIdxKey = 0; 667 } 668 rc = sqlite3BtreeMovetoUnpacked(pCur, pIdxKey, nKey, bias, pRes); 669 if( pKey ){ 670 sqlite3VdbeDeleteUnpackedRecord(pIdxKey); 671 } 672 return rc; 673 } 674 675 /* 676 ** Restore the cursor to the position it was in (or as close to as possible) 677 ** when saveCursorPosition() was called. Note that this call deletes the 678 ** saved position info stored by saveCursorPosition(), so there can be 679 ** at most one effective restoreCursorPosition() call after each 680 ** saveCursorPosition(). 681 */ 682 static int btreeRestoreCursorPosition(BtCursor *pCur){ 683 int rc; 684 assert( cursorHoldsMutex(pCur) ); 685 assert( pCur->eState>=CURSOR_REQUIRESEEK ); 686 if( pCur->eState==CURSOR_FAULT ){ 687 return pCur->skipNext; 688 } 689 pCur->eState = CURSOR_INVALID; 690 rc = btreeMoveto(pCur, pCur->pKey, pCur->nKey, 0, &pCur->skipNext); 691 if( rc==SQLITE_OK ){ 692 sqlite3_free(pCur->pKey); 693 pCur->pKey = 0; 694 assert( pCur->eState==CURSOR_VALID || pCur->eState==CURSOR_INVALID ); 695 } 696 return rc; 697 } 698 699 #define restoreCursorPosition(p) \ 700 (p->eState>=CURSOR_REQUIRESEEK ? \ 701 btreeRestoreCursorPosition(p) : \ 702 SQLITE_OK) 703 704 /* 705 ** Determine whether or not a cursor has moved from the position it 706 ** was last placed at. Cursors can move when the row they are pointing 707 ** at is deleted out from under them. 708 ** 709 ** This routine returns an error code if something goes wrong. The 710 ** integer *pHasMoved is set to one if the cursor has moved and 0 if not. 711 */ 712 int sqlite3BtreeCursorHasMoved(BtCursor *pCur, int *pHasMoved){ 713 int rc; 714 715 rc = restoreCursorPosition(pCur); 716 if( rc ){ 717 *pHasMoved = 1; 718 return rc; 719 } 720 if( pCur->eState!=CURSOR_VALID || pCur->skipNext!=0 ){ 721 *pHasMoved = 1; 722 }else{ 723 *pHasMoved = 0; 724 } 725 return SQLITE_OK; 726 } 727 728 #ifndef SQLITE_OMIT_AUTOVACUUM 729 /* 730 ** Given a page number of a regular database page, return the page 731 ** number for the pointer-map page that contains the entry for the 732 ** input page number. 733 ** 734 ** Return 0 (not a valid page) for pgno==1 since there is 735 ** no pointer map associated with page 1. The integrity_check logic 736 ** requires that ptrmapPageno(*,1)!=1. 737 */ 738 static Pgno ptrmapPageno(BtShared *pBt, Pgno pgno){ 739 int nPagesPerMapPage; 740 Pgno iPtrMap, ret; 741 assert( sqlite3_mutex_held(pBt->mutex) ); 742 if( pgno<2 ) return 0; 743 nPagesPerMapPage = (pBt->usableSize/5)+1; 744 iPtrMap = (pgno-2)/nPagesPerMapPage; 745 ret = (iPtrMap*nPagesPerMapPage) + 2; 746 if( ret==PENDING_BYTE_PAGE(pBt) ){ 747 ret++; 748 } 749 return ret; 750 } 751 752 /* 753 ** Write an entry into the pointer map. 754 ** 755 ** This routine updates the pointer map entry for page number 'key' 756 ** so that it maps to type 'eType' and parent page number 'pgno'. 757 ** 758 ** If *pRC is initially non-zero (non-SQLITE_OK) then this routine is 759 ** a no-op. If an error occurs, the appropriate error code is written 760 ** into *pRC. 761 */ 762 static void ptrmapPut(BtShared *pBt, Pgno key, u8 eType, Pgno parent, int *pRC){ 763 DbPage *pDbPage; /* The pointer map page */ 764 u8 *pPtrmap; /* The pointer map data */ 765 Pgno iPtrmap; /* The pointer map page number */ 766 int offset; /* Offset in pointer map page */ 767 int rc; /* Return code from subfunctions */ 768 769 if( *pRC ) return; 770 771 assert( sqlite3_mutex_held(pBt->mutex) ); 772 /* The master-journal page number must never be used as a pointer map page */ 773 assert( 0==PTRMAP_ISPAGE(pBt, PENDING_BYTE_PAGE(pBt)) ); 774 775 assert( pBt->autoVacuum ); 776 if( key==0 ){ 777 *pRC = SQLITE_CORRUPT_BKPT; 778 return; 779 } 780 iPtrmap = PTRMAP_PAGENO(pBt, key); 781 rc = sqlite3PagerGet(pBt->pPager, iPtrmap, &pDbPage); 782 if( rc!=SQLITE_OK ){ 783 *pRC = rc; 784 return; 785 } 786 offset = PTRMAP_PTROFFSET(iPtrmap, key); 787 if( offset<0 ){ 788 *pRC = SQLITE_CORRUPT_BKPT; 789 goto ptrmap_exit; 790 } 791 pPtrmap = (u8 *)sqlite3PagerGetData(pDbPage); 792 793 if( eType!=pPtrmap[offset] || get4byte(&pPtrmap[offset+1])!=parent ){ 794 TRACE(("PTRMAP_UPDATE: %d->(%d,%d)\n", key, eType, parent)); 795 *pRC= rc = sqlite3PagerWrite(pDbPage); 796 if( rc==SQLITE_OK ){ 797 pPtrmap[offset] = eType; 798 put4byte(&pPtrmap[offset+1], parent); 799 } 800 } 801 802 ptrmap_exit: 803 sqlite3PagerUnref(pDbPage); 804 } 805 806 /* 807 ** Read an entry from the pointer map. 808 ** 809 ** This routine retrieves the pointer map entry for page 'key', writing 810 ** the type and parent page number to *pEType and *pPgno respectively. 811 ** An error code is returned if something goes wrong, otherwise SQLITE_OK. 812 */ 813 static int ptrmapGet(BtShared *pBt, Pgno key, u8 *pEType, Pgno *pPgno){ 814 DbPage *pDbPage; /* The pointer map page */ 815 int iPtrmap; /* Pointer map page index */ 816 u8 *pPtrmap; /* Pointer map page data */ 817 int offset; /* Offset of entry in pointer map */ 818 int rc; 819 820 assert( sqlite3_mutex_held(pBt->mutex) ); 821 822 iPtrmap = PTRMAP_PAGENO(pBt, key); 823 rc = sqlite3PagerGet(pBt->pPager, iPtrmap, &pDbPage); 824 if( rc!=0 ){ 825 return rc; 826 } 827 pPtrmap = (u8 *)sqlite3PagerGetData(pDbPage); 828 829 offset = PTRMAP_PTROFFSET(iPtrmap, key); 830 assert( pEType!=0 ); 831 *pEType = pPtrmap[offset]; 832 if( pPgno ) *pPgno = get4byte(&pPtrmap[offset+1]); 833 834 sqlite3PagerUnref(pDbPage); 835 if( *pEType<1 || *pEType>5 ) return SQLITE_CORRUPT_BKPT; 836 return SQLITE_OK; 837 } 838 839 #else /* if defined SQLITE_OMIT_AUTOVACUUM */ 840 #define ptrmapPut(w,x,y,z,rc) 841 #define ptrmapGet(w,x,y,z) SQLITE_OK 842 #define ptrmapPutOvflPtr(x, y, rc) 843 #endif 844 845 /* 846 ** Given a btree page and a cell index (0 means the first cell on 847 ** the page, 1 means the second cell, and so forth) return a pointer 848 ** to the cell content. 849 ** 850 ** This routine works only for pages that do not contain overflow cells. 851 */ 852 #define findCell(P,I) \ 853 ((P)->aData + ((P)->maskPage & get2byte(&(P)->aData[(P)->cellOffset+2*(I)]))) 854 855 /* 856 ** This a more complex version of findCell() that works for 857 ** pages that do contain overflow cells. 858 */ 859 static u8 *findOverflowCell(MemPage *pPage, int iCell){ 860 int i; 861 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 862 for(i=pPage->nOverflow-1; i>=0; i--){ 863 int k; 864 struct _OvflCell *pOvfl; 865 pOvfl = &pPage->aOvfl[i]; 866 k = pOvfl->idx; 867 if( k<=iCell ){ 868 if( k==iCell ){ 869 return pOvfl->pCell; 870 } 871 iCell--; 872 } 873 } 874 return findCell(pPage, iCell); 875 } 876 877 /* 878 ** Parse a cell content block and fill in the CellInfo structure. There 879 ** are two versions of this function. btreeParseCell() takes a 880 ** cell index as the second argument and btreeParseCellPtr() 881 ** takes a pointer to the body of the cell as its second argument. 882 ** 883 ** Within this file, the parseCell() macro can be called instead of 884 ** btreeParseCellPtr(). Using some compilers, this will be faster. 885 */ 886 static void btreeParseCellPtr( 887 MemPage *pPage, /* Page containing the cell */ 888 u8 *pCell, /* Pointer to the cell text. */ 889 CellInfo *pInfo /* Fill in this structure */ 890 ){ 891 u16 n; /* Number bytes in cell content header */ 892 u32 nPayload; /* Number of bytes of cell payload */ 893 894 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 895 896 pInfo->pCell = pCell; 897 assert( pPage->leaf==0 || pPage->leaf==1 ); 898 n = pPage->childPtrSize; 899 assert( n==4-4*pPage->leaf ); 900 if( pPage->intKey ){ 901 if( pPage->hasData ){ 902 n += getVarint32(&pCell[n], nPayload); 903 }else{ 904 nPayload = 0; 905 } 906 n += getVarint(&pCell[n], (u64*)&pInfo->nKey); 907 pInfo->nData = nPayload; 908 }else{ 909 pInfo->nData = 0; 910 n += getVarint32(&pCell[n], nPayload); 911 pInfo->nKey = nPayload; 912 } 913 pInfo->nPayload = nPayload; 914 pInfo->nHeader = n; 915 testcase( nPayload==pPage->maxLocal ); 916 testcase( nPayload==pPage->maxLocal+1 ); 917 if( likely(nPayload<=pPage->maxLocal) ){ 918 /* This is the (easy) common case where the entire payload fits 919 ** on the local page. No overflow is required. 920 */ 921 if( (pInfo->nSize = (u16)(n+nPayload))<4 ) pInfo->nSize = 4; 922 pInfo->nLocal = (u16)nPayload; 923 pInfo->iOverflow = 0; 924 }else{ 925 /* If the payload will not fit completely on the local page, we have 926 ** to decide how much to store locally and how much to spill onto 927 ** overflow pages. The strategy is to minimize the amount of unused 928 ** space on overflow pages while keeping the amount of local storage 929 ** in between minLocal and maxLocal. 930 ** 931 ** Warning: changing the way overflow payload is distributed in any 932 ** way will result in an incompatible file format. 933 */ 934 int minLocal; /* Minimum amount of payload held locally */ 935 int maxLocal; /* Maximum amount of payload held locally */ 936 int surplus; /* Overflow payload available for local storage */ 937 938 minLocal = pPage->minLocal; 939 maxLocal = pPage->maxLocal; 940 surplus = minLocal + (nPayload - minLocal)%(pPage->pBt->usableSize - 4); 941 testcase( surplus==maxLocal ); 942 testcase( surplus==maxLocal+1 ); 943 if( surplus <= maxLocal ){ 944 pInfo->nLocal = (u16)surplus; 945 }else{ 946 pInfo->nLocal = (u16)minLocal; 947 } 948 pInfo->iOverflow = (u16)(pInfo->nLocal + n); 949 pInfo->nSize = pInfo->iOverflow + 4; 950 } 951 } 952 #define parseCell(pPage, iCell, pInfo) \ 953 btreeParseCellPtr((pPage), findCell((pPage), (iCell)), (pInfo)) 954 static void btreeParseCell( 955 MemPage *pPage, /* Page containing the cell */ 956 int iCell, /* The cell index. First cell is 0 */ 957 CellInfo *pInfo /* Fill in this structure */ 958 ){ 959 parseCell(pPage, iCell, pInfo); 960 } 961 962 /* 963 ** Compute the total number of bytes that a Cell needs in the cell 964 ** data area of the btree-page. The return number includes the cell 965 ** data header and the local payload, but not any overflow page or 966 ** the space used by the cell pointer. 967 */ 968 static u16 cellSizePtr(MemPage *pPage, u8 *pCell){ 969 u8 *pIter = &pCell[pPage->childPtrSize]; 970 u32 nSize; 971 972 #ifdef SQLITE_DEBUG 973 /* The value returned by this function should always be the same as 974 ** the (CellInfo.nSize) value found by doing a full parse of the 975 ** cell. If SQLITE_DEBUG is defined, an assert() at the bottom of 976 ** this function verifies that this invariant is not violated. */ 977 CellInfo debuginfo; 978 btreeParseCellPtr(pPage, pCell, &debuginfo); 979 #endif 980 981 if( pPage->intKey ){ 982 u8 *pEnd; 983 if( pPage->hasData ){ 984 pIter += getVarint32(pIter, nSize); 985 }else{ 986 nSize = 0; 987 } 988 989 /* pIter now points at the 64-bit integer key value, a variable length 990 ** integer. The following block moves pIter to point at the first byte 991 ** past the end of the key value. */ 992 pEnd = &pIter[9]; 993 while( (*pIter++)&0x80 && pIter<pEnd ); 994 }else{ 995 pIter += getVarint32(pIter, nSize); 996 } 997 998 testcase( nSize==pPage->maxLocal ); 999 testcase( nSize==pPage->maxLocal+1 ); 1000 if( nSize>pPage->maxLocal ){ 1001 int minLocal = pPage->minLocal; 1002 nSize = minLocal + (nSize - minLocal) % (pPage->pBt->usableSize - 4); 1003 testcase( nSize==pPage->maxLocal ); 1004 testcase( nSize==pPage->maxLocal+1 ); 1005 if( nSize>pPage->maxLocal ){ 1006 nSize = minLocal; 1007 } 1008 nSize += 4; 1009 } 1010 nSize += (u32)(pIter - pCell); 1011 1012 /* The minimum size of any cell is 4 bytes. */ 1013 if( nSize<4 ){ 1014 nSize = 4; 1015 } 1016 1017 assert( nSize==debuginfo.nSize ); 1018 return (u16)nSize; 1019 } 1020 1021 #ifdef SQLITE_DEBUG 1022 /* This variation on cellSizePtr() is used inside of assert() statements 1023 ** only. */ 1024 static u16 cellSize(MemPage *pPage, int iCell){ 1025 return cellSizePtr(pPage, findCell(pPage, iCell)); 1026 } 1027 #endif 1028 1029 #ifndef SQLITE_OMIT_AUTOVACUUM 1030 /* 1031 ** If the cell pCell, part of page pPage contains a pointer 1032 ** to an overflow page, insert an entry into the pointer-map 1033 ** for the overflow page. 1034 */ 1035 static void ptrmapPutOvflPtr(MemPage *pPage, u8 *pCell, int *pRC){ 1036 CellInfo info; 1037 if( *pRC ) return; 1038 assert( pCell!=0 ); 1039 btreeParseCellPtr(pPage, pCell, &info); 1040 assert( (info.nData+(pPage->intKey?0:info.nKey))==info.nPayload ); 1041 if( info.iOverflow ){ 1042 Pgno ovfl = get4byte(&pCell[info.iOverflow]); 1043 ptrmapPut(pPage->pBt, ovfl, PTRMAP_OVERFLOW1, pPage->pgno, pRC); 1044 } 1045 } 1046 #endif 1047 1048 1049 /* 1050 ** Defragment the page given. All Cells are moved to the 1051 ** end of the page and all free space is collected into one 1052 ** big FreeBlk that occurs in between the header and cell 1053 ** pointer array and the cell content area. 1054 */ 1055 static int defragmentPage(MemPage *pPage){ 1056 int i; /* Loop counter */ 1057 int pc; /* Address of a i-th cell */ 1058 int hdr; /* Offset to the page header */ 1059 int size; /* Size of a cell */ 1060 int usableSize; /* Number of usable bytes on a page */ 1061 int cellOffset; /* Offset to the cell pointer array */ 1062 int cbrk; /* Offset to the cell content area */ 1063 int nCell; /* Number of cells on the page */ 1064 unsigned char *data; /* The page data */ 1065 unsigned char *temp; /* Temp area for cell content */ 1066 int iCellFirst; /* First allowable cell index */ 1067 int iCellLast; /* Last possible cell index */ 1068 1069 1070 assert( sqlite3PagerIswriteable(pPage->pDbPage) ); 1071 assert( pPage->pBt!=0 ); 1072 assert( pPage->pBt->usableSize <= SQLITE_MAX_PAGE_SIZE ); 1073 assert( pPage->nOverflow==0 ); 1074 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 1075 temp = sqlite3PagerTempSpace(pPage->pBt->pPager); 1076 data = pPage->aData; 1077 hdr = pPage->hdrOffset; 1078 cellOffset = pPage->cellOffset; 1079 nCell = pPage->nCell; 1080 assert( nCell==get2byte(&data[hdr+3]) ); 1081 usableSize = pPage->pBt->usableSize; 1082 cbrk = get2byte(&data[hdr+5]); 1083 memcpy(&temp[cbrk], &data[cbrk], usableSize - cbrk); 1084 cbrk = usableSize; 1085 iCellFirst = cellOffset + 2*nCell; 1086 iCellLast = usableSize - 4; 1087 for(i=0; i<nCell; i++){ 1088 u8 *pAddr; /* The i-th cell pointer */ 1089 pAddr = &data[cellOffset + i*2]; 1090 pc = get2byte(pAddr); 1091 testcase( pc==iCellFirst ); 1092 testcase( pc==iCellLast ); 1093 #if !defined(SQLITE_ENABLE_OVERSIZE_CELL_CHECK) 1094 /* These conditions have already been verified in btreeInitPage() 1095 ** if SQLITE_ENABLE_OVERSIZE_CELL_CHECK is defined 1096 */ 1097 if( pc<iCellFirst || pc>iCellLast ){ 1098 return SQLITE_CORRUPT_BKPT; 1099 } 1100 #endif 1101 assert( pc>=iCellFirst && pc<=iCellLast ); 1102 size = cellSizePtr(pPage, &temp[pc]); 1103 cbrk -= size; 1104 #if defined(SQLITE_ENABLE_OVERSIZE_CELL_CHECK) 1105 if( cbrk<iCellFirst ){ 1106 return SQLITE_CORRUPT_BKPT; 1107 } 1108 #else 1109 if( cbrk<iCellFirst || pc+size>usableSize ){ 1110 return SQLITE_CORRUPT_BKPT; 1111 } 1112 #endif 1113 assert( cbrk+size<=usableSize && cbrk>=iCellFirst ); 1114 testcase( cbrk+size==usableSize ); 1115 testcase( pc+size==usableSize ); 1116 memcpy(&data[cbrk], &temp[pc], size); 1117 put2byte(pAddr, cbrk); 1118 } 1119 assert( cbrk>=iCellFirst ); 1120 put2byte(&data[hdr+5], cbrk); 1121 data[hdr+1] = 0; 1122 data[hdr+2] = 0; 1123 data[hdr+7] = 0; 1124 memset(&data[iCellFirst], 0, cbrk-iCellFirst); 1125 assert( sqlite3PagerIswriteable(pPage->pDbPage) ); 1126 if( cbrk-iCellFirst!=pPage->nFree ){ 1127 return SQLITE_CORRUPT_BKPT; 1128 } 1129 return SQLITE_OK; 1130 } 1131 1132 /* 1133 ** Allocate nByte bytes of space from within the B-Tree page passed 1134 ** as the first argument. Write into *pIdx the index into pPage->aData[] 1135 ** of the first byte of allocated space. Return either SQLITE_OK or 1136 ** an error code (usually SQLITE_CORRUPT). 1137 ** 1138 ** The caller guarantees that there is sufficient space to make the 1139 ** allocation. This routine might need to defragment in order to bring 1140 ** all the space together, however. This routine will avoid using 1141 ** the first two bytes past the cell pointer area since presumably this 1142 ** allocation is being made in order to insert a new cell, so we will 1143 ** also end up needing a new cell pointer. 1144 */ 1145 static int allocateSpace(MemPage *pPage, int nByte, int *pIdx){ 1146 const int hdr = pPage->hdrOffset; /* Local cache of pPage->hdrOffset */ 1147 u8 * const data = pPage->aData; /* Local cache of pPage->aData */ 1148 int nFrag; /* Number of fragmented bytes on pPage */ 1149 int top; /* First byte of cell content area */ 1150 int gap; /* First byte of gap between cell pointers and cell content */ 1151 int rc; /* Integer return code */ 1152 int usableSize; /* Usable size of the page */ 1153 1154 assert( sqlite3PagerIswriteable(pPage->pDbPage) ); 1155 assert( pPage->pBt ); 1156 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 1157 assert( nByte>=0 ); /* Minimum cell size is 4 */ 1158 assert( pPage->nFree>=nByte ); 1159 assert( pPage->nOverflow==0 ); 1160 usableSize = pPage->pBt->usableSize; 1161 assert( nByte < usableSize-8 ); 1162 1163 nFrag = data[hdr+7]; 1164 assert( pPage->cellOffset == hdr + 12 - 4*pPage->leaf ); 1165 gap = pPage->cellOffset + 2*pPage->nCell; 1166 top = get2byteNotZero(&data[hdr+5]); 1167 if( gap>top ) return SQLITE_CORRUPT_BKPT; 1168 testcase( gap+2==top ); 1169 testcase( gap+1==top ); 1170 testcase( gap==top ); 1171 1172 if( nFrag>=60 ){ 1173 /* Always defragment highly fragmented pages */ 1174 rc = defragmentPage(pPage); 1175 if( rc ) return rc; 1176 top = get2byteNotZero(&data[hdr+5]); 1177 }else if( gap+2<=top ){ 1178 /* Search the freelist looking for a free slot big enough to satisfy 1179 ** the request. The allocation is made from the first free slot in 1180 ** the list that is large enough to accomadate it. 1181 */ 1182 int pc, addr; 1183 for(addr=hdr+1; (pc = get2byte(&data[addr]))>0; addr=pc){ 1184 int size; /* Size of the free slot */ 1185 if( pc>usableSize-4 || pc<addr+4 ){ 1186 return SQLITE_CORRUPT_BKPT; 1187 } 1188 size = get2byte(&data[pc+2]); 1189 if( size>=nByte ){ 1190 int x = size - nByte; 1191 testcase( x==4 ); 1192 testcase( x==3 ); 1193 if( x<4 ){ 1194 /* Remove the slot from the free-list. Update the number of 1195 ** fragmented bytes within the page. */ 1196 memcpy(&data[addr], &data[pc], 2); 1197 data[hdr+7] = (u8)(nFrag + x); 1198 }else if( size+pc > usableSize ){ 1199 return SQLITE_CORRUPT_BKPT; 1200 }else{ 1201 /* The slot remains on the free-list. Reduce its size to account 1202 ** for the portion used by the new allocation. */ 1203 put2byte(&data[pc+2], x); 1204 } 1205 *pIdx = pc + x; 1206 return SQLITE_OK; 1207 } 1208 } 1209 } 1210 1211 /* Check to make sure there is enough space in the gap to satisfy 1212 ** the allocation. If not, defragment. 1213 */ 1214 testcase( gap+2+nByte==top ); 1215 if( gap+2+nByte>top ){ 1216 rc = defragmentPage(pPage); 1217 if( rc ) return rc; 1218 top = get2byteNotZero(&data[hdr+5]); 1219 assert( gap+nByte<=top ); 1220 } 1221 1222 1223 /* Allocate memory from the gap in between the cell pointer array 1224 ** and the cell content area. The btreeInitPage() call has already 1225 ** validated the freelist. Given that the freelist is valid, there 1226 ** is no way that the allocation can extend off the end of the page. 1227 ** The assert() below verifies the previous sentence. 1228 */ 1229 top -= nByte; 1230 put2byte(&data[hdr+5], top); 1231 assert( top+nByte <= (int)pPage->pBt->usableSize ); 1232 *pIdx = top; 1233 return SQLITE_OK; 1234 } 1235 1236 /* 1237 ** Return a section of the pPage->aData to the freelist. 1238 ** The first byte of the new free block is pPage->aDisk[start] 1239 ** and the size of the block is "size" bytes. 1240 ** 1241 ** Most of the effort here is involved in coalesing adjacent 1242 ** free blocks into a single big free block. 1243 */ 1244 static int freeSpace(MemPage *pPage, int start, int size){ 1245 int addr, pbegin, hdr; 1246 int iLast; /* Largest possible freeblock offset */ 1247 unsigned char *data = pPage->aData; 1248 1249 assert( pPage->pBt!=0 ); 1250 assert( sqlite3PagerIswriteable(pPage->pDbPage) ); 1251 assert( start>=pPage->hdrOffset+6+pPage->childPtrSize ); 1252 assert( (start + size) <= (int)pPage->pBt->usableSize ); 1253 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 1254 assert( size>=0 ); /* Minimum cell size is 4 */ 1255 1256 if( pPage->pBt->secureDelete ){ 1257 /* Overwrite deleted information with zeros when the secure_delete 1258 ** option is enabled */ 1259 memset(&data[start], 0, size); 1260 } 1261 1262 /* Add the space back into the linked list of freeblocks. Note that 1263 ** even though the freeblock list was checked by btreeInitPage(), 1264 ** btreeInitPage() did not detect overlapping cells or 1265 ** freeblocks that overlapped cells. Nor does it detect when the 1266 ** cell content area exceeds the value in the page header. If these 1267 ** situations arise, then subsequent insert operations might corrupt 1268 ** the freelist. So we do need to check for corruption while scanning 1269 ** the freelist. 1270 */ 1271 hdr = pPage->hdrOffset; 1272 addr = hdr + 1; 1273 iLast = pPage->pBt->usableSize - 4; 1274 assert( start<=iLast ); 1275 while( (pbegin = get2byte(&data[addr]))<start && pbegin>0 ){ 1276 if( pbegin<addr+4 ){ 1277 return SQLITE_CORRUPT_BKPT; 1278 } 1279 addr = pbegin; 1280 } 1281 if( pbegin>iLast ){ 1282 return SQLITE_CORRUPT_BKPT; 1283 } 1284 assert( pbegin>addr || pbegin==0 ); 1285 put2byte(&data[addr], start); 1286 put2byte(&data[start], pbegin); 1287 put2byte(&data[start+2], size); 1288 pPage->nFree = pPage->nFree + (u16)size; 1289 1290 /* Coalesce adjacent free blocks */ 1291 addr = hdr + 1; 1292 while( (pbegin = get2byte(&data[addr]))>0 ){ 1293 int pnext, psize, x; 1294 assert( pbegin>addr ); 1295 assert( pbegin <= (int)pPage->pBt->usableSize-4 ); 1296 pnext = get2byte(&data[pbegin]); 1297 psize = get2byte(&data[pbegin+2]); 1298 if( pbegin + psize + 3 >= pnext && pnext>0 ){ 1299 int frag = pnext - (pbegin+psize); 1300 if( (frag<0) || (frag>(int)data[hdr+7]) ){ 1301 return SQLITE_CORRUPT_BKPT; 1302 } 1303 data[hdr+7] -= (u8)frag; 1304 x = get2byte(&data[pnext]); 1305 put2byte(&data[pbegin], x); 1306 x = pnext + get2byte(&data[pnext+2]) - pbegin; 1307 put2byte(&data[pbegin+2], x); 1308 }else{ 1309 addr = pbegin; 1310 } 1311 } 1312 1313 /* If the cell content area begins with a freeblock, remove it. */ 1314 if( data[hdr+1]==data[hdr+5] && data[hdr+2]==data[hdr+6] ){ 1315 int top; 1316 pbegin = get2byte(&data[hdr+1]); 1317 memcpy(&data[hdr+1], &data[pbegin], 2); 1318 top = get2byte(&data[hdr+5]) + get2byte(&data[pbegin+2]); 1319 put2byte(&data[hdr+5], top); 1320 } 1321 assert( sqlite3PagerIswriteable(pPage->pDbPage) ); 1322 return SQLITE_OK; 1323 } 1324 1325 /* 1326 ** Decode the flags byte (the first byte of the header) for a page 1327 ** and initialize fields of the MemPage structure accordingly. 1328 ** 1329 ** Only the following combinations are supported. Anything different 1330 ** indicates a corrupt database files: 1331 ** 1332 ** PTF_ZERODATA 1333 ** PTF_ZERODATA | PTF_LEAF 1334 ** PTF_LEAFDATA | PTF_INTKEY 1335 ** PTF_LEAFDATA | PTF_INTKEY | PTF_LEAF 1336 */ 1337 static int decodeFlags(MemPage *pPage, int flagByte){ 1338 BtShared *pBt; /* A copy of pPage->pBt */ 1339 1340 assert( pPage->hdrOffset==(pPage->pgno==1 ? 100 : 0) ); 1341 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 1342 pPage->leaf = (u8)(flagByte>>3); assert( PTF_LEAF == 1<<3 ); 1343 flagByte &= ~PTF_LEAF; 1344 pPage->childPtrSize = 4-4*pPage->leaf; 1345 pBt = pPage->pBt; 1346 if( flagByte==(PTF_LEAFDATA | PTF_INTKEY) ){ 1347 pPage->intKey = 1; 1348 pPage->hasData = pPage->leaf; 1349 pPage->maxLocal = pBt->maxLeaf; 1350 pPage->minLocal = pBt->minLeaf; 1351 }else if( flagByte==PTF_ZERODATA ){ 1352 pPage->intKey = 0; 1353 pPage->hasData = 0; 1354 pPage->maxLocal = pBt->maxLocal; 1355 pPage->minLocal = pBt->minLocal; 1356 }else{ 1357 return SQLITE_CORRUPT_BKPT; 1358 } 1359 return SQLITE_OK; 1360 } 1361 1362 /* 1363 ** Initialize the auxiliary information for a disk block. 1364 ** 1365 ** Return SQLITE_OK on success. If we see that the page does 1366 ** not contain a well-formed database page, then return 1367 ** SQLITE_CORRUPT. Note that a return of SQLITE_OK does not 1368 ** guarantee that the page is well-formed. It only shows that 1369 ** we failed to detect any corruption. 1370 */ 1371 static int btreeInitPage(MemPage *pPage){ 1372 1373 assert( pPage->pBt!=0 ); 1374 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 1375 assert( pPage->pgno==sqlite3PagerPagenumber(pPage->pDbPage) ); 1376 assert( pPage == sqlite3PagerGetExtra(pPage->pDbPage) ); 1377 assert( pPage->aData == sqlite3PagerGetData(pPage->pDbPage) ); 1378 1379 if( !pPage->isInit ){ 1380 u16 pc; /* Address of a freeblock within pPage->aData[] */ 1381 u8 hdr; /* Offset to beginning of page header */ 1382 u8 *data; /* Equal to pPage->aData */ 1383 BtShared *pBt; /* The main btree structure */ 1384 int usableSize; /* Amount of usable space on each page */ 1385 u16 cellOffset; /* Offset from start of page to first cell pointer */ 1386 int nFree; /* Number of unused bytes on the page */ 1387 int top; /* First byte of the cell content area */ 1388 int iCellFirst; /* First allowable cell or freeblock offset */ 1389 int iCellLast; /* Last possible cell or freeblock offset */ 1390 1391 pBt = pPage->pBt; 1392 1393 hdr = pPage->hdrOffset; 1394 data = pPage->aData; 1395 if( decodeFlags(pPage, data[hdr]) ) return SQLITE_CORRUPT_BKPT; 1396 assert( pBt->pageSize>=512 && pBt->pageSize<=65536 ); 1397 pPage->maskPage = (u16)(pBt->pageSize - 1); 1398 pPage->nOverflow = 0; 1399 usableSize = pBt->usableSize; 1400 pPage->cellOffset = cellOffset = hdr + 12 - 4*pPage->leaf; 1401 top = get2byteNotZero(&data[hdr+5]); 1402 pPage->nCell = get2byte(&data[hdr+3]); 1403 if( pPage->nCell>MX_CELL(pBt) ){ 1404 /* To many cells for a single page. The page must be corrupt */ 1405 return SQLITE_CORRUPT_BKPT; 1406 } 1407 testcase( pPage->nCell==MX_CELL(pBt) ); 1408 1409 /* A malformed database page might cause us to read past the end 1410 ** of page when parsing a cell. 1411 ** 1412 ** The following block of code checks early to see if a cell extends 1413 ** past the end of a page boundary and causes SQLITE_CORRUPT to be 1414 ** returned if it does. 1415 */ 1416 iCellFirst = cellOffset + 2*pPage->nCell; 1417 iCellLast = usableSize - 4; 1418 #if defined(SQLITE_ENABLE_OVERSIZE_CELL_CHECK) 1419 { 1420 int i; /* Index into the cell pointer array */ 1421 int sz; /* Size of a cell */ 1422 1423 if( !pPage->leaf ) iCellLast--; 1424 for(i=0; i<pPage->nCell; i++){ 1425 pc = get2byte(&data[cellOffset+i*2]); 1426 testcase( pc==iCellFirst ); 1427 testcase( pc==iCellLast ); 1428 if( pc<iCellFirst || pc>iCellLast ){ 1429 return SQLITE_CORRUPT_BKPT; 1430 } 1431 sz = cellSizePtr(pPage, &data[pc]); 1432 testcase( pc+sz==usableSize ); 1433 if( pc+sz>usableSize ){ 1434 return SQLITE_CORRUPT_BKPT; 1435 } 1436 } 1437 if( !pPage->leaf ) iCellLast++; 1438 } 1439 #endif 1440 1441 /* Compute the total free space on the page */ 1442 pc = get2byte(&data[hdr+1]); 1443 nFree = data[hdr+7] + top; 1444 while( pc>0 ){ 1445 u16 next, size; 1446 if( pc<iCellFirst || pc>iCellLast ){ 1447 /* Start of free block is off the page */ 1448 return SQLITE_CORRUPT_BKPT; 1449 } 1450 next = get2byte(&data[pc]); 1451 size = get2byte(&data[pc+2]); 1452 if( (next>0 && next<=pc+size+3) || pc+size>usableSize ){ 1453 /* Free blocks must be in ascending order. And the last byte of 1454 ** the free-block must lie on the database page. */ 1455 return SQLITE_CORRUPT_BKPT; 1456 } 1457 nFree = nFree + size; 1458 pc = next; 1459 } 1460 1461 /* At this point, nFree contains the sum of the offset to the start 1462 ** of the cell-content area plus the number of free bytes within 1463 ** the cell-content area. If this is greater than the usable-size 1464 ** of the page, then the page must be corrupted. This check also 1465 ** serves to verify that the offset to the start of the cell-content 1466 ** area, according to the page header, lies within the page. 1467 */ 1468 if( nFree>usableSize ){ 1469 return SQLITE_CORRUPT_BKPT; 1470 } 1471 pPage->nFree = (u16)(nFree - iCellFirst); 1472 pPage->isInit = 1; 1473 } 1474 return SQLITE_OK; 1475 } 1476 1477 /* 1478 ** Set up a raw page so that it looks like a database page holding 1479 ** no entries. 1480 */ 1481 static void zeroPage(MemPage *pPage, int flags){ 1482 unsigned char *data = pPage->aData; 1483 BtShared *pBt = pPage->pBt; 1484 u8 hdr = pPage->hdrOffset; 1485 u16 first; 1486 1487 assert( sqlite3PagerPagenumber(pPage->pDbPage)==pPage->pgno ); 1488 assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage ); 1489 assert( sqlite3PagerGetData(pPage->pDbPage) == data ); 1490 assert( sqlite3PagerIswriteable(pPage->pDbPage) ); 1491 assert( sqlite3_mutex_held(pBt->mutex) ); 1492 if( pBt->secureDelete ){ 1493 memset(&data[hdr], 0, pBt->usableSize - hdr); 1494 } 1495 data[hdr] = (char)flags; 1496 first = hdr + 8 + 4*((flags&PTF_LEAF)==0 ?1:0); 1497 memset(&data[hdr+1], 0, 4); 1498 data[hdr+7] = 0; 1499 put2byte(&data[hdr+5], pBt->usableSize); 1500 pPage->nFree = (u16)(pBt->usableSize - first); 1501 decodeFlags(pPage, flags); 1502 pPage->hdrOffset = hdr; 1503 pPage->cellOffset = first; 1504 pPage->nOverflow = 0; 1505 assert( pBt->pageSize>=512 && pBt->pageSize<=65536 ); 1506 pPage->maskPage = (u16)(pBt->pageSize - 1); 1507 pPage->nCell = 0; 1508 pPage->isInit = 1; 1509 } 1510 1511 1512 /* 1513 ** Convert a DbPage obtained from the pager into a MemPage used by 1514 ** the btree layer. 1515 */ 1516 static MemPage *btreePageFromDbPage(DbPage *pDbPage, Pgno pgno, BtShared *pBt){ 1517 MemPage *pPage = (MemPage*)sqlite3PagerGetExtra(pDbPage); 1518 pPage->aData = sqlite3PagerGetData(pDbPage); 1519 pPage->pDbPage = pDbPage; 1520 pPage->pBt = pBt; 1521 pPage->pgno = pgno; 1522 pPage->hdrOffset = pPage->pgno==1 ? 100 : 0; 1523 return pPage; 1524 } 1525 1526 /* 1527 ** Get a page from the pager. Initialize the MemPage.pBt and 1528 ** MemPage.aData elements if needed. 1529 ** 1530 ** If the noContent flag is set, it means that we do not care about 1531 ** the content of the page at this time. So do not go to the disk 1532 ** to fetch the content. Just fill in the content with zeros for now. 1533 ** If in the future we call sqlite3PagerWrite() on this page, that 1534 ** means we have started to be concerned about content and the disk 1535 ** read should occur at that point. 1536 */ 1537 static int btreeGetPage( 1538 BtShared *pBt, /* The btree */ 1539 Pgno pgno, /* Number of the page to fetch */ 1540 MemPage **ppPage, /* Return the page in this parameter */ 1541 int noContent /* Do not load page content if true */ 1542 ){ 1543 int rc; 1544 DbPage *pDbPage; 1545 1546 assert( sqlite3_mutex_held(pBt->mutex) ); 1547 rc = sqlite3PagerAcquire(pBt->pPager, pgno, (DbPage**)&pDbPage, noContent); 1548 if( rc ) return rc; 1549 *ppPage = btreePageFromDbPage(pDbPage, pgno, pBt); 1550 return SQLITE_OK; 1551 } 1552 1553 /* 1554 ** Retrieve a page from the pager cache. If the requested page is not 1555 ** already in the pager cache return NULL. Initialize the MemPage.pBt and 1556 ** MemPage.aData elements if needed. 1557 */ 1558 static MemPage *btreePageLookup(BtShared *pBt, Pgno pgno){ 1559 DbPage *pDbPage; 1560 assert( sqlite3_mutex_held(pBt->mutex) ); 1561 pDbPage = sqlite3PagerLookup(pBt->pPager, pgno); 1562 if( pDbPage ){ 1563 return btreePageFromDbPage(pDbPage, pgno, pBt); 1564 } 1565 return 0; 1566 } 1567 1568 /* 1569 ** Return the size of the database file in pages. If there is any kind of 1570 ** error, return ((unsigned int)-1). 1571 */ 1572 static Pgno btreePagecount(BtShared *pBt){ 1573 return pBt->nPage; 1574 } 1575 u32 sqlite3BtreeLastPage(Btree *p){ 1576 assert( sqlite3BtreeHoldsMutex(p) ); 1577 assert( ((p->pBt->nPage)&0x8000000)==0 ); 1578 return (int)btreePagecount(p->pBt); 1579 } 1580 1581 /* 1582 ** Get a page from the pager and initialize it. This routine is just a 1583 ** convenience wrapper around separate calls to btreeGetPage() and 1584 ** btreeInitPage(). 1585 ** 1586 ** If an error occurs, then the value *ppPage is set to is undefined. It 1587 ** may remain unchanged, or it may be set to an invalid value. 1588 */ 1589 static int getAndInitPage( 1590 BtShared *pBt, /* The database file */ 1591 Pgno pgno, /* Number of the page to get */ 1592 MemPage **ppPage /* Write the page pointer here */ 1593 ){ 1594 int rc; 1595 assert( sqlite3_mutex_held(pBt->mutex) ); 1596 1597 if( pgno>btreePagecount(pBt) ){ 1598 rc = SQLITE_CORRUPT_BKPT; 1599 }else{ 1600 rc = btreeGetPage(pBt, pgno, ppPage, 0); 1601 if( rc==SQLITE_OK ){ 1602 rc = btreeInitPage(*ppPage); 1603 if( rc!=SQLITE_OK ){ 1604 releasePage(*ppPage); 1605 } 1606 } 1607 } 1608 1609 testcase( pgno==0 ); 1610 assert( pgno!=0 || rc==SQLITE_CORRUPT ); 1611 return rc; 1612 } 1613 1614 /* 1615 ** Release a MemPage. This should be called once for each prior 1616 ** call to btreeGetPage. 1617 */ 1618 static void releasePage(MemPage *pPage){ 1619 if( pPage ){ 1620 assert( pPage->aData ); 1621 assert( pPage->pBt ); 1622 assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage ); 1623 assert( sqlite3PagerGetData(pPage->pDbPage)==pPage->aData ); 1624 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 1625 sqlite3PagerUnref(pPage->pDbPage); 1626 } 1627 } 1628 1629 /* 1630 ** During a rollback, when the pager reloads information into the cache 1631 ** so that the cache is restored to its original state at the start of 1632 ** the transaction, for each page restored this routine is called. 1633 ** 1634 ** This routine needs to reset the extra data section at the end of the 1635 ** page to agree with the restored data. 1636 */ 1637 static void pageReinit(DbPage *pData){ 1638 MemPage *pPage; 1639 pPage = (MemPage *)sqlite3PagerGetExtra(pData); 1640 assert( sqlite3PagerPageRefcount(pData)>0 ); 1641 if( pPage->isInit ){ 1642 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 1643 pPage->isInit = 0; 1644 if( sqlite3PagerPageRefcount(pData)>1 ){ 1645 /* pPage might not be a btree page; it might be an overflow page 1646 ** or ptrmap page or a free page. In those cases, the following 1647 ** call to btreeInitPage() will likely return SQLITE_CORRUPT. 1648 ** But no harm is done by this. And it is very important that 1649 ** btreeInitPage() be called on every btree page so we make 1650 ** the call for every page that comes in for re-initing. */ 1651 btreeInitPage(pPage); 1652 } 1653 } 1654 } 1655 1656 /* 1657 ** Invoke the busy handler for a btree. 1658 */ 1659 static int btreeInvokeBusyHandler(void *pArg){ 1660 BtShared *pBt = (BtShared*)pArg; 1661 assert( pBt->db ); 1662 assert( sqlite3_mutex_held(pBt->db->mutex) ); 1663 return sqlite3InvokeBusyHandler(&pBt->db->busyHandler); 1664 } 1665 1666 /* 1667 ** Open a database file. 1668 ** 1669 ** zFilename is the name of the database file. If zFilename is NULL 1670 ** then an ephemeral database is created. The ephemeral database might 1671 ** be exclusively in memory, or it might use a disk-based memory cache. 1672 ** Either way, the ephemeral database will be automatically deleted 1673 ** when sqlite3BtreeClose() is called. 1674 ** 1675 ** If zFilename is ":memory:" then an in-memory database is created 1676 ** that is automatically destroyed when it is closed. 1677 ** 1678 ** The "flags" parameter is a bitmask that might contain bits 1679 ** BTREE_OMIT_JOURNAL and/or BTREE_NO_READLOCK. The BTREE_NO_READLOCK 1680 ** bit is also set if the SQLITE_NoReadlock flags is set in db->flags. 1681 ** These flags are passed through into sqlite3PagerOpen() and must 1682 ** be the same values as PAGER_OMIT_JOURNAL and PAGER_NO_READLOCK. 1683 ** 1684 ** If the database is already opened in the same database connection 1685 ** and we are in shared cache mode, then the open will fail with an 1686 ** SQLITE_CONSTRAINT error. We cannot allow two or more BtShared 1687 ** objects in the same database connection since doing so will lead 1688 ** to problems with locking. 1689 */ 1690 int sqlite3BtreeOpen( 1691 const char *zFilename, /* Name of the file containing the BTree database */ 1692 sqlite3 *db, /* Associated database handle */ 1693 Btree **ppBtree, /* Pointer to new Btree object written here */ 1694 int flags, /* Options */ 1695 int vfsFlags /* Flags passed through to sqlite3_vfs.xOpen() */ 1696 ){ 1697 sqlite3_vfs *pVfs; /* The VFS to use for this btree */ 1698 BtShared *pBt = 0; /* Shared part of btree structure */ 1699 Btree *p; /* Handle to return */ 1700 sqlite3_mutex *mutexOpen = 0; /* Prevents a race condition. Ticket #3537 */ 1701 int rc = SQLITE_OK; /* Result code from this function */ 1702 u8 nReserve; /* Byte of unused space on each page */ 1703 unsigned char zDbHeader[100]; /* Database header content */ 1704 1705 /* True if opening an ephemeral, temporary database */ 1706 const int isTempDb = zFilename==0 || zFilename[0]==0; 1707 1708 /* Set the variable isMemdb to true for an in-memory database, or 1709 ** false for a file-based database. 1710 */ 1711 #ifdef SQLITE_OMIT_MEMORYDB 1712 const int isMemdb = 0; 1713 #else 1714 const int isMemdb = (zFilename && strcmp(zFilename, ":memory:")==0) 1715 || (isTempDb && sqlite3TempInMemory(db)); 1716 #endif 1717 1718 assert( db!=0 ); 1719 assert( sqlite3_mutex_held(db->mutex) ); 1720 assert( (flags&0xff)==flags ); /* flags fit in 8 bits */ 1721 1722 /* Only a BTREE_SINGLE database can be BTREE_UNORDERED */ 1723 assert( (flags & BTREE_UNORDERED)==0 || (flags & BTREE_SINGLE)!=0 ); 1724 1725 /* A BTREE_SINGLE database is always a temporary and/or ephemeral */ 1726 assert( (flags & BTREE_SINGLE)==0 || isTempDb ); 1727 1728 if( db->flags & SQLITE_NoReadlock ){ 1729 flags |= BTREE_NO_READLOCK; 1730 } 1731 if( isMemdb ){ 1732 flags |= BTREE_MEMORY; 1733 } 1734 if( (vfsFlags & SQLITE_OPEN_MAIN_DB)!=0 && (isMemdb || isTempDb) ){ 1735 vfsFlags = (vfsFlags & ~SQLITE_OPEN_MAIN_DB) | SQLITE_OPEN_TEMP_DB; 1736 } 1737 pVfs = db->pVfs; 1738 p = sqlite3MallocZero(sizeof(Btree)); 1739 if( !p ){ 1740 return SQLITE_NOMEM; 1741 } 1742 p->inTrans = TRANS_NONE; 1743 p->db = db; 1744 #ifndef SQLITE_OMIT_SHARED_CACHE 1745 p->lock.pBtree = p; 1746 p->lock.iTable = 1; 1747 #endif 1748 1749 #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO) 1750 /* 1751 ** If this Btree is a candidate for shared cache, try to find an 1752 ** existing BtShared object that we can share with 1753 */ 1754 if( isMemdb==0 && isTempDb==0 ){ 1755 if( vfsFlags & SQLITE_OPEN_SHAREDCACHE ){ 1756 int nFullPathname = pVfs->mxPathname+1; 1757 char *zFullPathname = sqlite3Malloc(nFullPathname); 1758 sqlite3_mutex *mutexShared; 1759 p->sharable = 1; 1760 if( !zFullPathname ){ 1761 sqlite3_free(p); 1762 return SQLITE_NOMEM; 1763 } 1764 sqlite3OsFullPathname(pVfs, zFilename, nFullPathname, zFullPathname); 1765 mutexOpen = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_OPEN); 1766 sqlite3_mutex_enter(mutexOpen); 1767 mutexShared = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER); 1768 sqlite3_mutex_enter(mutexShared); 1769 for(pBt=GLOBAL(BtShared*,sqlite3SharedCacheList); pBt; pBt=pBt->pNext){ 1770 assert( pBt->nRef>0 ); 1771 if( 0==strcmp(zFullPathname, sqlite3PagerFilename(pBt->pPager)) 1772 && sqlite3PagerVfs(pBt->pPager)==pVfs ){ 1773 int iDb; 1774 for(iDb=db->nDb-1; iDb>=0; iDb--){ 1775 Btree *pExisting = db->aDb[iDb].pBt; 1776 if( pExisting && pExisting->pBt==pBt ){ 1777 sqlite3_mutex_leave(mutexShared); 1778 sqlite3_mutex_leave(mutexOpen); 1779 sqlite3_free(zFullPathname); 1780 sqlite3_free(p); 1781 return SQLITE_CONSTRAINT; 1782 } 1783 } 1784 p->pBt = pBt; 1785 pBt->nRef++; 1786 break; 1787 } 1788 } 1789 sqlite3_mutex_leave(mutexShared); 1790 sqlite3_free(zFullPathname); 1791 } 1792 #ifdef SQLITE_DEBUG 1793 else{ 1794 /* In debug mode, we mark all persistent databases as sharable 1795 ** even when they are not. This exercises the locking code and 1796 ** gives more opportunity for asserts(sqlite3_mutex_held()) 1797 ** statements to find locking problems. 1798 */ 1799 p->sharable = 1; 1800 } 1801 #endif 1802 } 1803 #endif 1804 if( pBt==0 ){ 1805 /* 1806 ** The following asserts make sure that structures used by the btree are 1807 ** the right size. This is to guard against size changes that result 1808 ** when compiling on a different architecture. 1809 */ 1810 assert( sizeof(i64)==8 || sizeof(i64)==4 ); 1811 assert( sizeof(u64)==8 || sizeof(u64)==4 ); 1812 assert( sizeof(u32)==4 ); 1813 assert( sizeof(u16)==2 ); 1814 assert( sizeof(Pgno)==4 ); 1815 1816 pBt = sqlite3MallocZero( sizeof(*pBt) ); 1817 if( pBt==0 ){ 1818 rc = SQLITE_NOMEM; 1819 goto btree_open_out; 1820 } 1821 rc = sqlite3PagerOpen(pVfs, &pBt->pPager, zFilename, 1822 EXTRA_SIZE, flags, vfsFlags, pageReinit); 1823 if( rc==SQLITE_OK ){ 1824 rc = sqlite3PagerReadFileheader(pBt->pPager,sizeof(zDbHeader),zDbHeader); 1825 } 1826 if( rc!=SQLITE_OK ){ 1827 goto btree_open_out; 1828 } 1829 pBt->openFlags = (u8)flags; 1830 pBt->db = db; 1831 sqlite3PagerSetBusyhandler(pBt->pPager, btreeInvokeBusyHandler, pBt); 1832 p->pBt = pBt; 1833 1834 pBt->pCursor = 0; 1835 pBt->pPage1 = 0; 1836 pBt->readOnly = sqlite3PagerIsreadonly(pBt->pPager); 1837 #ifdef SQLITE_SECURE_DELETE 1838 pBt->secureDelete = 1; 1839 #endif 1840 pBt->pageSize = (zDbHeader[16]<<8) | (zDbHeader[17]<<16); 1841 if( pBt->pageSize<512 || pBt->pageSize>SQLITE_MAX_PAGE_SIZE 1842 || ((pBt->pageSize-1)&pBt->pageSize)!=0 ){ 1843 pBt->pageSize = 0; 1844 #ifndef SQLITE_OMIT_AUTOVACUUM 1845 /* If the magic name ":memory:" will create an in-memory database, then 1846 ** leave the autoVacuum mode at 0 (do not auto-vacuum), even if 1847 ** SQLITE_DEFAULT_AUTOVACUUM is true. On the other hand, if 1848 ** SQLITE_OMIT_MEMORYDB has been defined, then ":memory:" is just a 1849 ** regular file-name. In this case the auto-vacuum applies as per normal. 1850 */ 1851 if( zFilename && !isMemdb ){ 1852 pBt->autoVacuum = (SQLITE_DEFAULT_AUTOVACUUM ? 1 : 0); 1853 pBt->incrVacuum = (SQLITE_DEFAULT_AUTOVACUUM==2 ? 1 : 0); 1854 } 1855 #endif 1856 nReserve = 0; 1857 }else{ 1858 nReserve = zDbHeader[20]; 1859 pBt->pageSizeFixed = 1; 1860 #ifndef SQLITE_OMIT_AUTOVACUUM 1861 pBt->autoVacuum = (get4byte(&zDbHeader[36 + 4*4])?1:0); 1862 pBt->incrVacuum = (get4byte(&zDbHeader[36 + 7*4])?1:0); 1863 #endif 1864 } 1865 rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize, nReserve); 1866 if( rc ) goto btree_open_out; 1867 pBt->usableSize = pBt->pageSize - nReserve; 1868 assert( (pBt->pageSize & 7)==0 ); /* 8-byte alignment of pageSize */ 1869 1870 #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO) 1871 /* Add the new BtShared object to the linked list sharable BtShareds. 1872 */ 1873 if( p->sharable ){ 1874 sqlite3_mutex *mutexShared; 1875 pBt->nRef = 1; 1876 mutexShared = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER); 1877 if( SQLITE_THREADSAFE && sqlite3GlobalConfig.bCoreMutex ){ 1878 pBt->mutex = sqlite3MutexAlloc(SQLITE_MUTEX_FAST); 1879 if( pBt->mutex==0 ){ 1880 rc = SQLITE_NOMEM; 1881 db->mallocFailed = 0; 1882 goto btree_open_out; 1883 } 1884 } 1885 sqlite3_mutex_enter(mutexShared); 1886 pBt->pNext = GLOBAL(BtShared*,sqlite3SharedCacheList); 1887 GLOBAL(BtShared*,sqlite3SharedCacheList) = pBt; 1888 sqlite3_mutex_leave(mutexShared); 1889 } 1890 #endif 1891 } 1892 1893 #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO) 1894 /* If the new Btree uses a sharable pBtShared, then link the new 1895 ** Btree into the list of all sharable Btrees for the same connection. 1896 ** The list is kept in ascending order by pBt address. 1897 */ 1898 if( p->sharable ){ 1899 int i; 1900 Btree *pSib; 1901 for(i=0; i<db->nDb; i++){ 1902 if( (pSib = db->aDb[i].pBt)!=0 && pSib->sharable ){ 1903 while( pSib->pPrev ){ pSib = pSib->pPrev; } 1904 if( p->pBt<pSib->pBt ){ 1905 p->pNext = pSib; 1906 p->pPrev = 0; 1907 pSib->pPrev = p; 1908 }else{ 1909 while( pSib->pNext && pSib->pNext->pBt<p->pBt ){ 1910 pSib = pSib->pNext; 1911 } 1912 p->pNext = pSib->pNext; 1913 p->pPrev = pSib; 1914 if( p->pNext ){ 1915 p->pNext->pPrev = p; 1916 } 1917 pSib->pNext = p; 1918 } 1919 break; 1920 } 1921 } 1922 } 1923 #endif 1924 *ppBtree = p; 1925 1926 btree_open_out: 1927 if( rc!=SQLITE_OK ){ 1928 if( pBt && pBt->pPager ){ 1929 sqlite3PagerClose(pBt->pPager); 1930 } 1931 sqlite3_free(pBt); 1932 sqlite3_free(p); 1933 *ppBtree = 0; 1934 }else{ 1935 /* If the B-Tree was successfully opened, set the pager-cache size to the 1936 ** default value. Except, when opening on an existing shared pager-cache, 1937 ** do not change the pager-cache size. 1938 */ 1939 if( sqlite3BtreeSchema(p, 0, 0)==0 ){ 1940 sqlite3PagerSetCachesize(p->pBt->pPager, SQLITE_DEFAULT_CACHE_SIZE); 1941 } 1942 } 1943 if( mutexOpen ){ 1944 assert( sqlite3_mutex_held(mutexOpen) ); 1945 sqlite3_mutex_leave(mutexOpen); 1946 } 1947 return rc; 1948 } 1949 1950 /* 1951 ** Decrement the BtShared.nRef counter. When it reaches zero, 1952 ** remove the BtShared structure from the sharing list. Return 1953 ** true if the BtShared.nRef counter reaches zero and return 1954 ** false if it is still positive. 1955 */ 1956 static int removeFromSharingList(BtShared *pBt){ 1957 #ifndef SQLITE_OMIT_SHARED_CACHE 1958 sqlite3_mutex *pMaster; 1959 BtShared *pList; 1960 int removed = 0; 1961 1962 assert( sqlite3_mutex_notheld(pBt->mutex) ); 1963 pMaster = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER); 1964 sqlite3_mutex_enter(pMaster); 1965 pBt->nRef--; 1966 if( pBt->nRef<=0 ){ 1967 if( GLOBAL(BtShared*,sqlite3SharedCacheList)==pBt ){ 1968 GLOBAL(BtShared*,sqlite3SharedCacheList) = pBt->pNext; 1969 }else{ 1970 pList = GLOBAL(BtShared*,sqlite3SharedCacheList); 1971 while( ALWAYS(pList) && pList->pNext!=pBt ){ 1972 pList=pList->pNext; 1973 } 1974 if( ALWAYS(pList) ){ 1975 pList->pNext = pBt->pNext; 1976 } 1977 } 1978 if( SQLITE_THREADSAFE ){ 1979 sqlite3_mutex_free(pBt->mutex); 1980 } 1981 removed = 1; 1982 } 1983 sqlite3_mutex_leave(pMaster); 1984 return removed; 1985 #else 1986 return 1; 1987 #endif 1988 } 1989 1990 /* 1991 ** Make sure pBt->pTmpSpace points to an allocation of 1992 ** MX_CELL_SIZE(pBt) bytes. 1993 */ 1994 static void allocateTempSpace(BtShared *pBt){ 1995 if( !pBt->pTmpSpace ){ 1996 pBt->pTmpSpace = sqlite3PageMalloc( pBt->pageSize ); 1997 } 1998 } 1999 2000 /* 2001 ** Free the pBt->pTmpSpace allocation 2002 */ 2003 static void freeTempSpace(BtShared *pBt){ 2004 sqlite3PageFree( pBt->pTmpSpace); 2005 pBt->pTmpSpace = 0; 2006 } 2007 2008 /* 2009 ** Close an open database and invalidate all cursors. 2010 */ 2011 int sqlite3BtreeClose(Btree *p){ 2012 BtShared *pBt = p->pBt; 2013 BtCursor *pCur; 2014 2015 /* Close all cursors opened via this handle. */ 2016 assert( sqlite3_mutex_held(p->db->mutex) ); 2017 sqlite3BtreeEnter(p); 2018 pCur = pBt->pCursor; 2019 while( pCur ){ 2020 BtCursor *pTmp = pCur; 2021 pCur = pCur->pNext; 2022 if( pTmp->pBtree==p ){ 2023 sqlite3BtreeCloseCursor(pTmp); 2024 } 2025 } 2026 2027 /* Rollback any active transaction and free the handle structure. 2028 ** The call to sqlite3BtreeRollback() drops any table-locks held by 2029 ** this handle. 2030 */ 2031 sqlite3BtreeRollback(p); 2032 sqlite3BtreeLeave(p); 2033 2034 /* If there are still other outstanding references to the shared-btree 2035 ** structure, return now. The remainder of this procedure cleans 2036 ** up the shared-btree. 2037 */ 2038 assert( p->wantToLock==0 && p->locked==0 ); 2039 if( !p->sharable || removeFromSharingList(pBt) ){ 2040 /* The pBt is no longer on the sharing list, so we can access 2041 ** it without having to hold the mutex. 2042 ** 2043 ** Clean out and delete the BtShared object. 2044 */ 2045 assert( !pBt->pCursor ); 2046 sqlite3PagerClose(pBt->pPager); 2047 if( pBt->xFreeSchema && pBt->pSchema ){ 2048 pBt->xFreeSchema(pBt->pSchema); 2049 } 2050 sqlite3DbFree(0, pBt->pSchema); 2051 freeTempSpace(pBt); 2052 sqlite3_free(pBt); 2053 } 2054 2055 #ifndef SQLITE_OMIT_SHARED_CACHE 2056 assert( p->wantToLock==0 ); 2057 assert( p->locked==0 ); 2058 if( p->pPrev ) p->pPrev->pNext = p->pNext; 2059 if( p->pNext ) p->pNext->pPrev = p->pPrev; 2060 #endif 2061 2062 sqlite3_free(p); 2063 return SQLITE_OK; 2064 } 2065 2066 /* 2067 ** Change the limit on the number of pages allowed in the cache. 2068 ** 2069 ** The maximum number of cache pages is set to the absolute 2070 ** value of mxPage. If mxPage is negative, the pager will 2071 ** operate asynchronously - it will not stop to do fsync()s 2072 ** to insure data is written to the disk surface before 2073 ** continuing. Transactions still work if synchronous is off, 2074 ** and the database cannot be corrupted if this program 2075 ** crashes. But if the operating system crashes or there is 2076 ** an abrupt power failure when synchronous is off, the database 2077 ** could be left in an inconsistent and unrecoverable state. 2078 ** Synchronous is on by default so database corruption is not 2079 ** normally a worry. 2080 */ 2081 int sqlite3BtreeSetCacheSize(Btree *p, int mxPage){ 2082 BtShared *pBt = p->pBt; 2083 assert( sqlite3_mutex_held(p->db->mutex) ); 2084 sqlite3BtreeEnter(p); 2085 sqlite3PagerSetCachesize(pBt->pPager, mxPage); 2086 sqlite3BtreeLeave(p); 2087 return SQLITE_OK; 2088 } 2089 2090 /* 2091 ** Change the way data is synced to disk in order to increase or decrease 2092 ** how well the database resists damage due to OS crashes and power 2093 ** failures. Level 1 is the same as asynchronous (no syncs() occur and 2094 ** there is a high probability of damage) Level 2 is the default. There 2095 ** is a very low but non-zero probability of damage. Level 3 reduces the 2096 ** probability of damage to near zero but with a write performance reduction. 2097 */ 2098 #ifndef SQLITE_OMIT_PAGER_PRAGMAS 2099 int sqlite3BtreeSetSafetyLevel( 2100 Btree *p, /* The btree to set the safety level on */ 2101 int level, /* PRAGMA synchronous. 1=OFF, 2=NORMAL, 3=FULL */ 2102 int fullSync, /* PRAGMA fullfsync. */ 2103 int ckptFullSync /* PRAGMA checkpoint_fullfync */ 2104 ){ 2105 BtShared *pBt = p->pBt; 2106 assert( sqlite3_mutex_held(p->db->mutex) ); 2107 assert( level>=1 && level<=3 ); 2108 sqlite3BtreeEnter(p); 2109 sqlite3PagerSetSafetyLevel(pBt->pPager, level, fullSync, ckptFullSync); 2110 sqlite3BtreeLeave(p); 2111 return SQLITE_OK; 2112 } 2113 #endif 2114 2115 /* 2116 ** Return TRUE if the given btree is set to safety level 1. In other 2117 ** words, return TRUE if no sync() occurs on the disk files. 2118 */ 2119 int sqlite3BtreeSyncDisabled(Btree *p){ 2120 BtShared *pBt = p->pBt; 2121 int rc; 2122 assert( sqlite3_mutex_held(p->db->mutex) ); 2123 sqlite3BtreeEnter(p); 2124 assert( pBt && pBt->pPager ); 2125 rc = sqlite3PagerNosync(pBt->pPager); 2126 sqlite3BtreeLeave(p); 2127 return rc; 2128 } 2129 2130 #if !defined(SQLITE_OMIT_PAGER_PRAGMAS) || !defined(SQLITE_OMIT_VACUUM) 2131 /* 2132 ** Change the default pages size and the number of reserved bytes per page. 2133 ** Or, if the page size has already been fixed, return SQLITE_READONLY 2134 ** without changing anything. 2135 ** 2136 ** The page size must be a power of 2 between 512 and 65536. If the page 2137 ** size supplied does not meet this constraint then the page size is not 2138 ** changed. 2139 ** 2140 ** Page sizes are constrained to be a power of two so that the region 2141 ** of the database file used for locking (beginning at PENDING_BYTE, 2142 ** the first byte past the 1GB boundary, 0x40000000) needs to occur 2143 ** at the beginning of a page. 2144 ** 2145 ** If parameter nReserve is less than zero, then the number of reserved 2146 ** bytes per page is left unchanged. 2147 ** 2148 ** If the iFix!=0 then the pageSizeFixed flag is set so that the page size 2149 ** and autovacuum mode can no longer be changed. 2150 */ 2151 int sqlite3BtreeSetPageSize(Btree *p, int pageSize, int nReserve, int iFix){ 2152 int rc = SQLITE_OK; 2153 BtShared *pBt = p->pBt; 2154 assert( nReserve>=-1 && nReserve<=255 ); 2155 sqlite3BtreeEnter(p); 2156 if( pBt->pageSizeFixed ){ 2157 sqlite3BtreeLeave(p); 2158 return SQLITE_READONLY; 2159 } 2160 if( nReserve<0 ){ 2161 nReserve = pBt->pageSize - pBt->usableSize; 2162 } 2163 assert( nReserve>=0 && nReserve<=255 ); 2164 if( pageSize>=512 && pageSize<=SQLITE_MAX_PAGE_SIZE && 2165 ((pageSize-1)&pageSize)==0 ){ 2166 assert( (pageSize & 7)==0 ); 2167 assert( !pBt->pPage1 && !pBt->pCursor ); 2168 pBt->pageSize = (u32)pageSize; 2169 freeTempSpace(pBt); 2170 } 2171 rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize, nReserve); 2172 pBt->usableSize = pBt->pageSize - (u16)nReserve; 2173 if( iFix ) pBt->pageSizeFixed = 1; 2174 sqlite3BtreeLeave(p); 2175 return rc; 2176 } 2177 2178 /* 2179 ** Return the currently defined page size 2180 */ 2181 int sqlite3BtreeGetPageSize(Btree *p){ 2182 return p->pBt->pageSize; 2183 } 2184 2185 /* 2186 ** Return the number of bytes of space at the end of every page that 2187 ** are intentually left unused. This is the "reserved" space that is 2188 ** sometimes used by extensions. 2189 */ 2190 int sqlite3BtreeGetReserve(Btree *p){ 2191 int n; 2192 sqlite3BtreeEnter(p); 2193 n = p->pBt->pageSize - p->pBt->usableSize; 2194 sqlite3BtreeLeave(p); 2195 return n; 2196 } 2197 2198 /* 2199 ** Set the maximum page count for a database if mxPage is positive. 2200 ** No changes are made if mxPage is 0 or negative. 2201 ** Regardless of the value of mxPage, return the maximum page count. 2202 */ 2203 int sqlite3BtreeMaxPageCount(Btree *p, int mxPage){ 2204 int n; 2205 sqlite3BtreeEnter(p); 2206 n = sqlite3PagerMaxPageCount(p->pBt->pPager, mxPage); 2207 sqlite3BtreeLeave(p); 2208 return n; 2209 } 2210 2211 /* 2212 ** Set the secureDelete flag if newFlag is 0 or 1. If newFlag is -1, 2213 ** then make no changes. Always return the value of the secureDelete 2214 ** setting after the change. 2215 */ 2216 int sqlite3BtreeSecureDelete(Btree *p, int newFlag){ 2217 int b; 2218 if( p==0 ) return 0; 2219 sqlite3BtreeEnter(p); 2220 if( newFlag>=0 ){ 2221 p->pBt->secureDelete = (newFlag!=0) ? 1 : 0; 2222 } 2223 b = p->pBt->secureDelete; 2224 sqlite3BtreeLeave(p); 2225 return b; 2226 } 2227 #endif /* !defined(SQLITE_OMIT_PAGER_PRAGMAS) || !defined(SQLITE_OMIT_VACUUM) */ 2228 2229 /* 2230 ** Change the 'auto-vacuum' property of the database. If the 'autoVacuum' 2231 ** parameter is non-zero, then auto-vacuum mode is enabled. If zero, it 2232 ** is disabled. The default value for the auto-vacuum property is 2233 ** determined by the SQLITE_DEFAULT_AUTOVACUUM macro. 2234 */ 2235 int sqlite3BtreeSetAutoVacuum(Btree *p, int autoVacuum){ 2236 #ifdef SQLITE_OMIT_AUTOVACUUM 2237 return SQLITE_READONLY; 2238 #else 2239 BtShared *pBt = p->pBt; 2240 int rc = SQLITE_OK; 2241 u8 av = (u8)autoVacuum; 2242 2243 sqlite3BtreeEnter(p); 2244 if( pBt->pageSizeFixed && (av ?1:0)!=pBt->autoVacuum ){ 2245 rc = SQLITE_READONLY; 2246 }else{ 2247 pBt->autoVacuum = av ?1:0; 2248 pBt->incrVacuum = av==2 ?1:0; 2249 } 2250 sqlite3BtreeLeave(p); 2251 return rc; 2252 #endif 2253 } 2254 2255 /* 2256 ** Return the value of the 'auto-vacuum' property. If auto-vacuum is 2257 ** enabled 1 is returned. Otherwise 0. 2258 */ 2259 int sqlite3BtreeGetAutoVacuum(Btree *p){ 2260 #ifdef SQLITE_OMIT_AUTOVACUUM 2261 return BTREE_AUTOVACUUM_NONE; 2262 #else 2263 int rc; 2264 sqlite3BtreeEnter(p); 2265 rc = ( 2266 (!p->pBt->autoVacuum)?BTREE_AUTOVACUUM_NONE: 2267 (!p->pBt->incrVacuum)?BTREE_AUTOVACUUM_FULL: 2268 BTREE_AUTOVACUUM_INCR 2269 ); 2270 sqlite3BtreeLeave(p); 2271 return rc; 2272 #endif 2273 } 2274 2275 2276 /* 2277 ** Get a reference to pPage1 of the database file. This will 2278 ** also acquire a readlock on that file. 2279 ** 2280 ** SQLITE_OK is returned on success. If the file is not a 2281 ** well-formed database file, then SQLITE_CORRUPT is returned. 2282 ** SQLITE_BUSY is returned if the database is locked. SQLITE_NOMEM 2283 ** is returned if we run out of memory. 2284 */ 2285 static int lockBtree(BtShared *pBt){ 2286 int rc; /* Result code from subfunctions */ 2287 MemPage *pPage1; /* Page 1 of the database file */ 2288 int nPage; /* Number of pages in the database */ 2289 int nPageFile = 0; /* Number of pages in the database file */ 2290 int nPageHeader; /* Number of pages in the database according to hdr */ 2291 2292 assert( sqlite3_mutex_held(pBt->mutex) ); 2293 assert( pBt->pPage1==0 ); 2294 rc = sqlite3PagerSharedLock(pBt->pPager); 2295 if( rc!=SQLITE_OK ) return rc; 2296 rc = btreeGetPage(pBt, 1, &pPage1, 0); 2297 if( rc!=SQLITE_OK ) return rc; 2298 2299 /* Do some checking to help insure the file we opened really is 2300 ** a valid database file. 2301 */ 2302 nPage = nPageHeader = get4byte(28+(u8*)pPage1->aData); 2303 sqlite3PagerPagecount(pBt->pPager, &nPageFile); 2304 if( nPage==0 || memcmp(24+(u8*)pPage1->aData, 92+(u8*)pPage1->aData,4)!=0 ){ 2305 nPage = nPageFile; 2306 } 2307 if( nPage>0 ){ 2308 u32 pageSize; 2309 u32 usableSize; 2310 u8 *page1 = pPage1->aData; 2311 rc = SQLITE_NOTADB; 2312 if( memcmp(page1, zMagicHeader, 16)!=0 ){ 2313 goto page1_init_failed; 2314 } 2315 2316 #ifdef SQLITE_OMIT_WAL 2317 if( page1[18]>1 ){ 2318 pBt->readOnly = 1; 2319 } 2320 if( page1[19]>1 ){ 2321 goto page1_init_failed; 2322 } 2323 #else 2324 if( page1[18]>2 ){ 2325 pBt->readOnly = 1; 2326 } 2327 if( page1[19]>2 ){ 2328 goto page1_init_failed; 2329 } 2330 2331 /* If the write version is set to 2, this database should be accessed 2332 ** in WAL mode. If the log is not already open, open it now. Then 2333 ** return SQLITE_OK and return without populating BtShared.pPage1. 2334 ** The caller detects this and calls this function again. This is 2335 ** required as the version of page 1 currently in the page1 buffer 2336 ** may not be the latest version - there may be a newer one in the log 2337 ** file. 2338 */ 2339 if( page1[19]==2 && pBt->doNotUseWAL==0 ){ 2340 int isOpen = 0; 2341 rc = sqlite3PagerOpenWal(pBt->pPager, &isOpen); 2342 if( rc!=SQLITE_OK ){ 2343 goto page1_init_failed; 2344 }else if( isOpen==0 ){ 2345 releasePage(pPage1); 2346 return SQLITE_OK; 2347 } 2348 rc = SQLITE_NOTADB; 2349 } 2350 #endif 2351 2352 /* The maximum embedded fraction must be exactly 25%. And the minimum 2353 ** embedded fraction must be 12.5% for both leaf-data and non-leaf-data. 2354 ** The original design allowed these amounts to vary, but as of 2355 ** version 3.6.0, we require them to be fixed. 2356 */ 2357 if( memcmp(&page1[21], "\100\040\040",3)!=0 ){ 2358 goto page1_init_failed; 2359 } 2360 pageSize = (page1[16]<<8) | (page1[17]<<16); 2361 if( ((pageSize-1)&pageSize)!=0 2362 || pageSize>SQLITE_MAX_PAGE_SIZE 2363 || pageSize<=256 2364 ){ 2365 goto page1_init_failed; 2366 } 2367 assert( (pageSize & 7)==0 ); 2368 usableSize = pageSize - page1[20]; 2369 if( (u32)pageSize!=pBt->pageSize ){ 2370 /* After reading the first page of the database assuming a page size 2371 ** of BtShared.pageSize, we have discovered that the page-size is 2372 ** actually pageSize. Unlock the database, leave pBt->pPage1 at 2373 ** zero and return SQLITE_OK. The caller will call this function 2374 ** again with the correct page-size. 2375 */ 2376 releasePage(pPage1); 2377 pBt->usableSize = usableSize; 2378 pBt->pageSize = pageSize; 2379 freeTempSpace(pBt); 2380 rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize, 2381 pageSize-usableSize); 2382 return rc; 2383 } 2384 if( (pBt->db->flags & SQLITE_RecoveryMode)==0 && nPage>nPageFile ){ 2385 rc = SQLITE_CORRUPT_BKPT; 2386 goto page1_init_failed; 2387 } 2388 if( usableSize<480 ){ 2389 goto page1_init_failed; 2390 } 2391 pBt->pageSize = pageSize; 2392 pBt->usableSize = usableSize; 2393 #ifndef SQLITE_OMIT_AUTOVACUUM 2394 pBt->autoVacuum = (get4byte(&page1[36 + 4*4])?1:0); 2395 pBt->incrVacuum = (get4byte(&page1[36 + 7*4])?1:0); 2396 #endif 2397 } 2398 2399 /* maxLocal is the maximum amount of payload to store locally for 2400 ** a cell. Make sure it is small enough so that at least minFanout 2401 ** cells can will fit on one page. We assume a 10-byte page header. 2402 ** Besides the payload, the cell must store: 2403 ** 2-byte pointer to the cell 2404 ** 4-byte child pointer 2405 ** 9-byte nKey value 2406 ** 4-byte nData value 2407 ** 4-byte overflow page pointer 2408 ** So a cell consists of a 2-byte pointer, a header which is as much as 2409 ** 17 bytes long, 0 to N bytes of payload, and an optional 4 byte overflow 2410 ** page pointer. 2411 */ 2412 pBt->maxLocal = (u16)((pBt->usableSize-12)*64/255 - 23); 2413 pBt->minLocal = (u16)((pBt->usableSize-12)*32/255 - 23); 2414 pBt->maxLeaf = (u16)(pBt->usableSize - 35); 2415 pBt->minLeaf = (u16)((pBt->usableSize-12)*32/255 - 23); 2416 assert( pBt->maxLeaf + 23 <= MX_CELL_SIZE(pBt) ); 2417 pBt->pPage1 = pPage1; 2418 pBt->nPage = nPage; 2419 return SQLITE_OK; 2420 2421 page1_init_failed: 2422 releasePage(pPage1); 2423 pBt->pPage1 = 0; 2424 return rc; 2425 } 2426 2427 /* 2428 ** If there are no outstanding cursors and we are not in the middle 2429 ** of a transaction but there is a read lock on the database, then 2430 ** this routine unrefs the first page of the database file which 2431 ** has the effect of releasing the read lock. 2432 ** 2433 ** If there is a transaction in progress, this routine is a no-op. 2434 */ 2435 static void unlockBtreeIfUnused(BtShared *pBt){ 2436 assert( sqlite3_mutex_held(pBt->mutex) ); 2437 assert( pBt->pCursor==0 || pBt->inTransaction>TRANS_NONE ); 2438 if( pBt->inTransaction==TRANS_NONE && pBt->pPage1!=0 ){ 2439 assert( pBt->pPage1->aData ); 2440 assert( sqlite3PagerRefcount(pBt->pPager)==1 ); 2441 assert( pBt->pPage1->aData ); 2442 releasePage(pBt->pPage1); 2443 pBt->pPage1 = 0; 2444 } 2445 } 2446 2447 /* 2448 ** If pBt points to an empty file then convert that empty file 2449 ** into a new empty database by initializing the first page of 2450 ** the database. 2451 */ 2452 static int newDatabase(BtShared *pBt){ 2453 MemPage *pP1; 2454 unsigned char *data; 2455 int rc; 2456 2457 assert( sqlite3_mutex_held(pBt->mutex) ); 2458 if( pBt->nPage>0 ){ 2459 return SQLITE_OK; 2460 } 2461 pP1 = pBt->pPage1; 2462 assert( pP1!=0 ); 2463 data = pP1->aData; 2464 rc = sqlite3PagerWrite(pP1->pDbPage); 2465 if( rc ) return rc; 2466 memcpy(data, zMagicHeader, sizeof(zMagicHeader)); 2467 assert( sizeof(zMagicHeader)==16 ); 2468 data[16] = (u8)((pBt->pageSize>>8)&0xff); 2469 data[17] = (u8)((pBt->pageSize>>16)&0xff); 2470 data[18] = 1; 2471 data[19] = 1; 2472 assert( pBt->usableSize<=pBt->pageSize && pBt->usableSize+255>=pBt->pageSize); 2473 data[20] = (u8)(pBt->pageSize - pBt->usableSize); 2474 data[21] = 64; 2475 data[22] = 32; 2476 data[23] = 32; 2477 memset(&data[24], 0, 100-24); 2478 zeroPage(pP1, PTF_INTKEY|PTF_LEAF|PTF_LEAFDATA ); 2479 pBt->pageSizeFixed = 1; 2480 #ifndef SQLITE_OMIT_AUTOVACUUM 2481 assert( pBt->autoVacuum==1 || pBt->autoVacuum==0 ); 2482 assert( pBt->incrVacuum==1 || pBt->incrVacuum==0 ); 2483 put4byte(&data[36 + 4*4], pBt->autoVacuum); 2484 put4byte(&data[36 + 7*4], pBt->incrVacuum); 2485 #endif 2486 pBt->nPage = 1; 2487 data[31] = 1; 2488 return SQLITE_OK; 2489 } 2490 2491 /* 2492 ** Attempt to start a new transaction. A write-transaction 2493 ** is started if the second argument is nonzero, otherwise a read- 2494 ** transaction. If the second argument is 2 or more and exclusive 2495 ** transaction is started, meaning that no other process is allowed 2496 ** to access the database. A preexisting transaction may not be 2497 ** upgraded to exclusive by calling this routine a second time - the 2498 ** exclusivity flag only works for a new transaction. 2499 ** 2500 ** A write-transaction must be started before attempting any 2501 ** changes to the database. None of the following routines 2502 ** will work unless a transaction is started first: 2503 ** 2504 ** sqlite3BtreeCreateTable() 2505 ** sqlite3BtreeCreateIndex() 2506 ** sqlite3BtreeClearTable() 2507 ** sqlite3BtreeDropTable() 2508 ** sqlite3BtreeInsert() 2509 ** sqlite3BtreeDelete() 2510 ** sqlite3BtreeUpdateMeta() 2511 ** 2512 ** If an initial attempt to acquire the lock fails because of lock contention 2513 ** and the database was previously unlocked, then invoke the busy handler 2514 ** if there is one. But if there was previously a read-lock, do not 2515 ** invoke the busy handler - just return SQLITE_BUSY. SQLITE_BUSY is 2516 ** returned when there is already a read-lock in order to avoid a deadlock. 2517 ** 2518 ** Suppose there are two processes A and B. A has a read lock and B has 2519 ** a reserved lock. B tries to promote to exclusive but is blocked because 2520 ** of A's read lock. A tries to promote to reserved but is blocked by B. 2521 ** One or the other of the two processes must give way or there can be 2522 ** no progress. By returning SQLITE_BUSY and not invoking the busy callback 2523 ** when A already has a read lock, we encourage A to give up and let B 2524 ** proceed. 2525 */ 2526 int sqlite3BtreeBeginTrans(Btree *p, int wrflag){ 2527 sqlite3 *pBlock = 0; 2528 BtShared *pBt = p->pBt; 2529 int rc = SQLITE_OK; 2530 2531 sqlite3BtreeEnter(p); 2532 btreeIntegrity(p); 2533 2534 /* If the btree is already in a write-transaction, or it 2535 ** is already in a read-transaction and a read-transaction 2536 ** is requested, this is a no-op. 2537 */ 2538 if( p->inTrans==TRANS_WRITE || (p->inTrans==TRANS_READ && !wrflag) ){ 2539 goto trans_begun; 2540 } 2541 2542 /* Write transactions are not possible on a read-only database */ 2543 if( pBt->readOnly && wrflag ){ 2544 rc = SQLITE_READONLY; 2545 goto trans_begun; 2546 } 2547 2548 #ifndef SQLITE_OMIT_SHARED_CACHE 2549 /* If another database handle has already opened a write transaction 2550 ** on this shared-btree structure and a second write transaction is 2551 ** requested, return SQLITE_LOCKED. 2552 */ 2553 if( (wrflag && pBt->inTransaction==TRANS_WRITE) || pBt->isPending ){ 2554 pBlock = pBt->pWriter->db; 2555 }else if( wrflag>1 ){ 2556 BtLock *pIter; 2557 for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){ 2558 if( pIter->pBtree!=p ){ 2559 pBlock = pIter->pBtree->db; 2560 break; 2561 } 2562 } 2563 } 2564 if( pBlock ){ 2565 sqlite3ConnectionBlocked(p->db, pBlock); 2566 rc = SQLITE_LOCKED_SHAREDCACHE; 2567 goto trans_begun; 2568 } 2569 #endif 2570 2571 /* Any read-only or read-write transaction implies a read-lock on 2572 ** page 1. So if some other shared-cache client already has a write-lock 2573 ** on page 1, the transaction cannot be opened. */ 2574 rc = querySharedCacheTableLock(p, MASTER_ROOT, READ_LOCK); 2575 if( SQLITE_OK!=rc ) goto trans_begun; 2576 2577 pBt->initiallyEmpty = (u8)(pBt->nPage==0); 2578 do { 2579 /* Call lockBtree() until either pBt->pPage1 is populated or 2580 ** lockBtree() returns something other than SQLITE_OK. lockBtree() 2581 ** may return SQLITE_OK but leave pBt->pPage1 set to 0 if after 2582 ** reading page 1 it discovers that the page-size of the database 2583 ** file is not pBt->pageSize. In this case lockBtree() will update 2584 ** pBt->pageSize to the page-size of the file on disk. 2585 */ 2586 while( pBt->pPage1==0 && SQLITE_OK==(rc = lockBtree(pBt)) ); 2587 2588 if( rc==SQLITE_OK && wrflag ){ 2589 if( pBt->readOnly ){ 2590 rc = SQLITE_READONLY; 2591 }else{ 2592 rc = sqlite3PagerBegin(pBt->pPager,wrflag>1,sqlite3TempInMemory(p->db)); 2593 if( rc==SQLITE_OK ){ 2594 rc = newDatabase(pBt); 2595 } 2596 } 2597 } 2598 2599 if( rc!=SQLITE_OK ){ 2600 unlockBtreeIfUnused(pBt); 2601 } 2602 }while( (rc&0xFF)==SQLITE_BUSY && pBt->inTransaction==TRANS_NONE && 2603 btreeInvokeBusyHandler(pBt) ); 2604 2605 if( rc==SQLITE_OK ){ 2606 if( p->inTrans==TRANS_NONE ){ 2607 pBt->nTransaction++; 2608 #ifndef SQLITE_OMIT_SHARED_CACHE 2609 if( p->sharable ){ 2610 assert( p->lock.pBtree==p && p->lock.iTable==1 ); 2611 p->lock.eLock = READ_LOCK; 2612 p->lock.pNext = pBt->pLock; 2613 pBt->pLock = &p->lock; 2614 } 2615 #endif 2616 } 2617 p->inTrans = (wrflag?TRANS_WRITE:TRANS_READ); 2618 if( p->inTrans>pBt->inTransaction ){ 2619 pBt->inTransaction = p->inTrans; 2620 } 2621 if( wrflag ){ 2622 MemPage *pPage1 = pBt->pPage1; 2623 #ifndef SQLITE_OMIT_SHARED_CACHE 2624 assert( !pBt->pWriter ); 2625 pBt->pWriter = p; 2626 pBt->isExclusive = (u8)(wrflag>1); 2627 #endif 2628 2629 /* If the db-size header field is incorrect (as it may be if an old 2630 ** client has been writing the database file), update it now. Doing 2631 ** this sooner rather than later means the database size can safely 2632 ** re-read the database size from page 1 if a savepoint or transaction 2633 ** rollback occurs within the transaction. 2634 */ 2635 if( pBt->nPage!=get4byte(&pPage1->aData[28]) ){ 2636 rc = sqlite3PagerWrite(pPage1->pDbPage); 2637 if( rc==SQLITE_OK ){ 2638 put4byte(&pPage1->aData[28], pBt->nPage); 2639 } 2640 } 2641 } 2642 } 2643 2644 2645 trans_begun: 2646 if( rc==SQLITE_OK && wrflag ){ 2647 /* This call makes sure that the pager has the correct number of 2648 ** open savepoints. If the second parameter is greater than 0 and 2649 ** the sub-journal is not already open, then it will be opened here. 2650 */ 2651 rc = sqlite3PagerOpenSavepoint(pBt->pPager, p->db->nSavepoint); 2652 } 2653 2654 btreeIntegrity(p); 2655 sqlite3BtreeLeave(p); 2656 return rc; 2657 } 2658 2659 #ifndef SQLITE_OMIT_AUTOVACUUM 2660 2661 /* 2662 ** Set the pointer-map entries for all children of page pPage. Also, if 2663 ** pPage contains cells that point to overflow pages, set the pointer 2664 ** map entries for the overflow pages as well. 2665 */ 2666 static int setChildPtrmaps(MemPage *pPage){ 2667 int i; /* Counter variable */ 2668 int nCell; /* Number of cells in page pPage */ 2669 int rc; /* Return code */ 2670 BtShared *pBt = pPage->pBt; 2671 u8 isInitOrig = pPage->isInit; 2672 Pgno pgno = pPage->pgno; 2673 2674 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 2675 rc = btreeInitPage(pPage); 2676 if( rc!=SQLITE_OK ){ 2677 goto set_child_ptrmaps_out; 2678 } 2679 nCell = pPage->nCell; 2680 2681 for(i=0; i<nCell; i++){ 2682 u8 *pCell = findCell(pPage, i); 2683 2684 ptrmapPutOvflPtr(pPage, pCell, &rc); 2685 2686 if( !pPage->leaf ){ 2687 Pgno childPgno = get4byte(pCell); 2688 ptrmapPut(pBt, childPgno, PTRMAP_BTREE, pgno, &rc); 2689 } 2690 } 2691 2692 if( !pPage->leaf ){ 2693 Pgno childPgno = get4byte(&pPage->aData[pPage->hdrOffset+8]); 2694 ptrmapPut(pBt, childPgno, PTRMAP_BTREE, pgno, &rc); 2695 } 2696 2697 set_child_ptrmaps_out: 2698 pPage->isInit = isInitOrig; 2699 return rc; 2700 } 2701 2702 /* 2703 ** Somewhere on pPage is a pointer to page iFrom. Modify this pointer so 2704 ** that it points to iTo. Parameter eType describes the type of pointer to 2705 ** be modified, as follows: 2706 ** 2707 ** PTRMAP_BTREE: pPage is a btree-page. The pointer points at a child 2708 ** page of pPage. 2709 ** 2710 ** PTRMAP_OVERFLOW1: pPage is a btree-page. The pointer points at an overflow 2711 ** page pointed to by one of the cells on pPage. 2712 ** 2713 ** PTRMAP_OVERFLOW2: pPage is an overflow-page. The pointer points at the next 2714 ** overflow page in the list. 2715 */ 2716 static int modifyPagePointer(MemPage *pPage, Pgno iFrom, Pgno iTo, u8 eType){ 2717 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 2718 assert( sqlite3PagerIswriteable(pPage->pDbPage) ); 2719 if( eType==PTRMAP_OVERFLOW2 ){ 2720 /* The pointer is always the first 4 bytes of the page in this case. */ 2721 if( get4byte(pPage->aData)!=iFrom ){ 2722 return SQLITE_CORRUPT_BKPT; 2723 } 2724 put4byte(pPage->aData, iTo); 2725 }else{ 2726 u8 isInitOrig = pPage->isInit; 2727 int i; 2728 int nCell; 2729 2730 btreeInitPage(pPage); 2731 nCell = pPage->nCell; 2732 2733 for(i=0; i<nCell; i++){ 2734 u8 *pCell = findCell(pPage, i); 2735 if( eType==PTRMAP_OVERFLOW1 ){ 2736 CellInfo info; 2737 btreeParseCellPtr(pPage, pCell, &info); 2738 if( info.iOverflow ){ 2739 if( iFrom==get4byte(&pCell[info.iOverflow]) ){ 2740 put4byte(&pCell[info.iOverflow], iTo); 2741 break; 2742 } 2743 } 2744 }else{ 2745 if( get4byte(pCell)==iFrom ){ 2746 put4byte(pCell, iTo); 2747 break; 2748 } 2749 } 2750 } 2751 2752 if( i==nCell ){ 2753 if( eType!=PTRMAP_BTREE || 2754 get4byte(&pPage->aData[pPage->hdrOffset+8])!=iFrom ){ 2755 return SQLITE_CORRUPT_BKPT; 2756 } 2757 put4byte(&pPage->aData[pPage->hdrOffset+8], iTo); 2758 } 2759 2760 pPage->isInit = isInitOrig; 2761 } 2762 return SQLITE_OK; 2763 } 2764 2765 2766 /* 2767 ** Move the open database page pDbPage to location iFreePage in the 2768 ** database. The pDbPage reference remains valid. 2769 ** 2770 ** The isCommit flag indicates that there is no need to remember that 2771 ** the journal needs to be sync()ed before database page pDbPage->pgno 2772 ** can be written to. The caller has already promised not to write to that 2773 ** page. 2774 */ 2775 static int relocatePage( 2776 BtShared *pBt, /* Btree */ 2777 MemPage *pDbPage, /* Open page to move */ 2778 u8 eType, /* Pointer map 'type' entry for pDbPage */ 2779 Pgno iPtrPage, /* Pointer map 'page-no' entry for pDbPage */ 2780 Pgno iFreePage, /* The location to move pDbPage to */ 2781 int isCommit /* isCommit flag passed to sqlite3PagerMovepage */ 2782 ){ 2783 MemPage *pPtrPage; /* The page that contains a pointer to pDbPage */ 2784 Pgno iDbPage = pDbPage->pgno; 2785 Pager *pPager = pBt->pPager; 2786 int rc; 2787 2788 assert( eType==PTRMAP_OVERFLOW2 || eType==PTRMAP_OVERFLOW1 || 2789 eType==PTRMAP_BTREE || eType==PTRMAP_ROOTPAGE ); 2790 assert( sqlite3_mutex_held(pBt->mutex) ); 2791 assert( pDbPage->pBt==pBt ); 2792 2793 /* Move page iDbPage from its current location to page number iFreePage */ 2794 TRACE(("AUTOVACUUM: Moving %d to free page %d (ptr page %d type %d)\n", 2795 iDbPage, iFreePage, iPtrPage, eType)); 2796 rc = sqlite3PagerMovepage(pPager, pDbPage->pDbPage, iFreePage, isCommit); 2797 if( rc!=SQLITE_OK ){ 2798 return rc; 2799 } 2800 pDbPage->pgno = iFreePage; 2801 2802 /* If pDbPage was a btree-page, then it may have child pages and/or cells 2803 ** that point to overflow pages. The pointer map entries for all these 2804 ** pages need to be changed. 2805 ** 2806 ** If pDbPage is an overflow page, then the first 4 bytes may store a 2807 ** pointer to a subsequent overflow page. If this is the case, then 2808 ** the pointer map needs to be updated for the subsequent overflow page. 2809 */ 2810 if( eType==PTRMAP_BTREE || eType==PTRMAP_ROOTPAGE ){ 2811 rc = setChildPtrmaps(pDbPage); 2812 if( rc!=SQLITE_OK ){ 2813 return rc; 2814 } 2815 }else{ 2816 Pgno nextOvfl = get4byte(pDbPage->aData); 2817 if( nextOvfl!=0 ){ 2818 ptrmapPut(pBt, nextOvfl, PTRMAP_OVERFLOW2, iFreePage, &rc); 2819 if( rc!=SQLITE_OK ){ 2820 return rc; 2821 } 2822 } 2823 } 2824 2825 /* Fix the database pointer on page iPtrPage that pointed at iDbPage so 2826 ** that it points at iFreePage. Also fix the pointer map entry for 2827 ** iPtrPage. 2828 */ 2829 if( eType!=PTRMAP_ROOTPAGE ){ 2830 rc = btreeGetPage(pBt, iPtrPage, &pPtrPage, 0); 2831 if( rc!=SQLITE_OK ){ 2832 return rc; 2833 } 2834 rc = sqlite3PagerWrite(pPtrPage->pDbPage); 2835 if( rc!=SQLITE_OK ){ 2836 releasePage(pPtrPage); 2837 return rc; 2838 } 2839 rc = modifyPagePointer(pPtrPage, iDbPage, iFreePage, eType); 2840 releasePage(pPtrPage); 2841 if( rc==SQLITE_OK ){ 2842 ptrmapPut(pBt, iFreePage, eType, iPtrPage, &rc); 2843 } 2844 } 2845 return rc; 2846 } 2847 2848 /* Forward declaration required by incrVacuumStep(). */ 2849 static int allocateBtreePage(BtShared *, MemPage **, Pgno *, Pgno, u8); 2850 2851 /* 2852 ** Perform a single step of an incremental-vacuum. If successful, 2853 ** return SQLITE_OK. If there is no work to do (and therefore no 2854 ** point in calling this function again), return SQLITE_DONE. 2855 ** 2856 ** More specificly, this function attempts to re-organize the 2857 ** database so that the last page of the file currently in use 2858 ** is no longer in use. 2859 ** 2860 ** If the nFin parameter is non-zero, this function assumes 2861 ** that the caller will keep calling incrVacuumStep() until 2862 ** it returns SQLITE_DONE or an error, and that nFin is the 2863 ** number of pages the database file will contain after this 2864 ** process is complete. If nFin is zero, it is assumed that 2865 ** incrVacuumStep() will be called a finite amount of times 2866 ** which may or may not empty the freelist. A full autovacuum 2867 ** has nFin>0. A "PRAGMA incremental_vacuum" has nFin==0. 2868 */ 2869 static int incrVacuumStep(BtShared *pBt, Pgno nFin, Pgno iLastPg){ 2870 Pgno nFreeList; /* Number of pages still on the free-list */ 2871 int rc; 2872 2873 assert( sqlite3_mutex_held(pBt->mutex) ); 2874 assert( iLastPg>nFin ); 2875 2876 if( !PTRMAP_ISPAGE(pBt, iLastPg) && iLastPg!=PENDING_BYTE_PAGE(pBt) ){ 2877 u8 eType; 2878 Pgno iPtrPage; 2879 2880 nFreeList = get4byte(&pBt->pPage1->aData[36]); 2881 if( nFreeList==0 ){ 2882 return SQLITE_DONE; 2883 } 2884 2885 rc = ptrmapGet(pBt, iLastPg, &eType, &iPtrPage); 2886 if( rc!=SQLITE_OK ){ 2887 return rc; 2888 } 2889 if( eType==PTRMAP_ROOTPAGE ){ 2890 return SQLITE_CORRUPT_BKPT; 2891 } 2892 2893 if( eType==PTRMAP_FREEPAGE ){ 2894 if( nFin==0 ){ 2895 /* Remove the page from the files free-list. This is not required 2896 ** if nFin is non-zero. In that case, the free-list will be 2897 ** truncated to zero after this function returns, so it doesn't 2898 ** matter if it still contains some garbage entries. 2899 */ 2900 Pgno iFreePg; 2901 MemPage *pFreePg; 2902 rc = allocateBtreePage(pBt, &pFreePg, &iFreePg, iLastPg, 1); 2903 if( rc!=SQLITE_OK ){ 2904 return rc; 2905 } 2906 assert( iFreePg==iLastPg ); 2907 releasePage(pFreePg); 2908 } 2909 } else { 2910 Pgno iFreePg; /* Index of free page to move pLastPg to */ 2911 MemPage *pLastPg; 2912 2913 rc = btreeGetPage(pBt, iLastPg, &pLastPg, 0); 2914 if( rc!=SQLITE_OK ){ 2915 return rc; 2916 } 2917 2918 /* If nFin is zero, this loop runs exactly once and page pLastPg 2919 ** is swapped with the first free page pulled off the free list. 2920 ** 2921 ** On the other hand, if nFin is greater than zero, then keep 2922 ** looping until a free-page located within the first nFin pages 2923 ** of the file is found. 2924 */ 2925 do { 2926 MemPage *pFreePg; 2927 rc = allocateBtreePage(pBt, &pFreePg, &iFreePg, 0, 0); 2928 if( rc!=SQLITE_OK ){ 2929 releasePage(pLastPg); 2930 return rc; 2931 } 2932 releasePage(pFreePg); 2933 }while( nFin!=0 && iFreePg>nFin ); 2934 assert( iFreePg<iLastPg ); 2935 2936 rc = sqlite3PagerWrite(pLastPg->pDbPage); 2937 if( rc==SQLITE_OK ){ 2938 rc = relocatePage(pBt, pLastPg, eType, iPtrPage, iFreePg, nFin!=0); 2939 } 2940 releasePage(pLastPg); 2941 if( rc!=SQLITE_OK ){ 2942 return rc; 2943 } 2944 } 2945 } 2946 2947 if( nFin==0 ){ 2948 iLastPg--; 2949 while( iLastPg==PENDING_BYTE_PAGE(pBt)||PTRMAP_ISPAGE(pBt, iLastPg) ){ 2950 if( PTRMAP_ISPAGE(pBt, iLastPg) ){ 2951 MemPage *pPg; 2952 rc = btreeGetPage(pBt, iLastPg, &pPg, 0); 2953 if( rc!=SQLITE_OK ){ 2954 return rc; 2955 } 2956 rc = sqlite3PagerWrite(pPg->pDbPage); 2957 releasePage(pPg); 2958 if( rc!=SQLITE_OK ){ 2959 return rc; 2960 } 2961 } 2962 iLastPg--; 2963 } 2964 sqlite3PagerTruncateImage(pBt->pPager, iLastPg); 2965 pBt->nPage = iLastPg; 2966 } 2967 return SQLITE_OK; 2968 } 2969 2970 /* 2971 ** A write-transaction must be opened before calling this function. 2972 ** It performs a single unit of work towards an incremental vacuum. 2973 ** 2974 ** If the incremental vacuum is finished after this function has run, 2975 ** SQLITE_DONE is returned. If it is not finished, but no error occurred, 2976 ** SQLITE_OK is returned. Otherwise an SQLite error code. 2977 */ 2978 int sqlite3BtreeIncrVacuum(Btree *p){ 2979 int rc; 2980 BtShared *pBt = p->pBt; 2981 2982 sqlite3BtreeEnter(p); 2983 assert( pBt->inTransaction==TRANS_WRITE && p->inTrans==TRANS_WRITE ); 2984 if( !pBt->autoVacuum ){ 2985 rc = SQLITE_DONE; 2986 }else{ 2987 invalidateAllOverflowCache(pBt); 2988 rc = incrVacuumStep(pBt, 0, btreePagecount(pBt)); 2989 if( rc==SQLITE_OK ){ 2990 rc = sqlite3PagerWrite(pBt->pPage1->pDbPage); 2991 put4byte(&pBt->pPage1->aData[28], pBt->nPage); 2992 } 2993 } 2994 sqlite3BtreeLeave(p); 2995 return rc; 2996 } 2997 2998 /* 2999 ** This routine is called prior to sqlite3PagerCommit when a transaction 3000 ** is commited for an auto-vacuum database. 3001 ** 3002 ** If SQLITE_OK is returned, then *pnTrunc is set to the number of pages 3003 ** the database file should be truncated to during the commit process. 3004 ** i.e. the database has been reorganized so that only the first *pnTrunc 3005 ** pages are in use. 3006 */ 3007 static int autoVacuumCommit(BtShared *pBt){ 3008 int rc = SQLITE_OK; 3009 Pager *pPager = pBt->pPager; 3010 VVA_ONLY( int nRef = sqlite3PagerRefcount(pPager) ); 3011 3012 assert( sqlite3_mutex_held(pBt->mutex) ); 3013 invalidateAllOverflowCache(pBt); 3014 assert(pBt->autoVacuum); 3015 if( !pBt->incrVacuum ){ 3016 Pgno nFin; /* Number of pages in database after autovacuuming */ 3017 Pgno nFree; /* Number of pages on the freelist initially */ 3018 Pgno nPtrmap; /* Number of PtrMap pages to be freed */ 3019 Pgno iFree; /* The next page to be freed */ 3020 int nEntry; /* Number of entries on one ptrmap page */ 3021 Pgno nOrig; /* Database size before freeing */ 3022 3023 nOrig = btreePagecount(pBt); 3024 if( PTRMAP_ISPAGE(pBt, nOrig) || nOrig==PENDING_BYTE_PAGE(pBt) ){ 3025 /* It is not possible to create a database for which the final page 3026 ** is either a pointer-map page or the pending-byte page. If one 3027 ** is encountered, this indicates corruption. 3028 */ 3029 return SQLITE_CORRUPT_BKPT; 3030 } 3031 3032 nFree = get4byte(&pBt->pPage1->aData[36]); 3033 nEntry = pBt->usableSize/5; 3034 nPtrmap = (nFree-nOrig+PTRMAP_PAGENO(pBt, nOrig)+nEntry)/nEntry; 3035 nFin = nOrig - nFree - nPtrmap; 3036 if( nOrig>PENDING_BYTE_PAGE(pBt) && nFin<PENDING_BYTE_PAGE(pBt) ){ 3037 nFin--; 3038 } 3039 while( PTRMAP_ISPAGE(pBt, nFin) || nFin==PENDING_BYTE_PAGE(pBt) ){ 3040 nFin--; 3041 } 3042 if( nFin>nOrig ) return SQLITE_CORRUPT_BKPT; 3043 3044 for(iFree=nOrig; iFree>nFin && rc==SQLITE_OK; iFree--){ 3045 rc = incrVacuumStep(pBt, nFin, iFree); 3046 } 3047 if( (rc==SQLITE_DONE || rc==SQLITE_OK) && nFree>0 ){ 3048 rc = sqlite3PagerWrite(pBt->pPage1->pDbPage); 3049 put4byte(&pBt->pPage1->aData[32], 0); 3050 put4byte(&pBt->pPage1->aData[36], 0); 3051 put4byte(&pBt->pPage1->aData[28], nFin); 3052 sqlite3PagerTruncateImage(pBt->pPager, nFin); 3053 pBt->nPage = nFin; 3054 } 3055 if( rc!=SQLITE_OK ){ 3056 sqlite3PagerRollback(pPager); 3057 } 3058 } 3059 3060 assert( nRef==sqlite3PagerRefcount(pPager) ); 3061 return rc; 3062 } 3063 3064 #else /* ifndef SQLITE_OMIT_AUTOVACUUM */ 3065 # define setChildPtrmaps(x) SQLITE_OK 3066 #endif 3067 3068 /* 3069 ** This routine does the first phase of a two-phase commit. This routine 3070 ** causes a rollback journal to be created (if it does not already exist) 3071 ** and populated with enough information so that if a power loss occurs 3072 ** the database can be restored to its original state by playing back 3073 ** the journal. Then the contents of the journal are flushed out to 3074 ** the disk. After the journal is safely on oxide, the changes to the 3075 ** database are written into the database file and flushed to oxide. 3076 ** At the end of this call, the rollback journal still exists on the 3077 ** disk and we are still holding all locks, so the transaction has not 3078 ** committed. See sqlite3BtreeCommitPhaseTwo() for the second phase of the 3079 ** commit process. 3080 ** 3081 ** This call is a no-op if no write-transaction is currently active on pBt. 3082 ** 3083 ** Otherwise, sync the database file for the btree pBt. zMaster points to 3084 ** the name of a master journal file that should be written into the 3085 ** individual journal file, or is NULL, indicating no master journal file 3086 ** (single database transaction). 3087 ** 3088 ** When this is called, the master journal should already have been 3089 ** created, populated with this journal pointer and synced to disk. 3090 ** 3091 ** Once this is routine has returned, the only thing required to commit 3092 ** the write-transaction for this database file is to delete the journal. 3093 */ 3094 int sqlite3BtreeCommitPhaseOne(Btree *p, const char *zMaster){ 3095 int rc = SQLITE_OK; 3096 if( p->inTrans==TRANS_WRITE ){ 3097 BtShared *pBt = p->pBt; 3098 sqlite3BtreeEnter(p); 3099 #ifndef SQLITE_OMIT_AUTOVACUUM 3100 if( pBt->autoVacuum ){ 3101 rc = autoVacuumCommit(pBt); 3102 if( rc!=SQLITE_OK ){ 3103 sqlite3BtreeLeave(p); 3104 return rc; 3105 } 3106 } 3107 #endif 3108 rc = sqlite3PagerCommitPhaseOne(pBt->pPager, zMaster, 0); 3109 sqlite3BtreeLeave(p); 3110 } 3111 return rc; 3112 } 3113 3114 /* 3115 ** This function is called from both BtreeCommitPhaseTwo() and BtreeRollback() 3116 ** at the conclusion of a transaction. 3117 */ 3118 static void btreeEndTransaction(Btree *p){ 3119 BtShared *pBt = p->pBt; 3120 assert( sqlite3BtreeHoldsMutex(p) ); 3121 3122 btreeClearHasContent(pBt); 3123 if( p->inTrans>TRANS_NONE && p->db->activeVdbeCnt>1 ){ 3124 /* If there are other active statements that belong to this database 3125 ** handle, downgrade to a read-only transaction. The other statements 3126 ** may still be reading from the database. */ 3127 downgradeAllSharedCacheTableLocks(p); 3128 p->inTrans = TRANS_READ; 3129 }else{ 3130 /* If the handle had any kind of transaction open, decrement the 3131 ** transaction count of the shared btree. If the transaction count 3132 ** reaches 0, set the shared state to TRANS_NONE. The unlockBtreeIfUnused() 3133 ** call below will unlock the pager. */ 3134 if( p->inTrans!=TRANS_NONE ){ 3135 clearAllSharedCacheTableLocks(p); 3136 pBt->nTransaction--; 3137 if( 0==pBt->nTransaction ){ 3138 pBt->inTransaction = TRANS_NONE; 3139 } 3140 } 3141 3142 /* Set the current transaction state to TRANS_NONE and unlock the 3143 ** pager if this call closed the only read or write transaction. */ 3144 p->inTrans = TRANS_NONE; 3145 unlockBtreeIfUnused(pBt); 3146 } 3147 3148 btreeIntegrity(p); 3149 } 3150 3151 /* 3152 ** Commit the transaction currently in progress. 3153 ** 3154 ** This routine implements the second phase of a 2-phase commit. The 3155 ** sqlite3BtreeCommitPhaseOne() routine does the first phase and should 3156 ** be invoked prior to calling this routine. The sqlite3BtreeCommitPhaseOne() 3157 ** routine did all the work of writing information out to disk and flushing the 3158 ** contents so that they are written onto the disk platter. All this 3159 ** routine has to do is delete or truncate or zero the header in the 3160 ** the rollback journal (which causes the transaction to commit) and 3161 ** drop locks. 3162 ** 3163 ** Normally, if an error occurs while the pager layer is attempting to 3164 ** finalize the underlying journal file, this function returns an error and 3165 ** the upper layer will attempt a rollback. However, if the second argument 3166 ** is non-zero then this b-tree transaction is part of a multi-file 3167 ** transaction. In this case, the transaction has already been committed 3168 ** (by deleting a master journal file) and the caller will ignore this 3169 ** functions return code. So, even if an error occurs in the pager layer, 3170 ** reset the b-tree objects internal state to indicate that the write 3171 ** transaction has been closed. This is quite safe, as the pager will have 3172 ** transitioned to the error state. 3173 ** 3174 ** This will release the write lock on the database file. If there 3175 ** are no active cursors, it also releases the read lock. 3176 */ 3177 int sqlite3BtreeCommitPhaseTwo(Btree *p, int bCleanup){ 3178 3179 if( p->inTrans==TRANS_NONE ) return SQLITE_OK; 3180 sqlite3BtreeEnter(p); 3181 btreeIntegrity(p); 3182 3183 /* If the handle has a write-transaction open, commit the shared-btrees 3184 ** transaction and set the shared state to TRANS_READ. 3185 */ 3186 if( p->inTrans==TRANS_WRITE ){ 3187 int rc; 3188 BtShared *pBt = p->pBt; 3189 assert( pBt->inTransaction==TRANS_WRITE ); 3190 assert( pBt->nTransaction>0 ); 3191 rc = sqlite3PagerCommitPhaseTwo(pBt->pPager); 3192 if( rc!=SQLITE_OK && bCleanup==0 ){ 3193 sqlite3BtreeLeave(p); 3194 return rc; 3195 } 3196 pBt->inTransaction = TRANS_READ; 3197 } 3198 3199 btreeEndTransaction(p); 3200 sqlite3BtreeLeave(p); 3201 return SQLITE_OK; 3202 } 3203 3204 /* 3205 ** Do both phases of a commit. 3206 */ 3207 int sqlite3BtreeCommit(Btree *p){ 3208 int rc; 3209 sqlite3BtreeEnter(p); 3210 rc = sqlite3BtreeCommitPhaseOne(p, 0); 3211 if( rc==SQLITE_OK ){ 3212 rc = sqlite3BtreeCommitPhaseTwo(p, 0); 3213 } 3214 sqlite3BtreeLeave(p); 3215 return rc; 3216 } 3217 3218 #ifndef NDEBUG 3219 /* 3220 ** Return the number of write-cursors open on this handle. This is for use 3221 ** in assert() expressions, so it is only compiled if NDEBUG is not 3222 ** defined. 3223 ** 3224 ** For the purposes of this routine, a write-cursor is any cursor that 3225 ** is capable of writing to the databse. That means the cursor was 3226 ** originally opened for writing and the cursor has not be disabled 3227 ** by having its state changed to CURSOR_FAULT. 3228 */ 3229 static int countWriteCursors(BtShared *pBt){ 3230 BtCursor *pCur; 3231 int r = 0; 3232 for(pCur=pBt->pCursor; pCur; pCur=pCur->pNext){ 3233 if( pCur->wrFlag && pCur->eState!=CURSOR_FAULT ) r++; 3234 } 3235 return r; 3236 } 3237 #endif 3238 3239 /* 3240 ** This routine sets the state to CURSOR_FAULT and the error 3241 ** code to errCode for every cursor on BtShared that pBtree 3242 ** references. 3243 ** 3244 ** Every cursor is tripped, including cursors that belong 3245 ** to other database connections that happen to be sharing 3246 ** the cache with pBtree. 3247 ** 3248 ** This routine gets called when a rollback occurs. 3249 ** All cursors using the same cache must be tripped 3250 ** to prevent them from trying to use the btree after 3251 ** the rollback. The rollback may have deleted tables 3252 ** or moved root pages, so it is not sufficient to 3253 ** save the state of the cursor. The cursor must be 3254 ** invalidated. 3255 */ 3256 void sqlite3BtreeTripAllCursors(Btree *pBtree, int errCode){ 3257 BtCursor *p; 3258 sqlite3BtreeEnter(pBtree); 3259 for(p=pBtree->pBt->pCursor; p; p=p->pNext){ 3260 int i; 3261 sqlite3BtreeClearCursor(p); 3262 p->eState = CURSOR_FAULT; 3263 p->skipNext = errCode; 3264 for(i=0; i<=p->iPage; i++){ 3265 releasePage(p->apPage[i]); 3266 p->apPage[i] = 0; 3267 } 3268 } 3269 sqlite3BtreeLeave(pBtree); 3270 } 3271 3272 /* 3273 ** Rollback the transaction in progress. All cursors will be 3274 ** invalided by this operation. Any attempt to use a cursor 3275 ** that was open at the beginning of this operation will result 3276 ** in an error. 3277 ** 3278 ** This will release the write lock on the database file. If there 3279 ** are no active cursors, it also releases the read lock. 3280 */ 3281 int sqlite3BtreeRollback(Btree *p){ 3282 int rc; 3283 BtShared *pBt = p->pBt; 3284 MemPage *pPage1; 3285 3286 sqlite3BtreeEnter(p); 3287 rc = saveAllCursors(pBt, 0, 0); 3288 #ifndef SQLITE_OMIT_SHARED_CACHE 3289 if( rc!=SQLITE_OK ){ 3290 /* This is a horrible situation. An IO or malloc() error occurred whilst 3291 ** trying to save cursor positions. If this is an automatic rollback (as 3292 ** the result of a constraint, malloc() failure or IO error) then 3293 ** the cache may be internally inconsistent (not contain valid trees) so 3294 ** we cannot simply return the error to the caller. Instead, abort 3295 ** all queries that may be using any of the cursors that failed to save. 3296 */ 3297 sqlite3BtreeTripAllCursors(p, rc); 3298 } 3299 #endif 3300 btreeIntegrity(p); 3301 3302 if( p->inTrans==TRANS_WRITE ){ 3303 int rc2; 3304 3305 assert( TRANS_WRITE==pBt->inTransaction ); 3306 rc2 = sqlite3PagerRollback(pBt->pPager); 3307 if( rc2!=SQLITE_OK ){ 3308 rc = rc2; 3309 } 3310 3311 /* The rollback may have destroyed the pPage1->aData value. So 3312 ** call btreeGetPage() on page 1 again to make 3313 ** sure pPage1->aData is set correctly. */ 3314 if( btreeGetPage(pBt, 1, &pPage1, 0)==SQLITE_OK ){ 3315 int nPage = get4byte(28+(u8*)pPage1->aData); 3316 testcase( nPage==0 ); 3317 if( nPage==0 ) sqlite3PagerPagecount(pBt->pPager, &nPage); 3318 testcase( pBt->nPage!=nPage ); 3319 pBt->nPage = nPage; 3320 releasePage(pPage1); 3321 } 3322 assert( countWriteCursors(pBt)==0 ); 3323 pBt->inTransaction = TRANS_READ; 3324 } 3325 3326 btreeEndTransaction(p); 3327 sqlite3BtreeLeave(p); 3328 return rc; 3329 } 3330 3331 /* 3332 ** Start a statement subtransaction. The subtransaction can can be rolled 3333 ** back independently of the main transaction. You must start a transaction 3334 ** before starting a subtransaction. The subtransaction is ended automatically 3335 ** if the main transaction commits or rolls back. 3336 ** 3337 ** Statement subtransactions are used around individual SQL statements 3338 ** that are contained within a BEGIN...COMMIT block. If a constraint 3339 ** error occurs within the statement, the effect of that one statement 3340 ** can be rolled back without having to rollback the entire transaction. 3341 ** 3342 ** A statement sub-transaction is implemented as an anonymous savepoint. The 3343 ** value passed as the second parameter is the total number of savepoints, 3344 ** including the new anonymous savepoint, open on the B-Tree. i.e. if there 3345 ** are no active savepoints and no other statement-transactions open, 3346 ** iStatement is 1. This anonymous savepoint can be released or rolled back 3347 ** using the sqlite3BtreeSavepoint() function. 3348 */ 3349 int sqlite3BtreeBeginStmt(Btree *p, int iStatement){ 3350 int rc; 3351 BtShared *pBt = p->pBt; 3352 sqlite3BtreeEnter(p); 3353 assert( p->inTrans==TRANS_WRITE ); 3354 assert( pBt->readOnly==0 ); 3355 assert( iStatement>0 ); 3356 assert( iStatement>p->db->nSavepoint ); 3357 assert( pBt->inTransaction==TRANS_WRITE ); 3358 /* At the pager level, a statement transaction is a savepoint with 3359 ** an index greater than all savepoints created explicitly using 3360 ** SQL statements. It is illegal to open, release or rollback any 3361 ** such savepoints while the statement transaction savepoint is active. 3362 */ 3363 rc = sqlite3PagerOpenSavepoint(pBt->pPager, iStatement); 3364 sqlite3BtreeLeave(p); 3365 return rc; 3366 } 3367 3368 /* 3369 ** The second argument to this function, op, is always SAVEPOINT_ROLLBACK 3370 ** or SAVEPOINT_RELEASE. This function either releases or rolls back the 3371 ** savepoint identified by parameter iSavepoint, depending on the value 3372 ** of op. 3373 ** 3374 ** Normally, iSavepoint is greater than or equal to zero. However, if op is 3375 ** SAVEPOINT_ROLLBACK, then iSavepoint may also be -1. In this case the 3376 ** contents of the entire transaction are rolled back. This is different 3377 ** from a normal transaction rollback, as no locks are released and the 3378 ** transaction remains open. 3379 */ 3380 int sqlite3BtreeSavepoint(Btree *p, int op, int iSavepoint){ 3381 int rc = SQLITE_OK; 3382 if( p && p->inTrans==TRANS_WRITE ){ 3383 BtShared *pBt = p->pBt; 3384 assert( op==SAVEPOINT_RELEASE || op==SAVEPOINT_ROLLBACK ); 3385 assert( iSavepoint>=0 || (iSavepoint==-1 && op==SAVEPOINT_ROLLBACK) ); 3386 sqlite3BtreeEnter(p); 3387 rc = sqlite3PagerSavepoint(pBt->pPager, op, iSavepoint); 3388 if( rc==SQLITE_OK ){ 3389 if( iSavepoint<0 && pBt->initiallyEmpty ) pBt->nPage = 0; 3390 rc = newDatabase(pBt); 3391 pBt->nPage = get4byte(28 + pBt->pPage1->aData); 3392 3393 /* The database size was written into the offset 28 of the header 3394 ** when the transaction started, so we know that the value at offset 3395 ** 28 is nonzero. */ 3396 assert( pBt->nPage>0 ); 3397 } 3398 sqlite3BtreeLeave(p); 3399 } 3400 return rc; 3401 } 3402 3403 /* 3404 ** Create a new cursor for the BTree whose root is on the page 3405 ** iTable. If a read-only cursor is requested, it is assumed that 3406 ** the caller already has at least a read-only transaction open 3407 ** on the database already. If a write-cursor is requested, then 3408 ** the caller is assumed to have an open write transaction. 3409 ** 3410 ** If wrFlag==0, then the cursor can only be used for reading. 3411 ** If wrFlag==1, then the cursor can be used for reading or for 3412 ** writing if other conditions for writing are also met. These 3413 ** are the conditions that must be met in order for writing to 3414 ** be allowed: 3415 ** 3416 ** 1: The cursor must have been opened with wrFlag==1 3417 ** 3418 ** 2: Other database connections that share the same pager cache 3419 ** but which are not in the READ_UNCOMMITTED state may not have 3420 ** cursors open with wrFlag==0 on the same table. Otherwise 3421 ** the changes made by this write cursor would be visible to 3422 ** the read cursors in the other database connection. 3423 ** 3424 ** 3: The database must be writable (not on read-only media) 3425 ** 3426 ** 4: There must be an active transaction. 3427 ** 3428 ** No checking is done to make sure that page iTable really is the 3429 ** root page of a b-tree. If it is not, then the cursor acquired 3430 ** will not work correctly. 3431 ** 3432 ** It is assumed that the sqlite3BtreeCursorZero() has been called 3433 ** on pCur to initialize the memory space prior to invoking this routine. 3434 */ 3435 static int btreeCursor( 3436 Btree *p, /* The btree */ 3437 int iTable, /* Root page of table to open */ 3438 int wrFlag, /* 1 to write. 0 read-only */ 3439 struct KeyInfo *pKeyInfo, /* First arg to comparison function */ 3440 BtCursor *pCur /* Space for new cursor */ 3441 ){ 3442 BtShared *pBt = p->pBt; /* Shared b-tree handle */ 3443 3444 assert( sqlite3BtreeHoldsMutex(p) ); 3445 assert( wrFlag==0 || wrFlag==1 ); 3446 3447 /* The following assert statements verify that if this is a sharable 3448 ** b-tree database, the connection is holding the required table locks, 3449 ** and that no other connection has any open cursor that conflicts with 3450 ** this lock. */ 3451 assert( hasSharedCacheTableLock(p, iTable, pKeyInfo!=0, wrFlag+1) ); 3452 assert( wrFlag==0 || !hasReadConflicts(p, iTable) ); 3453 3454 /* Assert that the caller has opened the required transaction. */ 3455 assert( p->inTrans>TRANS_NONE ); 3456 assert( wrFlag==0 || p->inTrans==TRANS_WRITE ); 3457 assert( pBt->pPage1 && pBt->pPage1->aData ); 3458 3459 if( NEVER(wrFlag && pBt->readOnly) ){ 3460 return SQLITE_READONLY; 3461 } 3462 if( iTable==1 && btreePagecount(pBt)==0 ){ 3463 return SQLITE_EMPTY; 3464 } 3465 3466 /* Now that no other errors can occur, finish filling in the BtCursor 3467 ** variables and link the cursor into the BtShared list. */ 3468 pCur->pgnoRoot = (Pgno)iTable; 3469 pCur->iPage = -1; 3470 pCur->pKeyInfo = pKeyInfo; 3471 pCur->pBtree = p; 3472 pCur->pBt = pBt; 3473 pCur->wrFlag = (u8)wrFlag; 3474 pCur->pNext = pBt->pCursor; 3475 if( pCur->pNext ){ 3476 pCur->pNext->pPrev = pCur; 3477 } 3478 pBt->pCursor = pCur; 3479 pCur->eState = CURSOR_INVALID; 3480 pCur->cachedRowid = 0; 3481 return SQLITE_OK; 3482 } 3483 int sqlite3BtreeCursor( 3484 Btree *p, /* The btree */ 3485 int iTable, /* Root page of table to open */ 3486 int wrFlag, /* 1 to write. 0 read-only */ 3487 struct KeyInfo *pKeyInfo, /* First arg to xCompare() */ 3488 BtCursor *pCur /* Write new cursor here */ 3489 ){ 3490 int rc; 3491 sqlite3BtreeEnter(p); 3492 rc = btreeCursor(p, iTable, wrFlag, pKeyInfo, pCur); 3493 sqlite3BtreeLeave(p); 3494 return rc; 3495 } 3496 3497 /* 3498 ** Return the size of a BtCursor object in bytes. 3499 ** 3500 ** This interfaces is needed so that users of cursors can preallocate 3501 ** sufficient storage to hold a cursor. The BtCursor object is opaque 3502 ** to users so they cannot do the sizeof() themselves - they must call 3503 ** this routine. 3504 */ 3505 int sqlite3BtreeCursorSize(void){ 3506 return ROUND8(sizeof(BtCursor)); 3507 } 3508 3509 /* 3510 ** Initialize memory that will be converted into a BtCursor object. 3511 ** 3512 ** The simple approach here would be to memset() the entire object 3513 ** to zero. But it turns out that the apPage[] and aiIdx[] arrays 3514 ** do not need to be zeroed and they are large, so we can save a lot 3515 ** of run-time by skipping the initialization of those elements. 3516 */ 3517 void sqlite3BtreeCursorZero(BtCursor *p){ 3518 memset(p, 0, offsetof(BtCursor, iPage)); 3519 } 3520 3521 /* 3522 ** Set the cached rowid value of every cursor in the same database file 3523 ** as pCur and having the same root page number as pCur. The value is 3524 ** set to iRowid. 3525 ** 3526 ** Only positive rowid values are considered valid for this cache. 3527 ** The cache is initialized to zero, indicating an invalid cache. 3528 ** A btree will work fine with zero or negative rowids. We just cannot 3529 ** cache zero or negative rowids, which means tables that use zero or 3530 ** negative rowids might run a little slower. But in practice, zero 3531 ** or negative rowids are very uncommon so this should not be a problem. 3532 */ 3533 void sqlite3BtreeSetCachedRowid(BtCursor *pCur, sqlite3_int64 iRowid){ 3534 BtCursor *p; 3535 for(p=pCur->pBt->pCursor; p; p=p->pNext){ 3536 if( p->pgnoRoot==pCur->pgnoRoot ) p->cachedRowid = iRowid; 3537 } 3538 assert( pCur->cachedRowid==iRowid ); 3539 } 3540 3541 /* 3542 ** Return the cached rowid for the given cursor. A negative or zero 3543 ** return value indicates that the rowid cache is invalid and should be 3544 ** ignored. If the rowid cache has never before been set, then a 3545 ** zero is returned. 3546 */ 3547 sqlite3_int64 sqlite3BtreeGetCachedRowid(BtCursor *pCur){ 3548 return pCur->cachedRowid; 3549 } 3550 3551 /* 3552 ** Close a cursor. The read lock on the database file is released 3553 ** when the last cursor is closed. 3554 */ 3555 int sqlite3BtreeCloseCursor(BtCursor *pCur){ 3556 Btree *pBtree = pCur->pBtree; 3557 if( pBtree ){ 3558 int i; 3559 BtShared *pBt = pCur->pBt; 3560 sqlite3BtreeEnter(pBtree); 3561 sqlite3BtreeClearCursor(pCur); 3562 if( pCur->pPrev ){ 3563 pCur->pPrev->pNext = pCur->pNext; 3564 }else{ 3565 pBt->pCursor = pCur->pNext; 3566 } 3567 if( pCur->pNext ){ 3568 pCur->pNext->pPrev = pCur->pPrev; 3569 } 3570 for(i=0; i<=pCur->iPage; i++){ 3571 releasePage(pCur->apPage[i]); 3572 } 3573 unlockBtreeIfUnused(pBt); 3574 invalidateOverflowCache(pCur); 3575 /* sqlite3_free(pCur); */ 3576 sqlite3BtreeLeave(pBtree); 3577 } 3578 return SQLITE_OK; 3579 } 3580 3581 /* 3582 ** Make sure the BtCursor* given in the argument has a valid 3583 ** BtCursor.info structure. If it is not already valid, call 3584 ** btreeParseCell() to fill it in. 3585 ** 3586 ** BtCursor.info is a cache of the information in the current cell. 3587 ** Using this cache reduces the number of calls to btreeParseCell(). 3588 ** 3589 ** 2007-06-25: There is a bug in some versions of MSVC that cause the 3590 ** compiler to crash when getCellInfo() is implemented as a macro. 3591 ** But there is a measureable speed advantage to using the macro on gcc 3592 ** (when less compiler optimizations like -Os or -O0 are used and the 3593 ** compiler is not doing agressive inlining.) So we use a real function 3594 ** for MSVC and a macro for everything else. Ticket #2457. 3595 */ 3596 #ifndef NDEBUG 3597 static void assertCellInfo(BtCursor *pCur){ 3598 CellInfo info; 3599 int iPage = pCur->iPage; 3600 memset(&info, 0, sizeof(info)); 3601 btreeParseCell(pCur->apPage[iPage], pCur->aiIdx[iPage], &info); 3602 assert( memcmp(&info, &pCur->info, sizeof(info))==0 ); 3603 } 3604 #else 3605 #define assertCellInfo(x) 3606 #endif 3607 #ifdef _MSC_VER 3608 /* Use a real function in MSVC to work around bugs in that compiler. */ 3609 static void getCellInfo(BtCursor *pCur){ 3610 if( pCur->info.nSize==0 ){ 3611 int iPage = pCur->iPage; 3612 btreeParseCell(pCur->apPage[iPage],pCur->aiIdx[iPage],&pCur->info); 3613 pCur->validNKey = 1; 3614 }else{ 3615 assertCellInfo(pCur); 3616 } 3617 } 3618 #else /* if not _MSC_VER */ 3619 /* Use a macro in all other compilers so that the function is inlined */ 3620 #define getCellInfo(pCur) \ 3621 if( pCur->info.nSize==0 ){ \ 3622 int iPage = pCur->iPage; \ 3623 btreeParseCell(pCur->apPage[iPage],pCur->aiIdx[iPage],&pCur->info); \ 3624 pCur->validNKey = 1; \ 3625 }else{ \ 3626 assertCellInfo(pCur); \ 3627 } 3628 #endif /* _MSC_VER */ 3629 3630 #ifndef NDEBUG /* The next routine used only within assert() statements */ 3631 /* 3632 ** Return true if the given BtCursor is valid. A valid cursor is one 3633 ** that is currently pointing to a row in a (non-empty) table. 3634 ** This is a verification routine is used only within assert() statements. 3635 */ 3636 int sqlite3BtreeCursorIsValid(BtCursor *pCur){ 3637 return pCur && pCur->eState==CURSOR_VALID; 3638 } 3639 #endif /* NDEBUG */ 3640 3641 /* 3642 ** Set *pSize to the size of the buffer needed to hold the value of 3643 ** the key for the current entry. If the cursor is not pointing 3644 ** to a valid entry, *pSize is set to 0. 3645 ** 3646 ** For a table with the INTKEY flag set, this routine returns the key 3647 ** itself, not the number of bytes in the key. 3648 ** 3649 ** The caller must position the cursor prior to invoking this routine. 3650 ** 3651 ** This routine cannot fail. It always returns SQLITE_OK. 3652 */ 3653 int sqlite3BtreeKeySize(BtCursor *pCur, i64 *pSize){ 3654 assert( cursorHoldsMutex(pCur) ); 3655 assert( pCur->eState==CURSOR_INVALID || pCur->eState==CURSOR_VALID ); 3656 if( pCur->eState!=CURSOR_VALID ){ 3657 *pSize = 0; 3658 }else{ 3659 getCellInfo(pCur); 3660 *pSize = pCur->info.nKey; 3661 } 3662 return SQLITE_OK; 3663 } 3664 3665 /* 3666 ** Set *pSize to the number of bytes of data in the entry the 3667 ** cursor currently points to. 3668 ** 3669 ** The caller must guarantee that the cursor is pointing to a non-NULL 3670 ** valid entry. In other words, the calling procedure must guarantee 3671 ** that the cursor has Cursor.eState==CURSOR_VALID. 3672 ** 3673 ** Failure is not possible. This function always returns SQLITE_OK. 3674 ** It might just as well be a procedure (returning void) but we continue 3675 ** to return an integer result code for historical reasons. 3676 */ 3677 int sqlite3BtreeDataSize(BtCursor *pCur, u32 *pSize){ 3678 assert( cursorHoldsMutex(pCur) ); 3679 assert( pCur->eState==CURSOR_VALID ); 3680 getCellInfo(pCur); 3681 *pSize = pCur->info.nData; 3682 return SQLITE_OK; 3683 } 3684 3685 /* 3686 ** Given the page number of an overflow page in the database (parameter 3687 ** ovfl), this function finds the page number of the next page in the 3688 ** linked list of overflow pages. If possible, it uses the auto-vacuum 3689 ** pointer-map data instead of reading the content of page ovfl to do so. 3690 ** 3691 ** If an error occurs an SQLite error code is returned. Otherwise: 3692 ** 3693 ** The page number of the next overflow page in the linked list is 3694 ** written to *pPgnoNext. If page ovfl is the last page in its linked 3695 ** list, *pPgnoNext is set to zero. 3696 ** 3697 ** If ppPage is not NULL, and a reference to the MemPage object corresponding 3698 ** to page number pOvfl was obtained, then *ppPage is set to point to that 3699 ** reference. It is the responsibility of the caller to call releasePage() 3700 ** on *ppPage to free the reference. In no reference was obtained (because 3701 ** the pointer-map was used to obtain the value for *pPgnoNext), then 3702 ** *ppPage is set to zero. 3703 */ 3704 static int getOverflowPage( 3705 BtShared *pBt, /* The database file */ 3706 Pgno ovfl, /* Current overflow page number */ 3707 MemPage **ppPage, /* OUT: MemPage handle (may be NULL) */ 3708 Pgno *pPgnoNext /* OUT: Next overflow page number */ 3709 ){ 3710 Pgno next = 0; 3711 MemPage *pPage = 0; 3712 int rc = SQLITE_OK; 3713 3714 assert( sqlite3_mutex_held(pBt->mutex) ); 3715 assert(pPgnoNext); 3716 3717 #ifndef SQLITE_OMIT_AUTOVACUUM 3718 /* Try to find the next page in the overflow list using the 3719 ** autovacuum pointer-map pages. Guess that the next page in 3720 ** the overflow list is page number (ovfl+1). If that guess turns 3721 ** out to be wrong, fall back to loading the data of page 3722 ** number ovfl to determine the next page number. 3723 */ 3724 if( pBt->autoVacuum ){ 3725 Pgno pgno; 3726 Pgno iGuess = ovfl+1; 3727 u8 eType; 3728 3729 while( PTRMAP_ISPAGE(pBt, iGuess) || iGuess==PENDING_BYTE_PAGE(pBt) ){ 3730 iGuess++; 3731 } 3732 3733 if( iGuess<=btreePagecount(pBt) ){ 3734 rc = ptrmapGet(pBt, iGuess, &eType, &pgno); 3735 if( rc==SQLITE_OK && eType==PTRMAP_OVERFLOW2 && pgno==ovfl ){ 3736 next = iGuess; 3737 rc = SQLITE_DONE; 3738 } 3739 } 3740 } 3741 #endif 3742 3743 assert( next==0 || rc==SQLITE_DONE ); 3744 if( rc==SQLITE_OK ){ 3745 rc = btreeGetPage(pBt, ovfl, &pPage, 0); 3746 assert( rc==SQLITE_OK || pPage==0 ); 3747 if( rc==SQLITE_OK ){ 3748 next = get4byte(pPage->aData); 3749 } 3750 } 3751 3752 *pPgnoNext = next; 3753 if( ppPage ){ 3754 *ppPage = pPage; 3755 }else{ 3756 releasePage(pPage); 3757 } 3758 return (rc==SQLITE_DONE ? SQLITE_OK : rc); 3759 } 3760 3761 /* 3762 ** Copy data from a buffer to a page, or from a page to a buffer. 3763 ** 3764 ** pPayload is a pointer to data stored on database page pDbPage. 3765 ** If argument eOp is false, then nByte bytes of data are copied 3766 ** from pPayload to the buffer pointed at by pBuf. If eOp is true, 3767 ** then sqlite3PagerWrite() is called on pDbPage and nByte bytes 3768 ** of data are copied from the buffer pBuf to pPayload. 3769 ** 3770 ** SQLITE_OK is returned on success, otherwise an error code. 3771 */ 3772 static int copyPayload( 3773 void *pPayload, /* Pointer to page data */ 3774 void *pBuf, /* Pointer to buffer */ 3775 int nByte, /* Number of bytes to copy */ 3776 int eOp, /* 0 -> copy from page, 1 -> copy to page */ 3777 DbPage *pDbPage /* Page containing pPayload */ 3778 ){ 3779 if( eOp ){ 3780 /* Copy data from buffer to page (a write operation) */ 3781 int rc = sqlite3PagerWrite(pDbPage); 3782 if( rc!=SQLITE_OK ){ 3783 return rc; 3784 } 3785 memcpy(pPayload, pBuf, nByte); 3786 }else{ 3787 /* Copy data from page to buffer (a read operation) */ 3788 memcpy(pBuf, pPayload, nByte); 3789 } 3790 return SQLITE_OK; 3791 } 3792 3793 /* 3794 ** This function is used to read or overwrite payload information 3795 ** for the entry that the pCur cursor is pointing to. If the eOp 3796 ** parameter is 0, this is a read operation (data copied into 3797 ** buffer pBuf). If it is non-zero, a write (data copied from 3798 ** buffer pBuf). 3799 ** 3800 ** A total of "amt" bytes are read or written beginning at "offset". 3801 ** Data is read to or from the buffer pBuf. 3802 ** 3803 ** The content being read or written might appear on the main page 3804 ** or be scattered out on multiple overflow pages. 3805 ** 3806 ** If the BtCursor.isIncrblobHandle flag is set, and the current 3807 ** cursor entry uses one or more overflow pages, this function 3808 ** allocates space for and lazily popluates the overflow page-list 3809 ** cache array (BtCursor.aOverflow). Subsequent calls use this 3810 ** cache to make seeking to the supplied offset more efficient. 3811 ** 3812 ** Once an overflow page-list cache has been allocated, it may be 3813 ** invalidated if some other cursor writes to the same table, or if 3814 ** the cursor is moved to a different row. Additionally, in auto-vacuum 3815 ** mode, the following events may invalidate an overflow page-list cache. 3816 ** 3817 ** * An incremental vacuum, 3818 ** * A commit in auto_vacuum="full" mode, 3819 ** * Creating a table (may require moving an overflow page). 3820 */ 3821 static int accessPayload( 3822 BtCursor *pCur, /* Cursor pointing to entry to read from */ 3823 u32 offset, /* Begin reading this far into payload */ 3824 u32 amt, /* Read this many bytes */ 3825 unsigned char *pBuf, /* Write the bytes into this buffer */ 3826 int eOp /* zero to read. non-zero to write. */ 3827 ){ 3828 unsigned char *aPayload; 3829 int rc = SQLITE_OK; 3830 u32 nKey; 3831 int iIdx = 0; 3832 MemPage *pPage = pCur->apPage[pCur->iPage]; /* Btree page of current entry */ 3833 BtShared *pBt = pCur->pBt; /* Btree this cursor belongs to */ 3834 3835 assert( pPage ); 3836 assert( pCur->eState==CURSOR_VALID ); 3837 assert( pCur->aiIdx[pCur->iPage]<pPage->nCell ); 3838 assert( cursorHoldsMutex(pCur) ); 3839 3840 getCellInfo(pCur); 3841 aPayload = pCur->info.pCell + pCur->info.nHeader; 3842 nKey = (pPage->intKey ? 0 : (int)pCur->info.nKey); 3843 3844 if( NEVER(offset+amt > nKey+pCur->info.nData) 3845 || &aPayload[pCur->info.nLocal] > &pPage->aData[pBt->usableSize] 3846 ){ 3847 /* Trying to read or write past the end of the data is an error */ 3848 return SQLITE_CORRUPT_BKPT; 3849 } 3850 3851 /* Check if data must be read/written to/from the btree page itself. */ 3852 if( offset<pCur->info.nLocal ){ 3853 int a = amt; 3854 if( a+offset>pCur->info.nLocal ){ 3855 a = pCur->info.nLocal - offset; 3856 } 3857 rc = copyPayload(&aPayload[offset], pBuf, a, eOp, pPage->pDbPage); 3858 offset = 0; 3859 pBuf += a; 3860 amt -= a; 3861 }else{ 3862 offset -= pCur->info.nLocal; 3863 } 3864 3865 if( rc==SQLITE_OK && amt>0 ){ 3866 const u32 ovflSize = pBt->usableSize - 4; /* Bytes content per ovfl page */ 3867 Pgno nextPage; 3868 3869 nextPage = get4byte(&aPayload[pCur->info.nLocal]); 3870 3871 #ifndef SQLITE_OMIT_INCRBLOB 3872 /* If the isIncrblobHandle flag is set and the BtCursor.aOverflow[] 3873 ** has not been allocated, allocate it now. The array is sized at 3874 ** one entry for each overflow page in the overflow chain. The 3875 ** page number of the first overflow page is stored in aOverflow[0], 3876 ** etc. A value of 0 in the aOverflow[] array means "not yet known" 3877 ** (the cache is lazily populated). 3878 */ 3879 if( pCur->isIncrblobHandle && !pCur->aOverflow ){ 3880 int nOvfl = (pCur->info.nPayload-pCur->info.nLocal+ovflSize-1)/ovflSize; 3881 pCur->aOverflow = (Pgno *)sqlite3MallocZero(sizeof(Pgno)*nOvfl); 3882 /* nOvfl is always positive. If it were zero, fetchPayload would have 3883 ** been used instead of this routine. */ 3884 if( ALWAYS(nOvfl) && !pCur->aOverflow ){ 3885 rc = SQLITE_NOMEM; 3886 } 3887 } 3888 3889 /* If the overflow page-list cache has been allocated and the 3890 ** entry for the first required overflow page is valid, skip 3891 ** directly to it. 3892 */ 3893 if( pCur->aOverflow && pCur->aOverflow[offset/ovflSize] ){ 3894 iIdx = (offset/ovflSize); 3895 nextPage = pCur->aOverflow[iIdx]; 3896 offset = (offset%ovflSize); 3897 } 3898 #endif 3899 3900 for( ; rc==SQLITE_OK && amt>0 && nextPage; iIdx++){ 3901 3902 #ifndef SQLITE_OMIT_INCRBLOB 3903 /* If required, populate the overflow page-list cache. */ 3904 if( pCur->aOverflow ){ 3905 assert(!pCur->aOverflow[iIdx] || pCur->aOverflow[iIdx]==nextPage); 3906 pCur->aOverflow[iIdx] = nextPage; 3907 } 3908 #endif 3909 3910 if( offset>=ovflSize ){ 3911 /* The only reason to read this page is to obtain the page 3912 ** number for the next page in the overflow chain. The page 3913 ** data is not required. So first try to lookup the overflow 3914 ** page-list cache, if any, then fall back to the getOverflowPage() 3915 ** function. 3916 */ 3917 #ifndef SQLITE_OMIT_INCRBLOB 3918 if( pCur->aOverflow && pCur->aOverflow[iIdx+1] ){ 3919 nextPage = pCur->aOverflow[iIdx+1]; 3920 } else 3921 #endif 3922 rc = getOverflowPage(pBt, nextPage, 0, &nextPage); 3923 offset -= ovflSize; 3924 }else{ 3925 /* Need to read this page properly. It contains some of the 3926 ** range of data that is being read (eOp==0) or written (eOp!=0). 3927 */ 3928 DbPage *pDbPage; 3929 int a = amt; 3930 rc = sqlite3PagerGet(pBt->pPager, nextPage, &pDbPage); 3931 if( rc==SQLITE_OK ){ 3932 aPayload = sqlite3PagerGetData(pDbPage); 3933 nextPage = get4byte(aPayload); 3934 if( a + offset > ovflSize ){ 3935 a = ovflSize - offset; 3936 } 3937 rc = copyPayload(&aPayload[offset+4], pBuf, a, eOp, pDbPage); 3938 sqlite3PagerUnref(pDbPage); 3939 offset = 0; 3940 amt -= a; 3941 pBuf += a; 3942 } 3943 } 3944 } 3945 } 3946 3947 if( rc==SQLITE_OK && amt>0 ){ 3948 return SQLITE_CORRUPT_BKPT; 3949 } 3950 return rc; 3951 } 3952 3953 /* 3954 ** Read part of the key associated with cursor pCur. Exactly 3955 ** "amt" bytes will be transfered into pBuf[]. The transfer 3956 ** begins at "offset". 3957 ** 3958 ** The caller must ensure that pCur is pointing to a valid row 3959 ** in the table. 3960 ** 3961 ** Return SQLITE_OK on success or an error code if anything goes 3962 ** wrong. An error is returned if "offset+amt" is larger than 3963 ** the available payload. 3964 */ 3965 int sqlite3BtreeKey(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){ 3966 assert( cursorHoldsMutex(pCur) ); 3967 assert( pCur->eState==CURSOR_VALID ); 3968 assert( pCur->iPage>=0 && pCur->apPage[pCur->iPage] ); 3969 assert( pCur->aiIdx[pCur->iPage]<pCur->apPage[pCur->iPage]->nCell ); 3970 return accessPayload(pCur, offset, amt, (unsigned char*)pBuf, 0); 3971 } 3972 3973 /* 3974 ** Read part of the data associated with cursor pCur. Exactly 3975 ** "amt" bytes will be transfered into pBuf[]. The transfer 3976 ** begins at "offset". 3977 ** 3978 ** Return SQLITE_OK on success or an error code if anything goes 3979 ** wrong. An error is returned if "offset+amt" is larger than 3980 ** the available payload. 3981 */ 3982 int sqlite3BtreeData(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){ 3983 int rc; 3984 3985 #ifndef SQLITE_OMIT_INCRBLOB 3986 if ( pCur->eState==CURSOR_INVALID ){ 3987 return SQLITE_ABORT; 3988 } 3989 #endif 3990 3991 assert( cursorHoldsMutex(pCur) ); 3992 rc = restoreCursorPosition(pCur); 3993 if( rc==SQLITE_OK ){ 3994 assert( pCur->eState==CURSOR_VALID ); 3995 assert( pCur->iPage>=0 && pCur->apPage[pCur->iPage] ); 3996 assert( pCur->aiIdx[pCur->iPage]<pCur->apPage[pCur->iPage]->nCell ); 3997 rc = accessPayload(pCur, offset, amt, pBuf, 0); 3998 } 3999 return rc; 4000 } 4001 4002 /* 4003 ** Return a pointer to payload information from the entry that the 4004 ** pCur cursor is pointing to. The pointer is to the beginning of 4005 ** the key if skipKey==0 and it points to the beginning of data if 4006 ** skipKey==1. The number of bytes of available key/data is written 4007 ** into *pAmt. If *pAmt==0, then the value returned will not be 4008 ** a valid pointer. 4009 ** 4010 ** This routine is an optimization. It is common for the entire key 4011 ** and data to fit on the local page and for there to be no overflow 4012 ** pages. When that is so, this routine can be used to access the 4013 ** key and data without making a copy. If the key and/or data spills 4014 ** onto overflow pages, then accessPayload() must be used to reassemble 4015 ** the key/data and copy it into a preallocated buffer. 4016 ** 4017 ** The pointer returned by this routine looks directly into the cached 4018 ** page of the database. The data might change or move the next time 4019 ** any btree routine is called. 4020 */ 4021 static const unsigned char *fetchPayload( 4022 BtCursor *pCur, /* Cursor pointing to entry to read from */ 4023 int *pAmt, /* Write the number of available bytes here */ 4024 int skipKey /* read beginning at data if this is true */ 4025 ){ 4026 unsigned char *aPayload; 4027 MemPage *pPage; 4028 u32 nKey; 4029 u32 nLocal; 4030 4031 assert( pCur!=0 && pCur->iPage>=0 && pCur->apPage[pCur->iPage]); 4032 assert( pCur->eState==CURSOR_VALID ); 4033 assert( cursorHoldsMutex(pCur) ); 4034 pPage = pCur->apPage[pCur->iPage]; 4035 assert( pCur->aiIdx[pCur->iPage]<pPage->nCell ); 4036 if( NEVER(pCur->info.nSize==0) ){ 4037 btreeParseCell(pCur->apPage[pCur->iPage], pCur->aiIdx[pCur->iPage], 4038 &pCur->info); 4039 } 4040 aPayload = pCur->info.pCell; 4041 aPayload += pCur->info.nHeader; 4042 if( pPage->intKey ){ 4043 nKey = 0; 4044 }else{ 4045 nKey = (int)pCur->info.nKey; 4046 } 4047 if( skipKey ){ 4048 aPayload += nKey; 4049 nLocal = pCur->info.nLocal - nKey; 4050 }else{ 4051 nLocal = pCur->info.nLocal; 4052 assert( nLocal<=nKey ); 4053 } 4054 *pAmt = nLocal; 4055 return aPayload; 4056 } 4057 4058 4059 /* 4060 ** For the entry that cursor pCur is point to, return as 4061 ** many bytes of the key or data as are available on the local 4062 ** b-tree page. Write the number of available bytes into *pAmt. 4063 ** 4064 ** The pointer returned is ephemeral. The key/data may move 4065 ** or be destroyed on the next call to any Btree routine, 4066 ** including calls from other threads against the same cache. 4067 ** Hence, a mutex on the BtShared should be held prior to calling 4068 ** this routine. 4069 ** 4070 ** These routines is used to get quick access to key and data 4071 ** in the common case where no overflow pages are used. 4072 */ 4073 const void *sqlite3BtreeKeyFetch(BtCursor *pCur, int *pAmt){ 4074 const void *p = 0; 4075 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) ); 4076 assert( cursorHoldsMutex(pCur) ); 4077 if( ALWAYS(pCur->eState==CURSOR_VALID) ){ 4078 p = (const void*)fetchPayload(pCur, pAmt, 0); 4079 } 4080 return p; 4081 } 4082 const void *sqlite3BtreeDataFetch(BtCursor *pCur, int *pAmt){ 4083 const void *p = 0; 4084 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) ); 4085 assert( cursorHoldsMutex(pCur) ); 4086 if( ALWAYS(pCur->eState==CURSOR_VALID) ){ 4087 p = (const void*)fetchPayload(pCur, pAmt, 1); 4088 } 4089 return p; 4090 } 4091 4092 4093 /* 4094 ** Move the cursor down to a new child page. The newPgno argument is the 4095 ** page number of the child page to move to. 4096 ** 4097 ** This function returns SQLITE_CORRUPT if the page-header flags field of 4098 ** the new child page does not match the flags field of the parent (i.e. 4099 ** if an intkey page appears to be the parent of a non-intkey page, or 4100 ** vice-versa). 4101 */ 4102 static int moveToChild(BtCursor *pCur, u32 newPgno){ 4103 int rc; 4104 int i = pCur->iPage; 4105 MemPage *pNewPage; 4106 BtShared *pBt = pCur->pBt; 4107 4108 assert( cursorHoldsMutex(pCur) ); 4109 assert( pCur->eState==CURSOR_VALID ); 4110 assert( pCur->iPage<BTCURSOR_MAX_DEPTH ); 4111 if( pCur->iPage>=(BTCURSOR_MAX_DEPTH-1) ){ 4112 return SQLITE_CORRUPT_BKPT; 4113 } 4114 rc = getAndInitPage(pBt, newPgno, &pNewPage); 4115 if( rc ) return rc; 4116 pCur->apPage[i+1] = pNewPage; 4117 pCur->aiIdx[i+1] = 0; 4118 pCur->iPage++; 4119 4120 pCur->info.nSize = 0; 4121 pCur->validNKey = 0; 4122 if( pNewPage->nCell<1 || pNewPage->intKey!=pCur->apPage[i]->intKey ){ 4123 return SQLITE_CORRUPT_BKPT; 4124 } 4125 return SQLITE_OK; 4126 } 4127 4128 #ifndef NDEBUG 4129 /* 4130 ** Page pParent is an internal (non-leaf) tree page. This function 4131 ** asserts that page number iChild is the left-child if the iIdx'th 4132 ** cell in page pParent. Or, if iIdx is equal to the total number of 4133 ** cells in pParent, that page number iChild is the right-child of 4134 ** the page. 4135 */ 4136 static void assertParentIndex(MemPage *pParent, int iIdx, Pgno iChild){ 4137 assert( iIdx<=pParent->nCell ); 4138 if( iIdx==pParent->nCell ){ 4139 assert( get4byte(&pParent->aData[pParent->hdrOffset+8])==iChild ); 4140 }else{ 4141 assert( get4byte(findCell(pParent, iIdx))==iChild ); 4142 } 4143 } 4144 #else 4145 # define assertParentIndex(x,y,z) 4146 #endif 4147 4148 /* 4149 ** Move the cursor up to the parent page. 4150 ** 4151 ** pCur->idx is set to the cell index that contains the pointer 4152 ** to the page we are coming from. If we are coming from the 4153 ** right-most child page then pCur->idx is set to one more than 4154 ** the largest cell index. 4155 */ 4156 static void moveToParent(BtCursor *pCur){ 4157 assert( cursorHoldsMutex(pCur) ); 4158 assert( pCur->eState==CURSOR_VALID ); 4159 assert( pCur->iPage>0 ); 4160 assert( pCur->apPage[pCur->iPage] ); 4161 assertParentIndex( 4162 pCur->apPage[pCur->iPage-1], 4163 pCur->aiIdx[pCur->iPage-1], 4164 pCur->apPage[pCur->iPage]->pgno 4165 ); 4166 releasePage(pCur->apPage[pCur->iPage]); 4167 pCur->iPage--; 4168 pCur->info.nSize = 0; 4169 pCur->validNKey = 0; 4170 } 4171 4172 /* 4173 ** Move the cursor to point to the root page of its b-tree structure. 4174 ** 4175 ** If the table has a virtual root page, then the cursor is moved to point 4176 ** to the virtual root page instead of the actual root page. A table has a 4177 ** virtual root page when the actual root page contains no cells and a 4178 ** single child page. This can only happen with the table rooted at page 1. 4179 ** 4180 ** If the b-tree structure is empty, the cursor state is set to 4181 ** CURSOR_INVALID. Otherwise, the cursor is set to point to the first 4182 ** cell located on the root (or virtual root) page and the cursor state 4183 ** is set to CURSOR_VALID. 4184 ** 4185 ** If this function returns successfully, it may be assumed that the 4186 ** page-header flags indicate that the [virtual] root-page is the expected 4187 ** kind of b-tree page (i.e. if when opening the cursor the caller did not 4188 ** specify a KeyInfo structure the flags byte is set to 0x05 or 0x0D, 4189 ** indicating a table b-tree, or if the caller did specify a KeyInfo 4190 ** structure the flags byte is set to 0x02 or 0x0A, indicating an index 4191 ** b-tree). 4192 */ 4193 static int moveToRoot(BtCursor *pCur){ 4194 MemPage *pRoot; 4195 int rc = SQLITE_OK; 4196 Btree *p = pCur->pBtree; 4197 BtShared *pBt = p->pBt; 4198 4199 assert( cursorHoldsMutex(pCur) ); 4200 assert( CURSOR_INVALID < CURSOR_REQUIRESEEK ); 4201 assert( CURSOR_VALID < CURSOR_REQUIRESEEK ); 4202 assert( CURSOR_FAULT > CURSOR_REQUIRESEEK ); 4203 if( pCur->eState>=CURSOR_REQUIRESEEK ){ 4204 if( pCur->eState==CURSOR_FAULT ){ 4205 assert( pCur->skipNext!=SQLITE_OK ); 4206 return pCur->skipNext; 4207 } 4208 sqlite3BtreeClearCursor(pCur); 4209 } 4210 4211 if( pCur->iPage>=0 ){ 4212 int i; 4213 for(i=1; i<=pCur->iPage; i++){ 4214 releasePage(pCur->apPage[i]); 4215 } 4216 pCur->iPage = 0; 4217 }else{ 4218 rc = getAndInitPage(pBt, pCur->pgnoRoot, &pCur->apPage[0]); 4219 if( rc!=SQLITE_OK ){ 4220 pCur->eState = CURSOR_INVALID; 4221 return rc; 4222 } 4223 pCur->iPage = 0; 4224 4225 /* If pCur->pKeyInfo is not NULL, then the caller that opened this cursor 4226 ** expected to open it on an index b-tree. Otherwise, if pKeyInfo is 4227 ** NULL, the caller expects a table b-tree. If this is not the case, 4228 ** return an SQLITE_CORRUPT error. */ 4229 assert( pCur->apPage[0]->intKey==1 || pCur->apPage[0]->intKey==0 ); 4230 if( (pCur->pKeyInfo==0)!=pCur->apPage[0]->intKey ){ 4231 return SQLITE_CORRUPT_BKPT; 4232 } 4233 } 4234 4235 /* Assert that the root page is of the correct type. This must be the 4236 ** case as the call to this function that loaded the root-page (either 4237 ** this call or a previous invocation) would have detected corruption 4238 ** if the assumption were not true, and it is not possible for the flags 4239 ** byte to have been modified while this cursor is holding a reference 4240 ** to the page. */ 4241 pRoot = pCur->apPage[0]; 4242 assert( pRoot->pgno==pCur->pgnoRoot ); 4243 assert( pRoot->isInit && (pCur->pKeyInfo==0)==pRoot->intKey ); 4244 4245 pCur->aiIdx[0] = 0; 4246 pCur->info.nSize = 0; 4247 pCur->atLast = 0; 4248 pCur->validNKey = 0; 4249 4250 if( pRoot->nCell==0 && !pRoot->leaf ){ 4251 Pgno subpage; 4252 if( pRoot->pgno!=1 ) return SQLITE_CORRUPT_BKPT; 4253 subpage = get4byte(&pRoot->aData[pRoot->hdrOffset+8]); 4254 pCur->eState = CURSOR_VALID; 4255 rc = moveToChild(pCur, subpage); 4256 }else{ 4257 pCur->eState = ((pRoot->nCell>0)?CURSOR_VALID:CURSOR_INVALID); 4258 } 4259 return rc; 4260 } 4261 4262 /* 4263 ** Move the cursor down to the left-most leaf entry beneath the 4264 ** entry to which it is currently pointing. 4265 ** 4266 ** The left-most leaf is the one with the smallest key - the first 4267 ** in ascending order. 4268 */ 4269 static int moveToLeftmost(BtCursor *pCur){ 4270 Pgno pgno; 4271 int rc = SQLITE_OK; 4272 MemPage *pPage; 4273 4274 assert( cursorHoldsMutex(pCur) ); 4275 assert( pCur->eState==CURSOR_VALID ); 4276 while( rc==SQLITE_OK && !(pPage = pCur->apPage[pCur->iPage])->leaf ){ 4277 assert( pCur->aiIdx[pCur->iPage]<pPage->nCell ); 4278 pgno = get4byte(findCell(pPage, pCur->aiIdx[pCur->iPage])); 4279 rc = moveToChild(pCur, pgno); 4280 } 4281 return rc; 4282 } 4283 4284 /* 4285 ** Move the cursor down to the right-most leaf entry beneath the 4286 ** page to which it is currently pointing. Notice the difference 4287 ** between moveToLeftmost() and moveToRightmost(). moveToLeftmost() 4288 ** finds the left-most entry beneath the *entry* whereas moveToRightmost() 4289 ** finds the right-most entry beneath the *page*. 4290 ** 4291 ** The right-most entry is the one with the largest key - the last 4292 ** key in ascending order. 4293 */ 4294 static int moveToRightmost(BtCursor *pCur){ 4295 Pgno pgno; 4296 int rc = SQLITE_OK; 4297 MemPage *pPage = 0; 4298 4299 assert( cursorHoldsMutex(pCur) ); 4300 assert( pCur->eState==CURSOR_VALID ); 4301 while( rc==SQLITE_OK && !(pPage = pCur->apPage[pCur->iPage])->leaf ){ 4302 pgno = get4byte(&pPage->aData[pPage->hdrOffset+8]); 4303 pCur->aiIdx[pCur->iPage] = pPage->nCell; 4304 rc = moveToChild(pCur, pgno); 4305 } 4306 if( rc==SQLITE_OK ){ 4307 pCur->aiIdx[pCur->iPage] = pPage->nCell-1; 4308 pCur->info.nSize = 0; 4309 pCur->validNKey = 0; 4310 } 4311 return rc; 4312 } 4313 4314 /* Move the cursor to the first entry in the table. Return SQLITE_OK 4315 ** on success. Set *pRes to 0 if the cursor actually points to something 4316 ** or set *pRes to 1 if the table is empty. 4317 */ 4318 int sqlite3BtreeFirst(BtCursor *pCur, int *pRes){ 4319 int rc; 4320 4321 assert( cursorHoldsMutex(pCur) ); 4322 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) ); 4323 rc = moveToRoot(pCur); 4324 if( rc==SQLITE_OK ){ 4325 if( pCur->eState==CURSOR_INVALID ){ 4326 assert( pCur->apPage[pCur->iPage]->nCell==0 ); 4327 *pRes = 1; 4328 }else{ 4329 assert( pCur->apPage[pCur->iPage]->nCell>0 ); 4330 *pRes = 0; 4331 rc = moveToLeftmost(pCur); 4332 } 4333 } 4334 return rc; 4335 } 4336 4337 /* Move the cursor to the last entry in the table. Return SQLITE_OK 4338 ** on success. Set *pRes to 0 if the cursor actually points to something 4339 ** or set *pRes to 1 if the table is empty. 4340 */ 4341 int sqlite3BtreeLast(BtCursor *pCur, int *pRes){ 4342 int rc; 4343 4344 assert( cursorHoldsMutex(pCur) ); 4345 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) ); 4346 4347 /* If the cursor already points to the last entry, this is a no-op. */ 4348 if( CURSOR_VALID==pCur->eState && pCur->atLast ){ 4349 #ifdef SQLITE_DEBUG 4350 /* This block serves to assert() that the cursor really does point 4351 ** to the last entry in the b-tree. */ 4352 int ii; 4353 for(ii=0; ii<pCur->iPage; ii++){ 4354 assert( pCur->aiIdx[ii]==pCur->apPage[ii]->nCell ); 4355 } 4356 assert( pCur->aiIdx[pCur->iPage]==pCur->apPage[pCur->iPage]->nCell-1 ); 4357 assert( pCur->apPage[pCur->iPage]->leaf ); 4358 #endif 4359 return SQLITE_OK; 4360 } 4361 4362 rc = moveToRoot(pCur); 4363 if( rc==SQLITE_OK ){ 4364 if( CURSOR_INVALID==pCur->eState ){ 4365 assert( pCur->apPage[pCur->iPage]->nCell==0 ); 4366 *pRes = 1; 4367 }else{ 4368 assert( pCur->eState==CURSOR_VALID ); 4369 *pRes = 0; 4370 rc = moveToRightmost(pCur); 4371 pCur->atLast = rc==SQLITE_OK ?1:0; 4372 } 4373 } 4374 return rc; 4375 } 4376 4377 /* Move the cursor so that it points to an entry near the key 4378 ** specified by pIdxKey or intKey. Return a success code. 4379 ** 4380 ** For INTKEY tables, the intKey parameter is used. pIdxKey 4381 ** must be NULL. For index tables, pIdxKey is used and intKey 4382 ** is ignored. 4383 ** 4384 ** If an exact match is not found, then the cursor is always 4385 ** left pointing at a leaf page which would hold the entry if it 4386 ** were present. The cursor might point to an entry that comes 4387 ** before or after the key. 4388 ** 4389 ** An integer is written into *pRes which is the result of 4390 ** comparing the key with the entry to which the cursor is 4391 ** pointing. The meaning of the integer written into 4392 ** *pRes is as follows: 4393 ** 4394 ** *pRes<0 The cursor is left pointing at an entry that 4395 ** is smaller than intKey/pIdxKey or if the table is empty 4396 ** and the cursor is therefore left point to nothing. 4397 ** 4398 ** *pRes==0 The cursor is left pointing at an entry that 4399 ** exactly matches intKey/pIdxKey. 4400 ** 4401 ** *pRes>0 The cursor is left pointing at an entry that 4402 ** is larger than intKey/pIdxKey. 4403 ** 4404 */ 4405 int sqlite3BtreeMovetoUnpacked( 4406 BtCursor *pCur, /* The cursor to be moved */ 4407 UnpackedRecord *pIdxKey, /* Unpacked index key */ 4408 i64 intKey, /* The table key */ 4409 int biasRight, /* If true, bias the search to the high end */ 4410 int *pRes /* Write search results here */ 4411 ){ 4412 int rc; 4413 4414 assert( cursorHoldsMutex(pCur) ); 4415 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) ); 4416 assert( pRes ); 4417 assert( (pIdxKey==0)==(pCur->pKeyInfo==0) ); 4418 4419 /* If the cursor is already positioned at the point we are trying 4420 ** to move to, then just return without doing any work */ 4421 if( pCur->eState==CURSOR_VALID && pCur->validNKey 4422 && pCur->apPage[0]->intKey 4423 ){ 4424 if( pCur->info.nKey==intKey ){ 4425 *pRes = 0; 4426 return SQLITE_OK; 4427 } 4428 if( pCur->atLast && pCur->info.nKey<intKey ){ 4429 *pRes = -1; 4430 return SQLITE_OK; 4431 } 4432 } 4433 4434 rc = moveToRoot(pCur); 4435 if( rc ){ 4436 return rc; 4437 } 4438 assert( pCur->apPage[pCur->iPage] ); 4439 assert( pCur->apPage[pCur->iPage]->isInit ); 4440 assert( pCur->apPage[pCur->iPage]->nCell>0 || pCur->eState==CURSOR_INVALID ); 4441 if( pCur->eState==CURSOR_INVALID ){ 4442 *pRes = -1; 4443 assert( pCur->apPage[pCur->iPage]->nCell==0 ); 4444 return SQLITE_OK; 4445 } 4446 assert( pCur->apPage[0]->intKey || pIdxKey ); 4447 for(;;){ 4448 int lwr, upr; 4449 Pgno chldPg; 4450 MemPage *pPage = pCur->apPage[pCur->iPage]; 4451 int c; 4452 4453 /* pPage->nCell must be greater than zero. If this is the root-page 4454 ** the cursor would have been INVALID above and this for(;;) loop 4455 ** not run. If this is not the root-page, then the moveToChild() routine 4456 ** would have already detected db corruption. Similarly, pPage must 4457 ** be the right kind (index or table) of b-tree page. Otherwise 4458 ** a moveToChild() or moveToRoot() call would have detected corruption. */ 4459 assert( pPage->nCell>0 ); 4460 assert( pPage->intKey==(pIdxKey==0) ); 4461 lwr = 0; 4462 upr = pPage->nCell-1; 4463 if( biasRight ){ 4464 pCur->aiIdx[pCur->iPage] = (u16)upr; 4465 }else{ 4466 pCur->aiIdx[pCur->iPage] = (u16)((upr+lwr)/2); 4467 } 4468 for(;;){ 4469 int idx = pCur->aiIdx[pCur->iPage]; /* Index of current cell in pPage */ 4470 u8 *pCell; /* Pointer to current cell in pPage */ 4471 4472 pCur->info.nSize = 0; 4473 pCell = findCell(pPage, idx) + pPage->childPtrSize; 4474 if( pPage->intKey ){ 4475 i64 nCellKey; 4476 if( pPage->hasData ){ 4477 u32 dummy; 4478 pCell += getVarint32(pCell, dummy); 4479 } 4480 getVarint(pCell, (u64*)&nCellKey); 4481 if( nCellKey==intKey ){ 4482 c = 0; 4483 }else if( nCellKey<intKey ){ 4484 c = -1; 4485 }else{ 4486 assert( nCellKey>intKey ); 4487 c = +1; 4488 } 4489 pCur->validNKey = 1; 4490 pCur->info.nKey = nCellKey; 4491 }else{ 4492 /* The maximum supported page-size is 65536 bytes. This means that 4493 ** the maximum number of record bytes stored on an index B-Tree 4494 ** page is less than 16384 bytes and may be stored as a 2-byte 4495 ** varint. This information is used to attempt to avoid parsing 4496 ** the entire cell by checking for the cases where the record is 4497 ** stored entirely within the b-tree page by inspecting the first 4498 ** 2 bytes of the cell. 4499 */ 4500 int nCell = pCell[0]; 4501 if( !(nCell & 0x80) && nCell<=pPage->maxLocal ){ 4502 /* This branch runs if the record-size field of the cell is a 4503 ** single byte varint and the record fits entirely on the main 4504 ** b-tree page. */ 4505 c = sqlite3VdbeRecordCompare(nCell, (void*)&pCell[1], pIdxKey); 4506 }else if( !(pCell[1] & 0x80) 4507 && (nCell = ((nCell&0x7f)<<7) + pCell[1])<=pPage->maxLocal 4508 ){ 4509 /* The record-size field is a 2 byte varint and the record 4510 ** fits entirely on the main b-tree page. */ 4511 c = sqlite3VdbeRecordCompare(nCell, (void*)&pCell[2], pIdxKey); 4512 }else{ 4513 /* The record flows over onto one or more overflow pages. In 4514 ** this case the whole cell needs to be parsed, a buffer allocated 4515 ** and accessPayload() used to retrieve the record into the 4516 ** buffer before VdbeRecordCompare() can be called. */ 4517 void *pCellKey; 4518 u8 * const pCellBody = pCell - pPage->childPtrSize; 4519 btreeParseCellPtr(pPage, pCellBody, &pCur->info); 4520 nCell = (int)pCur->info.nKey; 4521 pCellKey = sqlite3Malloc( nCell ); 4522 if( pCellKey==0 ){ 4523 rc = SQLITE_NOMEM; 4524 goto moveto_finish; 4525 } 4526 rc = accessPayload(pCur, 0, nCell, (unsigned char*)pCellKey, 0); 4527 if( rc ){ 4528 sqlite3_free(pCellKey); 4529 goto moveto_finish; 4530 } 4531 c = sqlite3VdbeRecordCompare(nCell, pCellKey, pIdxKey); 4532 sqlite3_free(pCellKey); 4533 } 4534 } 4535 if( c==0 ){ 4536 if( pPage->intKey && !pPage->leaf ){ 4537 lwr = idx; 4538 upr = lwr - 1; 4539 break; 4540 }else{ 4541 *pRes = 0; 4542 rc = SQLITE_OK; 4543 goto moveto_finish; 4544 } 4545 } 4546 if( c<0 ){ 4547 lwr = idx+1; 4548 }else{ 4549 upr = idx-1; 4550 } 4551 if( lwr>upr ){ 4552 break; 4553 } 4554 pCur->aiIdx[pCur->iPage] = (u16)((lwr+upr)/2); 4555 } 4556 assert( lwr==upr+1 ); 4557 assert( pPage->isInit ); 4558 if( pPage->leaf ){ 4559 chldPg = 0; 4560 }else if( lwr>=pPage->nCell ){ 4561 chldPg = get4byte(&pPage->aData[pPage->hdrOffset+8]); 4562 }else{ 4563 chldPg = get4byte(findCell(pPage, lwr)); 4564 } 4565 if( chldPg==0 ){ 4566 assert( pCur->aiIdx[pCur->iPage]<pCur->apPage[pCur->iPage]->nCell ); 4567 *pRes = c; 4568 rc = SQLITE_OK; 4569 goto moveto_finish; 4570 } 4571 pCur->aiIdx[pCur->iPage] = (u16)lwr; 4572 pCur->info.nSize = 0; 4573 pCur->validNKey = 0; 4574 rc = moveToChild(pCur, chldPg); 4575 if( rc ) goto moveto_finish; 4576 } 4577 moveto_finish: 4578 return rc; 4579 } 4580 4581 4582 /* 4583 ** Return TRUE if the cursor is not pointing at an entry of the table. 4584 ** 4585 ** TRUE will be returned after a call to sqlite3BtreeNext() moves 4586 ** past the last entry in the table or sqlite3BtreePrev() moves past 4587 ** the first entry. TRUE is also returned if the table is empty. 4588 */ 4589 int sqlite3BtreeEof(BtCursor *pCur){ 4590 /* TODO: What if the cursor is in CURSOR_REQUIRESEEK but all table entries 4591 ** have been deleted? This API will need to change to return an error code 4592 ** as well as the boolean result value. 4593 */ 4594 return (CURSOR_VALID!=pCur->eState); 4595 } 4596 4597 /* 4598 ** Advance the cursor to the next entry in the database. If 4599 ** successful then set *pRes=0. If the cursor 4600 ** was already pointing to the last entry in the database before 4601 ** this routine was called, then set *pRes=1. 4602 */ 4603 int sqlite3BtreeNext(BtCursor *pCur, int *pRes){ 4604 int rc; 4605 int idx; 4606 MemPage *pPage; 4607 4608 assert( cursorHoldsMutex(pCur) ); 4609 rc = restoreCursorPosition(pCur); 4610 if( rc!=SQLITE_OK ){ 4611 return rc; 4612 } 4613 assert( pRes!=0 ); 4614 if( CURSOR_INVALID==pCur->eState ){ 4615 *pRes = 1; 4616 return SQLITE_OK; 4617 } 4618 if( pCur->skipNext>0 ){ 4619 pCur->skipNext = 0; 4620 *pRes = 0; 4621 return SQLITE_OK; 4622 } 4623 pCur->skipNext = 0; 4624 4625 pPage = pCur->apPage[pCur->iPage]; 4626 idx = ++pCur->aiIdx[pCur->iPage]; 4627 assert( pPage->isInit ); 4628 assert( idx<=pPage->nCell ); 4629 4630 pCur->info.nSize = 0; 4631 pCur->validNKey = 0; 4632 if( idx>=pPage->nCell ){ 4633 if( !pPage->leaf ){ 4634 rc = moveToChild(pCur, get4byte(&pPage->aData[pPage->hdrOffset+8])); 4635 if( rc ) return rc; 4636 rc = moveToLeftmost(pCur); 4637 *pRes = 0; 4638 return rc; 4639 } 4640 do{ 4641 if( pCur->iPage==0 ){ 4642 *pRes = 1; 4643 pCur->eState = CURSOR_INVALID; 4644 return SQLITE_OK; 4645 } 4646 moveToParent(pCur); 4647 pPage = pCur->apPage[pCur->iPage]; 4648 }while( pCur->aiIdx[pCur->iPage]>=pPage->nCell ); 4649 *pRes = 0; 4650 if( pPage->intKey ){ 4651 rc = sqlite3BtreeNext(pCur, pRes); 4652 }else{ 4653 rc = SQLITE_OK; 4654 } 4655 return rc; 4656 } 4657 *pRes = 0; 4658 if( pPage->leaf ){ 4659 return SQLITE_OK; 4660 } 4661 rc = moveToLeftmost(pCur); 4662 return rc; 4663 } 4664 4665 4666 /* 4667 ** Step the cursor to the back to the previous entry in the database. If 4668 ** successful then set *pRes=0. If the cursor 4669 ** was already pointing to the first entry in the database before 4670 ** this routine was called, then set *pRes=1. 4671 */ 4672 int sqlite3BtreePrevious(BtCursor *pCur, int *pRes){ 4673 int rc; 4674 MemPage *pPage; 4675 4676 assert( cursorHoldsMutex(pCur) ); 4677 rc = restoreCursorPosition(pCur); 4678 if( rc!=SQLITE_OK ){ 4679 return rc; 4680 } 4681 pCur->atLast = 0; 4682 if( CURSOR_INVALID==pCur->eState ){ 4683 *pRes = 1; 4684 return SQLITE_OK; 4685 } 4686 if( pCur->skipNext<0 ){ 4687 pCur->skipNext = 0; 4688 *pRes = 0; 4689 return SQLITE_OK; 4690 } 4691 pCur->skipNext = 0; 4692 4693 pPage = pCur->apPage[pCur->iPage]; 4694 assert( pPage->isInit ); 4695 if( !pPage->leaf ){ 4696 int idx = pCur->aiIdx[pCur->iPage]; 4697 rc = moveToChild(pCur, get4byte(findCell(pPage, idx))); 4698 if( rc ){ 4699 return rc; 4700 } 4701 rc = moveToRightmost(pCur); 4702 }else{ 4703 while( pCur->aiIdx[pCur->iPage]==0 ){ 4704 if( pCur->iPage==0 ){ 4705 pCur->eState = CURSOR_INVALID; 4706 *pRes = 1; 4707 return SQLITE_OK; 4708 } 4709 moveToParent(pCur); 4710 } 4711 pCur->info.nSize = 0; 4712 pCur->validNKey = 0; 4713 4714 pCur->aiIdx[pCur->iPage]--; 4715 pPage = pCur->apPage[pCur->iPage]; 4716 if( pPage->intKey && !pPage->leaf ){ 4717 rc = sqlite3BtreePrevious(pCur, pRes); 4718 }else{ 4719 rc = SQLITE_OK; 4720 } 4721 } 4722 *pRes = 0; 4723 return rc; 4724 } 4725 4726 /* 4727 ** Allocate a new page from the database file. 4728 ** 4729 ** The new page is marked as dirty. (In other words, sqlite3PagerWrite() 4730 ** has already been called on the new page.) The new page has also 4731 ** been referenced and the calling routine is responsible for calling 4732 ** sqlite3PagerUnref() on the new page when it is done. 4733 ** 4734 ** SQLITE_OK is returned on success. Any other return value indicates 4735 ** an error. *ppPage and *pPgno are undefined in the event of an error. 4736 ** Do not invoke sqlite3PagerUnref() on *ppPage if an error is returned. 4737 ** 4738 ** If the "nearby" parameter is not 0, then a (feeble) effort is made to 4739 ** locate a page close to the page number "nearby". This can be used in an 4740 ** attempt to keep related pages close to each other in the database file, 4741 ** which in turn can make database access faster. 4742 ** 4743 ** If the "exact" parameter is not 0, and the page-number nearby exists 4744 ** anywhere on the free-list, then it is guarenteed to be returned. This 4745 ** is only used by auto-vacuum databases when allocating a new table. 4746 */ 4747 static int allocateBtreePage( 4748 BtShared *pBt, 4749 MemPage **ppPage, 4750 Pgno *pPgno, 4751 Pgno nearby, 4752 u8 exact 4753 ){ 4754 MemPage *pPage1; 4755 int rc; 4756 u32 n; /* Number of pages on the freelist */ 4757 u32 k; /* Number of leaves on the trunk of the freelist */ 4758 MemPage *pTrunk = 0; 4759 MemPage *pPrevTrunk = 0; 4760 Pgno mxPage; /* Total size of the database file */ 4761 4762 assert( sqlite3_mutex_held(pBt->mutex) ); 4763 pPage1 = pBt->pPage1; 4764 mxPage = btreePagecount(pBt); 4765 n = get4byte(&pPage1->aData[36]); 4766 testcase( n==mxPage-1 ); 4767 if( n>=mxPage ){ 4768 return SQLITE_CORRUPT_BKPT; 4769 } 4770 if( n>0 ){ 4771 /* There are pages on the freelist. Reuse one of those pages. */ 4772 Pgno iTrunk; 4773 u8 searchList = 0; /* If the free-list must be searched for 'nearby' */ 4774 4775 /* If the 'exact' parameter was true and a query of the pointer-map 4776 ** shows that the page 'nearby' is somewhere on the free-list, then 4777 ** the entire-list will be searched for that page. 4778 */ 4779 #ifndef SQLITE_OMIT_AUTOVACUUM 4780 if( exact && nearby<=mxPage ){ 4781 u8 eType; 4782 assert( nearby>0 ); 4783 assert( pBt->autoVacuum ); 4784 rc = ptrmapGet(pBt, nearby, &eType, 0); 4785 if( rc ) return rc; 4786 if( eType==PTRMAP_FREEPAGE ){ 4787 searchList = 1; 4788 } 4789 *pPgno = nearby; 4790 } 4791 #endif 4792 4793 /* Decrement the free-list count by 1. Set iTrunk to the index of the 4794 ** first free-list trunk page. iPrevTrunk is initially 1. 4795 */ 4796 rc = sqlite3PagerWrite(pPage1->pDbPage); 4797 if( rc ) return rc; 4798 put4byte(&pPage1->aData[36], n-1); 4799 4800 /* The code within this loop is run only once if the 'searchList' variable 4801 ** is not true. Otherwise, it runs once for each trunk-page on the 4802 ** free-list until the page 'nearby' is located. 4803 */ 4804 do { 4805 pPrevTrunk = pTrunk; 4806 if( pPrevTrunk ){ 4807 iTrunk = get4byte(&pPrevTrunk->aData[0]); 4808 }else{ 4809 iTrunk = get4byte(&pPage1->aData[32]); 4810 } 4811 testcase( iTrunk==mxPage ); 4812 if( iTrunk>mxPage ){ 4813 rc = SQLITE_CORRUPT_BKPT; 4814 }else{ 4815 rc = btreeGetPage(pBt, iTrunk, &pTrunk, 0); 4816 } 4817 if( rc ){ 4818 pTrunk = 0; 4819 goto end_allocate_page; 4820 } 4821 4822 k = get4byte(&pTrunk->aData[4]); 4823 if( k==0 && !searchList ){ 4824 /* The trunk has no leaves and the list is not being searched. 4825 ** So extract the trunk page itself and use it as the newly 4826 ** allocated page */ 4827 assert( pPrevTrunk==0 ); 4828 rc = sqlite3PagerWrite(pTrunk->pDbPage); 4829 if( rc ){ 4830 goto end_allocate_page; 4831 } 4832 *pPgno = iTrunk; 4833 memcpy(&pPage1->aData[32], &pTrunk->aData[0], 4); 4834 *ppPage = pTrunk; 4835 pTrunk = 0; 4836 TRACE(("ALLOCATE: %d trunk - %d free pages left\n", *pPgno, n-1)); 4837 }else if( k>(u32)(pBt->usableSize/4 - 2) ){ 4838 /* Value of k is out of range. Database corruption */ 4839 rc = SQLITE_CORRUPT_BKPT; 4840 goto end_allocate_page; 4841 #ifndef SQLITE_OMIT_AUTOVACUUM 4842 }else if( searchList && nearby==iTrunk ){ 4843 /* The list is being searched and this trunk page is the page 4844 ** to allocate, regardless of whether it has leaves. 4845 */ 4846 assert( *pPgno==iTrunk ); 4847 *ppPage = pTrunk; 4848 searchList = 0; 4849 rc = sqlite3PagerWrite(pTrunk->pDbPage); 4850 if( rc ){ 4851 goto end_allocate_page; 4852 } 4853 if( k==0 ){ 4854 if( !pPrevTrunk ){ 4855 memcpy(&pPage1->aData[32], &pTrunk->aData[0], 4); 4856 }else{ 4857 rc = sqlite3PagerWrite(pPrevTrunk->pDbPage); 4858 if( rc!=SQLITE_OK ){ 4859 goto end_allocate_page; 4860 } 4861 memcpy(&pPrevTrunk->aData[0], &pTrunk->aData[0], 4); 4862 } 4863 }else{ 4864 /* The trunk page is required by the caller but it contains 4865 ** pointers to free-list leaves. The first leaf becomes a trunk 4866 ** page in this case. 4867 */ 4868 MemPage *pNewTrunk; 4869 Pgno iNewTrunk = get4byte(&pTrunk->aData[8]); 4870 if( iNewTrunk>mxPage ){ 4871 rc = SQLITE_CORRUPT_BKPT; 4872 goto end_allocate_page; 4873 } 4874 testcase( iNewTrunk==mxPage ); 4875 rc = btreeGetPage(pBt, iNewTrunk, &pNewTrunk, 0); 4876 if( rc!=SQLITE_OK ){ 4877 goto end_allocate_page; 4878 } 4879 rc = sqlite3PagerWrite(pNewTrunk->pDbPage); 4880 if( rc!=SQLITE_OK ){ 4881 releasePage(pNewTrunk); 4882 goto end_allocate_page; 4883 } 4884 memcpy(&pNewTrunk->aData[0], &pTrunk->aData[0], 4); 4885 put4byte(&pNewTrunk->aData[4], k-1); 4886 memcpy(&pNewTrunk->aData[8], &pTrunk->aData[12], (k-1)*4); 4887 releasePage(pNewTrunk); 4888 if( !pPrevTrunk ){ 4889 assert( sqlite3PagerIswriteable(pPage1->pDbPage) ); 4890 put4byte(&pPage1->aData[32], iNewTrunk); 4891 }else{ 4892 rc = sqlite3PagerWrite(pPrevTrunk->pDbPage); 4893 if( rc ){ 4894 goto end_allocate_page; 4895 } 4896 put4byte(&pPrevTrunk->aData[0], iNewTrunk); 4897 } 4898 } 4899 pTrunk = 0; 4900 TRACE(("ALLOCATE: %d trunk - %d free pages left\n", *pPgno, n-1)); 4901 #endif 4902 }else if( k>0 ){ 4903 /* Extract a leaf from the trunk */ 4904 u32 closest; 4905 Pgno iPage; 4906 unsigned char *aData = pTrunk->aData; 4907 rc = sqlite3PagerWrite(pTrunk->pDbPage); 4908 if( rc ){ 4909 goto end_allocate_page; 4910 } 4911 if( nearby>0 ){ 4912 u32 i; 4913 int dist; 4914 closest = 0; 4915 dist = sqlite3AbsInt32(get4byte(&aData[8]) - nearby); 4916 for(i=1; i<k; i++){ 4917 int d2 = sqlite3AbsInt32(get4byte(&aData[8+i*4]) - nearby); 4918 if( d2<dist ){ 4919 closest = i; 4920 dist = d2; 4921 } 4922 } 4923 }else{ 4924 closest = 0; 4925 } 4926 4927 iPage = get4byte(&aData[8+closest*4]); 4928 testcase( iPage==mxPage ); 4929 if( iPage>mxPage ){ 4930 rc = SQLITE_CORRUPT_BKPT; 4931 goto end_allocate_page; 4932 } 4933 testcase( iPage==mxPage ); 4934 if( !searchList || iPage==nearby ){ 4935 int noContent; 4936 *pPgno = iPage; 4937 TRACE(("ALLOCATE: %d was leaf %d of %d on trunk %d" 4938 ": %d more free pages\n", 4939 *pPgno, closest+1, k, pTrunk->pgno, n-1)); 4940 if( closest<k-1 ){ 4941 memcpy(&aData[8+closest*4], &aData[4+k*4], 4); 4942 } 4943 put4byte(&aData[4], k-1); 4944 assert( sqlite3PagerIswriteable(pTrunk->pDbPage) ); 4945 noContent = !btreeGetHasContent(pBt, *pPgno); 4946 rc = btreeGetPage(pBt, *pPgno, ppPage, noContent); 4947 if( rc==SQLITE_OK ){ 4948 rc = sqlite3PagerWrite((*ppPage)->pDbPage); 4949 if( rc!=SQLITE_OK ){ 4950 releasePage(*ppPage); 4951 } 4952 } 4953 searchList = 0; 4954 } 4955 } 4956 releasePage(pPrevTrunk); 4957 pPrevTrunk = 0; 4958 }while( searchList ); 4959 }else{ 4960 /* There are no pages on the freelist, so create a new page at the 4961 ** end of the file */ 4962 rc = sqlite3PagerWrite(pBt->pPage1->pDbPage); 4963 if( rc ) return rc; 4964 pBt->nPage++; 4965 if( pBt->nPage==PENDING_BYTE_PAGE(pBt) ) pBt->nPage++; 4966 4967 #ifndef SQLITE_OMIT_AUTOVACUUM 4968 if( pBt->autoVacuum && PTRMAP_ISPAGE(pBt, pBt->nPage) ){ 4969 /* If *pPgno refers to a pointer-map page, allocate two new pages 4970 ** at the end of the file instead of one. The first allocated page 4971 ** becomes a new pointer-map page, the second is used by the caller. 4972 */ 4973 MemPage *pPg = 0; 4974 TRACE(("ALLOCATE: %d from end of file (pointer-map page)\n", pBt->nPage)); 4975 assert( pBt->nPage!=PENDING_BYTE_PAGE(pBt) ); 4976 rc = btreeGetPage(pBt, pBt->nPage, &pPg, 1); 4977 if( rc==SQLITE_OK ){ 4978 rc = sqlite3PagerWrite(pPg->pDbPage); 4979 releasePage(pPg); 4980 } 4981 if( rc ) return rc; 4982 pBt->nPage++; 4983 if( pBt->nPage==PENDING_BYTE_PAGE(pBt) ){ pBt->nPage++; } 4984 } 4985 #endif 4986 put4byte(28 + (u8*)pBt->pPage1->aData, pBt->nPage); 4987 *pPgno = pBt->nPage; 4988 4989 assert( *pPgno!=PENDING_BYTE_PAGE(pBt) ); 4990 rc = btreeGetPage(pBt, *pPgno, ppPage, 1); 4991 if( rc ) return rc; 4992 rc = sqlite3PagerWrite((*ppPage)->pDbPage); 4993 if( rc!=SQLITE_OK ){ 4994 releasePage(*ppPage); 4995 } 4996 TRACE(("ALLOCATE: %d from end of file\n", *pPgno)); 4997 } 4998 4999 assert( *pPgno!=PENDING_BYTE_PAGE(pBt) ); 5000 5001 end_allocate_page: 5002 releasePage(pTrunk); 5003 releasePage(pPrevTrunk); 5004 if( rc==SQLITE_OK ){ 5005 if( sqlite3PagerPageRefcount((*ppPage)->pDbPage)>1 ){ 5006 releasePage(*ppPage); 5007 return SQLITE_CORRUPT_BKPT; 5008 } 5009 (*ppPage)->isInit = 0; 5010 }else{ 5011 *ppPage = 0; 5012 } 5013 return rc; 5014 } 5015 5016 /* 5017 ** This function is used to add page iPage to the database file free-list. 5018 ** It is assumed that the page is not already a part of the free-list. 5019 ** 5020 ** The value passed as the second argument to this function is optional. 5021 ** If the caller happens to have a pointer to the MemPage object 5022 ** corresponding to page iPage handy, it may pass it as the second value. 5023 ** Otherwise, it may pass NULL. 5024 ** 5025 ** If a pointer to a MemPage object is passed as the second argument, 5026 ** its reference count is not altered by this function. 5027 */ 5028 static int freePage2(BtShared *pBt, MemPage *pMemPage, Pgno iPage){ 5029 MemPage *pTrunk = 0; /* Free-list trunk page */ 5030 Pgno iTrunk = 0; /* Page number of free-list trunk page */ 5031 MemPage *pPage1 = pBt->pPage1; /* Local reference to page 1 */ 5032 MemPage *pPage; /* Page being freed. May be NULL. */ 5033 int rc; /* Return Code */ 5034 int nFree; /* Initial number of pages on free-list */ 5035 5036 assert( sqlite3_mutex_held(pBt->mutex) ); 5037 assert( iPage>1 ); 5038 assert( !pMemPage || pMemPage->pgno==iPage ); 5039 5040 if( pMemPage ){ 5041 pPage = pMemPage; 5042 sqlite3PagerRef(pPage->pDbPage); 5043 }else{ 5044 pPage = btreePageLookup(pBt, iPage); 5045 } 5046 5047 /* Increment the free page count on pPage1 */ 5048 rc = sqlite3PagerWrite(pPage1->pDbPage); 5049 if( rc ) goto freepage_out; 5050 nFree = get4byte(&pPage1->aData[36]); 5051 put4byte(&pPage1->aData[36], nFree+1); 5052 5053 if( pBt->secureDelete ){ 5054 /* If the secure_delete option is enabled, then 5055 ** always fully overwrite deleted information with zeros. 5056 */ 5057 if( (!pPage && ((rc = btreeGetPage(pBt, iPage, &pPage, 0))!=0) ) 5058 || ((rc = sqlite3PagerWrite(pPage->pDbPage))!=0) 5059 ){ 5060 goto freepage_out; 5061 } 5062 memset(pPage->aData, 0, pPage->pBt->pageSize); 5063 } 5064 5065 /* If the database supports auto-vacuum, write an entry in the pointer-map 5066 ** to indicate that the page is free. 5067 */ 5068 if( ISAUTOVACUUM ){ 5069 ptrmapPut(pBt, iPage, PTRMAP_FREEPAGE, 0, &rc); 5070 if( rc ) goto freepage_out; 5071 } 5072 5073 /* Now manipulate the actual database free-list structure. There are two 5074 ** possibilities. If the free-list is currently empty, or if the first 5075 ** trunk page in the free-list is full, then this page will become a 5076 ** new free-list trunk page. Otherwise, it will become a leaf of the 5077 ** first trunk page in the current free-list. This block tests if it 5078 ** is possible to add the page as a new free-list leaf. 5079 */ 5080 if( nFree!=0 ){ 5081 u32 nLeaf; /* Initial number of leaf cells on trunk page */ 5082 5083 iTrunk = get4byte(&pPage1->aData[32]); 5084 rc = btreeGetPage(pBt, iTrunk, &pTrunk, 0); 5085 if( rc!=SQLITE_OK ){ 5086 goto freepage_out; 5087 } 5088 5089 nLeaf = get4byte(&pTrunk->aData[4]); 5090 assert( pBt->usableSize>32 ); 5091 if( nLeaf > (u32)pBt->usableSize/4 - 2 ){ 5092 rc = SQLITE_CORRUPT_BKPT; 5093 goto freepage_out; 5094 } 5095 if( nLeaf < (u32)pBt->usableSize/4 - 8 ){ 5096 /* In this case there is room on the trunk page to insert the page 5097 ** being freed as a new leaf. 5098 ** 5099 ** Note that the trunk page is not really full until it contains 5100 ** usableSize/4 - 2 entries, not usableSize/4 - 8 entries as we have 5101 ** coded. But due to a coding error in versions of SQLite prior to 5102 ** 3.6.0, databases with freelist trunk pages holding more than 5103 ** usableSize/4 - 8 entries will be reported as corrupt. In order 5104 ** to maintain backwards compatibility with older versions of SQLite, 5105 ** we will continue to restrict the number of entries to usableSize/4 - 8 5106 ** for now. At some point in the future (once everyone has upgraded 5107 ** to 3.6.0 or later) we should consider fixing the conditional above 5108 ** to read "usableSize/4-2" instead of "usableSize/4-8". 5109 */ 5110 rc = sqlite3PagerWrite(pTrunk->pDbPage); 5111 if( rc==SQLITE_OK ){ 5112 put4byte(&pTrunk->aData[4], nLeaf+1); 5113 put4byte(&pTrunk->aData[8+nLeaf*4], iPage); 5114 if( pPage && !pBt->secureDelete ){ 5115 sqlite3PagerDontWrite(pPage->pDbPage); 5116 } 5117 rc = btreeSetHasContent(pBt, iPage); 5118 } 5119 TRACE(("FREE-PAGE: %d leaf on trunk page %d\n",pPage->pgno,pTrunk->pgno)); 5120 goto freepage_out; 5121 } 5122 } 5123 5124 /* If control flows to this point, then it was not possible to add the 5125 ** the page being freed as a leaf page of the first trunk in the free-list. 5126 ** Possibly because the free-list is empty, or possibly because the 5127 ** first trunk in the free-list is full. Either way, the page being freed 5128 ** will become the new first trunk page in the free-list. 5129 */ 5130 if( pPage==0 && SQLITE_OK!=(rc = btreeGetPage(pBt, iPage, &pPage, 0)) ){ 5131 goto freepage_out; 5132 } 5133 rc = sqlite3PagerWrite(pPage->pDbPage); 5134 if( rc!=SQLITE_OK ){ 5135 goto freepage_out; 5136 } 5137 put4byte(pPage->aData, iTrunk); 5138 put4byte(&pPage->aData[4], 0); 5139 put4byte(&pPage1->aData[32], iPage); 5140 TRACE(("FREE-PAGE: %d new trunk page replacing %d\n", pPage->pgno, iTrunk)); 5141 5142 freepage_out: 5143 if( pPage ){ 5144 pPage->isInit = 0; 5145 } 5146 releasePage(pPage); 5147 releasePage(pTrunk); 5148 return rc; 5149 } 5150 static void freePage(MemPage *pPage, int *pRC){ 5151 if( (*pRC)==SQLITE_OK ){ 5152 *pRC = freePage2(pPage->pBt, pPage, pPage->pgno); 5153 } 5154 } 5155 5156 /* 5157 ** Free any overflow pages associated with the given Cell. 5158 */ 5159 static int clearCell(MemPage *pPage, unsigned char *pCell){ 5160 BtShared *pBt = pPage->pBt; 5161 CellInfo info; 5162 Pgno ovflPgno; 5163 int rc; 5164 int nOvfl; 5165 u32 ovflPageSize; 5166 5167 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 5168 btreeParseCellPtr(pPage, pCell, &info); 5169 if( info.iOverflow==0 ){ 5170 return SQLITE_OK; /* No overflow pages. Return without doing anything */ 5171 } 5172 ovflPgno = get4byte(&pCell[info.iOverflow]); 5173 assert( pBt->usableSize > 4 ); 5174 ovflPageSize = pBt->usableSize - 4; 5175 nOvfl = (info.nPayload - info.nLocal + ovflPageSize - 1)/ovflPageSize; 5176 assert( ovflPgno==0 || nOvfl>0 ); 5177 while( nOvfl-- ){ 5178 Pgno iNext = 0; 5179 MemPage *pOvfl = 0; 5180 if( ovflPgno<2 || ovflPgno>btreePagecount(pBt) ){ 5181 /* 0 is not a legal page number and page 1 cannot be an 5182 ** overflow page. Therefore if ovflPgno<2 or past the end of the 5183 ** file the database must be corrupt. */ 5184 return SQLITE_CORRUPT_BKPT; 5185 } 5186 if( nOvfl ){ 5187 rc = getOverflowPage(pBt, ovflPgno, &pOvfl, &iNext); 5188 if( rc ) return rc; 5189 } 5190 5191 if( ( pOvfl || ((pOvfl = btreePageLookup(pBt, ovflPgno))!=0) ) 5192 && sqlite3PagerPageRefcount(pOvfl->pDbPage)!=1 5193 ){ 5194 /* There is no reason any cursor should have an outstanding reference 5195 ** to an overflow page belonging to a cell that is being deleted/updated. 5196 ** So if there exists more than one reference to this page, then it 5197 ** must not really be an overflow page and the database must be corrupt. 5198 ** It is helpful to detect this before calling freePage2(), as 5199 ** freePage2() may zero the page contents if secure-delete mode is 5200 ** enabled. If this 'overflow' page happens to be a page that the 5201 ** caller is iterating through or using in some other way, this 5202 ** can be problematic. 5203 */ 5204 rc = SQLITE_CORRUPT_BKPT; 5205 }else{ 5206 rc = freePage2(pBt, pOvfl, ovflPgno); 5207 } 5208 5209 if( pOvfl ){ 5210 sqlite3PagerUnref(pOvfl->pDbPage); 5211 } 5212 if( rc ) return rc; 5213 ovflPgno = iNext; 5214 } 5215 return SQLITE_OK; 5216 } 5217 5218 /* 5219 ** Create the byte sequence used to represent a cell on page pPage 5220 ** and write that byte sequence into pCell[]. Overflow pages are 5221 ** allocated and filled in as necessary. The calling procedure 5222 ** is responsible for making sure sufficient space has been allocated 5223 ** for pCell[]. 5224 ** 5225 ** Note that pCell does not necessary need to point to the pPage->aData 5226 ** area. pCell might point to some temporary storage. The cell will 5227 ** be constructed in this temporary area then copied into pPage->aData 5228 ** later. 5229 */ 5230 static int fillInCell( 5231 MemPage *pPage, /* The page that contains the cell */ 5232 unsigned char *pCell, /* Complete text of the cell */ 5233 const void *pKey, i64 nKey, /* The key */ 5234 const void *pData,int nData, /* The data */ 5235 int nZero, /* Extra zero bytes to append to pData */ 5236 int *pnSize /* Write cell size here */ 5237 ){ 5238 int nPayload; 5239 const u8 *pSrc; 5240 int nSrc, n, rc; 5241 int spaceLeft; 5242 MemPage *pOvfl = 0; 5243 MemPage *pToRelease = 0; 5244 unsigned char *pPrior; 5245 unsigned char *pPayload; 5246 BtShared *pBt = pPage->pBt; 5247 Pgno pgnoOvfl = 0; 5248 int nHeader; 5249 CellInfo info; 5250 5251 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 5252 5253 /* pPage is not necessarily writeable since pCell might be auxiliary 5254 ** buffer space that is separate from the pPage buffer area */ 5255 assert( pCell<pPage->aData || pCell>=&pPage->aData[pBt->pageSize] 5256 || sqlite3PagerIswriteable(pPage->pDbPage) ); 5257 5258 /* Fill in the header. */ 5259 nHeader = 0; 5260 if( !pPage->leaf ){ 5261 nHeader += 4; 5262 } 5263 if( pPage->hasData ){ 5264 nHeader += putVarint(&pCell[nHeader], nData+nZero); 5265 }else{ 5266 nData = nZero = 0; 5267 } 5268 nHeader += putVarint(&pCell[nHeader], *(u64*)&nKey); 5269 btreeParseCellPtr(pPage, pCell, &info); 5270 assert( info.nHeader==nHeader ); 5271 assert( info.nKey==nKey ); 5272 assert( info.nData==(u32)(nData+nZero) ); 5273 5274 /* Fill in the payload */ 5275 nPayload = nData + nZero; 5276 if( pPage->intKey ){ 5277 pSrc = pData; 5278 nSrc = nData; 5279 nData = 0; 5280 }else{ 5281 if( NEVER(nKey>0x7fffffff || pKey==0) ){ 5282 return SQLITE_CORRUPT_BKPT; 5283 } 5284 nPayload += (int)nKey; 5285 pSrc = pKey; 5286 nSrc = (int)nKey; 5287 } 5288 *pnSize = info.nSize; 5289 spaceLeft = info.nLocal; 5290 pPayload = &pCell[nHeader]; 5291 pPrior = &pCell[info.iOverflow]; 5292 5293 while( nPayload>0 ){ 5294 if( spaceLeft==0 ){ 5295 #ifndef SQLITE_OMIT_AUTOVACUUM 5296 Pgno pgnoPtrmap = pgnoOvfl; /* Overflow page pointer-map entry page */ 5297 if( pBt->autoVacuum ){ 5298 do{ 5299 pgnoOvfl++; 5300 } while( 5301 PTRMAP_ISPAGE(pBt, pgnoOvfl) || pgnoOvfl==PENDING_BYTE_PAGE(pBt) 5302 ); 5303 } 5304 #endif 5305 rc = allocateBtreePage(pBt, &pOvfl, &pgnoOvfl, pgnoOvfl, 0); 5306 #ifndef SQLITE_OMIT_AUTOVACUUM 5307 /* If the database supports auto-vacuum, and the second or subsequent 5308 ** overflow page is being allocated, add an entry to the pointer-map 5309 ** for that page now. 5310 ** 5311 ** If this is the first overflow page, then write a partial entry 5312 ** to the pointer-map. If we write nothing to this pointer-map slot, 5313 ** then the optimistic overflow chain processing in clearCell() 5314 ** may misinterpret the uninitialised values and delete the 5315 ** wrong pages from the database. 5316 */ 5317 if( pBt->autoVacuum && rc==SQLITE_OK ){ 5318 u8 eType = (pgnoPtrmap?PTRMAP_OVERFLOW2:PTRMAP_OVERFLOW1); 5319 ptrmapPut(pBt, pgnoOvfl, eType, pgnoPtrmap, &rc); 5320 if( rc ){ 5321 releasePage(pOvfl); 5322 } 5323 } 5324 #endif 5325 if( rc ){ 5326 releasePage(pToRelease); 5327 return rc; 5328 } 5329 5330 /* If pToRelease is not zero than pPrior points into the data area 5331 ** of pToRelease. Make sure pToRelease is still writeable. */ 5332 assert( pToRelease==0 || sqlite3PagerIswriteable(pToRelease->pDbPage) ); 5333 5334 /* If pPrior is part of the data area of pPage, then make sure pPage 5335 ** is still writeable */ 5336 assert( pPrior<pPage->aData || pPrior>=&pPage->aData[pBt->pageSize] 5337 || sqlite3PagerIswriteable(pPage->pDbPage) ); 5338 5339 put4byte(pPrior, pgnoOvfl); 5340 releasePage(pToRelease); 5341 pToRelease = pOvfl; 5342 pPrior = pOvfl->aData; 5343 put4byte(pPrior, 0); 5344 pPayload = &pOvfl->aData[4]; 5345 spaceLeft = pBt->usableSize - 4; 5346 } 5347 n = nPayload; 5348 if( n>spaceLeft ) n = spaceLeft; 5349 5350 /* If pToRelease is not zero than pPayload points into the data area 5351 ** of pToRelease. Make sure pToRelease is still writeable. */ 5352 assert( pToRelease==0 || sqlite3PagerIswriteable(pToRelease->pDbPage) ); 5353 5354 /* If pPayload is part of the data area of pPage, then make sure pPage 5355 ** is still writeable */ 5356 assert( pPayload<pPage->aData || pPayload>=&pPage->aData[pBt->pageSize] 5357 || sqlite3PagerIswriteable(pPage->pDbPage) ); 5358 5359 if( nSrc>0 ){ 5360 if( n>nSrc ) n = nSrc; 5361 assert( pSrc ); 5362 memcpy(pPayload, pSrc, n); 5363 }else{ 5364 memset(pPayload, 0, n); 5365 } 5366 nPayload -= n; 5367 pPayload += n; 5368 pSrc += n; 5369 nSrc -= n; 5370 spaceLeft -= n; 5371 if( nSrc==0 ){ 5372 nSrc = nData; 5373 pSrc = pData; 5374 } 5375 } 5376 releasePage(pToRelease); 5377 return SQLITE_OK; 5378 } 5379 5380 /* 5381 ** Remove the i-th cell from pPage. This routine effects pPage only. 5382 ** The cell content is not freed or deallocated. It is assumed that 5383 ** the cell content has been copied someplace else. This routine just 5384 ** removes the reference to the cell from pPage. 5385 ** 5386 ** "sz" must be the number of bytes in the cell. 5387 */ 5388 static void dropCell(MemPage *pPage, int idx, int sz, int *pRC){ 5389 int i; /* Loop counter */ 5390 u32 pc; /* Offset to cell content of cell being deleted */ 5391 u8 *data; /* pPage->aData */ 5392 u8 *ptr; /* Used to move bytes around within data[] */ 5393 int rc; /* The return code */ 5394 int hdr; /* Beginning of the header. 0 most pages. 100 page 1 */ 5395 5396 if( *pRC ) return; 5397 5398 assert( idx>=0 && idx<pPage->nCell ); 5399 assert( sz==cellSize(pPage, idx) ); 5400 assert( sqlite3PagerIswriteable(pPage->pDbPage) ); 5401 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 5402 data = pPage->aData; 5403 ptr = &data[pPage->cellOffset + 2*idx]; 5404 pc = get2byte(ptr); 5405 hdr = pPage->hdrOffset; 5406 testcase( pc==get2byte(&data[hdr+5]) ); 5407 testcase( pc+sz==pPage->pBt->usableSize ); 5408 if( pc < (u32)get2byte(&data[hdr+5]) || pc+sz > pPage->pBt->usableSize ){ 5409 *pRC = SQLITE_CORRUPT_BKPT; 5410 return; 5411 } 5412 rc = freeSpace(pPage, pc, sz); 5413 if( rc ){ 5414 *pRC = rc; 5415 return; 5416 } 5417 for(i=idx+1; i<pPage->nCell; i++, ptr+=2){ 5418 ptr[0] = ptr[2]; 5419 ptr[1] = ptr[3]; 5420 } 5421 pPage->nCell--; 5422 put2byte(&data[hdr+3], pPage->nCell); 5423 pPage->nFree += 2; 5424 } 5425 5426 /* 5427 ** Insert a new cell on pPage at cell index "i". pCell points to the 5428 ** content of the cell. 5429 ** 5430 ** If the cell content will fit on the page, then put it there. If it 5431 ** will not fit, then make a copy of the cell content into pTemp if 5432 ** pTemp is not null. Regardless of pTemp, allocate a new entry 5433 ** in pPage->aOvfl[] and make it point to the cell content (either 5434 ** in pTemp or the original pCell) and also record its index. 5435 ** Allocating a new entry in pPage->aCell[] implies that 5436 ** pPage->nOverflow is incremented. 5437 ** 5438 ** If nSkip is non-zero, then do not copy the first nSkip bytes of the 5439 ** cell. The caller will overwrite them after this function returns. If 5440 ** nSkip is non-zero, then pCell may not point to an invalid memory location 5441 ** (but pCell+nSkip is always valid). 5442 */ 5443 static void insertCell( 5444 MemPage *pPage, /* Page into which we are copying */ 5445 int i, /* New cell becomes the i-th cell of the page */ 5446 u8 *pCell, /* Content of the new cell */ 5447 int sz, /* Bytes of content in pCell */ 5448 u8 *pTemp, /* Temp storage space for pCell, if needed */ 5449 Pgno iChild, /* If non-zero, replace first 4 bytes with this value */ 5450 int *pRC /* Read and write return code from here */ 5451 ){ 5452 int idx = 0; /* Where to write new cell content in data[] */ 5453 int j; /* Loop counter */ 5454 int end; /* First byte past the last cell pointer in data[] */ 5455 int ins; /* Index in data[] where new cell pointer is inserted */ 5456 int cellOffset; /* Address of first cell pointer in data[] */ 5457 u8 *data; /* The content of the whole page */ 5458 u8 *ptr; /* Used for moving information around in data[] */ 5459 5460 int nSkip = (iChild ? 4 : 0); 5461 5462 if( *pRC ) return; 5463 5464 assert( i>=0 && i<=pPage->nCell+pPage->nOverflow ); 5465 assert( pPage->nCell<=MX_CELL(pPage->pBt) && MX_CELL(pPage->pBt)<=10921 ); 5466 assert( pPage->nOverflow<=ArraySize(pPage->aOvfl) ); 5467 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 5468 /* The cell should normally be sized correctly. However, when moving a 5469 ** malformed cell from a leaf page to an interior page, if the cell size 5470 ** wanted to be less than 4 but got rounded up to 4 on the leaf, then size 5471 ** might be less than 8 (leaf-size + pointer) on the interior node. Hence 5472 ** the term after the || in the following assert(). */ 5473 assert( sz==cellSizePtr(pPage, pCell) || (sz==8 && iChild>0) ); 5474 if( pPage->nOverflow || sz+2>pPage->nFree ){ 5475 if( pTemp ){ 5476 memcpy(pTemp+nSkip, pCell+nSkip, sz-nSkip); 5477 pCell = pTemp; 5478 } 5479 if( iChild ){ 5480 put4byte(pCell, iChild); 5481 } 5482 j = pPage->nOverflow++; 5483 assert( j<(int)(sizeof(pPage->aOvfl)/sizeof(pPage->aOvfl[0])) ); 5484 pPage->aOvfl[j].pCell = pCell; 5485 pPage->aOvfl[j].idx = (u16)i; 5486 }else{ 5487 int rc = sqlite3PagerWrite(pPage->pDbPage); 5488 if( rc!=SQLITE_OK ){ 5489 *pRC = rc; 5490 return; 5491 } 5492 assert( sqlite3PagerIswriteable(pPage->pDbPage) ); 5493 data = pPage->aData; 5494 cellOffset = pPage->cellOffset; 5495 end = cellOffset + 2*pPage->nCell; 5496 ins = cellOffset + 2*i; 5497 rc = allocateSpace(pPage, sz, &idx); 5498 if( rc ){ *pRC = rc; return; } 5499 /* The allocateSpace() routine guarantees the following two properties 5500 ** if it returns success */ 5501 assert( idx >= end+2 ); 5502 assert( idx+sz <= (int)pPage->pBt->usableSize ); 5503 pPage->nCell++; 5504 pPage->nFree -= (u16)(2 + sz); 5505 memcpy(&data[idx+nSkip], pCell+nSkip, sz-nSkip); 5506 if( iChild ){ 5507 put4byte(&data[idx], iChild); 5508 } 5509 for(j=end, ptr=&data[j]; j>ins; j-=2, ptr-=2){ 5510 ptr[0] = ptr[-2]; 5511 ptr[1] = ptr[-1]; 5512 } 5513 put2byte(&data[ins], idx); 5514 put2byte(&data[pPage->hdrOffset+3], pPage->nCell); 5515 #ifndef SQLITE_OMIT_AUTOVACUUM 5516 if( pPage->pBt->autoVacuum ){ 5517 /* The cell may contain a pointer to an overflow page. If so, write 5518 ** the entry for the overflow page into the pointer map. 5519 */ 5520 ptrmapPutOvflPtr(pPage, pCell, pRC); 5521 } 5522 #endif 5523 } 5524 } 5525 5526 /* 5527 ** Add a list of cells to a page. The page should be initially empty. 5528 ** The cells are guaranteed to fit on the page. 5529 */ 5530 static void assemblePage( 5531 MemPage *pPage, /* The page to be assemblied */ 5532 int nCell, /* The number of cells to add to this page */ 5533 u8 **apCell, /* Pointers to cell bodies */ 5534 u16 *aSize /* Sizes of the cells */ 5535 ){ 5536 int i; /* Loop counter */ 5537 u8 *pCellptr; /* Address of next cell pointer */ 5538 int cellbody; /* Address of next cell body */ 5539 u8 * const data = pPage->aData; /* Pointer to data for pPage */ 5540 const int hdr = pPage->hdrOffset; /* Offset of header on pPage */ 5541 const int nUsable = pPage->pBt->usableSize; /* Usable size of page */ 5542 5543 assert( pPage->nOverflow==0 ); 5544 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 5545 assert( nCell>=0 && nCell<=(int)MX_CELL(pPage->pBt) 5546 && (int)MX_CELL(pPage->pBt)<=10921); 5547 assert( sqlite3PagerIswriteable(pPage->pDbPage) ); 5548 5549 /* Check that the page has just been zeroed by zeroPage() */ 5550 assert( pPage->nCell==0 ); 5551 assert( get2byteNotZero(&data[hdr+5])==nUsable ); 5552 5553 pCellptr = &data[pPage->cellOffset + nCell*2]; 5554 cellbody = nUsable; 5555 for(i=nCell-1; i>=0; i--){ 5556 pCellptr -= 2; 5557 cellbody -= aSize[i]; 5558 put2byte(pCellptr, cellbody); 5559 memcpy(&data[cellbody], apCell[i], aSize[i]); 5560 } 5561 put2byte(&data[hdr+3], nCell); 5562 put2byte(&data[hdr+5], cellbody); 5563 pPage->nFree -= (nCell*2 + nUsable - cellbody); 5564 pPage->nCell = (u16)nCell; 5565 } 5566 5567 /* 5568 ** The following parameters determine how many adjacent pages get involved 5569 ** in a balancing operation. NN is the number of neighbors on either side 5570 ** of the page that participate in the balancing operation. NB is the 5571 ** total number of pages that participate, including the target page and 5572 ** NN neighbors on either side. 5573 ** 5574 ** The minimum value of NN is 1 (of course). Increasing NN above 1 5575 ** (to 2 or 3) gives a modest improvement in SELECT and DELETE performance 5576 ** in exchange for a larger degradation in INSERT and UPDATE performance. 5577 ** The value of NN appears to give the best results overall. 5578 */ 5579 #define NN 1 /* Number of neighbors on either side of pPage */ 5580 #define NB (NN*2+1) /* Total pages involved in the balance */ 5581 5582 5583 #ifndef SQLITE_OMIT_QUICKBALANCE 5584 /* 5585 ** This version of balance() handles the common special case where 5586 ** a new entry is being inserted on the extreme right-end of the 5587 ** tree, in other words, when the new entry will become the largest 5588 ** entry in the tree. 5589 ** 5590 ** Instead of trying to balance the 3 right-most leaf pages, just add 5591 ** a new page to the right-hand side and put the one new entry in 5592 ** that page. This leaves the right side of the tree somewhat 5593 ** unbalanced. But odds are that we will be inserting new entries 5594 ** at the end soon afterwards so the nearly empty page will quickly 5595 ** fill up. On average. 5596 ** 5597 ** pPage is the leaf page which is the right-most page in the tree. 5598 ** pParent is its parent. pPage must have a single overflow entry 5599 ** which is also the right-most entry on the page. 5600 ** 5601 ** The pSpace buffer is used to store a temporary copy of the divider 5602 ** cell that will be inserted into pParent. Such a cell consists of a 4 5603 ** byte page number followed by a variable length integer. In other 5604 ** words, at most 13 bytes. Hence the pSpace buffer must be at 5605 ** least 13 bytes in size. 5606 */ 5607 static int balance_quick(MemPage *pParent, MemPage *pPage, u8 *pSpace){ 5608 BtShared *const pBt = pPage->pBt; /* B-Tree Database */ 5609 MemPage *pNew; /* Newly allocated page */ 5610 int rc; /* Return Code */ 5611 Pgno pgnoNew; /* Page number of pNew */ 5612 5613 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 5614 assert( sqlite3PagerIswriteable(pParent->pDbPage) ); 5615 assert( pPage->nOverflow==1 ); 5616 5617 /* This error condition is now caught prior to reaching this function */ 5618 if( pPage->nCell<=0 ) return SQLITE_CORRUPT_BKPT; 5619 5620 /* Allocate a new page. This page will become the right-sibling of 5621 ** pPage. Make the parent page writable, so that the new divider cell 5622 ** may be inserted. If both these operations are successful, proceed. 5623 */ 5624 rc = allocateBtreePage(pBt, &pNew, &pgnoNew, 0, 0); 5625 5626 if( rc==SQLITE_OK ){ 5627 5628 u8 *pOut = &pSpace[4]; 5629 u8 *pCell = pPage->aOvfl[0].pCell; 5630 u16 szCell = cellSizePtr(pPage, pCell); 5631 u8 *pStop; 5632 5633 assert( sqlite3PagerIswriteable(pNew->pDbPage) ); 5634 assert( pPage->aData[0]==(PTF_INTKEY|PTF_LEAFDATA|PTF_LEAF) ); 5635 zeroPage(pNew, PTF_INTKEY|PTF_LEAFDATA|PTF_LEAF); 5636 assemblePage(pNew, 1, &pCell, &szCell); 5637 5638 /* If this is an auto-vacuum database, update the pointer map 5639 ** with entries for the new page, and any pointer from the 5640 ** cell on the page to an overflow page. If either of these 5641 ** operations fails, the return code is set, but the contents 5642 ** of the parent page are still manipulated by thh code below. 5643 ** That is Ok, at this point the parent page is guaranteed to 5644 ** be marked as dirty. Returning an error code will cause a 5645 ** rollback, undoing any changes made to the parent page. 5646 */ 5647 if( ISAUTOVACUUM ){ 5648 ptrmapPut(pBt, pgnoNew, PTRMAP_BTREE, pParent->pgno, &rc); 5649 if( szCell>pNew->minLocal ){ 5650 ptrmapPutOvflPtr(pNew, pCell, &rc); 5651 } 5652 } 5653 5654 /* Create a divider cell to insert into pParent. The divider cell 5655 ** consists of a 4-byte page number (the page number of pPage) and 5656 ** a variable length key value (which must be the same value as the 5657 ** largest key on pPage). 5658 ** 5659 ** To find the largest key value on pPage, first find the right-most 5660 ** cell on pPage. The first two fields of this cell are the 5661 ** record-length (a variable length integer at most 32-bits in size) 5662 ** and the key value (a variable length integer, may have any value). 5663 ** The first of the while(...) loops below skips over the record-length 5664 ** field. The second while(...) loop copies the key value from the 5665 ** cell on pPage into the pSpace buffer. 5666 */ 5667 pCell = findCell(pPage, pPage->nCell-1); 5668 pStop = &pCell[9]; 5669 while( (*(pCell++)&0x80) && pCell<pStop ); 5670 pStop = &pCell[9]; 5671 while( ((*(pOut++) = *(pCell++))&0x80) && pCell<pStop ); 5672 5673 /* Insert the new divider cell into pParent. */ 5674 insertCell(pParent, pParent->nCell, pSpace, (int)(pOut-pSpace), 5675 0, pPage->pgno, &rc); 5676 5677 /* Set the right-child pointer of pParent to point to the new page. */ 5678 put4byte(&pParent->aData[pParent->hdrOffset+8], pgnoNew); 5679 5680 /* Release the reference to the new page. */ 5681 releasePage(pNew); 5682 } 5683 5684 return rc; 5685 } 5686 #endif /* SQLITE_OMIT_QUICKBALANCE */ 5687 5688 #if 0 5689 /* 5690 ** This function does not contribute anything to the operation of SQLite. 5691 ** it is sometimes activated temporarily while debugging code responsible 5692 ** for setting pointer-map entries. 5693 */ 5694 static int ptrmapCheckPages(MemPage **apPage, int nPage){ 5695 int i, j; 5696 for(i=0; i<nPage; i++){ 5697 Pgno n; 5698 u8 e; 5699 MemPage *pPage = apPage[i]; 5700 BtShared *pBt = pPage->pBt; 5701 assert( pPage->isInit ); 5702 5703 for(j=0; j<pPage->nCell; j++){ 5704 CellInfo info; 5705 u8 *z; 5706 5707 z = findCell(pPage, j); 5708 btreeParseCellPtr(pPage, z, &info); 5709 if( info.iOverflow ){ 5710 Pgno ovfl = get4byte(&z[info.iOverflow]); 5711 ptrmapGet(pBt, ovfl, &e, &n); 5712 assert( n==pPage->pgno && e==PTRMAP_OVERFLOW1 ); 5713 } 5714 if( !pPage->leaf ){ 5715 Pgno child = get4byte(z); 5716 ptrmapGet(pBt, child, &e, &n); 5717 assert( n==pPage->pgno && e==PTRMAP_BTREE ); 5718 } 5719 } 5720 if( !pPage->leaf ){ 5721 Pgno child = get4byte(&pPage->aData[pPage->hdrOffset+8]); 5722 ptrmapGet(pBt, child, &e, &n); 5723 assert( n==pPage->pgno && e==PTRMAP_BTREE ); 5724 } 5725 } 5726 return 1; 5727 } 5728 #endif 5729 5730 /* 5731 ** This function is used to copy the contents of the b-tree node stored 5732 ** on page pFrom to page pTo. If page pFrom was not a leaf page, then 5733 ** the pointer-map entries for each child page are updated so that the 5734 ** parent page stored in the pointer map is page pTo. If pFrom contained 5735 ** any cells with overflow page pointers, then the corresponding pointer 5736 ** map entries are also updated so that the parent page is page pTo. 5737 ** 5738 ** If pFrom is currently carrying any overflow cells (entries in the 5739 ** MemPage.aOvfl[] array), they are not copied to pTo. 5740 ** 5741 ** Before returning, page pTo is reinitialized using btreeInitPage(). 5742 ** 5743 ** The performance of this function is not critical. It is only used by 5744 ** the balance_shallower() and balance_deeper() procedures, neither of 5745 ** which are called often under normal circumstances. 5746 */ 5747 static void copyNodeContent(MemPage *pFrom, MemPage *pTo, int *pRC){ 5748 if( (*pRC)==SQLITE_OK ){ 5749 BtShared * const pBt = pFrom->pBt; 5750 u8 * const aFrom = pFrom->aData; 5751 u8 * const aTo = pTo->aData; 5752 int const iFromHdr = pFrom->hdrOffset; 5753 int const iToHdr = ((pTo->pgno==1) ? 100 : 0); 5754 int rc; 5755 int iData; 5756 5757 5758 assert( pFrom->isInit ); 5759 assert( pFrom->nFree>=iToHdr ); 5760 assert( get2byte(&aFrom[iFromHdr+5]) <= (int)pBt->usableSize ); 5761 5762 /* Copy the b-tree node content from page pFrom to page pTo. */ 5763 iData = get2byte(&aFrom[iFromHdr+5]); 5764 memcpy(&aTo[iData], &aFrom[iData], pBt->usableSize-iData); 5765 memcpy(&aTo[iToHdr], &aFrom[iFromHdr], pFrom->cellOffset + 2*pFrom->nCell); 5766 5767 /* Reinitialize page pTo so that the contents of the MemPage structure 5768 ** match the new data. The initialization of pTo can actually fail under 5769 ** fairly obscure circumstances, even though it is a copy of initialized 5770 ** page pFrom. 5771 */ 5772 pTo->isInit = 0; 5773 rc = btreeInitPage(pTo); 5774 if( rc!=SQLITE_OK ){ 5775 *pRC = rc; 5776 return; 5777 } 5778 5779 /* If this is an auto-vacuum database, update the pointer-map entries 5780 ** for any b-tree or overflow pages that pTo now contains the pointers to. 5781 */ 5782 if( ISAUTOVACUUM ){ 5783 *pRC = setChildPtrmaps(pTo); 5784 } 5785 } 5786 } 5787 5788 /* 5789 ** This routine redistributes cells on the iParentIdx'th child of pParent 5790 ** (hereafter "the page") and up to 2 siblings so that all pages have about the 5791 ** same amount of free space. Usually a single sibling on either side of the 5792 ** page are used in the balancing, though both siblings might come from one 5793 ** side if the page is the first or last child of its parent. If the page 5794 ** has fewer than 2 siblings (something which can only happen if the page 5795 ** is a root page or a child of a root page) then all available siblings 5796 ** participate in the balancing. 5797 ** 5798 ** The number of siblings of the page might be increased or decreased by 5799 ** one or two in an effort to keep pages nearly full but not over full. 5800 ** 5801 ** Note that when this routine is called, some of the cells on the page 5802 ** might not actually be stored in MemPage.aData[]. This can happen 5803 ** if the page is overfull. This routine ensures that all cells allocated 5804 ** to the page and its siblings fit into MemPage.aData[] before returning. 5805 ** 5806 ** In the course of balancing the page and its siblings, cells may be 5807 ** inserted into or removed from the parent page (pParent). Doing so 5808 ** may cause the parent page to become overfull or underfull. If this 5809 ** happens, it is the responsibility of the caller to invoke the correct 5810 ** balancing routine to fix this problem (see the balance() routine). 5811 ** 5812 ** If this routine fails for any reason, it might leave the database 5813 ** in a corrupted state. So if this routine fails, the database should 5814 ** be rolled back. 5815 ** 5816 ** The third argument to this function, aOvflSpace, is a pointer to a 5817 ** buffer big enough to hold one page. If while inserting cells into the parent 5818 ** page (pParent) the parent page becomes overfull, this buffer is 5819 ** used to store the parent's overflow cells. Because this function inserts 5820 ** a maximum of four divider cells into the parent page, and the maximum 5821 ** size of a cell stored within an internal node is always less than 1/4 5822 ** of the page-size, the aOvflSpace[] buffer is guaranteed to be large 5823 ** enough for all overflow cells. 5824 ** 5825 ** If aOvflSpace is set to a null pointer, this function returns 5826 ** SQLITE_NOMEM. 5827 */ 5828 static int balance_nonroot( 5829 MemPage *pParent, /* Parent page of siblings being balanced */ 5830 int iParentIdx, /* Index of "the page" in pParent */ 5831 u8 *aOvflSpace, /* page-size bytes of space for parent ovfl */ 5832 int isRoot /* True if pParent is a root-page */ 5833 ){ 5834 BtShared *pBt; /* The whole database */ 5835 int nCell = 0; /* Number of cells in apCell[] */ 5836 int nMaxCells = 0; /* Allocated size of apCell, szCell, aFrom. */ 5837 int nNew = 0; /* Number of pages in apNew[] */ 5838 int nOld; /* Number of pages in apOld[] */ 5839 int i, j, k; /* Loop counters */ 5840 int nxDiv; /* Next divider slot in pParent->aCell[] */ 5841 int rc = SQLITE_OK; /* The return code */ 5842 u16 leafCorrection; /* 4 if pPage is a leaf. 0 if not */ 5843 int leafData; /* True if pPage is a leaf of a LEAFDATA tree */ 5844 int usableSpace; /* Bytes in pPage beyond the header */ 5845 int pageFlags; /* Value of pPage->aData[0] */ 5846 int subtotal; /* Subtotal of bytes in cells on one page */ 5847 int iSpace1 = 0; /* First unused byte of aSpace1[] */ 5848 int iOvflSpace = 0; /* First unused byte of aOvflSpace[] */ 5849 int szScratch; /* Size of scratch memory requested */ 5850 MemPage *apOld[NB]; /* pPage and up to two siblings */ 5851 MemPage *apCopy[NB]; /* Private copies of apOld[] pages */ 5852 MemPage *apNew[NB+2]; /* pPage and up to NB siblings after balancing */ 5853 u8 *pRight; /* Location in parent of right-sibling pointer */ 5854 u8 *apDiv[NB-1]; /* Divider cells in pParent */ 5855 int cntNew[NB+2]; /* Index in aCell[] of cell after i-th page */ 5856 int szNew[NB+2]; /* Combined size of cells place on i-th page */ 5857 u8 **apCell = 0; /* All cells begin balanced */ 5858 u16 *szCell; /* Local size of all cells in apCell[] */ 5859 u8 *aSpace1; /* Space for copies of dividers cells */ 5860 Pgno pgno; /* Temp var to store a page number in */ 5861 5862 pBt = pParent->pBt; 5863 assert( sqlite3_mutex_held(pBt->mutex) ); 5864 assert( sqlite3PagerIswriteable(pParent->pDbPage) ); 5865 5866 #if 0 5867 TRACE(("BALANCE: begin page %d child of %d\n", pPage->pgno, pParent->pgno)); 5868 #endif 5869 5870 /* At this point pParent may have at most one overflow cell. And if 5871 ** this overflow cell is present, it must be the cell with 5872 ** index iParentIdx. This scenario comes about when this function 5873 ** is called (indirectly) from sqlite3BtreeDelete(). 5874 */ 5875 assert( pParent->nOverflow==0 || pParent->nOverflow==1 ); 5876 assert( pParent->nOverflow==0 || pParent->aOvfl[0].idx==iParentIdx ); 5877 5878 if( !aOvflSpace ){ 5879 return SQLITE_NOMEM; 5880 } 5881 5882 /* Find the sibling pages to balance. Also locate the cells in pParent 5883 ** that divide the siblings. An attempt is made to find NN siblings on 5884 ** either side of pPage. More siblings are taken from one side, however, 5885 ** if there are fewer than NN siblings on the other side. If pParent 5886 ** has NB or fewer children then all children of pParent are taken. 5887 ** 5888 ** This loop also drops the divider cells from the parent page. This 5889 ** way, the remainder of the function does not have to deal with any 5890 ** overflow cells in the parent page, since if any existed they will 5891 ** have already been removed. 5892 */ 5893 i = pParent->nOverflow + pParent->nCell; 5894 if( i<2 ){ 5895 nxDiv = 0; 5896 nOld = i+1; 5897 }else{ 5898 nOld = 3; 5899 if( iParentIdx==0 ){ 5900 nxDiv = 0; 5901 }else if( iParentIdx==i ){ 5902 nxDiv = i-2; 5903 }else{ 5904 nxDiv = iParentIdx-1; 5905 } 5906 i = 2; 5907 } 5908 if( (i+nxDiv-pParent->nOverflow)==pParent->nCell ){ 5909 pRight = &pParent->aData[pParent->hdrOffset+8]; 5910 }else{ 5911 pRight = findCell(pParent, i+nxDiv-pParent->nOverflow); 5912 } 5913 pgno = get4byte(pRight); 5914 while( 1 ){ 5915 rc = getAndInitPage(pBt, pgno, &apOld[i]); 5916 if( rc ){ 5917 memset(apOld, 0, (i+1)*sizeof(MemPage*)); 5918 goto balance_cleanup; 5919 } 5920 nMaxCells += 1+apOld[i]->nCell+apOld[i]->nOverflow; 5921 if( (i--)==0 ) break; 5922 5923 if( i+nxDiv==pParent->aOvfl[0].idx && pParent->nOverflow ){ 5924 apDiv[i] = pParent->aOvfl[0].pCell; 5925 pgno = get4byte(apDiv[i]); 5926 szNew[i] = cellSizePtr(pParent, apDiv[i]); 5927 pParent->nOverflow = 0; 5928 }else{ 5929 apDiv[i] = findCell(pParent, i+nxDiv-pParent->nOverflow); 5930 pgno = get4byte(apDiv[i]); 5931 szNew[i] = cellSizePtr(pParent, apDiv[i]); 5932 5933 /* Drop the cell from the parent page. apDiv[i] still points to 5934 ** the cell within the parent, even though it has been dropped. 5935 ** This is safe because dropping a cell only overwrites the first 5936 ** four bytes of it, and this function does not need the first 5937 ** four bytes of the divider cell. So the pointer is safe to use 5938 ** later on. 5939 ** 5940 ** Unless SQLite is compiled in secure-delete mode. In this case, 5941 ** the dropCell() routine will overwrite the entire cell with zeroes. 5942 ** In this case, temporarily copy the cell into the aOvflSpace[] 5943 ** buffer. It will be copied out again as soon as the aSpace[] buffer 5944 ** is allocated. */ 5945 if( pBt->secureDelete ){ 5946 int iOff = SQLITE_PTR_TO_INT(apDiv[i]) - SQLITE_PTR_TO_INT(pParent->aData); 5947 if( (iOff+szNew[i])>(int)pBt->usableSize ){ 5948 rc = SQLITE_CORRUPT_BKPT; 5949 memset(apOld, 0, (i+1)*sizeof(MemPage*)); 5950 goto balance_cleanup; 5951 }else{ 5952 memcpy(&aOvflSpace[iOff], apDiv[i], szNew[i]); 5953 apDiv[i] = &aOvflSpace[apDiv[i]-pParent->aData]; 5954 } 5955 } 5956 dropCell(pParent, i+nxDiv-pParent->nOverflow, szNew[i], &rc); 5957 } 5958 } 5959 5960 /* Make nMaxCells a multiple of 4 in order to preserve 8-byte 5961 ** alignment */ 5962 nMaxCells = (nMaxCells + 3)&~3; 5963 5964 /* 5965 ** Allocate space for memory structures 5966 */ 5967 k = pBt->pageSize + ROUND8(sizeof(MemPage)); 5968 szScratch = 5969 nMaxCells*sizeof(u8*) /* apCell */ 5970 + nMaxCells*sizeof(u16) /* szCell */ 5971 + pBt->pageSize /* aSpace1 */ 5972 + k*nOld; /* Page copies (apCopy) */ 5973 apCell = sqlite3ScratchMalloc( szScratch ); 5974 if( apCell==0 ){ 5975 rc = SQLITE_NOMEM; 5976 goto balance_cleanup; 5977 } 5978 szCell = (u16*)&apCell[nMaxCells]; 5979 aSpace1 = (u8*)&szCell[nMaxCells]; 5980 assert( EIGHT_BYTE_ALIGNMENT(aSpace1) ); 5981 5982 /* 5983 ** Load pointers to all cells on sibling pages and the divider cells 5984 ** into the local apCell[] array. Make copies of the divider cells 5985 ** into space obtained from aSpace1[] and remove the the divider Cells 5986 ** from pParent. 5987 ** 5988 ** If the siblings are on leaf pages, then the child pointers of the 5989 ** divider cells are stripped from the cells before they are copied 5990 ** into aSpace1[]. In this way, all cells in apCell[] are without 5991 ** child pointers. If siblings are not leaves, then all cell in 5992 ** apCell[] include child pointers. Either way, all cells in apCell[] 5993 ** are alike. 5994 ** 5995 ** leafCorrection: 4 if pPage is a leaf. 0 if pPage is not a leaf. 5996 ** leafData: 1 if pPage holds key+data and pParent holds only keys. 5997 */ 5998 leafCorrection = apOld[0]->leaf*4; 5999 leafData = apOld[0]->hasData; 6000 for(i=0; i<nOld; i++){ 6001 int limit; 6002 6003 /* Before doing anything else, take a copy of the i'th original sibling 6004 ** The rest of this function will use data from the copies rather 6005 ** that the original pages since the original pages will be in the 6006 ** process of being overwritten. */ 6007 MemPage *pOld = apCopy[i] = (MemPage*)&aSpace1[pBt->pageSize + k*i]; 6008 memcpy(pOld, apOld[i], sizeof(MemPage)); 6009 pOld->aData = (void*)&pOld[1]; 6010 memcpy(pOld->aData, apOld[i]->aData, pBt->pageSize); 6011 6012 limit = pOld->nCell+pOld->nOverflow; 6013 for(j=0; j<limit; j++){ 6014 assert( nCell<nMaxCells ); 6015 apCell[nCell] = findOverflowCell(pOld, j); 6016 szCell[nCell] = cellSizePtr(pOld, apCell[nCell]); 6017 nCell++; 6018 } 6019 if( i<nOld-1 && !leafData){ 6020 u16 sz = (u16)szNew[i]; 6021 u8 *pTemp; 6022 assert( nCell<nMaxCells ); 6023 szCell[nCell] = sz; 6024 pTemp = &aSpace1[iSpace1]; 6025 iSpace1 += sz; 6026 assert( sz<=pBt->maxLocal+23 ); 6027 assert( iSpace1 <= (int)pBt->pageSize ); 6028 memcpy(pTemp, apDiv[i], sz); 6029 apCell[nCell] = pTemp+leafCorrection; 6030 assert( leafCorrection==0 || leafCorrection==4 ); 6031 szCell[nCell] = szCell[nCell] - leafCorrection; 6032 if( !pOld->leaf ){ 6033 assert( leafCorrection==0 ); 6034 assert( pOld->hdrOffset==0 ); 6035 /* The right pointer of the child page pOld becomes the left 6036 ** pointer of the divider cell */ 6037 memcpy(apCell[nCell], &pOld->aData[8], 4); 6038 }else{ 6039 assert( leafCorrection==4 ); 6040 if( szCell[nCell]<4 ){ 6041 /* Do not allow any cells smaller than 4 bytes. */ 6042 szCell[nCell] = 4; 6043 } 6044 } 6045 nCell++; 6046 } 6047 } 6048 6049 /* 6050 ** Figure out the number of pages needed to hold all nCell cells. 6051 ** Store this number in "k". Also compute szNew[] which is the total 6052 ** size of all cells on the i-th page and cntNew[] which is the index 6053 ** in apCell[] of the cell that divides page i from page i+1. 6054 ** cntNew[k] should equal nCell. 6055 ** 6056 ** Values computed by this block: 6057 ** 6058 ** k: The total number of sibling pages 6059 ** szNew[i]: Spaced used on the i-th sibling page. 6060 ** cntNew[i]: Index in apCell[] and szCell[] for the first cell to 6061 ** the right of the i-th sibling page. 6062 ** usableSpace: Number of bytes of space available on each sibling. 6063 ** 6064 */ 6065 usableSpace = pBt->usableSize - 12 + leafCorrection; 6066 for(subtotal=k=i=0; i<nCell; i++){ 6067 assert( i<nMaxCells ); 6068 subtotal += szCell[i] + 2; 6069 if( subtotal > usableSpace ){ 6070 szNew[k] = subtotal - szCell[i]; 6071 cntNew[k] = i; 6072 if( leafData ){ i--; } 6073 subtotal = 0; 6074 k++; 6075 if( k>NB+1 ){ rc = SQLITE_CORRUPT_BKPT; goto balance_cleanup; } 6076 } 6077 } 6078 szNew[k] = subtotal; 6079 cntNew[k] = nCell; 6080 k++; 6081 6082 /* 6083 ** The packing computed by the previous block is biased toward the siblings 6084 ** on the left side. The left siblings are always nearly full, while the 6085 ** right-most sibling might be nearly empty. This block of code attempts 6086 ** to adjust the packing of siblings to get a better balance. 6087 ** 6088 ** This adjustment is more than an optimization. The packing above might 6089 ** be so out of balance as to be illegal. For example, the right-most 6090 ** sibling might be completely empty. This adjustment is not optional. 6091 */ 6092 for(i=k-1; i>0; i--){ 6093 int szRight = szNew[i]; /* Size of sibling on the right */ 6094 int szLeft = szNew[i-1]; /* Size of sibling on the left */ 6095 int r; /* Index of right-most cell in left sibling */ 6096 int d; /* Index of first cell to the left of right sibling */ 6097 6098 r = cntNew[i-1] - 1; 6099 d = r + 1 - leafData; 6100 assert( d<nMaxCells ); 6101 assert( r<nMaxCells ); 6102 while( szRight==0 || szRight+szCell[d]+2<=szLeft-(szCell[r]+2) ){ 6103 szRight += szCell[d] + 2; 6104 szLeft -= szCell[r] + 2; 6105 cntNew[i-1]--; 6106 r = cntNew[i-1] - 1; 6107 d = r + 1 - leafData; 6108 } 6109 szNew[i] = szRight; 6110 szNew[i-1] = szLeft; 6111 } 6112 6113 /* Either we found one or more cells (cntnew[0])>0) or pPage is 6114 ** a virtual root page. A virtual root page is when the real root 6115 ** page is page 1 and we are the only child of that page. 6116 */ 6117 assert( cntNew[0]>0 || (pParent->pgno==1 && pParent->nCell==0) ); 6118 6119 TRACE(("BALANCE: old: %d %d %d ", 6120 apOld[0]->pgno, 6121 nOld>=2 ? apOld[1]->pgno : 0, 6122 nOld>=3 ? apOld[2]->pgno : 0 6123 )); 6124 6125 /* 6126 ** Allocate k new pages. Reuse old pages where possible. 6127 */ 6128 if( apOld[0]->pgno<=1 ){ 6129 rc = SQLITE_CORRUPT_BKPT; 6130 goto balance_cleanup; 6131 } 6132 pageFlags = apOld[0]->aData[0]; 6133 for(i=0; i<k; i++){ 6134 MemPage *pNew; 6135 if( i<nOld ){ 6136 pNew = apNew[i] = apOld[i]; 6137 apOld[i] = 0; 6138 rc = sqlite3PagerWrite(pNew->pDbPage); 6139 nNew++; 6140 if( rc ) goto balance_cleanup; 6141 }else{ 6142 assert( i>0 ); 6143 rc = allocateBtreePage(pBt, &pNew, &pgno, pgno, 0); 6144 if( rc ) goto balance_cleanup; 6145 apNew[i] = pNew; 6146 nNew++; 6147 6148 /* Set the pointer-map entry for the new sibling page. */ 6149 if( ISAUTOVACUUM ){ 6150 ptrmapPut(pBt, pNew->pgno, PTRMAP_BTREE, pParent->pgno, &rc); 6151 if( rc!=SQLITE_OK ){ 6152 goto balance_cleanup; 6153 } 6154 } 6155 } 6156 } 6157 6158 /* Free any old pages that were not reused as new pages. 6159 */ 6160 while( i<nOld ){ 6161 freePage(apOld[i], &rc); 6162 if( rc ) goto balance_cleanup; 6163 releasePage(apOld[i]); 6164 apOld[i] = 0; 6165 i++; 6166 } 6167 6168 /* 6169 ** Put the new pages in accending order. This helps to 6170 ** keep entries in the disk file in order so that a scan 6171 ** of the table is a linear scan through the file. That 6172 ** in turn helps the operating system to deliver pages 6173 ** from the disk more rapidly. 6174 ** 6175 ** An O(n^2) insertion sort algorithm is used, but since 6176 ** n is never more than NB (a small constant), that should 6177 ** not be a problem. 6178 ** 6179 ** When NB==3, this one optimization makes the database 6180 ** about 25% faster for large insertions and deletions. 6181 */ 6182 for(i=0; i<k-1; i++){ 6183 int minV = apNew[i]->pgno; 6184 int minI = i; 6185 for(j=i+1; j<k; j++){ 6186 if( apNew[j]->pgno<(unsigned)minV ){ 6187 minI = j; 6188 minV = apNew[j]->pgno; 6189 } 6190 } 6191 if( minI>i ){ 6192 MemPage *pT; 6193 pT = apNew[i]; 6194 apNew[i] = apNew[minI]; 6195 apNew[minI] = pT; 6196 } 6197 } 6198 TRACE(("new: %d(%d) %d(%d) %d(%d) %d(%d) %d(%d)\n", 6199 apNew[0]->pgno, szNew[0], 6200 nNew>=2 ? apNew[1]->pgno : 0, nNew>=2 ? szNew[1] : 0, 6201 nNew>=3 ? apNew[2]->pgno : 0, nNew>=3 ? szNew[2] : 0, 6202 nNew>=4 ? apNew[3]->pgno : 0, nNew>=4 ? szNew[3] : 0, 6203 nNew>=5 ? apNew[4]->pgno : 0, nNew>=5 ? szNew[4] : 0)); 6204 6205 assert( sqlite3PagerIswriteable(pParent->pDbPage) ); 6206 put4byte(pRight, apNew[nNew-1]->pgno); 6207 6208 /* 6209 ** Evenly distribute the data in apCell[] across the new pages. 6210 ** Insert divider cells into pParent as necessary. 6211 */ 6212 j = 0; 6213 for(i=0; i<nNew; i++){ 6214 /* Assemble the new sibling page. */ 6215 MemPage *pNew = apNew[i]; 6216 assert( j<nMaxCells ); 6217 zeroPage(pNew, pageFlags); 6218 assemblePage(pNew, cntNew[i]-j, &apCell[j], &szCell[j]); 6219 assert( pNew->nCell>0 || (nNew==1 && cntNew[0]==0) ); 6220 assert( pNew->nOverflow==0 ); 6221 6222 j = cntNew[i]; 6223 6224 /* If the sibling page assembled above was not the right-most sibling, 6225 ** insert a divider cell into the parent page. 6226 */ 6227 assert( i<nNew-1 || j==nCell ); 6228 if( j<nCell ){ 6229 u8 *pCell; 6230 u8 *pTemp; 6231 int sz; 6232 6233 assert( j<nMaxCells ); 6234 pCell = apCell[j]; 6235 sz = szCell[j] + leafCorrection; 6236 pTemp = &aOvflSpace[iOvflSpace]; 6237 if( !pNew->leaf ){ 6238 memcpy(&pNew->aData[8], pCell, 4); 6239 }else if( leafData ){ 6240 /* If the tree is a leaf-data tree, and the siblings are leaves, 6241 ** then there is no divider cell in apCell[]. Instead, the divider 6242 ** cell consists of the integer key for the right-most cell of 6243 ** the sibling-page assembled above only. 6244 */ 6245 CellInfo info; 6246 j--; 6247 btreeParseCellPtr(pNew, apCell[j], &info); 6248 pCell = pTemp; 6249 sz = 4 + putVarint(&pCell[4], info.nKey); 6250 pTemp = 0; 6251 }else{ 6252 pCell -= 4; 6253 /* Obscure case for non-leaf-data trees: If the cell at pCell was 6254 ** previously stored on a leaf node, and its reported size was 4 6255 ** bytes, then it may actually be smaller than this 6256 ** (see btreeParseCellPtr(), 4 bytes is the minimum size of 6257 ** any cell). But it is important to pass the correct size to 6258 ** insertCell(), so reparse the cell now. 6259 ** 6260 ** Note that this can never happen in an SQLite data file, as all 6261 ** cells are at least 4 bytes. It only happens in b-trees used 6262 ** to evaluate "IN (SELECT ...)" and similar clauses. 6263 */ 6264 if( szCell[j]==4 ){ 6265 assert(leafCorrection==4); 6266 sz = cellSizePtr(pParent, pCell); 6267 } 6268 } 6269 iOvflSpace += sz; 6270 assert( sz<=pBt->maxLocal+23 ); 6271 assert( iOvflSpace <= (int)pBt->pageSize ); 6272 insertCell(pParent, nxDiv, pCell, sz, pTemp, pNew->pgno, &rc); 6273 if( rc!=SQLITE_OK ) goto balance_cleanup; 6274 assert( sqlite3PagerIswriteable(pParent->pDbPage) ); 6275 6276 j++; 6277 nxDiv++; 6278 } 6279 } 6280 assert( j==nCell ); 6281 assert( nOld>0 ); 6282 assert( nNew>0 ); 6283 if( (pageFlags & PTF_LEAF)==0 ){ 6284 u8 *zChild = &apCopy[nOld-1]->aData[8]; 6285 memcpy(&apNew[nNew-1]->aData[8], zChild, 4); 6286 } 6287 6288 if( isRoot && pParent->nCell==0 && pParent->hdrOffset<=apNew[0]->nFree ){ 6289 /* The root page of the b-tree now contains no cells. The only sibling 6290 ** page is the right-child of the parent. Copy the contents of the 6291 ** child page into the parent, decreasing the overall height of the 6292 ** b-tree structure by one. This is described as the "balance-shallower" 6293 ** sub-algorithm in some documentation. 6294 ** 6295 ** If this is an auto-vacuum database, the call to copyNodeContent() 6296 ** sets all pointer-map entries corresponding to database image pages 6297 ** for which the pointer is stored within the content being copied. 6298 ** 6299 ** The second assert below verifies that the child page is defragmented 6300 ** (it must be, as it was just reconstructed using assemblePage()). This 6301 ** is important if the parent page happens to be page 1 of the database 6302 ** image. */ 6303 assert( nNew==1 ); 6304 assert( apNew[0]->nFree == 6305 (get2byte(&apNew[0]->aData[5])-apNew[0]->cellOffset-apNew[0]->nCell*2) 6306 ); 6307 copyNodeContent(apNew[0], pParent, &rc); 6308 freePage(apNew[0], &rc); 6309 }else if( ISAUTOVACUUM ){ 6310 /* Fix the pointer-map entries for all the cells that were shifted around. 6311 ** There are several different types of pointer-map entries that need to 6312 ** be dealt with by this routine. Some of these have been set already, but 6313 ** many have not. The following is a summary: 6314 ** 6315 ** 1) The entries associated with new sibling pages that were not 6316 ** siblings when this function was called. These have already 6317 ** been set. We don't need to worry about old siblings that were 6318 ** moved to the free-list - the freePage() code has taken care 6319 ** of those. 6320 ** 6321 ** 2) The pointer-map entries associated with the first overflow 6322 ** page in any overflow chains used by new divider cells. These 6323 ** have also already been taken care of by the insertCell() code. 6324 ** 6325 ** 3) If the sibling pages are not leaves, then the child pages of 6326 ** cells stored on the sibling pages may need to be updated. 6327 ** 6328 ** 4) If the sibling pages are not internal intkey nodes, then any 6329 ** overflow pages used by these cells may need to be updated 6330 ** (internal intkey nodes never contain pointers to overflow pages). 6331 ** 6332 ** 5) If the sibling pages are not leaves, then the pointer-map 6333 ** entries for the right-child pages of each sibling may need 6334 ** to be updated. 6335 ** 6336 ** Cases 1 and 2 are dealt with above by other code. The next 6337 ** block deals with cases 3 and 4 and the one after that, case 5. Since 6338 ** setting a pointer map entry is a relatively expensive operation, this 6339 ** code only sets pointer map entries for child or overflow pages that have 6340 ** actually moved between pages. */ 6341 MemPage *pNew = apNew[0]; 6342 MemPage *pOld = apCopy[0]; 6343 int nOverflow = pOld->nOverflow; 6344 int iNextOld = pOld->nCell + nOverflow; 6345 int iOverflow = (nOverflow ? pOld->aOvfl[0].idx : -1); 6346 j = 0; /* Current 'old' sibling page */ 6347 k = 0; /* Current 'new' sibling page */ 6348 for(i=0; i<nCell; i++){ 6349 int isDivider = 0; 6350 while( i==iNextOld ){ 6351 /* Cell i is the cell immediately following the last cell on old 6352 ** sibling page j. If the siblings are not leaf pages of an 6353 ** intkey b-tree, then cell i was a divider cell. */ 6354 pOld = apCopy[++j]; 6355 iNextOld = i + !leafData + pOld->nCell + pOld->nOverflow; 6356 if( pOld->nOverflow ){ 6357 nOverflow = pOld->nOverflow; 6358 iOverflow = i + !leafData + pOld->aOvfl[0].idx; 6359 } 6360 isDivider = !leafData; 6361 } 6362 6363 assert(nOverflow>0 || iOverflow<i ); 6364 assert(nOverflow<2 || pOld->aOvfl[0].idx==pOld->aOvfl[1].idx-1); 6365 assert(nOverflow<3 || pOld->aOvfl[1].idx==pOld->aOvfl[2].idx-1); 6366 if( i==iOverflow ){ 6367 isDivider = 1; 6368 if( (--nOverflow)>0 ){ 6369 iOverflow++; 6370 } 6371 } 6372 6373 if( i==cntNew[k] ){ 6374 /* Cell i is the cell immediately following the last cell on new 6375 ** sibling page k. If the siblings are not leaf pages of an 6376 ** intkey b-tree, then cell i is a divider cell. */ 6377 pNew = apNew[++k]; 6378 if( !leafData ) continue; 6379 } 6380 assert( j<nOld ); 6381 assert( k<nNew ); 6382 6383 /* If the cell was originally divider cell (and is not now) or 6384 ** an overflow cell, or if the cell was located on a different sibling 6385 ** page before the balancing, then the pointer map entries associated 6386 ** with any child or overflow pages need to be updated. */ 6387 if( isDivider || pOld->pgno!=pNew->pgno ){ 6388 if( !leafCorrection ){ 6389 ptrmapPut(pBt, get4byte(apCell[i]), PTRMAP_BTREE, pNew->pgno, &rc); 6390 } 6391 if( szCell[i]>pNew->minLocal ){ 6392 ptrmapPutOvflPtr(pNew, apCell[i], &rc); 6393 } 6394 } 6395 } 6396 6397 if( !leafCorrection ){ 6398 for(i=0; i<nNew; i++){ 6399 u32 key = get4byte(&apNew[i]->aData[8]); 6400 ptrmapPut(pBt, key, PTRMAP_BTREE, apNew[i]->pgno, &rc); 6401 } 6402 } 6403 6404 #if 0 6405 /* The ptrmapCheckPages() contains assert() statements that verify that 6406 ** all pointer map pages are set correctly. This is helpful while 6407 ** debugging. This is usually disabled because a corrupt database may 6408 ** cause an assert() statement to fail. */ 6409 ptrmapCheckPages(apNew, nNew); 6410 ptrmapCheckPages(&pParent, 1); 6411 #endif 6412 } 6413 6414 assert( pParent->isInit ); 6415 TRACE(("BALANCE: finished: old=%d new=%d cells=%d\n", 6416 nOld, nNew, nCell)); 6417 6418 /* 6419 ** Cleanup before returning. 6420 */ 6421 balance_cleanup: 6422 sqlite3ScratchFree(apCell); 6423 for(i=0; i<nOld; i++){ 6424 releasePage(apOld[i]); 6425 } 6426 for(i=0; i<nNew; i++){ 6427 releasePage(apNew[i]); 6428 } 6429 6430 return rc; 6431 } 6432 6433 6434 /* 6435 ** This function is called when the root page of a b-tree structure is 6436 ** overfull (has one or more overflow pages). 6437 ** 6438 ** A new child page is allocated and the contents of the current root 6439 ** page, including overflow cells, are copied into the child. The root 6440 ** page is then overwritten to make it an empty page with the right-child 6441 ** pointer pointing to the new page. 6442 ** 6443 ** Before returning, all pointer-map entries corresponding to pages 6444 ** that the new child-page now contains pointers to are updated. The 6445 ** entry corresponding to the new right-child pointer of the root 6446 ** page is also updated. 6447 ** 6448 ** If successful, *ppChild is set to contain a reference to the child 6449 ** page and SQLITE_OK is returned. In this case the caller is required 6450 ** to call releasePage() on *ppChild exactly once. If an error occurs, 6451 ** an error code is returned and *ppChild is set to 0. 6452 */ 6453 static int balance_deeper(MemPage *pRoot, MemPage **ppChild){ 6454 int rc; /* Return value from subprocedures */ 6455 MemPage *pChild = 0; /* Pointer to a new child page */ 6456 Pgno pgnoChild = 0; /* Page number of the new child page */ 6457 BtShared *pBt = pRoot->pBt; /* The BTree */ 6458 6459 assert( pRoot->nOverflow>0 ); 6460 assert( sqlite3_mutex_held(pBt->mutex) ); 6461 6462 /* Make pRoot, the root page of the b-tree, writable. Allocate a new 6463 ** page that will become the new right-child of pPage. Copy the contents 6464 ** of the node stored on pRoot into the new child page. 6465 */ 6466 rc = sqlite3PagerWrite(pRoot->pDbPage); 6467 if( rc==SQLITE_OK ){ 6468 rc = allocateBtreePage(pBt,&pChild,&pgnoChild,pRoot->pgno,0); 6469 copyNodeContent(pRoot, pChild, &rc); 6470 if( ISAUTOVACUUM ){ 6471 ptrmapPut(pBt, pgnoChild, PTRMAP_BTREE, pRoot->pgno, &rc); 6472 } 6473 } 6474 if( rc ){ 6475 *ppChild = 0; 6476 releasePage(pChild); 6477 return rc; 6478 } 6479 assert( sqlite3PagerIswriteable(pChild->pDbPage) ); 6480 assert( sqlite3PagerIswriteable(pRoot->pDbPage) ); 6481 assert( pChild->nCell==pRoot->nCell ); 6482 6483 TRACE(("BALANCE: copy root %d into %d\n", pRoot->pgno, pChild->pgno)); 6484 6485 /* Copy the overflow cells from pRoot to pChild */ 6486 memcpy(pChild->aOvfl, pRoot->aOvfl, pRoot->nOverflow*sizeof(pRoot->aOvfl[0])); 6487 pChild->nOverflow = pRoot->nOverflow; 6488 6489 /* Zero the contents of pRoot. Then install pChild as the right-child. */ 6490 zeroPage(pRoot, pChild->aData[0] & ~PTF_LEAF); 6491 put4byte(&pRoot->aData[pRoot->hdrOffset+8], pgnoChild); 6492 6493 *ppChild = pChild; 6494 return SQLITE_OK; 6495 } 6496 6497 /* 6498 ** The page that pCur currently points to has just been modified in 6499 ** some way. This function figures out if this modification means the 6500 ** tree needs to be balanced, and if so calls the appropriate balancing 6501 ** routine. Balancing routines are: 6502 ** 6503 ** balance_quick() 6504 ** balance_deeper() 6505 ** balance_nonroot() 6506 */ 6507 static int balance(BtCursor *pCur){ 6508 int rc = SQLITE_OK; 6509 const int nMin = pCur->pBt->usableSize * 2 / 3; 6510 u8 aBalanceQuickSpace[13]; 6511 u8 *pFree = 0; 6512 6513 TESTONLY( int balance_quick_called = 0 ); 6514 TESTONLY( int balance_deeper_called = 0 ); 6515 6516 do { 6517 int iPage = pCur->iPage; 6518 MemPage *pPage = pCur->apPage[iPage]; 6519 6520 if( iPage==0 ){ 6521 if( pPage->nOverflow ){ 6522 /* The root page of the b-tree is overfull. In this case call the 6523 ** balance_deeper() function to create a new child for the root-page 6524 ** and copy the current contents of the root-page to it. The 6525 ** next iteration of the do-loop will balance the child page. 6526 */ 6527 assert( (balance_deeper_called++)==0 ); 6528 rc = balance_deeper(pPage, &pCur->apPage[1]); 6529 if( rc==SQLITE_OK ){ 6530 pCur->iPage = 1; 6531 pCur->aiIdx[0] = 0; 6532 pCur->aiIdx[1] = 0; 6533 assert( pCur->apPage[1]->nOverflow ); 6534 } 6535 }else{ 6536 break; 6537 } 6538 }else if( pPage->nOverflow==0 && pPage->nFree<=nMin ){ 6539 break; 6540 }else{ 6541 MemPage * const pParent = pCur->apPage[iPage-1]; 6542 int const iIdx = pCur->aiIdx[iPage-1]; 6543 6544 rc = sqlite3PagerWrite(pParent->pDbPage); 6545 if( rc==SQLITE_OK ){ 6546 #ifndef SQLITE_OMIT_QUICKBALANCE 6547 if( pPage->hasData 6548 && pPage->nOverflow==1 6549 && pPage->aOvfl[0].idx==pPage->nCell 6550 && pParent->pgno!=1 6551 && pParent->nCell==iIdx 6552 ){ 6553 /* Call balance_quick() to create a new sibling of pPage on which 6554 ** to store the overflow cell. balance_quick() inserts a new cell 6555 ** into pParent, which may cause pParent overflow. If this 6556 ** happens, the next interation of the do-loop will balance pParent 6557 ** use either balance_nonroot() or balance_deeper(). Until this 6558 ** happens, the overflow cell is stored in the aBalanceQuickSpace[] 6559 ** buffer. 6560 ** 6561 ** The purpose of the following assert() is to check that only a 6562 ** single call to balance_quick() is made for each call to this 6563 ** function. If this were not verified, a subtle bug involving reuse 6564 ** of the aBalanceQuickSpace[] might sneak in. 6565 */ 6566 assert( (balance_quick_called++)==0 ); 6567 rc = balance_quick(pParent, pPage, aBalanceQuickSpace); 6568 }else 6569 #endif 6570 { 6571 /* In this case, call balance_nonroot() to redistribute cells 6572 ** between pPage and up to 2 of its sibling pages. This involves 6573 ** modifying the contents of pParent, which may cause pParent to 6574 ** become overfull or underfull. The next iteration of the do-loop 6575 ** will balance the parent page to correct this. 6576 ** 6577 ** If the parent page becomes overfull, the overflow cell or cells 6578 ** are stored in the pSpace buffer allocated immediately below. 6579 ** A subsequent iteration of the do-loop will deal with this by 6580 ** calling balance_nonroot() (balance_deeper() may be called first, 6581 ** but it doesn't deal with overflow cells - just moves them to a 6582 ** different page). Once this subsequent call to balance_nonroot() 6583 ** has completed, it is safe to release the pSpace buffer used by 6584 ** the previous call, as the overflow cell data will have been 6585 ** copied either into the body of a database page or into the new 6586 ** pSpace buffer passed to the latter call to balance_nonroot(). 6587 */ 6588 u8 *pSpace = sqlite3PageMalloc(pCur->pBt->pageSize); 6589 rc = balance_nonroot(pParent, iIdx, pSpace, iPage==1); 6590 if( pFree ){ 6591 /* If pFree is not NULL, it points to the pSpace buffer used 6592 ** by a previous call to balance_nonroot(). Its contents are 6593 ** now stored either on real database pages or within the 6594 ** new pSpace buffer, so it may be safely freed here. */ 6595 sqlite3PageFree(pFree); 6596 } 6597 6598 /* The pSpace buffer will be freed after the next call to 6599 ** balance_nonroot(), or just before this function returns, whichever 6600 ** comes first. */ 6601 pFree = pSpace; 6602 } 6603 } 6604 6605 pPage->nOverflow = 0; 6606 6607 /* The next iteration of the do-loop balances the parent page. */ 6608 releasePage(pPage); 6609 pCur->iPage--; 6610 } 6611 }while( rc==SQLITE_OK ); 6612 6613 if( pFree ){ 6614 sqlite3PageFree(pFree); 6615 } 6616 return rc; 6617 } 6618 6619 6620 /* 6621 ** Insert a new record into the BTree. The key is given by (pKey,nKey) 6622 ** and the data is given by (pData,nData). The cursor is used only to 6623 ** define what table the record should be inserted into. The cursor 6624 ** is left pointing at a random location. 6625 ** 6626 ** For an INTKEY table, only the nKey value of the key is used. pKey is 6627 ** ignored. For a ZERODATA table, the pData and nData are both ignored. 6628 ** 6629 ** If the seekResult parameter is non-zero, then a successful call to 6630 ** MovetoUnpacked() to seek cursor pCur to (pKey, nKey) has already 6631 ** been performed. seekResult is the search result returned (a negative 6632 ** number if pCur points at an entry that is smaller than (pKey, nKey), or 6633 ** a positive value if pCur points at an etry that is larger than 6634 ** (pKey, nKey)). 6635 ** 6636 ** If the seekResult parameter is non-zero, then the caller guarantees that 6637 ** cursor pCur is pointing at the existing copy of a row that is to be 6638 ** overwritten. If the seekResult parameter is 0, then cursor pCur may 6639 ** point to any entry or to no entry at all and so this function has to seek 6640 ** the cursor before the new key can be inserted. 6641 */ 6642 int sqlite3BtreeInsert( 6643 BtCursor *pCur, /* Insert data into the table of this cursor */ 6644 const void *pKey, i64 nKey, /* The key of the new record */ 6645 const void *pData, int nData, /* The data of the new record */ 6646 int nZero, /* Number of extra 0 bytes to append to data */ 6647 int appendBias, /* True if this is likely an append */ 6648 int seekResult /* Result of prior MovetoUnpacked() call */ 6649 ){ 6650 int rc; 6651 int loc = seekResult; /* -1: before desired location +1: after */ 6652 int szNew = 0; 6653 int idx; 6654 MemPage *pPage; 6655 Btree *p = pCur->pBtree; 6656 BtShared *pBt = p->pBt; 6657 unsigned char *oldCell; 6658 unsigned char *newCell = 0; 6659 6660 if( pCur->eState==CURSOR_FAULT ){ 6661 assert( pCur->skipNext!=SQLITE_OK ); 6662 return pCur->skipNext; 6663 } 6664 6665 assert( cursorHoldsMutex(pCur) ); 6666 assert( pCur->wrFlag && pBt->inTransaction==TRANS_WRITE && !pBt->readOnly ); 6667 assert( hasSharedCacheTableLock(p, pCur->pgnoRoot, pCur->pKeyInfo!=0, 2) ); 6668 6669 /* Assert that the caller has been consistent. If this cursor was opened 6670 ** expecting an index b-tree, then the caller should be inserting blob 6671 ** keys with no associated data. If the cursor was opened expecting an 6672 ** intkey table, the caller should be inserting integer keys with a 6673 ** blob of associated data. */ 6674 assert( (pKey==0)==(pCur->pKeyInfo==0) ); 6675 6676 /* If this is an insert into a table b-tree, invalidate any incrblob 6677 ** cursors open on the row being replaced (assuming this is a replace 6678 ** operation - if it is not, the following is a no-op). */ 6679 if( pCur->pKeyInfo==0 ){ 6680 invalidateIncrblobCursors(p, nKey, 0); 6681 } 6682 6683 /* Save the positions of any other cursors open on this table. 6684 ** 6685 ** In some cases, the call to btreeMoveto() below is a no-op. For 6686 ** example, when inserting data into a table with auto-generated integer 6687 ** keys, the VDBE layer invokes sqlite3BtreeLast() to figure out the 6688 ** integer key to use. It then calls this function to actually insert the 6689 ** data into the intkey B-Tree. In this case btreeMoveto() recognizes 6690 ** that the cursor is already where it needs to be and returns without 6691 ** doing any work. To avoid thwarting these optimizations, it is important 6692 ** not to clear the cursor here. 6693 */ 6694 rc = saveAllCursors(pBt, pCur->pgnoRoot, pCur); 6695 if( rc ) return rc; 6696 if( !loc ){ 6697 rc = btreeMoveto(pCur, pKey, nKey, appendBias, &loc); 6698 if( rc ) return rc; 6699 } 6700 assert( pCur->eState==CURSOR_VALID || (pCur->eState==CURSOR_INVALID && loc) ); 6701 6702 pPage = pCur->apPage[pCur->iPage]; 6703 assert( pPage->intKey || nKey>=0 ); 6704 assert( pPage->leaf || !pPage->intKey ); 6705 6706 TRACE(("INSERT: table=%d nkey=%lld ndata=%d page=%d %s\n", 6707 pCur->pgnoRoot, nKey, nData, pPage->pgno, 6708 loc==0 ? "overwrite" : "new entry")); 6709 assert( pPage->isInit ); 6710 allocateTempSpace(pBt); 6711 newCell = pBt->pTmpSpace; 6712 if( newCell==0 ) return SQLITE_NOMEM; 6713 rc = fillInCell(pPage, newCell, pKey, nKey, pData, nData, nZero, &szNew); 6714 if( rc ) goto end_insert; 6715 assert( szNew==cellSizePtr(pPage, newCell) ); 6716 assert( szNew <= MX_CELL_SIZE(pBt) ); 6717 idx = pCur->aiIdx[pCur->iPage]; 6718 if( loc==0 ){ 6719 u16 szOld; 6720 assert( idx<pPage->nCell ); 6721 rc = sqlite3PagerWrite(pPage->pDbPage); 6722 if( rc ){ 6723 goto end_insert; 6724 } 6725 oldCell = findCell(pPage, idx); 6726 if( !pPage->leaf ){ 6727 memcpy(newCell, oldCell, 4); 6728 } 6729 szOld = cellSizePtr(pPage, oldCell); 6730 rc = clearCell(pPage, oldCell); 6731 dropCell(pPage, idx, szOld, &rc); 6732 if( rc ) goto end_insert; 6733 }else if( loc<0 && pPage->nCell>0 ){ 6734 assert( pPage->leaf ); 6735 idx = ++pCur->aiIdx[pCur->iPage]; 6736 }else{ 6737 assert( pPage->leaf ); 6738 } 6739 insertCell(pPage, idx, newCell, szNew, 0, 0, &rc); 6740 assert( rc!=SQLITE_OK || pPage->nCell>0 || pPage->nOverflow>0 ); 6741 6742 /* If no error has occured and pPage has an overflow cell, call balance() 6743 ** to redistribute the cells within the tree. Since balance() may move 6744 ** the cursor, zero the BtCursor.info.nSize and BtCursor.validNKey 6745 ** variables. 6746 ** 6747 ** Previous versions of SQLite called moveToRoot() to move the cursor 6748 ** back to the root page as balance() used to invalidate the contents 6749 ** of BtCursor.apPage[] and BtCursor.aiIdx[]. Instead of doing that, 6750 ** set the cursor state to "invalid". This makes common insert operations 6751 ** slightly faster. 6752 ** 6753 ** There is a subtle but important optimization here too. When inserting 6754 ** multiple records into an intkey b-tree using a single cursor (as can 6755 ** happen while processing an "INSERT INTO ... SELECT" statement), it 6756 ** is advantageous to leave the cursor pointing to the last entry in 6757 ** the b-tree if possible. If the cursor is left pointing to the last 6758 ** entry in the table, and the next row inserted has an integer key 6759 ** larger than the largest existing key, it is possible to insert the 6760 ** row without seeking the cursor. This can be a big performance boost. 6761 */ 6762 pCur->info.nSize = 0; 6763 pCur->validNKey = 0; 6764 if( rc==SQLITE_OK && pPage->nOverflow ){ 6765 rc = balance(pCur); 6766 6767 /* Must make sure nOverflow is reset to zero even if the balance() 6768 ** fails. Internal data structure corruption will result otherwise. 6769 ** Also, set the cursor state to invalid. This stops saveCursorPosition() 6770 ** from trying to save the current position of the cursor. */ 6771 pCur->apPage[pCur->iPage]->nOverflow = 0; 6772 pCur->eState = CURSOR_INVALID; 6773 } 6774 assert( pCur->apPage[pCur->iPage]->nOverflow==0 ); 6775 6776 end_insert: 6777 return rc; 6778 } 6779 6780 /* 6781 ** Delete the entry that the cursor is pointing to. The cursor 6782 ** is left pointing at a arbitrary location. 6783 */ 6784 int sqlite3BtreeDelete(BtCursor *pCur){ 6785 Btree *p = pCur->pBtree; 6786 BtShared *pBt = p->pBt; 6787 int rc; /* Return code */ 6788 MemPage *pPage; /* Page to delete cell from */ 6789 unsigned char *pCell; /* Pointer to cell to delete */ 6790 int iCellIdx; /* Index of cell to delete */ 6791 int iCellDepth; /* Depth of node containing pCell */ 6792 6793 assert( cursorHoldsMutex(pCur) ); 6794 assert( pBt->inTransaction==TRANS_WRITE ); 6795 assert( !pBt->readOnly ); 6796 assert( pCur->wrFlag ); 6797 assert( hasSharedCacheTableLock(p, pCur->pgnoRoot, pCur->pKeyInfo!=0, 2) ); 6798 assert( !hasReadConflicts(p, pCur->pgnoRoot) ); 6799 6800 if( NEVER(pCur->aiIdx[pCur->iPage]>=pCur->apPage[pCur->iPage]->nCell) 6801 || NEVER(pCur->eState!=CURSOR_VALID) 6802 ){ 6803 return SQLITE_ERROR; /* Something has gone awry. */ 6804 } 6805 6806 /* If this is a delete operation to remove a row from a table b-tree, 6807 ** invalidate any incrblob cursors open on the row being deleted. */ 6808 if( pCur->pKeyInfo==0 ){ 6809 invalidateIncrblobCursors(p, pCur->info.nKey, 0); 6810 } 6811 6812 iCellDepth = pCur->iPage; 6813 iCellIdx = pCur->aiIdx[iCellDepth]; 6814 pPage = pCur->apPage[iCellDepth]; 6815 pCell = findCell(pPage, iCellIdx); 6816 6817 /* If the page containing the entry to delete is not a leaf page, move 6818 ** the cursor to the largest entry in the tree that is smaller than 6819 ** the entry being deleted. This cell will replace the cell being deleted 6820 ** from the internal node. The 'previous' entry is used for this instead 6821 ** of the 'next' entry, as the previous entry is always a part of the 6822 ** sub-tree headed by the child page of the cell being deleted. This makes 6823 ** balancing the tree following the delete operation easier. */ 6824 if( !pPage->leaf ){ 6825 int notUsed; 6826 rc = sqlite3BtreePrevious(pCur, ¬Used); 6827 if( rc ) return rc; 6828 } 6829 6830 /* Save the positions of any other cursors open on this table before 6831 ** making any modifications. Make the page containing the entry to be 6832 ** deleted writable. Then free any overflow pages associated with the 6833 ** entry and finally remove the cell itself from within the page. 6834 */ 6835 rc = saveAllCursors(pBt, pCur->pgnoRoot, pCur); 6836 if( rc ) return rc; 6837 rc = sqlite3PagerWrite(pPage->pDbPage); 6838 if( rc ) return rc; 6839 rc = clearCell(pPage, pCell); 6840 dropCell(pPage, iCellIdx, cellSizePtr(pPage, pCell), &rc); 6841 if( rc ) return rc; 6842 6843 /* If the cell deleted was not located on a leaf page, then the cursor 6844 ** is currently pointing to the largest entry in the sub-tree headed 6845 ** by the child-page of the cell that was just deleted from an internal 6846 ** node. The cell from the leaf node needs to be moved to the internal 6847 ** node to replace the deleted cell. */ 6848 if( !pPage->leaf ){ 6849 MemPage *pLeaf = pCur->apPage[pCur->iPage]; 6850 int nCell; 6851 Pgno n = pCur->apPage[iCellDepth+1]->pgno; 6852 unsigned char *pTmp; 6853 6854 pCell = findCell(pLeaf, pLeaf->nCell-1); 6855 nCell = cellSizePtr(pLeaf, pCell); 6856 assert( MX_CELL_SIZE(pBt) >= nCell ); 6857 6858 allocateTempSpace(pBt); 6859 pTmp = pBt->pTmpSpace; 6860 6861 rc = sqlite3PagerWrite(pLeaf->pDbPage); 6862 insertCell(pPage, iCellIdx, pCell-4, nCell+4, pTmp, n, &rc); 6863 dropCell(pLeaf, pLeaf->nCell-1, nCell, &rc); 6864 if( rc ) return rc; 6865 } 6866 6867 /* Balance the tree. If the entry deleted was located on a leaf page, 6868 ** then the cursor still points to that page. In this case the first 6869 ** call to balance() repairs the tree, and the if(...) condition is 6870 ** never true. 6871 ** 6872 ** Otherwise, if the entry deleted was on an internal node page, then 6873 ** pCur is pointing to the leaf page from which a cell was removed to 6874 ** replace the cell deleted from the internal node. This is slightly 6875 ** tricky as the leaf node may be underfull, and the internal node may 6876 ** be either under or overfull. In this case run the balancing algorithm 6877 ** on the leaf node first. If the balance proceeds far enough up the 6878 ** tree that we can be sure that any problem in the internal node has 6879 ** been corrected, so be it. Otherwise, after balancing the leaf node, 6880 ** walk the cursor up the tree to the internal node and balance it as 6881 ** well. */ 6882 rc = balance(pCur); 6883 if( rc==SQLITE_OK && pCur->iPage>iCellDepth ){ 6884 while( pCur->iPage>iCellDepth ){ 6885 releasePage(pCur->apPage[pCur->iPage--]); 6886 } 6887 rc = balance(pCur); 6888 } 6889 6890 if( rc==SQLITE_OK ){ 6891 moveToRoot(pCur); 6892 } 6893 return rc; 6894 } 6895 6896 /* 6897 ** Create a new BTree table. Write into *piTable the page 6898 ** number for the root page of the new table. 6899 ** 6900 ** The type of type is determined by the flags parameter. Only the 6901 ** following values of flags are currently in use. Other values for 6902 ** flags might not work: 6903 ** 6904 ** BTREE_INTKEY|BTREE_LEAFDATA Used for SQL tables with rowid keys 6905 ** BTREE_ZERODATA Used for SQL indices 6906 */ 6907 static int btreeCreateTable(Btree *p, int *piTable, int createTabFlags){ 6908 BtShared *pBt = p->pBt; 6909 MemPage *pRoot; 6910 Pgno pgnoRoot; 6911 int rc; 6912 int ptfFlags; /* Page-type flage for the root page of new table */ 6913 6914 assert( sqlite3BtreeHoldsMutex(p) ); 6915 assert( pBt->inTransaction==TRANS_WRITE ); 6916 assert( !pBt->readOnly ); 6917 6918 #ifdef SQLITE_OMIT_AUTOVACUUM 6919 rc = allocateBtreePage(pBt, &pRoot, &pgnoRoot, 1, 0); 6920 if( rc ){ 6921 return rc; 6922 } 6923 #else 6924 if( pBt->autoVacuum ){ 6925 Pgno pgnoMove; /* Move a page here to make room for the root-page */ 6926 MemPage *pPageMove; /* The page to move to. */ 6927 6928 /* Creating a new table may probably require moving an existing database 6929 ** to make room for the new tables root page. In case this page turns 6930 ** out to be an overflow page, delete all overflow page-map caches 6931 ** held by open cursors. 6932 */ 6933 invalidateAllOverflowCache(pBt); 6934 6935 /* Read the value of meta[3] from the database to determine where the 6936 ** root page of the new table should go. meta[3] is the largest root-page 6937 ** created so far, so the new root-page is (meta[3]+1). 6938 */ 6939 sqlite3BtreeGetMeta(p, BTREE_LARGEST_ROOT_PAGE, &pgnoRoot); 6940 pgnoRoot++; 6941 6942 /* The new root-page may not be allocated on a pointer-map page, or the 6943 ** PENDING_BYTE page. 6944 */ 6945 while( pgnoRoot==PTRMAP_PAGENO(pBt, pgnoRoot) || 6946 pgnoRoot==PENDING_BYTE_PAGE(pBt) ){ 6947 pgnoRoot++; 6948 } 6949 assert( pgnoRoot>=3 ); 6950 6951 /* Allocate a page. The page that currently resides at pgnoRoot will 6952 ** be moved to the allocated page (unless the allocated page happens 6953 ** to reside at pgnoRoot). 6954 */ 6955 rc = allocateBtreePage(pBt, &pPageMove, &pgnoMove, pgnoRoot, 1); 6956 if( rc!=SQLITE_OK ){ 6957 return rc; 6958 } 6959 6960 if( pgnoMove!=pgnoRoot ){ 6961 /* pgnoRoot is the page that will be used for the root-page of 6962 ** the new table (assuming an error did not occur). But we were 6963 ** allocated pgnoMove. If required (i.e. if it was not allocated 6964 ** by extending the file), the current page at position pgnoMove 6965 ** is already journaled. 6966 */ 6967 u8 eType = 0; 6968 Pgno iPtrPage = 0; 6969 6970 releasePage(pPageMove); 6971 6972 /* Move the page currently at pgnoRoot to pgnoMove. */ 6973 rc = btreeGetPage(pBt, pgnoRoot, &pRoot, 0); 6974 if( rc!=SQLITE_OK ){ 6975 return rc; 6976 } 6977 rc = ptrmapGet(pBt, pgnoRoot, &eType, &iPtrPage); 6978 if( eType==PTRMAP_ROOTPAGE || eType==PTRMAP_FREEPAGE ){ 6979 rc = SQLITE_CORRUPT_BKPT; 6980 } 6981 if( rc!=SQLITE_OK ){ 6982 releasePage(pRoot); 6983 return rc; 6984 } 6985 assert( eType!=PTRMAP_ROOTPAGE ); 6986 assert( eType!=PTRMAP_FREEPAGE ); 6987 rc = relocatePage(pBt, pRoot, eType, iPtrPage, pgnoMove, 0); 6988 releasePage(pRoot); 6989 6990 /* Obtain the page at pgnoRoot */ 6991 if( rc!=SQLITE_OK ){ 6992 return rc; 6993 } 6994 rc = btreeGetPage(pBt, pgnoRoot, &pRoot, 0); 6995 if( rc!=SQLITE_OK ){ 6996 return rc; 6997 } 6998 rc = sqlite3PagerWrite(pRoot->pDbPage); 6999 if( rc!=SQLITE_OK ){ 7000 releasePage(pRoot); 7001 return rc; 7002 } 7003 }else{ 7004 pRoot = pPageMove; 7005 } 7006 7007 /* Update the pointer-map and meta-data with the new root-page number. */ 7008 ptrmapPut(pBt, pgnoRoot, PTRMAP_ROOTPAGE, 0, &rc); 7009 if( rc ){ 7010 releasePage(pRoot); 7011 return rc; 7012 } 7013 7014 /* When the new root page was allocated, page 1 was made writable in 7015 ** order either to increase the database filesize, or to decrement the 7016 ** freelist count. Hence, the sqlite3BtreeUpdateMeta() call cannot fail. 7017 */ 7018 assert( sqlite3PagerIswriteable(pBt->pPage1->pDbPage) ); 7019 rc = sqlite3BtreeUpdateMeta(p, 4, pgnoRoot); 7020 if( NEVER(rc) ){ 7021 releasePage(pRoot); 7022 return rc; 7023 } 7024 7025 }else{ 7026 rc = allocateBtreePage(pBt, &pRoot, &pgnoRoot, 1, 0); 7027 if( rc ) return rc; 7028 } 7029 #endif 7030 assert( sqlite3PagerIswriteable(pRoot->pDbPage) ); 7031 if( createTabFlags & BTREE_INTKEY ){ 7032 ptfFlags = PTF_INTKEY | PTF_LEAFDATA | PTF_LEAF; 7033 }else{ 7034 ptfFlags = PTF_ZERODATA | PTF_LEAF; 7035 } 7036 zeroPage(pRoot, ptfFlags); 7037 sqlite3PagerUnref(pRoot->pDbPage); 7038 assert( (pBt->openFlags & BTREE_SINGLE)==0 || pgnoRoot==2 ); 7039 *piTable = (int)pgnoRoot; 7040 return SQLITE_OK; 7041 } 7042 int sqlite3BtreeCreateTable(Btree *p, int *piTable, int flags){ 7043 int rc; 7044 sqlite3BtreeEnter(p); 7045 rc = btreeCreateTable(p, piTable, flags); 7046 sqlite3BtreeLeave(p); 7047 return rc; 7048 } 7049 7050 /* 7051 ** Erase the given database page and all its children. Return 7052 ** the page to the freelist. 7053 */ 7054 static int clearDatabasePage( 7055 BtShared *pBt, /* The BTree that contains the table */ 7056 Pgno pgno, /* Page number to clear */ 7057 int freePageFlag, /* Deallocate page if true */ 7058 int *pnChange /* Add number of Cells freed to this counter */ 7059 ){ 7060 MemPage *pPage; 7061 int rc; 7062 unsigned char *pCell; 7063 int i; 7064 7065 assert( sqlite3_mutex_held(pBt->mutex) ); 7066 if( pgno>btreePagecount(pBt) ){ 7067 return SQLITE_CORRUPT_BKPT; 7068 } 7069 7070 rc = getAndInitPage(pBt, pgno, &pPage); 7071 if( rc ) return rc; 7072 for(i=0; i<pPage->nCell; i++){ 7073 pCell = findCell(pPage, i); 7074 if( !pPage->leaf ){ 7075 rc = clearDatabasePage(pBt, get4byte(pCell), 1, pnChange); 7076 if( rc ) goto cleardatabasepage_out; 7077 } 7078 rc = clearCell(pPage, pCell); 7079 if( rc ) goto cleardatabasepage_out; 7080 } 7081 if( !pPage->leaf ){ 7082 rc = clearDatabasePage(pBt, get4byte(&pPage->aData[8]), 1, pnChange); 7083 if( rc ) goto cleardatabasepage_out; 7084 }else if( pnChange ){ 7085 assert( pPage->intKey ); 7086 *pnChange += pPage->nCell; 7087 } 7088 if( freePageFlag ){ 7089 freePage(pPage, &rc); 7090 }else if( (rc = sqlite3PagerWrite(pPage->pDbPage))==0 ){ 7091 zeroPage(pPage, pPage->aData[0] | PTF_LEAF); 7092 } 7093 7094 cleardatabasepage_out: 7095 releasePage(pPage); 7096 return rc; 7097 } 7098 7099 /* 7100 ** Delete all information from a single table in the database. iTable is 7101 ** the page number of the root of the table. After this routine returns, 7102 ** the root page is empty, but still exists. 7103 ** 7104 ** This routine will fail with SQLITE_LOCKED if there are any open 7105 ** read cursors on the table. Open write cursors are moved to the 7106 ** root of the table. 7107 ** 7108 ** If pnChange is not NULL, then table iTable must be an intkey table. The 7109 ** integer value pointed to by pnChange is incremented by the number of 7110 ** entries in the table. 7111 */ 7112 int sqlite3BtreeClearTable(Btree *p, int iTable, int *pnChange){ 7113 int rc; 7114 BtShared *pBt = p->pBt; 7115 sqlite3BtreeEnter(p); 7116 assert( p->inTrans==TRANS_WRITE ); 7117 7118 /* Invalidate all incrblob cursors open on table iTable (assuming iTable 7119 ** is the root of a table b-tree - if it is not, the following call is 7120 ** a no-op). */ 7121 invalidateIncrblobCursors(p, 0, 1); 7122 7123 rc = saveAllCursors(pBt, (Pgno)iTable, 0); 7124 if( SQLITE_OK==rc ){ 7125 rc = clearDatabasePage(pBt, (Pgno)iTable, 0, pnChange); 7126 } 7127 sqlite3BtreeLeave(p); 7128 return rc; 7129 } 7130 7131 /* 7132 ** Erase all information in a table and add the root of the table to 7133 ** the freelist. Except, the root of the principle table (the one on 7134 ** page 1) is never added to the freelist. 7135 ** 7136 ** This routine will fail with SQLITE_LOCKED if there are any open 7137 ** cursors on the table. 7138 ** 7139 ** If AUTOVACUUM is enabled and the page at iTable is not the last 7140 ** root page in the database file, then the last root page 7141 ** in the database file is moved into the slot formerly occupied by 7142 ** iTable and that last slot formerly occupied by the last root page 7143 ** is added to the freelist instead of iTable. In this say, all 7144 ** root pages are kept at the beginning of the database file, which 7145 ** is necessary for AUTOVACUUM to work right. *piMoved is set to the 7146 ** page number that used to be the last root page in the file before 7147 ** the move. If no page gets moved, *piMoved is set to 0. 7148 ** The last root page is recorded in meta[3] and the value of 7149 ** meta[3] is updated by this procedure. 7150 */ 7151 static int btreeDropTable(Btree *p, Pgno iTable, int *piMoved){ 7152 int rc; 7153 MemPage *pPage = 0; 7154 BtShared *pBt = p->pBt; 7155 7156 assert( sqlite3BtreeHoldsMutex(p) ); 7157 assert( p->inTrans==TRANS_WRITE ); 7158 7159 /* It is illegal to drop a table if any cursors are open on the 7160 ** database. This is because in auto-vacuum mode the backend may 7161 ** need to move another root-page to fill a gap left by the deleted 7162 ** root page. If an open cursor was using this page a problem would 7163 ** occur. 7164 ** 7165 ** This error is caught long before control reaches this point. 7166 */ 7167 if( NEVER(pBt->pCursor) ){ 7168 sqlite3ConnectionBlocked(p->db, pBt->pCursor->pBtree->db); 7169 return SQLITE_LOCKED_SHAREDCACHE; 7170 } 7171 7172 rc = btreeGetPage(pBt, (Pgno)iTable, &pPage, 0); 7173 if( rc ) return rc; 7174 rc = sqlite3BtreeClearTable(p, iTable, 0); 7175 if( rc ){ 7176 releasePage(pPage); 7177 return rc; 7178 } 7179 7180 *piMoved = 0; 7181 7182 if( iTable>1 ){ 7183 #ifdef SQLITE_OMIT_AUTOVACUUM 7184 freePage(pPage, &rc); 7185 releasePage(pPage); 7186 #else 7187 if( pBt->autoVacuum ){ 7188 Pgno maxRootPgno; 7189 sqlite3BtreeGetMeta(p, BTREE_LARGEST_ROOT_PAGE, &maxRootPgno); 7190 7191 if( iTable==maxRootPgno ){ 7192 /* If the table being dropped is the table with the largest root-page 7193 ** number in the database, put the root page on the free list. 7194 */ 7195 freePage(pPage, &rc); 7196 releasePage(pPage); 7197 if( rc!=SQLITE_OK ){ 7198 return rc; 7199 } 7200 }else{ 7201 /* The table being dropped does not have the largest root-page 7202 ** number in the database. So move the page that does into the 7203 ** gap left by the deleted root-page. 7204 */ 7205 MemPage *pMove; 7206 releasePage(pPage); 7207 rc = btreeGetPage(pBt, maxRootPgno, &pMove, 0); 7208 if( rc!=SQLITE_OK ){ 7209 return rc; 7210 } 7211 rc = relocatePage(pBt, pMove, PTRMAP_ROOTPAGE, 0, iTable, 0); 7212 releasePage(pMove); 7213 if( rc!=SQLITE_OK ){ 7214 return rc; 7215 } 7216 pMove = 0; 7217 rc = btreeGetPage(pBt, maxRootPgno, &pMove, 0); 7218 freePage(pMove, &rc); 7219 releasePage(pMove); 7220 if( rc!=SQLITE_OK ){ 7221 return rc; 7222 } 7223 *piMoved = maxRootPgno; 7224 } 7225 7226 /* Set the new 'max-root-page' value in the database header. This 7227 ** is the old value less one, less one more if that happens to 7228 ** be a root-page number, less one again if that is the 7229 ** PENDING_BYTE_PAGE. 7230 */ 7231 maxRootPgno--; 7232 while( maxRootPgno==PENDING_BYTE_PAGE(pBt) 7233 || PTRMAP_ISPAGE(pBt, maxRootPgno) ){ 7234 maxRootPgno--; 7235 } 7236 assert( maxRootPgno!=PENDING_BYTE_PAGE(pBt) ); 7237 7238 rc = sqlite3BtreeUpdateMeta(p, 4, maxRootPgno); 7239 }else{ 7240 freePage(pPage, &rc); 7241 releasePage(pPage); 7242 } 7243 #endif 7244 }else{ 7245 /* If sqlite3BtreeDropTable was called on page 1. 7246 ** This really never should happen except in a corrupt 7247 ** database. 7248 */ 7249 zeroPage(pPage, PTF_INTKEY|PTF_LEAF ); 7250 releasePage(pPage); 7251 } 7252 return rc; 7253 } 7254 int sqlite3BtreeDropTable(Btree *p, int iTable, int *piMoved){ 7255 int rc; 7256 sqlite3BtreeEnter(p); 7257 rc = btreeDropTable(p, iTable, piMoved); 7258 sqlite3BtreeLeave(p); 7259 return rc; 7260 } 7261 7262 7263 /* 7264 ** This function may only be called if the b-tree connection already 7265 ** has a read or write transaction open on the database. 7266 ** 7267 ** Read the meta-information out of a database file. Meta[0] 7268 ** is the number of free pages currently in the database. Meta[1] 7269 ** through meta[15] are available for use by higher layers. Meta[0] 7270 ** is read-only, the others are read/write. 7271 ** 7272 ** The schema layer numbers meta values differently. At the schema 7273 ** layer (and the SetCookie and ReadCookie opcodes) the number of 7274 ** free pages is not visible. So Cookie[0] is the same as Meta[1]. 7275 */ 7276 void sqlite3BtreeGetMeta(Btree *p, int idx, u32 *pMeta){ 7277 BtShared *pBt = p->pBt; 7278 7279 sqlite3BtreeEnter(p); 7280 assert( p->inTrans>TRANS_NONE ); 7281 assert( SQLITE_OK==querySharedCacheTableLock(p, MASTER_ROOT, READ_LOCK) ); 7282 assert( pBt->pPage1 ); 7283 assert( idx>=0 && idx<=15 ); 7284 7285 *pMeta = get4byte(&pBt->pPage1->aData[36 + idx*4]); 7286 7287 /* If auto-vacuum is disabled in this build and this is an auto-vacuum 7288 ** database, mark the database as read-only. */ 7289 #ifdef SQLITE_OMIT_AUTOVACUUM 7290 if( idx==BTREE_LARGEST_ROOT_PAGE && *pMeta>0 ) pBt->readOnly = 1; 7291 #endif 7292 7293 sqlite3BtreeLeave(p); 7294 } 7295 7296 /* 7297 ** Write meta-information back into the database. Meta[0] is 7298 ** read-only and may not be written. 7299 */ 7300 int sqlite3BtreeUpdateMeta(Btree *p, int idx, u32 iMeta){ 7301 BtShared *pBt = p->pBt; 7302 unsigned char *pP1; 7303 int rc; 7304 assert( idx>=1 && idx<=15 ); 7305 sqlite3BtreeEnter(p); 7306 assert( p->inTrans==TRANS_WRITE ); 7307 assert( pBt->pPage1!=0 ); 7308 pP1 = pBt->pPage1->aData; 7309 rc = sqlite3PagerWrite(pBt->pPage1->pDbPage); 7310 if( rc==SQLITE_OK ){ 7311 put4byte(&pP1[36 + idx*4], iMeta); 7312 #ifndef SQLITE_OMIT_AUTOVACUUM 7313 if( idx==BTREE_INCR_VACUUM ){ 7314 assert( pBt->autoVacuum || iMeta==0 ); 7315 assert( iMeta==0 || iMeta==1 ); 7316 pBt->incrVacuum = (u8)iMeta; 7317 } 7318 #endif 7319 } 7320 sqlite3BtreeLeave(p); 7321 return rc; 7322 } 7323 7324 #ifndef SQLITE_OMIT_BTREECOUNT 7325 /* 7326 ** The first argument, pCur, is a cursor opened on some b-tree. Count the 7327 ** number of entries in the b-tree and write the result to *pnEntry. 7328 ** 7329 ** SQLITE_OK is returned if the operation is successfully executed. 7330 ** Otherwise, if an error is encountered (i.e. an IO error or database 7331 ** corruption) an SQLite error code is returned. 7332 */ 7333 int sqlite3BtreeCount(BtCursor *pCur, i64 *pnEntry){ 7334 i64 nEntry = 0; /* Value to return in *pnEntry */ 7335 int rc; /* Return code */ 7336 rc = moveToRoot(pCur); 7337 7338 /* Unless an error occurs, the following loop runs one iteration for each 7339 ** page in the B-Tree structure (not including overflow pages). 7340 */ 7341 while( rc==SQLITE_OK ){ 7342 int iIdx; /* Index of child node in parent */ 7343 MemPage *pPage; /* Current page of the b-tree */ 7344 7345 /* If this is a leaf page or the tree is not an int-key tree, then 7346 ** this page contains countable entries. Increment the entry counter 7347 ** accordingly. 7348 */ 7349 pPage = pCur->apPage[pCur->iPage]; 7350 if( pPage->leaf || !pPage->intKey ){ 7351 nEntry += pPage->nCell; 7352 } 7353 7354 /* pPage is a leaf node. This loop navigates the cursor so that it 7355 ** points to the first interior cell that it points to the parent of 7356 ** the next page in the tree that has not yet been visited. The 7357 ** pCur->aiIdx[pCur->iPage] value is set to the index of the parent cell 7358 ** of the page, or to the number of cells in the page if the next page 7359 ** to visit is the right-child of its parent. 7360 ** 7361 ** If all pages in the tree have been visited, return SQLITE_OK to the 7362 ** caller. 7363 */ 7364 if( pPage->leaf ){ 7365 do { 7366 if( pCur->iPage==0 ){ 7367 /* All pages of the b-tree have been visited. Return successfully. */ 7368 *pnEntry = nEntry; 7369 return SQLITE_OK; 7370 } 7371 moveToParent(pCur); 7372 }while ( pCur->aiIdx[pCur->iPage]>=pCur->apPage[pCur->iPage]->nCell ); 7373 7374 pCur->aiIdx[pCur->iPage]++; 7375 pPage = pCur->apPage[pCur->iPage]; 7376 } 7377 7378 /* Descend to the child node of the cell that the cursor currently 7379 ** points at. This is the right-child if (iIdx==pPage->nCell). 7380 */ 7381 iIdx = pCur->aiIdx[pCur->iPage]; 7382 if( iIdx==pPage->nCell ){ 7383 rc = moveToChild(pCur, get4byte(&pPage->aData[pPage->hdrOffset+8])); 7384 }else{ 7385 rc = moveToChild(pCur, get4byte(findCell(pPage, iIdx))); 7386 } 7387 } 7388 7389 /* An error has occurred. Return an error code. */ 7390 return rc; 7391 } 7392 #endif 7393 7394 /* 7395 ** Return the pager associated with a BTree. This routine is used for 7396 ** testing and debugging only. 7397 */ 7398 Pager *sqlite3BtreePager(Btree *p){ 7399 return p->pBt->pPager; 7400 } 7401 7402 #ifndef SQLITE_OMIT_INTEGRITY_CHECK 7403 /* 7404 ** Append a message to the error message string. 7405 */ 7406 static void checkAppendMsg( 7407 IntegrityCk *pCheck, 7408 char *zMsg1, 7409 const char *zFormat, 7410 ... 7411 ){ 7412 va_list ap; 7413 if( !pCheck->mxErr ) return; 7414 pCheck->mxErr--; 7415 pCheck->nErr++; 7416 va_start(ap, zFormat); 7417 if( pCheck->errMsg.nChar ){ 7418 sqlite3StrAccumAppend(&pCheck->errMsg, "\n", 1); 7419 } 7420 if( zMsg1 ){ 7421 sqlite3StrAccumAppend(&pCheck->errMsg, zMsg1, -1); 7422 } 7423 sqlite3VXPrintf(&pCheck->errMsg, 1, zFormat, ap); 7424 va_end(ap); 7425 if( pCheck->errMsg.mallocFailed ){ 7426 pCheck->mallocFailed = 1; 7427 } 7428 } 7429 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */ 7430 7431 #ifndef SQLITE_OMIT_INTEGRITY_CHECK 7432 /* 7433 ** Add 1 to the reference count for page iPage. If this is the second 7434 ** reference to the page, add an error message to pCheck->zErrMsg. 7435 ** Return 1 if there are 2 ore more references to the page and 0 if 7436 ** if this is the first reference to the page. 7437 ** 7438 ** Also check that the page number is in bounds. 7439 */ 7440 static int checkRef(IntegrityCk *pCheck, Pgno iPage, char *zContext){ 7441 if( iPage==0 ) return 1; 7442 if( iPage>pCheck->nPage ){ 7443 checkAppendMsg(pCheck, zContext, "invalid page number %d", iPage); 7444 return 1; 7445 } 7446 if( pCheck->anRef[iPage]==1 ){ 7447 checkAppendMsg(pCheck, zContext, "2nd reference to page %d", iPage); 7448 return 1; 7449 } 7450 return (pCheck->anRef[iPage]++)>1; 7451 } 7452 7453 #ifndef SQLITE_OMIT_AUTOVACUUM 7454 /* 7455 ** Check that the entry in the pointer-map for page iChild maps to 7456 ** page iParent, pointer type ptrType. If not, append an error message 7457 ** to pCheck. 7458 */ 7459 static void checkPtrmap( 7460 IntegrityCk *pCheck, /* Integrity check context */ 7461 Pgno iChild, /* Child page number */ 7462 u8 eType, /* Expected pointer map type */ 7463 Pgno iParent, /* Expected pointer map parent page number */ 7464 char *zContext /* Context description (used for error msg) */ 7465 ){ 7466 int rc; 7467 u8 ePtrmapType; 7468 Pgno iPtrmapParent; 7469 7470 rc = ptrmapGet(pCheck->pBt, iChild, &ePtrmapType, &iPtrmapParent); 7471 if( rc!=SQLITE_OK ){ 7472 if( rc==SQLITE_NOMEM || rc==SQLITE_IOERR_NOMEM ) pCheck->mallocFailed = 1; 7473 checkAppendMsg(pCheck, zContext, "Failed to read ptrmap key=%d", iChild); 7474 return; 7475 } 7476 7477 if( ePtrmapType!=eType || iPtrmapParent!=iParent ){ 7478 checkAppendMsg(pCheck, zContext, 7479 "Bad ptr map entry key=%d expected=(%d,%d) got=(%d,%d)", 7480 iChild, eType, iParent, ePtrmapType, iPtrmapParent); 7481 } 7482 } 7483 #endif 7484 7485 /* 7486 ** Check the integrity of the freelist or of an overflow page list. 7487 ** Verify that the number of pages on the list is N. 7488 */ 7489 static void checkList( 7490 IntegrityCk *pCheck, /* Integrity checking context */ 7491 int isFreeList, /* True for a freelist. False for overflow page list */ 7492 int iPage, /* Page number for first page in the list */ 7493 int N, /* Expected number of pages in the list */ 7494 char *zContext /* Context for error messages */ 7495 ){ 7496 int i; 7497 int expected = N; 7498 int iFirst = iPage; 7499 while( N-- > 0 && pCheck->mxErr ){ 7500 DbPage *pOvflPage; 7501 unsigned char *pOvflData; 7502 if( iPage<1 ){ 7503 checkAppendMsg(pCheck, zContext, 7504 "%d of %d pages missing from overflow list starting at %d", 7505 N+1, expected, iFirst); 7506 break; 7507 } 7508 if( checkRef(pCheck, iPage, zContext) ) break; 7509 if( sqlite3PagerGet(pCheck->pPager, (Pgno)iPage, &pOvflPage) ){ 7510 checkAppendMsg(pCheck, zContext, "failed to get page %d", iPage); 7511 break; 7512 } 7513 pOvflData = (unsigned char *)sqlite3PagerGetData(pOvflPage); 7514 if( isFreeList ){ 7515 int n = get4byte(&pOvflData[4]); 7516 #ifndef SQLITE_OMIT_AUTOVACUUM 7517 if( pCheck->pBt->autoVacuum ){ 7518 checkPtrmap(pCheck, iPage, PTRMAP_FREEPAGE, 0, zContext); 7519 } 7520 #endif 7521 if( n>(int)pCheck->pBt->usableSize/4-2 ){ 7522 checkAppendMsg(pCheck, zContext, 7523 "freelist leaf count too big on page %d", iPage); 7524 N--; 7525 }else{ 7526 for(i=0; i<n; i++){ 7527 Pgno iFreePage = get4byte(&pOvflData[8+i*4]); 7528 #ifndef SQLITE_OMIT_AUTOVACUUM 7529 if( pCheck->pBt->autoVacuum ){ 7530 checkPtrmap(pCheck, iFreePage, PTRMAP_FREEPAGE, 0, zContext); 7531 } 7532 #endif 7533 checkRef(pCheck, iFreePage, zContext); 7534 } 7535 N -= n; 7536 } 7537 } 7538 #ifndef SQLITE_OMIT_AUTOVACUUM 7539 else{ 7540 /* If this database supports auto-vacuum and iPage is not the last 7541 ** page in this overflow list, check that the pointer-map entry for 7542 ** the following page matches iPage. 7543 */ 7544 if( pCheck->pBt->autoVacuum && N>0 ){ 7545 i = get4byte(pOvflData); 7546 checkPtrmap(pCheck, i, PTRMAP_OVERFLOW2, iPage, zContext); 7547 } 7548 } 7549 #endif 7550 iPage = get4byte(pOvflData); 7551 sqlite3PagerUnref(pOvflPage); 7552 } 7553 } 7554 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */ 7555 7556 #ifndef SQLITE_OMIT_INTEGRITY_CHECK 7557 /* 7558 ** Do various sanity checks on a single page of a tree. Return 7559 ** the tree depth. Root pages return 0. Parents of root pages 7560 ** return 1, and so forth. 7561 ** 7562 ** These checks are done: 7563 ** 7564 ** 1. Make sure that cells and freeblocks do not overlap 7565 ** but combine to completely cover the page. 7566 ** NO 2. Make sure cell keys are in order. 7567 ** NO 3. Make sure no key is less than or equal to zLowerBound. 7568 ** NO 4. Make sure no key is greater than or equal to zUpperBound. 7569 ** 5. Check the integrity of overflow pages. 7570 ** 6. Recursively call checkTreePage on all children. 7571 ** 7. Verify that the depth of all children is the same. 7572 ** 8. Make sure this page is at least 33% full or else it is 7573 ** the root of the tree. 7574 */ 7575 static int checkTreePage( 7576 IntegrityCk *pCheck, /* Context for the sanity check */ 7577 int iPage, /* Page number of the page to check */ 7578 char *zParentContext, /* Parent context */ 7579 i64 *pnParentMinKey, 7580 i64 *pnParentMaxKey 7581 ){ 7582 MemPage *pPage; 7583 int i, rc, depth, d2, pgno, cnt; 7584 int hdr, cellStart; 7585 int nCell; 7586 u8 *data; 7587 BtShared *pBt; 7588 int usableSize; 7589 char zContext[100]; 7590 char *hit = 0; 7591 i64 nMinKey = 0; 7592 i64 nMaxKey = 0; 7593 7594 sqlite3_snprintf(sizeof(zContext), zContext, "Page %d: ", iPage); 7595 7596 /* Check that the page exists 7597 */ 7598 pBt = pCheck->pBt; 7599 usableSize = pBt->usableSize; 7600 if( iPage==0 ) return 0; 7601 if( checkRef(pCheck, iPage, zParentContext) ) return 0; 7602 if( (rc = btreeGetPage(pBt, (Pgno)iPage, &pPage, 0))!=0 ){ 7603 checkAppendMsg(pCheck, zContext, 7604 "unable to get the page. error code=%d", rc); 7605 return 0; 7606 } 7607 7608 /* Clear MemPage.isInit to make sure the corruption detection code in 7609 ** btreeInitPage() is executed. */ 7610 pPage->isInit = 0; 7611 if( (rc = btreeInitPage(pPage))!=0 ){ 7612 assert( rc==SQLITE_CORRUPT ); /* The only possible error from InitPage */ 7613 checkAppendMsg(pCheck, zContext, 7614 "btreeInitPage() returns error code %d", rc); 7615 releasePage(pPage); 7616 return 0; 7617 } 7618 7619 /* Check out all the cells. 7620 */ 7621 depth = 0; 7622 for(i=0; i<pPage->nCell && pCheck->mxErr; i++){ 7623 u8 *pCell; 7624 u32 sz; 7625 CellInfo info; 7626 7627 /* Check payload overflow pages 7628 */ 7629 sqlite3_snprintf(sizeof(zContext), zContext, 7630 "On tree page %d cell %d: ", iPage, i); 7631 pCell = findCell(pPage,i); 7632 btreeParseCellPtr(pPage, pCell, &info); 7633 sz = info.nData; 7634 if( !pPage->intKey ) sz += (int)info.nKey; 7635 /* For intKey pages, check that the keys are in order. 7636 */ 7637 else if( i==0 ) nMinKey = nMaxKey = info.nKey; 7638 else{ 7639 if( info.nKey <= nMaxKey ){ 7640 checkAppendMsg(pCheck, zContext, 7641 "Rowid %lld out of order (previous was %lld)", info.nKey, nMaxKey); 7642 } 7643 nMaxKey = info.nKey; 7644 } 7645 assert( sz==info.nPayload ); 7646 if( (sz>info.nLocal) 7647 && (&pCell[info.iOverflow]<=&pPage->aData[pBt->usableSize]) 7648 ){ 7649 int nPage = (sz - info.nLocal + usableSize - 5)/(usableSize - 4); 7650 Pgno pgnoOvfl = get4byte(&pCell[info.iOverflow]); 7651 #ifndef SQLITE_OMIT_AUTOVACUUM 7652 if( pBt->autoVacuum ){ 7653 checkPtrmap(pCheck, pgnoOvfl, PTRMAP_OVERFLOW1, iPage, zContext); 7654 } 7655 #endif 7656 checkList(pCheck, 0, pgnoOvfl, nPage, zContext); 7657 } 7658 7659 /* Check sanity of left child page. 7660 */ 7661 if( !pPage->leaf ){ 7662 pgno = get4byte(pCell); 7663 #ifndef SQLITE_OMIT_AUTOVACUUM 7664 if( pBt->autoVacuum ){ 7665 checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage, zContext); 7666 } 7667 #endif 7668 d2 = checkTreePage(pCheck, pgno, zContext, &nMinKey, i==0 ? NULL : &nMaxKey); 7669 if( i>0 && d2!=depth ){ 7670 checkAppendMsg(pCheck, zContext, "Child page depth differs"); 7671 } 7672 depth = d2; 7673 } 7674 } 7675 7676 if( !pPage->leaf ){ 7677 pgno = get4byte(&pPage->aData[pPage->hdrOffset+8]); 7678 sqlite3_snprintf(sizeof(zContext), zContext, 7679 "On page %d at right child: ", iPage); 7680 #ifndef SQLITE_OMIT_AUTOVACUUM 7681 if( pBt->autoVacuum ){ 7682 checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage, zContext); 7683 } 7684 #endif 7685 checkTreePage(pCheck, pgno, zContext, NULL, !pPage->nCell ? NULL : &nMaxKey); 7686 } 7687 7688 /* For intKey leaf pages, check that the min/max keys are in order 7689 ** with any left/parent/right pages. 7690 */ 7691 if( pPage->leaf && pPage->intKey ){ 7692 /* if we are a left child page */ 7693 if( pnParentMinKey ){ 7694 /* if we are the left most child page */ 7695 if( !pnParentMaxKey ){ 7696 if( nMaxKey > *pnParentMinKey ){ 7697 checkAppendMsg(pCheck, zContext, 7698 "Rowid %lld out of order (max larger than parent min of %lld)", 7699 nMaxKey, *pnParentMinKey); 7700 } 7701 }else{ 7702 if( nMinKey <= *pnParentMinKey ){ 7703 checkAppendMsg(pCheck, zContext, 7704 "Rowid %lld out of order (min less than parent min of %lld)", 7705 nMinKey, *pnParentMinKey); 7706 } 7707 if( nMaxKey > *pnParentMaxKey ){ 7708 checkAppendMsg(pCheck, zContext, 7709 "Rowid %lld out of order (max larger than parent max of %lld)", 7710 nMaxKey, *pnParentMaxKey); 7711 } 7712 *pnParentMinKey = nMaxKey; 7713 } 7714 /* else if we're a right child page */ 7715 } else if( pnParentMaxKey ){ 7716 if( nMinKey <= *pnParentMaxKey ){ 7717 checkAppendMsg(pCheck, zContext, 7718 "Rowid %lld out of order (min less than parent max of %lld)", 7719 nMinKey, *pnParentMaxKey); 7720 } 7721 } 7722 } 7723 7724 /* Check for complete coverage of the page 7725 */ 7726 data = pPage->aData; 7727 hdr = pPage->hdrOffset; 7728 hit = sqlite3PageMalloc( pBt->pageSize ); 7729 if( hit==0 ){ 7730 pCheck->mallocFailed = 1; 7731 }else{ 7732 int contentOffset = get2byteNotZero(&data[hdr+5]); 7733 assert( contentOffset<=usableSize ); /* Enforced by btreeInitPage() */ 7734 memset(hit+contentOffset, 0, usableSize-contentOffset); 7735 memset(hit, 1, contentOffset); 7736 nCell = get2byte(&data[hdr+3]); 7737 cellStart = hdr + 12 - 4*pPage->leaf; 7738 for(i=0; i<nCell; i++){ 7739 int pc = get2byte(&data[cellStart+i*2]); 7740 u32 size = 65536; 7741 int j; 7742 if( pc<=usableSize-4 ){ 7743 size = cellSizePtr(pPage, &data[pc]); 7744 } 7745 if( (int)(pc+size-1)>=usableSize ){ 7746 checkAppendMsg(pCheck, 0, 7747 "Corruption detected in cell %d on page %d",i,iPage); 7748 }else{ 7749 for(j=pc+size-1; j>=pc; j--) hit[j]++; 7750 } 7751 } 7752 i = get2byte(&data[hdr+1]); 7753 while( i>0 ){ 7754 int size, j; 7755 assert( i<=usableSize-4 ); /* Enforced by btreeInitPage() */ 7756 size = get2byte(&data[i+2]); 7757 assert( i+size<=usableSize ); /* Enforced by btreeInitPage() */ 7758 for(j=i+size-1; j>=i; j--) hit[j]++; 7759 j = get2byte(&data[i]); 7760 assert( j==0 || j>i+size ); /* Enforced by btreeInitPage() */ 7761 assert( j<=usableSize-4 ); /* Enforced by btreeInitPage() */ 7762 i = j; 7763 } 7764 for(i=cnt=0; i<usableSize; i++){ 7765 if( hit[i]==0 ){ 7766 cnt++; 7767 }else if( hit[i]>1 ){ 7768 checkAppendMsg(pCheck, 0, 7769 "Multiple uses for byte %d of page %d", i, iPage); 7770 break; 7771 } 7772 } 7773 if( cnt!=data[hdr+7] ){ 7774 checkAppendMsg(pCheck, 0, 7775 "Fragmentation of %d bytes reported as %d on page %d", 7776 cnt, data[hdr+7], iPage); 7777 } 7778 } 7779 sqlite3PageFree(hit); 7780 releasePage(pPage); 7781 return depth+1; 7782 } 7783 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */ 7784 7785 #ifndef SQLITE_OMIT_INTEGRITY_CHECK 7786 /* 7787 ** This routine does a complete check of the given BTree file. aRoot[] is 7788 ** an array of pages numbers were each page number is the root page of 7789 ** a table. nRoot is the number of entries in aRoot. 7790 ** 7791 ** A read-only or read-write transaction must be opened before calling 7792 ** this function. 7793 ** 7794 ** Write the number of error seen in *pnErr. Except for some memory 7795 ** allocation errors, an error message held in memory obtained from 7796 ** malloc is returned if *pnErr is non-zero. If *pnErr==0 then NULL is 7797 ** returned. If a memory allocation error occurs, NULL is returned. 7798 */ 7799 char *sqlite3BtreeIntegrityCheck( 7800 Btree *p, /* The btree to be checked */ 7801 int *aRoot, /* An array of root pages numbers for individual trees */ 7802 int nRoot, /* Number of entries in aRoot[] */ 7803 int mxErr, /* Stop reporting errors after this many */ 7804 int *pnErr /* Write number of errors seen to this variable */ 7805 ){ 7806 Pgno i; 7807 int nRef; 7808 IntegrityCk sCheck; 7809 BtShared *pBt = p->pBt; 7810 char zErr[100]; 7811 7812 sqlite3BtreeEnter(p); 7813 assert( p->inTrans>TRANS_NONE && pBt->inTransaction>TRANS_NONE ); 7814 nRef = sqlite3PagerRefcount(pBt->pPager); 7815 sCheck.pBt = pBt; 7816 sCheck.pPager = pBt->pPager; 7817 sCheck.nPage = btreePagecount(sCheck.pBt); 7818 sCheck.mxErr = mxErr; 7819 sCheck.nErr = 0; 7820 sCheck.mallocFailed = 0; 7821 *pnErr = 0; 7822 if( sCheck.nPage==0 ){ 7823 sqlite3BtreeLeave(p); 7824 return 0; 7825 } 7826 sCheck.anRef = sqlite3Malloc( (sCheck.nPage+1)*sizeof(sCheck.anRef[0]) ); 7827 if( !sCheck.anRef ){ 7828 *pnErr = 1; 7829 sqlite3BtreeLeave(p); 7830 return 0; 7831 } 7832 for(i=0; i<=sCheck.nPage; i++){ sCheck.anRef[i] = 0; } 7833 i = PENDING_BYTE_PAGE(pBt); 7834 if( i<=sCheck.nPage ){ 7835 sCheck.anRef[i] = 1; 7836 } 7837 sqlite3StrAccumInit(&sCheck.errMsg, zErr, sizeof(zErr), 20000); 7838 sCheck.errMsg.useMalloc = 2; 7839 7840 /* Check the integrity of the freelist 7841 */ 7842 checkList(&sCheck, 1, get4byte(&pBt->pPage1->aData[32]), 7843 get4byte(&pBt->pPage1->aData[36]), "Main freelist: "); 7844 7845 /* Check all the tables. 7846 */ 7847 for(i=0; (int)i<nRoot && sCheck.mxErr; i++){ 7848 if( aRoot[i]==0 ) continue; 7849 #ifndef SQLITE_OMIT_AUTOVACUUM 7850 if( pBt->autoVacuum && aRoot[i]>1 ){ 7851 checkPtrmap(&sCheck, aRoot[i], PTRMAP_ROOTPAGE, 0, 0); 7852 } 7853 #endif 7854 checkTreePage(&sCheck, aRoot[i], "List of tree roots: ", NULL, NULL); 7855 } 7856 7857 /* Make sure every page in the file is referenced 7858 */ 7859 for(i=1; i<=sCheck.nPage && sCheck.mxErr; i++){ 7860 #ifdef SQLITE_OMIT_AUTOVACUUM 7861 if( sCheck.anRef[i]==0 ){ 7862 checkAppendMsg(&sCheck, 0, "Page %d is never used", i); 7863 } 7864 #else 7865 /* If the database supports auto-vacuum, make sure no tables contain 7866 ** references to pointer-map pages. 7867 */ 7868 if( sCheck.anRef[i]==0 && 7869 (PTRMAP_PAGENO(pBt, i)!=i || !pBt->autoVacuum) ){ 7870 checkAppendMsg(&sCheck, 0, "Page %d is never used", i); 7871 } 7872 if( sCheck.anRef[i]!=0 && 7873 (PTRMAP_PAGENO(pBt, i)==i && pBt->autoVacuum) ){ 7874 checkAppendMsg(&sCheck, 0, "Pointer map page %d is referenced", i); 7875 } 7876 #endif 7877 } 7878 7879 /* Make sure this analysis did not leave any unref() pages. 7880 ** This is an internal consistency check; an integrity check 7881 ** of the integrity check. 7882 */ 7883 if( NEVER(nRef != sqlite3PagerRefcount(pBt->pPager)) ){ 7884 checkAppendMsg(&sCheck, 0, 7885 "Outstanding page count goes from %d to %d during this analysis", 7886 nRef, sqlite3PagerRefcount(pBt->pPager) 7887 ); 7888 } 7889 7890 /* Clean up and report errors. 7891 */ 7892 sqlite3BtreeLeave(p); 7893 sqlite3_free(sCheck.anRef); 7894 if( sCheck.mallocFailed ){ 7895 sqlite3StrAccumReset(&sCheck.errMsg); 7896 *pnErr = sCheck.nErr+1; 7897 return 0; 7898 } 7899 *pnErr = sCheck.nErr; 7900 if( sCheck.nErr==0 ) sqlite3StrAccumReset(&sCheck.errMsg); 7901 return sqlite3StrAccumFinish(&sCheck.errMsg); 7902 } 7903 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */ 7904 7905 /* 7906 ** Return the full pathname of the underlying database file. 7907 ** 7908 ** The pager filename is invariant as long as the pager is 7909 ** open so it is safe to access without the BtShared mutex. 7910 */ 7911 const char *sqlite3BtreeGetFilename(Btree *p){ 7912 assert( p->pBt->pPager!=0 ); 7913 return sqlite3PagerFilename(p->pBt->pPager); 7914 } 7915 7916 /* 7917 ** Return the pathname of the journal file for this database. The return 7918 ** value of this routine is the same regardless of whether the journal file 7919 ** has been created or not. 7920 ** 7921 ** The pager journal filename is invariant as long as the pager is 7922 ** open so it is safe to access without the BtShared mutex. 7923 */ 7924 const char *sqlite3BtreeGetJournalname(Btree *p){ 7925 assert( p->pBt->pPager!=0 ); 7926 return sqlite3PagerJournalname(p->pBt->pPager); 7927 } 7928 7929 /* 7930 ** Return non-zero if a transaction is active. 7931 */ 7932 int sqlite3BtreeIsInTrans(Btree *p){ 7933 assert( p==0 || sqlite3_mutex_held(p->db->mutex) ); 7934 return (p && (p->inTrans==TRANS_WRITE)); 7935 } 7936 7937 #ifndef SQLITE_OMIT_WAL 7938 /* 7939 ** Run a checkpoint on the Btree passed as the first argument. 7940 ** 7941 ** Return SQLITE_LOCKED if this or any other connection has an open 7942 ** transaction on the shared-cache the argument Btree is connected to. 7943 ** 7944 ** Parameter eMode is one of SQLITE_CHECKPOINT_PASSIVE, FULL or RESTART. 7945 */ 7946 int sqlite3BtreeCheckpoint(Btree *p, int eMode, int *pnLog, int *pnCkpt){ 7947 int rc = SQLITE_OK; 7948 if( p ){ 7949 BtShared *pBt = p->pBt; 7950 sqlite3BtreeEnter(p); 7951 if( pBt->inTransaction!=TRANS_NONE ){ 7952 rc = SQLITE_LOCKED; 7953 }else{ 7954 rc = sqlite3PagerCheckpoint(pBt->pPager, eMode, pnLog, pnCkpt); 7955 } 7956 sqlite3BtreeLeave(p); 7957 } 7958 return rc; 7959 } 7960 #endif 7961 7962 /* 7963 ** Return non-zero if a read (or write) transaction is active. 7964 */ 7965 int sqlite3BtreeIsInReadTrans(Btree *p){ 7966 assert( p ); 7967 assert( sqlite3_mutex_held(p->db->mutex) ); 7968 return p->inTrans!=TRANS_NONE; 7969 } 7970 7971 int sqlite3BtreeIsInBackup(Btree *p){ 7972 assert( p ); 7973 assert( sqlite3_mutex_held(p->db->mutex) ); 7974 return p->nBackup!=0; 7975 } 7976 7977 /* 7978 ** This function returns a pointer to a blob of memory associated with 7979 ** a single shared-btree. The memory is used by client code for its own 7980 ** purposes (for example, to store a high-level schema associated with 7981 ** the shared-btree). The btree layer manages reference counting issues. 7982 ** 7983 ** The first time this is called on a shared-btree, nBytes bytes of memory 7984 ** are allocated, zeroed, and returned to the caller. For each subsequent 7985 ** call the nBytes parameter is ignored and a pointer to the same blob 7986 ** of memory returned. 7987 ** 7988 ** If the nBytes parameter is 0 and the blob of memory has not yet been 7989 ** allocated, a null pointer is returned. If the blob has already been 7990 ** allocated, it is returned as normal. 7991 ** 7992 ** Just before the shared-btree is closed, the function passed as the 7993 ** xFree argument when the memory allocation was made is invoked on the 7994 ** blob of allocated memory. The xFree function should not call sqlite3_free() 7995 ** on the memory, the btree layer does that. 7996 */ 7997 void *sqlite3BtreeSchema(Btree *p, int nBytes, void(*xFree)(void *)){ 7998 BtShared *pBt = p->pBt; 7999 sqlite3BtreeEnter(p); 8000 if( !pBt->pSchema && nBytes ){ 8001 pBt->pSchema = sqlite3DbMallocZero(0, nBytes); 8002 pBt->xFreeSchema = xFree; 8003 } 8004 sqlite3BtreeLeave(p); 8005 return pBt->pSchema; 8006 } 8007 8008 /* 8009 ** Return SQLITE_LOCKED_SHAREDCACHE if another user of the same shared 8010 ** btree as the argument handle holds an exclusive lock on the 8011 ** sqlite_master table. Otherwise SQLITE_OK. 8012 */ 8013 int sqlite3BtreeSchemaLocked(Btree *p){ 8014 int rc; 8015 assert( sqlite3_mutex_held(p->db->mutex) ); 8016 sqlite3BtreeEnter(p); 8017 rc = querySharedCacheTableLock(p, MASTER_ROOT, READ_LOCK); 8018 assert( rc==SQLITE_OK || rc==SQLITE_LOCKED_SHAREDCACHE ); 8019 sqlite3BtreeLeave(p); 8020 return rc; 8021 } 8022 8023 8024 #ifndef SQLITE_OMIT_SHARED_CACHE 8025 /* 8026 ** Obtain a lock on the table whose root page is iTab. The 8027 ** lock is a write lock if isWritelock is true or a read lock 8028 ** if it is false. 8029 */ 8030 int sqlite3BtreeLockTable(Btree *p, int iTab, u8 isWriteLock){ 8031 int rc = SQLITE_OK; 8032 assert( p->inTrans!=TRANS_NONE ); 8033 if( p->sharable ){ 8034 u8 lockType = READ_LOCK + isWriteLock; 8035 assert( READ_LOCK+1==WRITE_LOCK ); 8036 assert( isWriteLock==0 || isWriteLock==1 ); 8037 8038 sqlite3BtreeEnter(p); 8039 rc = querySharedCacheTableLock(p, iTab, lockType); 8040 if( rc==SQLITE_OK ){ 8041 rc = setSharedCacheTableLock(p, iTab, lockType); 8042 } 8043 sqlite3BtreeLeave(p); 8044 } 8045 return rc; 8046 } 8047 #endif 8048 8049 #ifndef SQLITE_OMIT_INCRBLOB 8050 /* 8051 ** Argument pCsr must be a cursor opened for writing on an 8052 ** INTKEY table currently pointing at a valid table entry. 8053 ** This function modifies the data stored as part of that entry. 8054 ** 8055 ** Only the data content may only be modified, it is not possible to 8056 ** change the length of the data stored. If this function is called with 8057 ** parameters that attempt to write past the end of the existing data, 8058 ** no modifications are made and SQLITE_CORRUPT is returned. 8059 */ 8060 int sqlite3BtreePutData(BtCursor *pCsr, u32 offset, u32 amt, void *z){ 8061 int rc; 8062 assert( cursorHoldsMutex(pCsr) ); 8063 assert( sqlite3_mutex_held(pCsr->pBtree->db->mutex) ); 8064 assert( pCsr->isIncrblobHandle ); 8065 8066 rc = restoreCursorPosition(pCsr); 8067 if( rc!=SQLITE_OK ){ 8068 return rc; 8069 } 8070 assert( pCsr->eState!=CURSOR_REQUIRESEEK ); 8071 if( pCsr->eState!=CURSOR_VALID ){ 8072 return SQLITE_ABORT; 8073 } 8074 8075 /* Check some assumptions: 8076 ** (a) the cursor is open for writing, 8077 ** (b) there is a read/write transaction open, 8078 ** (c) the connection holds a write-lock on the table (if required), 8079 ** (d) there are no conflicting read-locks, and 8080 ** (e) the cursor points at a valid row of an intKey table. 8081 */ 8082 if( !pCsr->wrFlag ){ 8083 return SQLITE_READONLY; 8084 } 8085 assert( !pCsr->pBt->readOnly && pCsr->pBt->inTransaction==TRANS_WRITE ); 8086 assert( hasSharedCacheTableLock(pCsr->pBtree, pCsr->pgnoRoot, 0, 2) ); 8087 assert( !hasReadConflicts(pCsr->pBtree, pCsr->pgnoRoot) ); 8088 assert( pCsr->apPage[pCsr->iPage]->intKey ); 8089 8090 return accessPayload(pCsr, offset, amt, (unsigned char *)z, 1); 8091 } 8092 8093 /* 8094 ** Set a flag on this cursor to cache the locations of pages from the 8095 ** overflow list for the current row. This is used by cursors opened 8096 ** for incremental blob IO only. 8097 ** 8098 ** This function sets a flag only. The actual page location cache 8099 ** (stored in BtCursor.aOverflow[]) is allocated and used by function 8100 ** accessPayload() (the worker function for sqlite3BtreeData() and 8101 ** sqlite3BtreePutData()). 8102 */ 8103 void sqlite3BtreeCacheOverflow(BtCursor *pCur){ 8104 assert( cursorHoldsMutex(pCur) ); 8105 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) ); 8106 invalidateOverflowCache(pCur); 8107 pCur->isIncrblobHandle = 1; 8108 } 8109 #endif 8110 8111 /* 8112 ** Set both the "read version" (single byte at byte offset 18) and 8113 ** "write version" (single byte at byte offset 19) fields in the database 8114 ** header to iVersion. 8115 */ 8116 int sqlite3BtreeSetVersion(Btree *pBtree, int iVersion){ 8117 BtShared *pBt = pBtree->pBt; 8118 int rc; /* Return code */ 8119 8120 assert( pBtree->inTrans==TRANS_NONE ); 8121 assert( iVersion==1 || iVersion==2 ); 8122 8123 /* If setting the version fields to 1, do not automatically open the 8124 ** WAL connection, even if the version fields are currently set to 2. 8125 */ 8126 pBt->doNotUseWAL = (u8)(iVersion==1); 8127 8128 rc = sqlite3BtreeBeginTrans(pBtree, 0); 8129 if( rc==SQLITE_OK ){ 8130 u8 *aData = pBt->pPage1->aData; 8131 if( aData[18]!=(u8)iVersion || aData[19]!=(u8)iVersion ){ 8132 rc = sqlite3BtreeBeginTrans(pBtree, 2); 8133 if( rc==SQLITE_OK ){ 8134 rc = sqlite3PagerWrite(pBt->pPage1->pDbPage); 8135 if( rc==SQLITE_OK ){ 8136 aData[18] = (u8)iVersion; 8137 aData[19] = (u8)iVersion; 8138 } 8139 } 8140 } 8141 } 8142 8143 pBt->doNotUseWAL = 0; 8144 return rc; 8145 } 8146