xref: /sqlite-3.40.0/src/pager.c (revision eb4ac06f)
1 /*
2 ** 2001 September 15
3 **
4 ** The author disclaims copyright to this source code.  In place of
5 ** a legal notice, here is a blessing:
6 **
7 **    May you do good and not evil.
8 **    May you find forgiveness for yourself and forgive others.
9 **    May you share freely, never taking more than you give.
10 **
11 *************************************************************************
12 ** This is the implementation of the page cache subsystem or "pager".
13 **
14 ** The pager is used to access a database disk file.  It implements
15 ** atomic commit and rollback through the use of a journal file that
16 ** is separate from the database file.  The pager also implements file
17 ** locking to prevent two processes from writing the same database
18 ** file simultaneously, or one process from reading the database while
19 ** another is writing.
20 **
21 ** @(#) $Id: pager.c,v 1.585 2009/04/30 16:41:00 danielk1977 Exp $
22 */
23 #ifndef SQLITE_OMIT_DISKIO
24 #include "sqliteInt.h"
25 
26 /*
27 ** Macros for troubleshooting.  Normally turned off
28 */
29 #if 0
30 int sqlite3PagerTrace=1;  /* True to enable tracing */
31 #define sqlite3DebugPrintf printf
32 #define PAGERTRACE(X)     if( sqlite3PagerTrace ){ sqlite3DebugPrintf X; }
33 #else
34 #define PAGERTRACE(X)
35 #endif
36 
37 /*
38 ** The following two macros are used within the PAGERTRACE() macros above
39 ** to print out file-descriptors.
40 **
41 ** PAGERID() takes a pointer to a Pager struct as its argument. The
42 ** associated file-descriptor is returned. FILEHANDLEID() takes an sqlite3_file
43 ** struct as its argument.
44 */
45 #define PAGERID(p) ((int)(p->fd))
46 #define FILEHANDLEID(fd) ((int)fd)
47 
48 /*
49 ** The page cache as a whole is always in one of the following
50 ** states:
51 **
52 **   PAGER_UNLOCK        The page cache is not currently reading or
53 **                       writing the database file.  There is no
54 **                       data held in memory.  This is the initial
55 **                       state.
56 **
57 **   PAGER_SHARED        The page cache is reading the database.
58 **                       Writing is not permitted.  There can be
59 **                       multiple readers accessing the same database
60 **                       file at the same time.
61 **
62 **   PAGER_RESERVED      This process has reserved the database for writing
63 **                       but has not yet made any changes.  Only one process
64 **                       at a time can reserve the database.  The original
65 **                       database file has not been modified so other
66 **                       processes may still be reading the on-disk
67 **                       database file.
68 **
69 **   PAGER_EXCLUSIVE     The page cache is writing the database.
70 **                       Access is exclusive.  No other processes or
71 **                       threads can be reading or writing while one
72 **                       process is writing.
73 **
74 **   PAGER_SYNCED        The pager moves to this state from PAGER_EXCLUSIVE
75 **                       after all dirty pages have been written to the
76 **                       database file and the file has been synced to
77 **                       disk. All that remains to do is to remove or
78 **                       truncate the journal file and the transaction
79 **                       will be committed.
80 **
81 ** The page cache comes up in PAGER_UNLOCK.  The first time a
82 ** sqlite3PagerGet() occurs, the state transitions to PAGER_SHARED.
83 ** After all pages have been released using sqlite_page_unref(),
84 ** the state transitions back to PAGER_UNLOCK.  The first time
85 ** that sqlite3PagerWrite() is called, the state transitions to
86 ** PAGER_RESERVED.  (Note that sqlite3PagerWrite() can only be
87 ** called on an outstanding page which means that the pager must
88 ** be in PAGER_SHARED before it transitions to PAGER_RESERVED.)
89 ** PAGER_RESERVED means that there is an open rollback journal.
90 ** The transition to PAGER_EXCLUSIVE occurs before any changes
91 ** are made to the database file, though writes to the rollback
92 ** journal occurs with just PAGER_RESERVED.  After an sqlite3PagerRollback()
93 ** or sqlite3PagerCommitPhaseTwo(), the state can go back to PAGER_SHARED,
94 ** or it can stay at PAGER_EXCLUSIVE if we are in exclusive access mode.
95 */
96 #define PAGER_UNLOCK      0
97 #define PAGER_SHARED      1   /* same as SHARED_LOCK */
98 #define PAGER_RESERVED    2   /* same as RESERVED_LOCK */
99 #define PAGER_EXCLUSIVE   4   /* same as EXCLUSIVE_LOCK */
100 #define PAGER_SYNCED      5
101 
102 /*
103 ** A macro used for invoking the codec if there is one
104 */
105 #ifdef SQLITE_HAS_CODEC
106 # define CODEC1(P,D,N,X) if( P->xCodec!=0 ){ P->xCodec(P->pCodecArg,D,N,X); }
107 # define CODEC2(P,D,N,X) ((char*)(P->xCodec!=0?P->xCodec(P->pCodecArg,D,N,X):D))
108 #else
109 # define CODEC1(P,D,N,X) /* NO-OP */
110 # define CODEC2(P,D,N,X) ((char*)D)
111 #endif
112 
113 /*
114 ** The maximum allowed sector size. 16MB. If the xSectorsize() method
115 ** returns a value larger than this, then MAX_SECTOR_SIZE is used instead.
116 ** This could conceivably cause corruption following a power failure on
117 ** such a system. This is currently an undocumented limit.
118 */
119 #define MAX_SECTOR_SIZE 0x0100000
120 
121 /*
122 ** An instance of the following structure is allocated for each active
123 ** savepoint and statement transaction in the system. All such structures
124 ** are stored in the Pager.aSavepoint[] array, which is allocated and
125 ** resized using sqlite3Realloc().
126 **
127 ** When a savepoint is created, the PagerSavepoint.iHdrOffset field is
128 ** set to 0. If a journal-header is written into the main journal while
129 ** the savepoint is active, then iHdrOffset is set to the byte offset
130 ** immediately following the last journal record written into the main
131 ** journal before the journal-header. This is required during savepoint
132 ** rollback (see pagerPlaybackSavepoint()).
133 */
134 typedef struct PagerSavepoint PagerSavepoint;
135 struct PagerSavepoint {
136   i64 iOffset;                 /* Starting offset in main journal */
137   i64 iHdrOffset;              /* See above */
138   Bitvec *pInSavepoint;        /* Set of pages in this savepoint */
139   Pgno nOrig;                  /* Original number of pages in file */
140   Pgno iSubRec;                /* Index of first record in sub-journal */
141 };
142 
143 /*
144 ** A open page cache is an instance of the following structure.
145 **
146 ** errCode
147 **
148 **   Pager.errCode may be set to SQLITE_IOERR, SQLITE_CORRUPT, or
149 **   or SQLITE_FULL. Once one of the first three errors occurs, it persists
150 **   and is returned as the result of every major pager API call.  The
151 **   SQLITE_FULL return code is slightly different. It persists only until the
152 **   next successful rollback is performed on the pager cache. Also,
153 **   SQLITE_FULL does not affect the sqlite3PagerGet() and sqlite3PagerLookup()
154 **   APIs, they may still be used successfully.
155 **
156 ** dbSizeValid, dbSize, dbOrigSize, dbFileSize
157 **
158 **   Managing the size of the database file in pages is a little complicated.
159 **   The variable Pager.dbSize contains the number of pages that the database
160 **   image currently contains. As the database image grows or shrinks this
161 **   variable is updated. The variable Pager.dbFileSize contains the number
162 **   of pages in the database file. This may be different from Pager.dbSize
163 **   if some pages have been appended to the database image but not yet written
164 **   out from the cache to the actual file on disk. Or if the image has been
165 **   truncated by an incremental-vacuum operation. The Pager.dbOrigSize variable
166 **   contains the number of pages in the database image when the current
167 **   transaction was opened. The contents of all three of these variables is
168 **   only guaranteed to be correct if the boolean Pager.dbSizeValid is true.
169 **
170 **   TODO: Under what conditions is dbSizeValid set? Cleared?
171 **
172 ** changeCountDone
173 **
174 **   This boolean variable is used to make sure that the change-counter
175 **   (the 4-byte header field at byte offset 24 of the database file) is
176 **   not updated more often than necessary.
177 **
178 **   It is set to true when the change-counter field is updated, which
179 **   can only happen if an exclusive lock is held on the database file.
180 **   It is cleared (set to false) whenever an exclusive lock is
181 **   relinquished on the database file. Each time a transaction is committed,
182 **   The changeCountDone flag is inspected. If it is true, the work of
183 **   updating the change-counter is omitted for the current transaction.
184 **
185 **   This mechanism means that when running in exclusive mode, a connection
186 **   need only update the change-counter once, for the first transaction
187 **   committed.
188 **
189 ** dbModified
190 **
191 **   The dbModified flag is set whenever a database page is dirtied.
192 **   It is cleared at the end of each transaction.
193 **
194 **   It is used when committing or otherwise ending a transaction. If
195 **   the dbModified flag is clear then less work has to be done.
196 **
197 ** journalStarted
198 **
199 **   This flag is set whenever the the main journal is synced.
200 **
201 **   The point of this flag is that it must be set after the
202 **   first journal header in a journal file has been synced to disk.
203 **   After this has happened, new pages appended to the database
204 **   do not need the PGHDR_NEED_SYNC flag set, as they do not need
205 **   to wait for a journal sync before they can be written out to
206 **   the database file (see function pager_write()).
207 **
208 ** setMaster
209 **
210 **   This variable is used to ensure that the master journal file name
211 **   (if any) is only written into the journal file once.
212 **
213 **   When committing a transaction, the master journal file name (if any)
214 **   may be written into the journal file while the pager is still in
215 **   PAGER_RESERVED state (see CommitPhaseOne() for the action). It
216 **   then attempts to upgrade to an exclusive lock. If this attempt
217 **   fails, then SQLITE_BUSY may be returned to the user and the user
218 **   may attempt to commit the transaction again later (calling
219 **   CommitPhaseOne() again). This flag is used to ensure that the
220 **   master journal name is only written to the journal file the first
221 **   time CommitPhaseOne() is called.
222 **
223 ** doNotSync
224 **
225 **   This variable is set and cleared by sqlite3PagerWrite().
226 **
227 ** needSync
228 **
229 **   TODO: It might be easier to set this variable in writeJournalHdr()
230 **   and writeMasterJournal() only. Change its meaning to "unsynced data
231 **   has been written to the journal".
232 **
233 ** subjInMemory
234 **
235 **   This is a boolean variable. If true, then any required sub-journal
236 **   is opened as an in-memory journal file. If false, then in-memory
237 **   sub-journals are only used for in-memory pager files.
238 */
239 struct Pager {
240   sqlite3_vfs *pVfs;          /* OS functions to use for IO */
241   u8 exclusiveMode;           /* Boolean. True if locking_mode==EXCLUSIVE */
242   u8 journalMode;             /* On of the PAGER_JOURNALMODE_* values */
243   u8 useJournal;              /* Use a rollback journal on this file */
244   u8 noReadlock;              /* Do not bother to obtain readlocks */
245   u8 noSync;                  /* Do not sync the journal if true */
246   u8 fullSync;                /* Do extra syncs of the journal for robustness */
247   u8 sync_flags;              /* One of SYNC_NORMAL or SYNC_FULL */
248   u8 tempFile;                /* zFilename is a temporary file */
249   u8 readOnly;                /* True for a read-only database */
250   u8 memDb;                   /* True to inhibit all file I/O */
251 
252   /* The following block contains those class members that are dynamically
253   ** modified during normal operations. The other variables in this structure
254   ** are either constant throughout the lifetime of the pager, or else
255   ** used to store configuration parameters that affect the way the pager
256   ** operates.
257   **
258   ** The 'state' variable is described in more detail along with the
259   ** descriptions of the values it may take - PAGER_UNLOCK etc. Many of the
260   ** other variables in this block are described in the comment directly
261   ** above this class definition.
262   */
263   u8 state;                   /* PAGER_UNLOCK, _SHARED, _RESERVED, etc. */
264   u8 dbModified;              /* True if there are any changes to the Db */
265   u8 needSync;                /* True if an fsync() is needed on the journal */
266   u8 journalStarted;          /* True if header of journal is synced */
267   u8 changeCountDone;         /* Set after incrementing the change-counter */
268   u8 setMaster;               /* True if a m-j name has been written to jrnl */
269   u8 doNotSync;               /* Boolean. While true, do not spill the cache */
270   u8 dbSizeValid;             /* Set when dbSize is correct */
271   u8 subjInMemory;            /* True to use in-memory sub-journals */
272   Pgno dbSize;                /* Number of pages in the database */
273   Pgno dbOrigSize;            /* dbSize before the current transaction */
274   Pgno dbFileSize;            /* Number of pages in the database file */
275   int errCode;                /* One of several kinds of errors */
276   int nRec;                   /* Pages journalled since last j-header written */
277   u32 cksumInit;              /* Quasi-random value added to every checksum */
278   u32 nSubRec;                /* Number of records written to sub-journal */
279   Bitvec *pInJournal;         /* One bit for each page in the database file */
280   sqlite3_file *fd;           /* File descriptor for database */
281   sqlite3_file *jfd;          /* File descriptor for main journal */
282   sqlite3_file *sjfd;         /* File descriptor for sub-journal */
283   i64 journalOff;             /* Current write offset in the journal file */
284   i64 journalHdr;             /* Byte offset to previous journal header */
285   PagerSavepoint *aSavepoint; /* Array of active savepoints */
286   int nSavepoint;             /* Number of elements in aSavepoint[] */
287   char dbFileVers[16];        /* Changes whenever database file changes */
288   u32 sectorSize;             /* Assumed sector size during rollback */
289 
290   int nExtra;                 /* Add this many bytes to each in-memory page */
291   u32 vfsFlags;               /* Flags for sqlite3_vfs.xOpen() */
292   int pageSize;               /* Number of bytes in a page */
293   Pgno mxPgno;                /* Maximum allowed size of the database */
294   char *zFilename;            /* Name of the database file */
295   char *zJournal;             /* Name of the journal file */
296   int (*xBusyHandler)(void*); /* Function to call when busy */
297   void *pBusyHandlerArg;      /* Context argument for xBusyHandler */
298 #ifdef SQLITE_TEST
299   int nHit, nMiss;            /* Cache hits and missing */
300   int nRead, nWrite;          /* Database pages read/written */
301 #endif
302   void (*xReiniter)(DbPage*); /* Call this routine when reloading pages */
303 #ifdef SQLITE_HAS_CODEC
304   void *(*xCodec)(void*,void*,Pgno,int); /* Routine for en/decoding data */
305   void *pCodecArg;            /* First argument to xCodec() */
306 #endif
307   char *pTmpSpace;            /* Pager.pageSize bytes of space for tmp use */
308   i64 journalSizeLimit;       /* Size limit for persistent journal files */
309   PCache *pPCache;            /* Pointer to page cache object */
310   sqlite3_backup *pBackup;    /* Pointer to list of ongoing backup processes */
311 };
312 
313 /*
314 ** The following global variables hold counters used for
315 ** testing purposes only.  These variables do not exist in
316 ** a non-testing build.  These variables are not thread-safe.
317 */
318 #ifdef SQLITE_TEST
319 int sqlite3_pager_readdb_count = 0;    /* Number of full pages read from DB */
320 int sqlite3_pager_writedb_count = 0;   /* Number of full pages written to DB */
321 int sqlite3_pager_writej_count = 0;    /* Number of pages written to journal */
322 # define PAGER_INCR(v)  v++
323 #else
324 # define PAGER_INCR(v)
325 #endif
326 
327 
328 
329 /*
330 ** Journal files begin with the following magic string.  The data
331 ** was obtained from /dev/random.  It is used only as a sanity check.
332 **
333 ** Since version 2.8.0, the journal format contains additional sanity
334 ** checking information.  If the power fails while the journal is being
335 ** written, semi-random garbage data might appear in the journal
336 ** file after power is restored.  If an attempt is then made
337 ** to roll the journal back, the database could be corrupted.  The additional
338 ** sanity checking data is an attempt to discover the garbage in the
339 ** journal and ignore it.
340 **
341 ** The sanity checking information for the new journal format consists
342 ** of a 32-bit checksum on each page of data.  The checksum covers both
343 ** the page number and the pPager->pageSize bytes of data for the page.
344 ** This cksum is initialized to a 32-bit random value that appears in the
345 ** journal file right after the header.  The random initializer is important,
346 ** because garbage data that appears at the end of a journal is likely
347 ** data that was once in other files that have now been deleted.  If the
348 ** garbage data came from an obsolete journal file, the checksums might
349 ** be correct.  But by initializing the checksum to random value which
350 ** is different for every journal, we minimize that risk.
351 */
352 static const unsigned char aJournalMagic[] = {
353   0xd9, 0xd5, 0x05, 0xf9, 0x20, 0xa1, 0x63, 0xd7,
354 };
355 
356 /*
357 ** The size of the of each page record in the journal is given by
358 ** the following macro.
359 */
360 #define JOURNAL_PG_SZ(pPager)  ((pPager->pageSize) + 8)
361 
362 /*
363 ** The journal header size for this pager. This is usually the same
364 ** size as a single disk sector. See also setSectorSize().
365 */
366 #define JOURNAL_HDR_SZ(pPager) (pPager->sectorSize)
367 
368 /*
369 ** The macro MEMDB is true if we are dealing with an in-memory database.
370 ** We do this as a macro so that if the SQLITE_OMIT_MEMORYDB macro is set,
371 ** the value of MEMDB will be a constant and the compiler will optimize
372 ** out code that would never execute.
373 */
374 #ifdef SQLITE_OMIT_MEMORYDB
375 # define MEMDB 0
376 #else
377 # define MEMDB pPager->memDb
378 #endif
379 
380 /*
381 ** The maximum legal page number is (2^31 - 1).
382 */
383 #define PAGER_MAX_PGNO 2147483647
384 
385 #ifndef NDEBUG
386 /*
387 ** Usage:
388 **
389 **   assert( assert_pager_state(pPager) );
390 */
391 static int assert_pager_state(Pager *pPager){
392 
393   /* A temp-file is always in PAGER_EXCLUSIVE or PAGER_SYNCED state. */
394   assert( pPager->tempFile==0 || pPager->state>=PAGER_EXCLUSIVE );
395 
396   /* The changeCountDone flag is always set for temp-files */
397   assert( pPager->tempFile==0 || pPager->changeCountDone );
398 
399   return 1;
400 }
401 #endif
402 
403 /*
404 ** Return true if it is necessary to write page *pPg into the sub-journal.
405 ** A page needs to be written into the sub-journal if there exists one
406 ** or more open savepoints for which:
407 **
408 **   * The page-number is less than or equal to PagerSavepoint.nOrig, and
409 **   * The bit corresponding to the page-number is not set in
410 **     PagerSavepoint.pInSavepoint.
411 */
412 static int subjRequiresPage(PgHdr *pPg){
413   Pgno pgno = pPg->pgno;
414   Pager *pPager = pPg->pPager;
415   int i;
416   for(i=0; i<pPager->nSavepoint; i++){
417     PagerSavepoint *p = &pPager->aSavepoint[i];
418     if( p->nOrig>=pgno && 0==sqlite3BitvecTest(p->pInSavepoint, pgno) ){
419       return 1;
420     }
421   }
422   return 0;
423 }
424 
425 /*
426 ** Return true if the page is already in the journal file.
427 */
428 static int pageInJournal(PgHdr *pPg){
429   return sqlite3BitvecTest(pPg->pPager->pInJournal, pPg->pgno);
430 }
431 
432 /*
433 ** Read a 32-bit integer from the given file descriptor.  Store the integer
434 ** that is read in *pRes.  Return SQLITE_OK if everything worked, or an
435 ** error code is something goes wrong.
436 **
437 ** All values are stored on disk as big-endian.
438 */
439 static int read32bits(sqlite3_file *fd, i64 offset, u32 *pRes){
440   unsigned char ac[4];
441   int rc = sqlite3OsRead(fd, ac, sizeof(ac), offset);
442   if( rc==SQLITE_OK ){
443     *pRes = sqlite3Get4byte(ac);
444   }
445   return rc;
446 }
447 
448 /*
449 ** Write a 32-bit integer into a string buffer in big-endian byte order.
450 */
451 #define put32bits(A,B)  sqlite3Put4byte((u8*)A,B)
452 
453 /*
454 ** Write a 32-bit integer into the given file descriptor.  Return SQLITE_OK
455 ** on success or an error code is something goes wrong.
456 */
457 static int write32bits(sqlite3_file *fd, i64 offset, u32 val){
458   char ac[4];
459   put32bits(ac, val);
460   return sqlite3OsWrite(fd, ac, 4, offset);
461 }
462 
463 /*
464 ** The argument to this macro is a file descriptor (type sqlite3_file*).
465 ** Return 0 if it is not open, or non-zero (but not 1) if it is.
466 **
467 ** This is so that expressions can be written as:
468 **
469 **   if( isOpen(pPager->jfd) ){ ...
470 **
471 ** instead of
472 **
473 **   if( pPager->jfd->pMethods ){ ...
474 */
475 #define isOpen(pFd) ((pFd)->pMethods)
476 
477 /*
478 ** If file pFd is open, call sqlite3OsUnlock() on it.
479 */
480 static int osUnlock(sqlite3_file *pFd, int eLock){
481   if( !isOpen(pFd) ){
482     return SQLITE_OK;
483   }
484   return sqlite3OsUnlock(pFd, eLock);
485 }
486 
487 /*
488 ** This function determines whether or not the atomic-write optimization
489 ** can be used with this pager. The optimization can be used if:
490 **
491 **  (a) the value returned by OsDeviceCharacteristics() indicates that
492 **      a database page may be written atomically, and
493 **  (b) the value returned by OsSectorSize() is less than or equal
494 **      to the page size.
495 **
496 ** The optimization is also always enabled for temporary files. It is
497 ** an error to call this function if pPager is opened on an in-memory
498 ** database.
499 **
500 ** If the optimization cannot be used, 0 is returned. If it can be used,
501 ** then the value returned is the size of the journal file when it
502 ** contains rollback data for exactly one page.
503 */
504 #ifdef SQLITE_ENABLE_ATOMIC_WRITE
505 static int jrnlBufferSize(Pager *pPager){
506   assert( !MEMDB );
507   if( !pPager->tempFile ){
508     int dc;                           /* Device characteristics */
509     int nSector;                      /* Sector size */
510     int szPage;                       /* Page size */
511 
512     assert( isOpen(pPager->fd) );
513     dc = sqlite3OsDeviceCharacteristics(pPager->fd);
514     nSector = pPager->sectorSize;
515     szPage = pPager->pageSize;
516 
517     assert(SQLITE_IOCAP_ATOMIC512==(512>>8));
518     assert(SQLITE_IOCAP_ATOMIC64K==(65536>>8));
519     if( 0==(dc&(SQLITE_IOCAP_ATOMIC|(szPage>>8)) || nSector>szPage) ){
520       return 0;
521     }
522   }
523 
524   return JOURNAL_HDR_SZ(pPager) + JOURNAL_PG_SZ(pPager);
525 }
526 #endif
527 
528 /*
529 ** If SQLITE_CHECK_PAGES is defined then we do some sanity checking
530 ** on the cache using a hash function.  This is used for testing
531 ** and debugging only.
532 */
533 #ifdef SQLITE_CHECK_PAGES
534 /*
535 ** Return a 32-bit hash of the page data for pPage.
536 */
537 static u32 pager_datahash(int nByte, unsigned char *pData){
538   u32 hash = 0;
539   int i;
540   for(i=0; i<nByte; i++){
541     hash = (hash*1039) + pData[i];
542   }
543   return hash;
544 }
545 static u32 pager_pagehash(PgHdr *pPage){
546   return pager_datahash(pPage->pPager->pageSize, (unsigned char *)pPage->pData);
547 }
548 static void pager_set_pagehash(PgHdr *pPage){
549   pPage->pageHash = pager_pagehash(pPage);
550 }
551 
552 /*
553 ** The CHECK_PAGE macro takes a PgHdr* as an argument. If SQLITE_CHECK_PAGES
554 ** is defined, and NDEBUG is not defined, an assert() statement checks
555 ** that the page is either dirty or still matches the calculated page-hash.
556 */
557 #define CHECK_PAGE(x) checkPage(x)
558 static void checkPage(PgHdr *pPg){
559   Pager *pPager = pPg->pPager;
560   assert( !pPg->pageHash || pPager->errCode
561       || (pPg->flags&PGHDR_DIRTY) || pPg->pageHash==pager_pagehash(pPg) );
562 }
563 
564 #else
565 #define pager_datahash(X,Y)  0
566 #define pager_pagehash(X)  0
567 #define CHECK_PAGE(x)
568 #endif  /* SQLITE_CHECK_PAGES */
569 
570 /*
571 ** When this is called the journal file for pager pPager must be open.
572 ** This function attempts to read a master journal file name from the
573 ** end of the file and, if successful, copies it into memory supplied
574 ** by the caller. See comments above writeMasterJournal() for the format
575 ** used to store a master journal file name at the end of a journal file.
576 **
577 ** zMaster must point to a buffer of at least nMaster bytes allocated by
578 ** the caller. This should be sqlite3_vfs.mxPathname+1 (to ensure there is
579 ** enough space to write the master journal name). If the master journal
580 ** name in the journal is longer than nMaster bytes (including a
581 ** nul-terminator), then this is handled as if no master journal name
582 ** were present in the journal.
583 **
584 ** If a master journal file name is present at the end of the journal
585 ** file, then it is copied into the buffer pointed to by zMaster. A
586 ** nul-terminator byte is appended to the buffer following the master
587 ** journal file name.
588 **
589 ** If it is determined that no master journal file name is present
590 ** zMaster[0] is set to 0 and SQLITE_OK returned.
591 **
592 ** If an error occurs while reading from the journal file, an SQLite
593 ** error code is returned.
594 */
595 static int readMasterJournal(sqlite3_file *pJrnl, char *zMaster, u32 nMaster){
596   int rc;                    /* Return code */
597   u32 len;                   /* Length in bytes of master journal name */
598   i64 szJ;                   /* Total size in bytes of journal file pJrnl */
599   u32 cksum;                 /* MJ checksum value read from journal */
600   u32 u;                     /* Unsigned loop counter */
601   unsigned char aMagic[8];   /* A buffer to hold the magic header */
602   zMaster[0] = '\0';
603 
604   if( SQLITE_OK!=(rc = sqlite3OsFileSize(pJrnl, &szJ))
605    || szJ<16
606    || SQLITE_OK!=(rc = read32bits(pJrnl, szJ-16, &len))
607    || len>=nMaster
608    || SQLITE_OK!=(rc = read32bits(pJrnl, szJ-12, &cksum))
609    || SQLITE_OK!=(rc = sqlite3OsRead(pJrnl, aMagic, 8, szJ-8))
610    || memcmp(aMagic, aJournalMagic, 8)
611    || SQLITE_OK!=(rc = sqlite3OsRead(pJrnl, zMaster, len, szJ-16-len))
612   ){
613     return rc;
614   }
615 
616   /* See if the checksum matches the master journal name */
617   for(u=0; u<len; u++){
618     cksum -= zMaster[u];
619   }
620   if( cksum ){
621     /* If the checksum doesn't add up, then one or more of the disk sectors
622     ** containing the master journal filename is corrupted. This means
623     ** definitely roll back, so just return SQLITE_OK and report a (nul)
624     ** master-journal filename.
625     */
626     len = 0;
627   }
628   zMaster[len] = '\0';
629 
630   return SQLITE_OK;
631 }
632 
633 /*
634 ** Return the offset of the sector boundary at or immediately
635 ** following the value in pPager->journalOff, assuming a sector
636 ** size of pPager->sectorSize bytes.
637 **
638 ** i.e for a sector size of 512:
639 **
640 **   Pager.journalOff          Return value
641 **   ---------------------------------------
642 **   0                         0
643 **   512                       512
644 **   100                       512
645 **   2000                      2048
646 **
647 */
648 static i64 journalHdrOffset(Pager *pPager){
649   i64 offset = 0;
650   i64 c = pPager->journalOff;
651   if( c ){
652     offset = ((c-1)/JOURNAL_HDR_SZ(pPager) + 1) * JOURNAL_HDR_SZ(pPager);
653   }
654   assert( offset%JOURNAL_HDR_SZ(pPager)==0 );
655   assert( offset>=c );
656   assert( (offset-c)<JOURNAL_HDR_SZ(pPager) );
657   return offset;
658 }
659 
660 /*
661 ** The journal file must be open when this function is called.
662 **
663 ** This function is a no-op if the journal file has not been written to
664 ** within the current transaction (i.e. if Pager.journalOff==0).
665 **
666 ** If doTruncate is non-zero or the Pager.journalSizeLimit variable is
667 ** set to 0, then truncate the journal file to zero bytes in size. Otherwise,
668 ** zero the 28-byte header at the start of the journal file. In either case,
669 ** if the pager is not in no-sync mode, sync the journal file immediately
670 ** after writing or truncating it.
671 **
672 ** If Pager.journalSizeLimit is set to a positive, non-zero value, and
673 ** following the truncation or zeroing described above the size of the
674 ** journal file in bytes is larger than this value, then truncate the
675 ** journal file to Pager.journalSizeLimit bytes. The journal file does
676 ** not need to be synced following this operation.
677 **
678 ** If an IO error occurs, abandon processing and return the IO error code.
679 ** Otherwise, return SQLITE_OK.
680 */
681 static int zeroJournalHdr(Pager *pPager, int doTruncate){
682   int rc = SQLITE_OK;                               /* Return code */
683   assert( isOpen(pPager->jfd) );
684   if( pPager->journalOff ){
685     const i64 iLimit = pPager->journalSizeLimit;    /* Local cache of jsl */
686 
687     IOTRACE(("JZEROHDR %p\n", pPager))
688     if( doTruncate || iLimit==0 ){
689       rc = sqlite3OsTruncate(pPager->jfd, 0);
690     }else{
691       static const char zeroHdr[28] = {0};
692       rc = sqlite3OsWrite(pPager->jfd, zeroHdr, sizeof(zeroHdr), 0);
693     }
694     if( rc==SQLITE_OK && !pPager->noSync ){
695       rc = sqlite3OsSync(pPager->jfd, SQLITE_SYNC_DATAONLY|pPager->sync_flags);
696     }
697 
698     /* At this point the transaction is committed but the write lock
699     ** is still held on the file. If there is a size limit configured for
700     ** the persistent journal and the journal file currently consumes more
701     ** space than that limit allows for, truncate it now. There is no need
702     ** to sync the file following this operation.
703     */
704     if( rc==SQLITE_OK && iLimit>0 ){
705       i64 sz;
706       rc = sqlite3OsFileSize(pPager->jfd, &sz);
707       if( rc==SQLITE_OK && sz>iLimit ){
708         rc = sqlite3OsTruncate(pPager->jfd, iLimit);
709       }
710     }
711   }
712   return rc;
713 }
714 
715 /*
716 ** The journal file must be open when this routine is called. A journal
717 ** header (JOURNAL_HDR_SZ bytes) is written into the journal file at the
718 ** current location.
719 **
720 ** The format for the journal header is as follows:
721 ** - 8 bytes: Magic identifying journal format.
722 ** - 4 bytes: Number of records in journal, or -1 no-sync mode is on.
723 ** - 4 bytes: Random number used for page hash.
724 ** - 4 bytes: Initial database page count.
725 ** - 4 bytes: Sector size used by the process that wrote this journal.
726 ** - 4 bytes: Database page size.
727 **
728 ** Followed by (JOURNAL_HDR_SZ - 28) bytes of unused space.
729 */
730 static int writeJournalHdr(Pager *pPager){
731   int rc = SQLITE_OK;                 /* Return code */
732   char *zHeader = pPager->pTmpSpace;  /* Temporary space used to build header */
733   u32 nHeader = pPager->pageSize;     /* Size of buffer pointed to by zHeader */
734   u32 nWrite;                         /* Bytes of header sector written */
735   int ii;                             /* Loop counter */
736 
737   assert( isOpen(pPager->jfd) );      /* Journal file must be open. */
738 
739   if( nHeader>JOURNAL_HDR_SZ(pPager) ){
740     nHeader = JOURNAL_HDR_SZ(pPager);
741   }
742 
743   /* If there are active savepoints and any of them were created
744   ** since the most recent journal header was written, update the
745   ** PagerSavepoint.iHdrOffset fields now.
746   */
747   for(ii=0; ii<pPager->nSavepoint; ii++){
748     if( pPager->aSavepoint[ii].iHdrOffset==0 ){
749       pPager->aSavepoint[ii].iHdrOffset = pPager->journalOff;
750     }
751   }
752 
753   pPager->journalHdr = pPager->journalOff = journalHdrOffset(pPager);
754   memcpy(zHeader, aJournalMagic, sizeof(aJournalMagic));
755 
756   /*
757   ** Write the nRec Field - the number of page records that follow this
758   ** journal header. Normally, zero is written to this value at this time.
759   ** After the records are added to the journal (and the journal synced,
760   ** if in full-sync mode), the zero is overwritten with the true number
761   ** of records (see syncJournal()).
762   **
763   ** A faster alternative is to write 0xFFFFFFFF to the nRec field. When
764   ** reading the journal this value tells SQLite to assume that the
765   ** rest of the journal file contains valid page records. This assumption
766   ** is dangerous, as if a failure occurred whilst writing to the journal
767   ** file it may contain some garbage data. There are two scenarios
768   ** where this risk can be ignored:
769   **
770   **   * When the pager is in no-sync mode. Corruption can follow a
771   **     power failure in this case anyway.
772   **
773   **   * When the SQLITE_IOCAP_SAFE_APPEND flag is set. This guarantees
774   **     that garbage data is never appended to the journal file.
775   */
776   assert( isOpen(pPager->fd) || pPager->noSync );
777   if( (pPager->noSync) || (pPager->journalMode==PAGER_JOURNALMODE_MEMORY)
778    || (sqlite3OsDeviceCharacteristics(pPager->fd)&SQLITE_IOCAP_SAFE_APPEND)
779   ){
780     put32bits(&zHeader[sizeof(aJournalMagic)], 0xffffffff);
781   }else{
782     put32bits(&zHeader[sizeof(aJournalMagic)], 0);
783   }
784 
785   /* The random check-hash initialiser */
786   sqlite3_randomness(sizeof(pPager->cksumInit), &pPager->cksumInit);
787   put32bits(&zHeader[sizeof(aJournalMagic)+4], pPager->cksumInit);
788   /* The initial database size */
789   put32bits(&zHeader[sizeof(aJournalMagic)+8], pPager->dbOrigSize);
790   /* The assumed sector size for this process */
791   put32bits(&zHeader[sizeof(aJournalMagic)+12], pPager->sectorSize);
792 
793   /* The page size */
794   put32bits(&zHeader[sizeof(aJournalMagic)+16], pPager->pageSize);
795 
796   /* Initializing the tail of the buffer is not necessary.  Everything
797   ** works find if the following memset() is omitted.  But initializing
798   ** the memory prevents valgrind from complaining, so we are willing to
799   ** take the performance hit.
800   */
801   memset(&zHeader[sizeof(aJournalMagic)+20], 0,
802          nHeader-(sizeof(aJournalMagic)+20));
803 
804   /* In theory, it is only necessary to write the 28 bytes that the
805   ** journal header consumes to the journal file here. Then increment the
806   ** Pager.journalOff variable by JOURNAL_HDR_SZ so that the next
807   ** record is written to the following sector (leaving a gap in the file
808   ** that will be implicitly filled in by the OS).
809   **
810   ** However it has been discovered that on some systems this pattern can
811   ** be significantly slower than contiguously writing data to the file,
812   ** even if that means explicitly writing data to the block of
813   ** (JOURNAL_HDR_SZ - 28) bytes that will not be used. So that is what
814   ** is done.
815   **
816   ** The loop is required here in case the sector-size is larger than the
817   ** database page size. Since the zHeader buffer is only Pager.pageSize
818   ** bytes in size, more than one call to sqlite3OsWrite() may be required
819   ** to populate the entire journal header sector.
820   */
821   for(nWrite=0; rc==SQLITE_OK&&nWrite<JOURNAL_HDR_SZ(pPager); nWrite+=nHeader){
822     IOTRACE(("JHDR %p %lld %d\n", pPager, pPager->journalHdr, nHeader))
823     rc = sqlite3OsWrite(pPager->jfd, zHeader, nHeader, pPager->journalOff);
824     pPager->journalOff += nHeader;
825   }
826 
827   return rc;
828 }
829 
830 /*
831 ** The journal file must be open when this is called. A journal header file
832 ** (JOURNAL_HDR_SZ bytes) is read from the current location in the journal
833 ** file. The current location in the journal file is given by
834 ** pPager->journalOff. See comments above function writeJournalHdr() for
835 ** a description of the journal header format.
836 **
837 ** If the header is read successfully, *pNRec is set to the number of
838 ** page records following this header and *pDbSize is set to the size of the
839 ** database before the transaction began, in pages. Also, pPager->cksumInit
840 ** is set to the value read from the journal header. SQLITE_OK is returned
841 ** in this case.
842 **
843 ** If the journal header file appears to be corrupted, SQLITE_DONE is
844 ** returned and *pNRec and *PDbSize are undefined.  If JOURNAL_HDR_SZ bytes
845 ** cannot be read from the journal file an error code is returned.
846 */
847 static int readJournalHdr(
848   Pager *pPager,               /* Pager object */
849   i64 journalSize,             /* Size of the open journal file in bytes */
850   u32 *pNRec,                  /* OUT: Value read from the nRec field */
851   u32 *pDbSize                 /* OUT: Value of original database size field */
852 ){
853   int rc;                      /* Return code */
854   unsigned char aMagic[8];     /* A buffer to hold the magic header */
855   i64 iHdrOff;                 /* Offset of journal header being read */
856 
857   assert( isOpen(pPager->jfd) );      /* Journal file must be open. */
858 
859   /* Advance Pager.journalOff to the start of the next sector. If the
860   ** journal file is too small for there to be a header stored at this
861   ** point, return SQLITE_DONE.
862   */
863   pPager->journalOff = journalHdrOffset(pPager);
864   if( pPager->journalOff+JOURNAL_HDR_SZ(pPager) > journalSize ){
865     return SQLITE_DONE;
866   }
867   iHdrOff = pPager->journalOff;
868 
869   /* Read in the first 8 bytes of the journal header. If they do not match
870   ** the  magic string found at the start of each journal header, return
871   ** SQLITE_DONE. If an IO error occurs, return an error code. Otherwise,
872   ** proceed.
873   */
874   rc = sqlite3OsRead(pPager->jfd, aMagic, sizeof(aMagic), iHdrOff);
875   if( rc ){
876     return rc;
877   }
878   if( memcmp(aMagic, aJournalMagic, sizeof(aMagic))!=0 ){
879     return SQLITE_DONE;
880   }
881 
882   /* Read the first three 32-bit fields of the journal header: The nRec
883   ** field, the checksum-initializer and the database size at the start
884   ** of the transaction. Return an error code if anything goes wrong.
885   */
886   if( SQLITE_OK!=(rc = read32bits(pPager->jfd, iHdrOff+8, pNRec))
887    || SQLITE_OK!=(rc = read32bits(pPager->jfd, iHdrOff+12, &pPager->cksumInit))
888    || SQLITE_OK!=(rc = read32bits(pPager->jfd, iHdrOff+16, pDbSize))
889   ){
890     return rc;
891   }
892 
893   if( pPager->journalOff==0 ){
894     u32 iPageSize;               /* Page-size field of journal header */
895     u32 iSectorSize;             /* Sector-size field of journal header */
896     u16 iPageSize16;             /* Copy of iPageSize in 16-bit variable */
897 
898     /* Read the page-size and sector-size journal header fields. */
899     if( SQLITE_OK!=(rc = read32bits(pPager->jfd, iHdrOff+20, &iSectorSize))
900      || SQLITE_OK!=(rc = read32bits(pPager->jfd, iHdrOff+24, &iPageSize))
901     ){
902       return rc;
903     }
904 
905     /* Check that the values read from the page-size and sector-size fields
906     ** are within range. To be 'in range', both values need to be a power
907     ** of two greater than or equal to 512, and not greater than their
908     ** respective compile time maximum limits.
909     */
910     if( iPageSize<512                  || iSectorSize<512
911      || iPageSize>SQLITE_MAX_PAGE_SIZE || iSectorSize>MAX_SECTOR_SIZE
912      || ((iPageSize-1)&iPageSize)!=0   || ((iSectorSize-1)&iSectorSize)!=0
913     ){
914       /* If the either the page-size or sector-size in the journal-header is
915       ** invalid, then the process that wrote the journal-header must have
916       ** crashed before the header was synced. In this case stop reading
917       ** the journal file here.
918       */
919       return SQLITE_DONE;
920     }
921 
922     /* Update the page-size to match the value read from the journal.
923     ** Use a testcase() macro to make sure that malloc failure within
924     ** PagerSetPagesize() is tested.
925     */
926     iPageSize16 = (u16)iPageSize;
927     rc = sqlite3PagerSetPagesize(pPager, &iPageSize16);
928     testcase( rc!=SQLITE_OK );
929     assert( rc!=SQLITE_OK || iPageSize16==(u16)iPageSize );
930 
931     /* Update the assumed sector-size to match the value used by
932     ** the process that created this journal. If this journal was
933     ** created by a process other than this one, then this routine
934     ** is being called from within pager_playback(). The local value
935     ** of Pager.sectorSize is restored at the end of that routine.
936     */
937     pPager->sectorSize = iSectorSize;
938   }
939 
940   pPager->journalOff += JOURNAL_HDR_SZ(pPager);
941   return rc;
942 }
943 
944 
945 /*
946 ** Write the supplied master journal name into the journal file for pager
947 ** pPager at the current location. The master journal name must be the last
948 ** thing written to a journal file. If the pager is in full-sync mode, the
949 ** journal file descriptor is advanced to the next sector boundary before
950 ** anything is written. The format is:
951 **
952 **   + 4 bytes: PAGER_MJ_PGNO.
953 **   + N bytes: Master journal filename in utf-8.
954 **   + 4 bytes: N (length of master journal name in bytes, no nul-terminator).
955 **   + 4 bytes: Master journal name checksum.
956 **   + 8 bytes: aJournalMagic[].
957 **
958 ** The master journal page checksum is the sum of the bytes in the master
959 ** journal name, where each byte is interpreted as a signed 8-bit integer.
960 **
961 ** If zMaster is a NULL pointer (occurs for a single database transaction),
962 ** this call is a no-op.
963 */
964 static int writeMasterJournal(Pager *pPager, const char *zMaster){
965   int rc;                          /* Return code */
966   int nMaster;                     /* Length of string zMaster */
967   i64 iHdrOff;                     /* Offset of header in journal file */
968   i64 jrnlSize;                    /* Size of journal file on disk */
969   u32 cksum = 0;                   /* Checksum of string zMaster */
970 
971   if( !zMaster || pPager->setMaster
972    || pPager->journalMode==PAGER_JOURNALMODE_MEMORY
973    || pPager->journalMode==PAGER_JOURNALMODE_OFF
974   ){
975     return SQLITE_OK;
976   }
977   pPager->setMaster = 1;
978   assert( isOpen(pPager->jfd) );
979 
980   /* Calculate the length in bytes and the checksum of zMaster */
981   for(nMaster=0; zMaster[nMaster]; nMaster++){
982     cksum += zMaster[nMaster];
983   }
984 
985   /* If in full-sync mode, advance to the next disk sector before writing
986   ** the master journal name. This is in case the previous page written to
987   ** the journal has already been synced.
988   */
989   if( pPager->fullSync ){
990     pPager->journalOff = journalHdrOffset(pPager);
991   }
992   iHdrOff = pPager->journalOff;
993 
994   /* Write the master journal data to the end of the journal file. If
995   ** an error occurs, return the error code to the caller.
996   */
997   if( (0 != (rc = write32bits(pPager->jfd, iHdrOff, PAGER_MJ_PGNO(pPager))))
998    || (0 != (rc = sqlite3OsWrite(pPager->jfd, zMaster, nMaster, iHdrOff+4)))
999    || (0 != (rc = write32bits(pPager->jfd, iHdrOff+4+nMaster, nMaster)))
1000    || (0 != (rc = write32bits(pPager->jfd, iHdrOff+4+nMaster+4, cksum)))
1001    || (0 != (rc = sqlite3OsWrite(pPager->jfd, aJournalMagic, 8, iHdrOff+4+nMaster+8)))
1002   ){
1003     return rc;
1004   }
1005   pPager->journalOff += (nMaster+20);
1006   pPager->needSync = !pPager->noSync;
1007 
1008   /* If the pager is in peristent-journal mode, then the physical
1009   ** journal-file may extend past the end of the master-journal name
1010   ** and 8 bytes of magic data just written to the file. This is
1011   ** dangerous because the code to rollback a hot-journal file
1012   ** will not be able to find the master-journal name to determine
1013   ** whether or not the journal is hot.
1014   **
1015   ** Easiest thing to do in this scenario is to truncate the journal
1016   ** file to the required size.
1017   */
1018   if( SQLITE_OK==(rc = sqlite3OsFileSize(pPager->jfd, &jrnlSize))
1019    && jrnlSize>pPager->journalOff
1020   ){
1021     rc = sqlite3OsTruncate(pPager->jfd, pPager->journalOff);
1022   }
1023   return rc;
1024 }
1025 
1026 /*
1027 ** Find a page in the hash table given its page number. Return
1028 ** a pointer to the page or NULL if the requested page is not
1029 ** already in memory.
1030 */
1031 static PgHdr *pager_lookup(Pager *pPager, Pgno pgno){
1032   PgHdr *p;                         /* Return value */
1033 
1034   /* It is not possible for a call to PcacheFetch() with createFlag==0 to
1035   ** fail, since no attempt to allocate dynamic memory will be made.
1036   */
1037   (void)sqlite3PcacheFetch(pPager->pPCache, pgno, 0, &p);
1038   return p;
1039 }
1040 
1041 /*
1042 ** Unless the pager is in error-state, discard all in-memory pages. If
1043 ** the pager is in error-state, then this call is a no-op.
1044 **
1045 ** TODO: Why can we not reset the pager while in error state?
1046 */
1047 static void pager_reset(Pager *pPager){
1048   if( SQLITE_OK==pPager->errCode ){
1049     sqlite3BackupRestart(pPager->pBackup);
1050     sqlite3PcacheClear(pPager->pPCache);
1051     pPager->dbSizeValid = 0;
1052   }
1053 }
1054 
1055 /*
1056 ** Free all structures in the Pager.aSavepoint[] array and set both
1057 ** Pager.aSavepoint and Pager.nSavepoint to zero. Close the sub-journal
1058 ** if it is open and the pager is not in exclusive mode.
1059 */
1060 static void releaseAllSavepoints(Pager *pPager){
1061   int ii;               /* Iterator for looping through Pager.aSavepoint */
1062   for(ii=0; ii<pPager->nSavepoint; ii++){
1063     sqlite3BitvecDestroy(pPager->aSavepoint[ii].pInSavepoint);
1064   }
1065   if( !pPager->exclusiveMode || sqlite3IsMemJournal(pPager->sjfd) ){
1066     sqlite3OsClose(pPager->sjfd);
1067   }
1068   sqlite3_free(pPager->aSavepoint);
1069   pPager->aSavepoint = 0;
1070   pPager->nSavepoint = 0;
1071   pPager->nSubRec = 0;
1072 }
1073 
1074 /*
1075 ** Set the bit number pgno in the PagerSavepoint.pInSavepoint
1076 ** bitvecs of all open savepoints. Return SQLITE_OK if successful
1077 ** or SQLITE_NOMEM if a malloc failure occurs.
1078 */
1079 static int addToSavepointBitvecs(Pager *pPager, Pgno pgno){
1080   int ii;                   /* Loop counter */
1081   int rc = SQLITE_OK;       /* Result code */
1082 
1083   for(ii=0; ii<pPager->nSavepoint; ii++){
1084     PagerSavepoint *p = &pPager->aSavepoint[ii];
1085     if( pgno<=p->nOrig ){
1086       rc |= sqlite3BitvecSet(p->pInSavepoint, pgno);
1087       testcase( rc==SQLITE_NOMEM );
1088       assert( rc==SQLITE_OK || rc==SQLITE_NOMEM );
1089     }
1090   }
1091   return rc;
1092 }
1093 
1094 /*
1095 ** Unlock the database file. This function is a no-op if the pager
1096 ** is in exclusive mode.
1097 **
1098 ** If the pager is currently in error state, discard the contents of
1099 ** the cache and reset the Pager structure internal state. If there is
1100 ** an open journal-file, then the next time a shared-lock is obtained
1101 ** on the pager file (by this or any other process), it will be
1102 ** treated as a hot-journal and rolled back.
1103 */
1104 static void pager_unlock(Pager *pPager){
1105   if( !pPager->exclusiveMode ){
1106     int rc;                      /* Return code */
1107 
1108     /* Always close the journal file when dropping the database lock.
1109     ** Otherwise, another connection with journal_mode=delete might
1110     ** delete the file out from under us.
1111     */
1112     sqlite3OsClose(pPager->jfd);
1113     sqlite3BitvecDestroy(pPager->pInJournal);
1114     pPager->pInJournal = 0;
1115     releaseAllSavepoints(pPager);
1116 
1117     /* If the file is unlocked, somebody else might change it. The
1118     ** values stored in Pager.dbSize etc. might become invalid if
1119     ** this happens. TODO: Really, this doesn't need to be cleared
1120     ** until the change-counter check fails in pagerSharedLock().
1121     */
1122     pPager->dbSizeValid = 0;
1123 
1124     rc = osUnlock(pPager->fd, NO_LOCK);
1125     if( rc ){
1126       pPager->errCode = rc;
1127     }
1128     IOTRACE(("UNLOCK %p\n", pPager))
1129 
1130     /* If Pager.errCode is set, the contents of the pager cache cannot be
1131     ** trusted. Now that the pager file is unlocked, the contents of the
1132     ** cache can be discarded and the error code safely cleared.
1133     */
1134     if( pPager->errCode ){
1135       if( rc==SQLITE_OK ){
1136         pPager->errCode = SQLITE_OK;
1137       }
1138       pager_reset(pPager);
1139     }
1140 
1141     pPager->changeCountDone = 0;
1142     pPager->state = PAGER_UNLOCK;
1143   }
1144 }
1145 
1146 /*
1147 ** This function should be called when an IOERR, CORRUPT or FULL error
1148 ** may have occurred. The first argument is a pointer to the pager
1149 ** structure, the second the error-code about to be returned by a pager
1150 ** API function. The value returned is a copy of the second argument
1151 ** to this function.
1152 **
1153 ** If the second argument is SQLITE_IOERR, SQLITE_CORRUPT, or SQLITE_FULL
1154 ** the error becomes persistent. Until the persisten error is cleared,
1155 ** subsequent API calls on this Pager will immediately return the same
1156 ** error code.
1157 **
1158 ** A persistent error indicates that the contents of the pager-cache
1159 ** cannot be trusted. This state can be cleared by completely discarding
1160 ** the contents of the pager-cache. If a transaction was active when
1161 ** the persistent error occurred, then the rollback journal may need
1162 ** to be replayed to restore the contents of the database file (as if
1163 ** it were a hot-journal).
1164 */
1165 static int pager_error(Pager *pPager, int rc){
1166   int rc2 = rc & 0xff;
1167   assert(
1168        pPager->errCode==SQLITE_FULL ||
1169        pPager->errCode==SQLITE_OK ||
1170        (pPager->errCode & 0xff)==SQLITE_IOERR
1171   );
1172   if(
1173     rc2==SQLITE_FULL ||
1174     rc2==SQLITE_IOERR ||
1175     rc2==SQLITE_CORRUPT
1176   ){
1177     pPager->errCode = rc;
1178     if( pPager->state==PAGER_UNLOCK
1179      && sqlite3PcacheRefCount(pPager->pPCache)==0
1180     ){
1181       /* If the pager is already unlocked, call pager_unlock() now to
1182       ** clear the error state and ensure that the pager-cache is
1183       ** completely empty.
1184       */
1185       pager_unlock(pPager);
1186     }
1187   }
1188   return rc;
1189 }
1190 
1191 /*
1192 ** Execute a rollback if a transaction is active and unlock the
1193 ** database file.
1194 **
1195 ** If the pager has already entered the error state, do not attempt
1196 ** the rollback at this time. Instead, pager_unlock() is called. The
1197 ** call to pager_unlock() will discard all in-memory pages, unlock
1198 ** the database file and clear the error state. If this means that
1199 ** there is a hot-journal left in the file-system, the next connection
1200 ** to obtain a shared lock on the pager (which may be this one) will
1201 ** roll it back.
1202 **
1203 ** If the pager has not already entered the error state, but an IO or
1204 ** malloc error occurs during a rollback, then this will itself cause
1205 ** the pager to enter the error state. Which will be cleared by the
1206 ** call to pager_unlock(), as described above.
1207 */
1208 static void pagerUnlockAndRollback(Pager *pPager){
1209   if( pPager->errCode==SQLITE_OK && pPager->state>=PAGER_RESERVED ){
1210     sqlite3BeginBenignMalloc();
1211     sqlite3PagerRollback(pPager);
1212     sqlite3EndBenignMalloc();
1213   }
1214   pager_unlock(pPager);
1215 }
1216 
1217 /*
1218 ** This routine ends a transaction. A transaction is usually ended by
1219 ** either a COMMIT or a ROLLBACK operation. This routine may be called
1220 ** after rollback of a hot-journal, or if an error occurs while opening
1221 ** the journal file or writing the very first journal-header of a
1222 ** database transaction.
1223 **
1224 ** If the pager is in PAGER_SHARED or PAGER_UNLOCK state when this
1225 ** routine is called, it is a no-op (returns SQLITE_OK).
1226 **
1227 ** Otherwise, any active savepoints are released.
1228 **
1229 ** If the journal file is open, then it is "finalized". Once a journal
1230 ** file has been finalized it is not possible to use it to roll back a
1231 ** transaction. Nor will it be considered to be a hot-journal by this
1232 ** or any other database connection. Exactly how a journal is finalized
1233 ** depends on whether or not the pager is running in exclusive mode and
1234 ** the current journal-mode (Pager.journalMode value), as follows:
1235 **
1236 **   journalMode==MEMORY
1237 **     Journal file descriptor is simply closed. This destroys an
1238 **     in-memory journal.
1239 **
1240 **   journalMode==TRUNCATE
1241 **     Journal file is truncated to zero bytes in size.
1242 **
1243 **   journalMode==PERSIST
1244 **     The first 28 bytes of the journal file are zeroed. This invalidates
1245 **     the first journal header in the file, and hence the entire journal
1246 **     file. An invalid journal file cannot be rolled back.
1247 **
1248 **   journalMode==DELETE
1249 **     The journal file is closed and deleted using sqlite3OsDelete().
1250 **
1251 **     If the pager is running in exclusive mode, this method of finalizing
1252 **     the journal file is never used. Instead, if the journalMode is
1253 **     DELETE and the pager is in exclusive mode, the method described under
1254 **     journalMode==PERSIST is used instead.
1255 **
1256 ** After the journal is finalized, if running in non-exclusive mode, the
1257 ** pager moves to PAGER_SHARED state (and downgrades the lock on the
1258 ** database file accordingly).
1259 **
1260 ** If the pager is running in exclusive mode and is in PAGER_SYNCED state,
1261 ** it moves to PAGER_EXCLUSIVE. No locks are downgraded when running in
1262 ** exclusive mode.
1263 **
1264 ** SQLITE_OK is returned if no error occurs. If an error occurs during
1265 ** any of the IO operations to finalize the journal file or unlock the
1266 ** database then the IO error code is returned to the user. If the
1267 ** operation to finalize the journal file fails, then the code still
1268 ** tries to unlock the database file if not in exclusive mode. If the
1269 ** unlock operation fails as well, then the first error code related
1270 ** to the first error encountered (the journal finalization one) is
1271 ** returned.
1272 */
1273 static int pager_end_transaction(Pager *pPager, int hasMaster){
1274   int rc = SQLITE_OK;      /* Error code from journal finalization operation */
1275   int rc2 = SQLITE_OK;     /* Error code from db file unlock operation */
1276 
1277   if( pPager->state<PAGER_RESERVED ){
1278     return SQLITE_OK;
1279   }
1280   releaseAllSavepoints(pPager);
1281 
1282   assert( isOpen(pPager->jfd) || pPager->pInJournal==0 );
1283   if( isOpen(pPager->jfd) ){
1284 
1285     /* TODO: There's a problem here if a journal-file was opened in MEMORY
1286     ** mode and then the journal-mode is changed to TRUNCATE or PERSIST
1287     ** during the transaction. This code should be changed to assume
1288     ** that the journal mode has not changed since the transaction was
1289     ** started. And the sqlite3PagerJournalMode() function should be
1290     ** changed to make sure that this is the case too.
1291     */
1292 
1293     /* Finalize the journal file. */
1294     if( pPager->journalMode==PAGER_JOURNALMODE_MEMORY ){
1295       int isMemoryJournal = sqlite3IsMemJournal(pPager->jfd);
1296       sqlite3OsClose(pPager->jfd);
1297       if( !isMemoryJournal ){
1298         rc = sqlite3OsDelete(pPager->pVfs, pPager->zJournal, 0);
1299       }
1300     }else if( pPager->journalMode==PAGER_JOURNALMODE_TRUNCATE ){
1301       if( pPager->journalOff==0 ){
1302         rc = SQLITE_OK;
1303       }else{
1304         rc = sqlite3OsTruncate(pPager->jfd, 0);
1305       }
1306       pPager->journalOff = 0;
1307       pPager->journalStarted = 0;
1308     }else if( pPager->exclusiveMode
1309      || pPager->journalMode==PAGER_JOURNALMODE_PERSIST
1310     ){
1311       rc = zeroJournalHdr(pPager, hasMaster);
1312       pager_error(pPager, rc);
1313       pPager->journalOff = 0;
1314       pPager->journalStarted = 0;
1315     }else{
1316       assert( pPager->journalMode==PAGER_JOURNALMODE_DELETE || rc );
1317       sqlite3OsClose(pPager->jfd);
1318       if( rc==SQLITE_OK && !pPager->tempFile ){
1319         rc = sqlite3OsDelete(pPager->pVfs, pPager->zJournal, 0);
1320       }
1321     }
1322 
1323 #ifdef SQLITE_CHECK_PAGES
1324     sqlite3PcacheIterateDirty(pPager->pPCache, pager_set_pagehash);
1325 #endif
1326 
1327     sqlite3PcacheCleanAll(pPager->pPCache);
1328     sqlite3BitvecDestroy(pPager->pInJournal);
1329     pPager->pInJournal = 0;
1330     pPager->nRec = 0;
1331   }
1332 
1333   if( !pPager->exclusiveMode ){
1334     rc2 = osUnlock(pPager->fd, SHARED_LOCK);
1335     pPager->state = PAGER_SHARED;
1336     pPager->changeCountDone = 0;
1337   }else if( pPager->state==PAGER_SYNCED ){
1338     pPager->state = PAGER_EXCLUSIVE;
1339   }
1340   pPager->setMaster = 0;
1341   pPager->needSync = 0;
1342   pPager->dbModified = 0;
1343 
1344   /* TODO: Is this optimal? Why is the db size invalidated here
1345   ** when the database file is not unlocked? */
1346   pPager->dbOrigSize = 0;
1347   sqlite3PcacheTruncate(pPager->pPCache, pPager->dbSize);
1348   if( !MEMDB ){
1349     pPager->dbSizeValid = 0;
1350   }
1351 
1352   return (rc==SQLITE_OK?rc2:rc);
1353 }
1354 
1355 /*
1356 ** Parameter aData must point to a buffer of pPager->pageSize bytes
1357 ** of data. Compute and return a checksum based ont the contents of the
1358 ** page of data and the current value of pPager->cksumInit.
1359 **
1360 ** This is not a real checksum. It is really just the sum of the
1361 ** random initial value (pPager->cksumInit) and every 200th byte
1362 ** of the page data, starting with byte offset (pPager->pageSize%200).
1363 ** Each byte is interpreted as an 8-bit unsigned integer.
1364 **
1365 ** Changing the formula used to compute this checksum results in an
1366 ** incompatible journal file format.
1367 **
1368 ** If journal corruption occurs due to a power failure, the most likely
1369 ** scenario is that one end or the other of the record will be changed.
1370 ** It is much less likely that the two ends of the journal record will be
1371 ** correct and the middle be corrupt.  Thus, this "checksum" scheme,
1372 ** though fast and simple, catches the mostly likely kind of corruption.
1373 */
1374 static u32 pager_cksum(Pager *pPager, const u8 *aData){
1375   u32 cksum = pPager->cksumInit;         /* Checksum value to return */
1376   int i = pPager->pageSize-200;          /* Loop counter */
1377   while( i>0 ){
1378     cksum += aData[i];
1379     i -= 200;
1380   }
1381   return cksum;
1382 }
1383 
1384 /*
1385 ** Read a single page from either the journal file (if isMainJrnl==1) or
1386 ** from the sub-journal (if isMainJrnl==0) and playback that page.
1387 ** The page begins at offset *pOffset into the file. The *pOffset
1388 ** value is increased to the start of the next page in the journal.
1389 **
1390 ** The isMainJrnl flag is true if this is the main rollback journal and
1391 ** false for the statement journal.  The main rollback journal uses
1392 ** checksums - the statement journal does not.
1393 **
1394 ** If the page number of the page record read from the (sub-)journal file
1395 ** is greater than the current value of Pager.dbSize, then playback is
1396 ** skipped and SQLITE_OK is returned.
1397 **
1398 ** If pDone is not NULL, then it is a record of pages that have already
1399 ** been played back.  If the page at *pOffset has already been played back
1400 ** (if the corresponding pDone bit is set) then skip the playback.
1401 ** Make sure the pDone bit corresponding to the *pOffset page is set
1402 ** prior to returning.
1403 **
1404 ** If the page record is successfully read from the (sub-)journal file
1405 ** and played back, then SQLITE_OK is returned. If an IO error occurs
1406 ** while reading the record from the (sub-)journal file or while writing
1407 ** to the database file, then the IO error code is returned. If data
1408 ** is successfully read from the (sub-)journal file but appears to be
1409 ** corrupted, SQLITE_DONE is returned. Data is considered corrupted in
1410 ** two circumstances:
1411 **
1412 **   * If the record page-number is illegal (0 or PAGER_MJ_PGNO), or
1413 **   * If the record is being rolled back from the main journal file
1414 **     and the checksum field does not match the record content.
1415 **
1416 ** Neither of these two scenarios are possible during a savepoint rollback.
1417 **
1418 ** If this is a savepoint rollback, then memory may have to be dynamically
1419 ** allocated by this function. If this is the case and an allocation fails,
1420 ** SQLITE_NOMEM is returned.
1421 */
1422 static int pager_playback_one_page(
1423   Pager *pPager,                /* The pager being played back */
1424   int isMainJrnl,               /* 1 -> main journal. 0 -> sub-journal. */
1425   int isUnsync,                 /* True if reading from unsynced main journal */
1426   i64 *pOffset,                 /* Offset of record to playback */
1427   int isSavepnt,                /* True for a savepoint rollback */
1428   Bitvec *pDone                 /* Bitvec of pages already played back */
1429 ){
1430   int rc;
1431   PgHdr *pPg;                   /* An existing page in the cache */
1432   Pgno pgno;                    /* The page number of a page in journal */
1433   u32 cksum;                    /* Checksum used for sanity checking */
1434   u8 *aData;                    /* Temporary storage for the page */
1435   sqlite3_file *jfd;            /* The file descriptor for the journal file */
1436 
1437   assert( (isMainJrnl&~1)==0 );      /* isMainJrnl is 0 or 1 */
1438   assert( (isSavepnt&~1)==0 );       /* isSavepnt is 0 or 1 */
1439   assert( isMainJrnl || pDone );     /* pDone always used on sub-journals */
1440   assert( isSavepnt || pDone==0 );   /* pDone never used on non-savepoint */
1441 
1442   aData = (u8*)pPager->pTmpSpace;
1443   assert( aData );         /* Temp storage must have already been allocated */
1444 
1445   /* Read the page number and page data from the journal or sub-journal
1446   ** file. Return an error code to the caller if an IO error occurs.
1447   */
1448   jfd = isMainJrnl ? pPager->jfd : pPager->sjfd;
1449   rc = read32bits(jfd, *pOffset, &pgno);
1450   if( rc!=SQLITE_OK ) return rc;
1451   rc = sqlite3OsRead(jfd, aData, pPager->pageSize, (*pOffset)+4);
1452   if( rc!=SQLITE_OK ) return rc;
1453   *pOffset += pPager->pageSize + 4 + isMainJrnl*4;
1454 
1455   /* Sanity checking on the page.  This is more important that I originally
1456   ** thought.  If a power failure occurs while the journal is being written,
1457   ** it could cause invalid data to be written into the journal.  We need to
1458   ** detect this invalid data (with high probability) and ignore it.
1459   */
1460   if( pgno==0 || pgno==PAGER_MJ_PGNO(pPager) ){
1461     assert( !isSavepnt );
1462     return SQLITE_DONE;
1463   }
1464   if( pgno>(Pgno)pPager->dbSize || sqlite3BitvecTest(pDone, pgno) ){
1465     return SQLITE_OK;
1466   }
1467   if( isMainJrnl ){
1468     rc = read32bits(jfd, (*pOffset)-4, &cksum);
1469     if( rc ) return rc;
1470     if( !isSavepnt && pager_cksum(pPager, aData)!=cksum ){
1471       return SQLITE_DONE;
1472     }
1473   }
1474 
1475   if( pDone && (rc = sqlite3BitvecSet(pDone, pgno))!=SQLITE_OK ){
1476     return rc;
1477   }
1478 
1479   assert( pPager->state==PAGER_RESERVED || pPager->state>=PAGER_EXCLUSIVE );
1480 
1481   /* If the pager is in RESERVED state, then there must be a copy of this
1482   ** page in the pager cache. In this case just update the pager cache,
1483   ** not the database file. The page is left marked dirty in this case.
1484   **
1485   ** An exception to the above rule: If the database is in no-sync mode
1486   ** and a page is moved during an incremental vacuum then the page may
1487   ** not be in the pager cache. Later: if a malloc() or IO error occurs
1488   ** during a Movepage() call, then the page may not be in the cache
1489   ** either. So the condition described in the above paragraph is not
1490   ** assert()able.
1491   **
1492   ** If in EXCLUSIVE state, then we update the pager cache if it exists
1493   ** and the main file. The page is then marked not dirty.
1494   **
1495   ** Ticket #1171:  The statement journal might contain page content that is
1496   ** different from the page content at the start of the transaction.
1497   ** This occurs when a page is changed prior to the start of a statement
1498   ** then changed again within the statement.  When rolling back such a
1499   ** statement we must not write to the original database unless we know
1500   ** for certain that original page contents are synced into the main rollback
1501   ** journal.  Otherwise, a power loss might leave modified data in the
1502   ** database file without an entry in the rollback journal that can
1503   ** restore the database to its original form.  Two conditions must be
1504   ** met before writing to the database files. (1) the database must be
1505   ** locked.  (2) we know that the original page content is fully synced
1506   ** in the main journal either because the page is not in cache or else
1507   ** the page is marked as needSync==0.
1508   **
1509   ** 2008-04-14:  When attempting to vacuum a corrupt database file, it
1510   ** is possible to fail a statement on a database that does not yet exist.
1511   ** Do not attempt to write if database file has never been opened.
1512   */
1513   pPg = pager_lookup(pPager, pgno);
1514   assert( pPg || !MEMDB );
1515   PAGERTRACE(("PLAYBACK %d page %d hash(%08x) %s\n",
1516                PAGERID(pPager), pgno, pager_datahash(pPager->pageSize, aData),
1517                (isMainJrnl?"main-journal":"sub-journal")
1518   ));
1519   if( (pPager->state>=PAGER_EXCLUSIVE)
1520    && (pPg==0 || 0==(pPg->flags&PGHDR_NEED_SYNC))
1521    && isOpen(pPager->fd)
1522    && !isUnsync
1523   ){
1524     i64 ofst = (pgno-1)*(i64)pPager->pageSize;
1525     rc = sqlite3OsWrite(pPager->fd, aData, pPager->pageSize, ofst);
1526     if( pgno>pPager->dbFileSize ){
1527       pPager->dbFileSize = pgno;
1528     }
1529     sqlite3BackupUpdate(pPager->pBackup, pgno, aData);
1530   }else if( !isMainJrnl && pPg==0 ){
1531     /* If this is a rollback of a savepoint and data was not written to
1532     ** the database and the page is not in-memory, there is a potential
1533     ** problem. When the page is next fetched by the b-tree layer, it
1534     ** will be read from the database file, which may or may not be
1535     ** current.
1536     **
1537     ** There are a couple of different ways this can happen. All are quite
1538     ** obscure. When running in synchronous mode, this can only happen
1539     ** if the page is on the free-list at the start of the transaction, then
1540     ** populated, then moved using sqlite3PagerMovepage().
1541     **
1542     ** The solution is to add an in-memory page to the cache containing
1543     ** the data just read from the sub-journal. Mark the page as dirty
1544     ** and if the pager requires a journal-sync, then mark the page as
1545     ** requiring a journal-sync before it is written.
1546     */
1547     assert( isSavepnt );
1548     if( (rc = sqlite3PagerAcquire(pPager, pgno, &pPg, 1))!=SQLITE_OK ){
1549       return rc;
1550     }
1551     pPg->flags &= ~PGHDR_NEED_READ;
1552     sqlite3PcacheMakeDirty(pPg);
1553   }
1554   if( pPg ){
1555     /* No page should ever be explicitly rolled back that is in use, except
1556     ** for page 1 which is held in use in order to keep the lock on the
1557     ** database active. However such a page may be rolled back as a result
1558     ** of an internal error resulting in an automatic call to
1559     ** sqlite3PagerRollback().
1560     */
1561     void *pData;
1562     pData = pPg->pData;
1563     memcpy(pData, aData, pPager->pageSize);
1564     if( pPager->xReiniter ){
1565       pPager->xReiniter(pPg);
1566     }
1567     if( isMainJrnl && (!isSavepnt || *pOffset<=pPager->journalHdr) ){
1568       /* If the contents of this page were just restored from the main
1569       ** journal file, then its content must be as they were when the
1570       ** transaction was first opened. In this case we can mark the page
1571       ** as clean, since there will be no need to write it out to the.
1572       **
1573       ** There is one exception to this rule. If the page is being rolled
1574       ** back as part of a savepoint (or statement) rollback from an
1575       ** unsynced portion of the main journal file, then it is not safe
1576       ** to mark the page as clean. This is because marking the page as
1577       ** clean will clear the PGHDR_NEED_SYNC flag. Since the page is
1578       ** already in the journal file (recorded in Pager.pInJournal) and
1579       ** the PGHDR_NEED_SYNC flag is cleared, if the page is written to
1580       ** again within this transaction, it will be marked as dirty but
1581       ** the PGHDR_NEED_SYNC flag will not be set. It could then potentially
1582       ** be written out into the database file before its journal file
1583       ** segment is synced. If a crash occurs during or following this,
1584       ** database corruption may ensue.
1585       */
1586       sqlite3PcacheMakeClean(pPg);
1587     }
1588 #ifdef SQLITE_CHECK_PAGES
1589     pPg->pageHash = pager_pagehash(pPg);
1590 #endif
1591     /* If this was page 1, then restore the value of Pager.dbFileVers.
1592     ** Do this before any decoding. */
1593     if( pgno==1 ){
1594       memcpy(&pPager->dbFileVers, &((u8*)pData)[24],sizeof(pPager->dbFileVers));
1595     }
1596 
1597     /* Decode the page just read from disk */
1598     CODEC1(pPager, pData, pPg->pgno, 3);
1599     sqlite3PcacheRelease(pPg);
1600   }
1601   return rc;
1602 }
1603 
1604 #if !defined(NDEBUG) || defined(SQLITE_COVERAGE_TEST)
1605 /*
1606 ** This routine looks ahead into the main journal file and determines
1607 ** whether or not the next record (the record that begins at file
1608 ** offset pPager->journalOff) is a well-formed page record consisting
1609 ** of a valid page number, pPage->pageSize bytes of content, followed
1610 ** by a valid checksum.
1611 **
1612 ** The pager never needs to know this in order to do its job.   This
1613 ** routine is only used from with assert() and testcase() macros.
1614 */
1615 static int pagerNextJournalPageIsValid(Pager *pPager){
1616   Pgno pgno;           /* The page number of the page */
1617   u32 cksum;           /* The page checksum */
1618   int rc;              /* Return code from read operations */
1619   sqlite3_file *fd;    /* The file descriptor from which we are reading */
1620   u8 *aData;           /* Content of the page */
1621 
1622   /* Read the page number header */
1623   fd = pPager->jfd;
1624   rc = read32bits(fd, pPager->journalOff, &pgno);
1625   if( rc!=SQLITE_OK ){ return 0; }                                  /*NO_TEST*/
1626   if( pgno==0 || pgno==PAGER_MJ_PGNO(pPager) ){ return 0; }         /*NO_TEST*/
1627   if( pgno>(Pgno)pPager->dbSize ){ return 0; }                      /*NO_TEST*/
1628 
1629   /* Read the checksum */
1630   rc = read32bits(fd, pPager->journalOff+pPager->pageSize+4, &cksum);
1631   if( rc!=SQLITE_OK ){ return 0; }                                  /*NO_TEST*/
1632 
1633   /* Read the data and verify the checksum */
1634   aData = (u8*)pPager->pTmpSpace;
1635   rc = sqlite3OsRead(fd, aData, pPager->pageSize, pPager->journalOff+4);
1636   if( rc!=SQLITE_OK ){ return 0; }                                  /*NO_TEST*/
1637   if( pager_cksum(pPager, aData)!=cksum ){ return 0; }              /*NO_TEST*/
1638 
1639   /* Reach this point only if the page is valid */
1640   return 1;
1641 }
1642 #endif /* !defined(NDEBUG) || defined(SQLITE_COVERAGE_TEST) */
1643 
1644 /*
1645 ** Parameter zMaster is the name of a master journal file. A single journal
1646 ** file that referred to the master journal file has just been rolled back.
1647 ** This routine checks if it is possible to delete the master journal file,
1648 ** and does so if it is.
1649 **
1650 ** Argument zMaster may point to Pager.pTmpSpace. So that buffer is not
1651 ** available for use within this function.
1652 **
1653 ** When a master journal file is created, it is populated with the names
1654 ** of all of its child journals, one after another, formatted as utf-8
1655 ** encoded text. The end of each child journal file is marked with a
1656 ** nul-terminator byte (0x00). i.e. the entire contents of a master journal
1657 ** file for a transaction involving two databases might be:
1658 **
1659 **   "/home/bill/a.db-journal\x00/home/bill/b.db-journal\x00"
1660 **
1661 ** A master journal file may only be deleted once all of its child
1662 ** journals have been rolled back.
1663 **
1664 ** This function reads the contents of the master-journal file into
1665 ** memory and loops through each of the child journal names. For
1666 ** each child journal, it checks if:
1667 **
1668 **   * if the child journal exists, and if so
1669 **   * if the child journal contains a reference to master journal
1670 **     file zMaster
1671 **
1672 ** If a child journal can be found that matches both of the criteria
1673 ** above, this function returns without doing anything. Otherwise, if
1674 ** no such child journal can be found, file zMaster is deleted from
1675 ** the file-system using sqlite3OsDelete().
1676 **
1677 ** If an IO error within this function, an error code is returned. This
1678 ** function allocates memory by calling sqlite3Malloc(). If an allocation
1679 ** fails, SQLITE_NOMEM is returned. Otherwise, if no IO or malloc errors
1680 ** occur, SQLITE_OK is returned.
1681 **
1682 ** TODO: This function allocates a single block of memory to load
1683 ** the entire contents of the master journal file. This could be
1684 ** a couple of kilobytes or so - potentially larger than the page
1685 ** size.
1686 */
1687 static int pager_delmaster(Pager *pPager, const char *zMaster){
1688   sqlite3_vfs *pVfs = pPager->pVfs;
1689   int rc;                   /* Return code */
1690   sqlite3_file *pMaster;    /* Malloc'd master-journal file descriptor */
1691   sqlite3_file *pJournal;   /* Malloc'd child-journal file descriptor */
1692   char *zMasterJournal = 0; /* Contents of master journal file */
1693   i64 nMasterJournal;       /* Size of master journal file */
1694 
1695   /* Allocate space for both the pJournal and pMaster file descriptors.
1696   ** If successful, open the master journal file for reading.
1697   */
1698   pMaster = (sqlite3_file *)sqlite3MallocZero(pVfs->szOsFile * 2);
1699   pJournal = (sqlite3_file *)(((u8 *)pMaster) + pVfs->szOsFile);
1700   if( !pMaster ){
1701     rc = SQLITE_NOMEM;
1702   }else{
1703     const int flags = (SQLITE_OPEN_READONLY|SQLITE_OPEN_MASTER_JOURNAL);
1704     rc = sqlite3OsOpen(pVfs, zMaster, pMaster, flags, 0);
1705   }
1706   if( rc!=SQLITE_OK ) goto delmaster_out;
1707 
1708   rc = sqlite3OsFileSize(pMaster, &nMasterJournal);
1709   if( rc!=SQLITE_OK ) goto delmaster_out;
1710 
1711   if( nMasterJournal>0 ){
1712     char *zJournal;
1713     char *zMasterPtr = 0;
1714     int nMasterPtr = pVfs->mxPathname+1;
1715 
1716     /* Load the entire master journal file into space obtained from
1717     ** sqlite3_malloc() and pointed to by zMasterJournal.
1718     */
1719     zMasterJournal = (char *)sqlite3Malloc((int)nMasterJournal + nMasterPtr);
1720     if( !zMasterJournal ){
1721       rc = SQLITE_NOMEM;
1722       goto delmaster_out;
1723     }
1724     zMasterPtr = &zMasterJournal[nMasterJournal];
1725     rc = sqlite3OsRead(pMaster, zMasterJournal, (int)nMasterJournal, 0);
1726     if( rc!=SQLITE_OK ) goto delmaster_out;
1727 
1728     zJournal = zMasterJournal;
1729     while( (zJournal-zMasterJournal)<nMasterJournal ){
1730       int exists;
1731       rc = sqlite3OsAccess(pVfs, zJournal, SQLITE_ACCESS_EXISTS, &exists);
1732       if( rc!=SQLITE_OK ){
1733         goto delmaster_out;
1734       }
1735       if( exists ){
1736         /* One of the journals pointed to by the master journal exists.
1737         ** Open it and check if it points at the master journal. If
1738         ** so, return without deleting the master journal file.
1739         */
1740         int c;
1741         int flags = (SQLITE_OPEN_READONLY|SQLITE_OPEN_MAIN_JOURNAL);
1742         rc = sqlite3OsOpen(pVfs, zJournal, pJournal, flags, 0);
1743         if( rc!=SQLITE_OK ){
1744           goto delmaster_out;
1745         }
1746 
1747         rc = readMasterJournal(pJournal, zMasterPtr, nMasterPtr);
1748         sqlite3OsClose(pJournal);
1749         if( rc!=SQLITE_OK ){
1750           goto delmaster_out;
1751         }
1752 
1753         c = zMasterPtr[0]!=0 && strcmp(zMasterPtr, zMaster)==0;
1754         if( c ){
1755           /* We have a match. Do not delete the master journal file. */
1756           goto delmaster_out;
1757         }
1758       }
1759       zJournal += (sqlite3Strlen30(zJournal)+1);
1760     }
1761   }
1762 
1763   rc = sqlite3OsDelete(pVfs, zMaster, 0);
1764 
1765 delmaster_out:
1766   if( zMasterJournal ){
1767     sqlite3_free(zMasterJournal);
1768   }
1769   if( pMaster ){
1770     sqlite3OsClose(pMaster);
1771     assert( !isOpen(pJournal) );
1772   }
1773   sqlite3_free(pMaster);
1774   return rc;
1775 }
1776 
1777 
1778 /*
1779 ** This function is used to change the actual size of the database
1780 ** file in the file-system. This only happens when committing a transaction,
1781 ** or rolling back a transaction (including rolling back a hot-journal).
1782 **
1783 ** If the main database file is not open, or an exclusive lock is not
1784 ** held, this function is a no-op. Otherwise, the size of the file is
1785 ** changed to nPage pages (nPage*pPager->pageSize bytes). If the file
1786 ** on disk is currently larger than nPage pages, then use the VFS
1787 ** xTruncate() method to truncate it.
1788 **
1789 ** Or, it might might be the case that the file on disk is smaller than
1790 ** nPage pages. Some operating system implementations can get confused if
1791 ** you try to truncate a file to some size that is larger than it
1792 ** currently is, so detect this case and write a single zero byte to
1793 ** the end of the new file instead.
1794 **
1795 ** If successful, return SQLITE_OK. If an IO error occurs while modifying
1796 ** the database file, return the error code to the caller.
1797 */
1798 static int pager_truncate(Pager *pPager, Pgno nPage){
1799   int rc = SQLITE_OK;
1800   if( pPager->state>=PAGER_EXCLUSIVE && isOpen(pPager->fd) ){
1801     i64 currentSize, newSize;
1802     /* TODO: Is it safe to use Pager.dbFileSize here? */
1803     rc = sqlite3OsFileSize(pPager->fd, &currentSize);
1804     newSize = pPager->pageSize*(i64)nPage;
1805     if( rc==SQLITE_OK && currentSize!=newSize ){
1806       if( currentSize>newSize ){
1807         rc = sqlite3OsTruncate(pPager->fd, newSize);
1808       }else{
1809         rc = sqlite3OsWrite(pPager->fd, "", 1, newSize-1);
1810       }
1811       if( rc==SQLITE_OK ){
1812         pPager->dbFileSize = nPage;
1813       }
1814     }
1815   }
1816   return rc;
1817 }
1818 
1819 /*
1820 ** Set the value of the Pager.sectorSize variable for the given
1821 ** pager based on the value returned by the xSectorSize method
1822 ** of the open database file. The sector size will be used used
1823 ** to determine the size and alignment of journal header and
1824 ** master journal pointers within created journal files.
1825 **
1826 ** For temporary files the effective sector size is always 512 bytes.
1827 **
1828 ** Otherwise, for non-temporary files, the effective sector size is
1829 ** the value returned by the xSectorSize() method rounded up to 512 if
1830 ** it is less than 512, or rounded down to MAX_SECTOR_SIZE if it
1831 ** is greater than MAX_SECTOR_SIZE.
1832 */
1833 static void setSectorSize(Pager *pPager){
1834   assert( isOpen(pPager->fd) || pPager->tempFile );
1835 
1836   if( !pPager->tempFile ){
1837     /* Sector size doesn't matter for temporary files. Also, the file
1838     ** may not have been opened yet, in which case the OsSectorSize()
1839     ** call will segfault.
1840     */
1841     pPager->sectorSize = sqlite3OsSectorSize(pPager->fd);
1842   }
1843   if( pPager->sectorSize<512 ){
1844     pPager->sectorSize = 512;
1845   }
1846   if( pPager->sectorSize>MAX_SECTOR_SIZE ){
1847     assert( MAX_SECTOR_SIZE>=512 );
1848     pPager->sectorSize = MAX_SECTOR_SIZE;
1849   }
1850 }
1851 
1852 /*
1853 ** Playback the journal and thus restore the database file to
1854 ** the state it was in before we started making changes.
1855 **
1856 ** The journal file format is as follows:
1857 **
1858 **  (1)  8 byte prefix.  A copy of aJournalMagic[].
1859 **  (2)  4 byte big-endian integer which is the number of valid page records
1860 **       in the journal.  If this value is 0xffffffff, then compute the
1861 **       number of page records from the journal size.
1862 **  (3)  4 byte big-endian integer which is the initial value for the
1863 **       sanity checksum.
1864 **  (4)  4 byte integer which is the number of pages to truncate the
1865 **       database to during a rollback.
1866 **  (5)  4 byte big-endian integer which is the sector size.  The header
1867 **       is this many bytes in size.
1868 **  (6)  4 byte big-endian integer which is the page case.
1869 **  (7)  4 byte integer which is the number of bytes in the master journal
1870 **       name.  The value may be zero (indicate that there is no master
1871 **       journal.)
1872 **  (8)  N bytes of the master journal name.  The name will be nul-terminated
1873 **       and might be shorter than the value read from (5).  If the first byte
1874 **       of the name is \000 then there is no master journal.  The master
1875 **       journal name is stored in UTF-8.
1876 **  (9)  Zero or more pages instances, each as follows:
1877 **        +  4 byte page number.
1878 **        +  pPager->pageSize bytes of data.
1879 **        +  4 byte checksum
1880 **
1881 ** When we speak of the journal header, we mean the first 8 items above.
1882 ** Each entry in the journal is an instance of the 9th item.
1883 **
1884 ** Call the value from the second bullet "nRec".  nRec is the number of
1885 ** valid page entries in the journal.  In most cases, you can compute the
1886 ** value of nRec from the size of the journal file.  But if a power
1887 ** failure occurred while the journal was being written, it could be the
1888 ** case that the size of the journal file had already been increased but
1889 ** the extra entries had not yet made it safely to disk.  In such a case,
1890 ** the value of nRec computed from the file size would be too large.  For
1891 ** that reason, we always use the nRec value in the header.
1892 **
1893 ** If the nRec value is 0xffffffff it means that nRec should be computed
1894 ** from the file size.  This value is used when the user selects the
1895 ** no-sync option for the journal.  A power failure could lead to corruption
1896 ** in this case.  But for things like temporary table (which will be
1897 ** deleted when the power is restored) we don't care.
1898 **
1899 ** If the file opened as the journal file is not a well-formed
1900 ** journal file then all pages up to the first corrupted page are rolled
1901 ** back (or no pages if the journal header is corrupted). The journal file
1902 ** is then deleted and SQLITE_OK returned, just as if no corruption had
1903 ** been encountered.
1904 **
1905 ** If an I/O or malloc() error occurs, the journal-file is not deleted
1906 ** and an error code is returned.
1907 **
1908 ** The isHot parameter indicates that we are trying to rollback a journal
1909 ** that might be a hot journal.  Or, it could be that the journal is
1910 ** preserved because of JOURNALMODE_PERSIST or JOURNALMODE_TRUNCATE.
1911 ** If the journal really is hot, reset the pager cache prior rolling
1912 ** back any content.  If the journal is merely persistent, no reset is
1913 ** needed.
1914 */
1915 static int pager_playback(Pager *pPager, int isHot){
1916   sqlite3_vfs *pVfs = pPager->pVfs;
1917   i64 szJ;                 /* Size of the journal file in bytes */
1918   u32 nRec;                /* Number of Records in the journal */
1919   u32 u;                   /* Unsigned loop counter */
1920   Pgno mxPg = 0;           /* Size of the original file in pages */
1921   int rc;                  /* Result code of a subroutine */
1922   int res = 1;             /* Value returned by sqlite3OsAccess() */
1923   char *zMaster = 0;       /* Name of master journal file if any */
1924   int needPagerReset;      /* True to reset page prior to first page rollback */
1925 
1926   /* Figure out how many records are in the journal.  Abort early if
1927   ** the journal is empty.
1928   */
1929   assert( isOpen(pPager->jfd) );
1930   rc = sqlite3OsFileSize(pPager->jfd, &szJ);
1931   if( rc!=SQLITE_OK || szJ==0 ){
1932     goto end_playback;
1933   }
1934 
1935   /* Read the master journal name from the journal, if it is present.
1936   ** If a master journal file name is specified, but the file is not
1937   ** present on disk, then the journal is not hot and does not need to be
1938   ** played back.
1939   **
1940   ** TODO: Technically the following is an error because it assumes that
1941   ** buffer Pager.pTmpSpace is (mxPathname+1) bytes or larger. i.e. that
1942   ** (pPager->pageSize >= pPager->pVfs->mxPathname+1). Using os_unix.c,
1943   **  mxPathname is 512, which is the same as the minimum allowable value
1944   ** for pageSize.
1945   */
1946   zMaster = pPager->pTmpSpace;
1947   rc = readMasterJournal(pPager->jfd, zMaster, pPager->pVfs->mxPathname+1);
1948   if( rc==SQLITE_OK && zMaster[0] ){
1949     rc = sqlite3OsAccess(pVfs, zMaster, SQLITE_ACCESS_EXISTS, &res);
1950   }
1951   zMaster = 0;
1952   if( rc!=SQLITE_OK || !res ){
1953     goto end_playback;
1954   }
1955   pPager->journalOff = 0;
1956   needPagerReset = isHot;
1957 
1958   /* This loop terminates either when a readJournalHdr() or
1959   ** pager_playback_one_page() call returns SQLITE_DONE or an IO error
1960   ** occurs.
1961   */
1962   while( 1 ){
1963     int isUnsync = 0;
1964 
1965     /* Read the next journal header from the journal file.  If there are
1966     ** not enough bytes left in the journal file for a complete header, or
1967     ** it is corrupted, then a process must of failed while writing it.
1968     ** This indicates nothing more needs to be rolled back.
1969     */
1970     rc = readJournalHdr(pPager, szJ, &nRec, &mxPg);
1971     if( rc!=SQLITE_OK ){
1972       if( rc==SQLITE_DONE ){
1973         rc = SQLITE_OK;
1974       }
1975       goto end_playback;
1976     }
1977 
1978     /* If nRec is 0xffffffff, then this journal was created by a process
1979     ** working in no-sync mode. This means that the rest of the journal
1980     ** file consists of pages, there are no more journal headers. Compute
1981     ** the value of nRec based on this assumption.
1982     */
1983     if( nRec==0xffffffff ){
1984       assert( pPager->journalOff==JOURNAL_HDR_SZ(pPager) );
1985       nRec = (int)((szJ - JOURNAL_HDR_SZ(pPager))/JOURNAL_PG_SZ(pPager));
1986     }
1987 
1988     /* If nRec is 0 and this rollback is of a transaction created by this
1989     ** process and if this is the final header in the journal, then it means
1990     ** that this part of the journal was being filled but has not yet been
1991     ** synced to disk.  Compute the number of pages based on the remaining
1992     ** size of the file.
1993     **
1994     ** The third term of the test was added to fix ticket #2565.
1995     ** When rolling back a hot journal, nRec==0 always means that the next
1996     ** chunk of the journal contains zero pages to be rolled back.  But
1997     ** when doing a ROLLBACK and the nRec==0 chunk is the last chunk in
1998     ** the journal, it means that the journal might contain additional
1999     ** pages that need to be rolled back and that the number of pages
2000     ** should be computed based on the journal file size.
2001     */
2002     testcase( nRec==0 && !isHot
2003          && pPager->journalHdr+JOURNAL_HDR_SZ(pPager)!=pPager->journalOff
2004          && ((szJ - pPager->journalOff) / JOURNAL_PG_SZ(pPager))>0
2005          && pagerNextJournalPageIsValid(pPager)
2006     );
2007     if( nRec==0 && !isHot &&
2008         pPager->journalHdr+JOURNAL_HDR_SZ(pPager)==pPager->journalOff ){
2009       nRec = (int)((szJ - pPager->journalOff) / JOURNAL_PG_SZ(pPager));
2010       isUnsync = 1;
2011     }
2012 
2013     /* If this is the first header read from the journal, truncate the
2014     ** database file back to its original size.
2015     */
2016     if( pPager->journalOff==JOURNAL_HDR_SZ(pPager) ){
2017       rc = pager_truncate(pPager, mxPg);
2018       if( rc!=SQLITE_OK ){
2019         goto end_playback;
2020       }
2021       pPager->dbSize = mxPg;
2022     }
2023 
2024     /* Copy original pages out of the journal and back into the
2025     ** database file and/or page cache.
2026     */
2027     for(u=0; u<nRec; u++){
2028       if( needPagerReset ){
2029         pager_reset(pPager);
2030         needPagerReset = 0;
2031       }
2032       rc = pager_playback_one_page(pPager,1,isUnsync,&pPager->journalOff,0,0);
2033       if( rc!=SQLITE_OK ){
2034         if( rc==SQLITE_DONE ){
2035           rc = SQLITE_OK;
2036           pPager->journalOff = szJ;
2037           break;
2038         }else{
2039           /* If we are unable to rollback, quit and return the error
2040           ** code.  This will cause the pager to enter the error state
2041           ** so that no further harm will be done.  Perhaps the next
2042           ** process to come along will be able to rollback the database.
2043           */
2044           goto end_playback;
2045         }
2046       }
2047     }
2048   }
2049   /*NOTREACHED*/
2050   assert( 0 );
2051 
2052 end_playback:
2053   /* Following a rollback, the database file should be back in its original
2054   ** state prior to the start of the transaction, so invoke the
2055   ** SQLITE_FCNTL_DB_UNCHANGED file-control method to disable the
2056   ** assertion that the transaction counter was modified.
2057   */
2058   assert(
2059     pPager->fd->pMethods==0 ||
2060     sqlite3OsFileControl(pPager->fd,SQLITE_FCNTL_DB_UNCHANGED,0)>=SQLITE_OK
2061   );
2062 
2063   /* If this playback is happening automatically as a result of an IO or
2064   ** malloc error that occurred after the change-counter was updated but
2065   ** before the transaction was committed, then the change-counter
2066   ** modification may just have been reverted. If this happens in exclusive
2067   ** mode, then subsequent transactions performed by the connection will not
2068   ** update the change-counter at all. This may lead to cache inconsistency
2069   ** problems for other processes at some point in the future. So, just
2070   ** in case this has happened, clear the changeCountDone flag now.
2071   */
2072   pPager->changeCountDone = pPager->tempFile;
2073 
2074   if( rc==SQLITE_OK ){
2075     zMaster = pPager->pTmpSpace;
2076     rc = readMasterJournal(pPager->jfd, zMaster, pPager->pVfs->mxPathname+1);
2077     testcase( rc!=SQLITE_OK );
2078   }
2079   if( rc==SQLITE_OK ){
2080     rc = pager_end_transaction(pPager, zMaster[0]!='\0');
2081     testcase( rc!=SQLITE_OK );
2082   }
2083   if( rc==SQLITE_OK && zMaster[0] && res ){
2084     /* If there was a master journal and this routine will return success,
2085     ** see if it is possible to delete the master journal.
2086     */
2087     rc = pager_delmaster(pPager, zMaster);
2088     testcase( rc!=SQLITE_OK );
2089   }
2090 
2091   /* The Pager.sectorSize variable may have been updated while rolling
2092   ** back a journal created by a process with a different sector size
2093   ** value. Reset it to the correct value for this process.
2094   */
2095   setSectorSize(pPager);
2096   return rc;
2097 }
2098 
2099 /*
2100 ** Playback savepoint pSavepoint. Or, if pSavepoint==NULL, then playback
2101 ** the entire master journal file. The case pSavepoint==NULL occurs when
2102 ** a ROLLBACK TO command is invoked on a SAVEPOINT that is a transaction
2103 ** savepoint.
2104 **
2105 ** When pSavepoint is not NULL (meaning a non-transaction savepoint is
2106 ** being rolled back), then the rollback consists of up to three stages,
2107 ** performed in the order specified:
2108 **
2109 **   * Pages are played back from the main journal starting at byte
2110 **     offset PagerSavepoint.iOffset and continuing to
2111 **     PagerSavepoint.iHdrOffset, or to the end of the main journal
2112 **     file if PagerSavepoint.iHdrOffset is zero.
2113 **
2114 **   * If PagerSavepoint.iHdrOffset is not zero, then pages are played
2115 **     back starting from the journal header immediately following
2116 **     PagerSavepoint.iHdrOffset to the end of the main journal file.
2117 **
2118 **   * Pages are then played back from the sub-journal file, starting
2119 **     with the PagerSavepoint.iSubRec and continuing to the end of
2120 **     the journal file.
2121 **
2122 ** Throughout the rollback process, each time a page is rolled back, the
2123 ** corresponding bit is set in a bitvec structure (variable pDone in the
2124 ** implementation below). This is used to ensure that a page is only
2125 ** rolled back the first time it is encountered in either journal.
2126 **
2127 ** If pSavepoint is NULL, then pages are only played back from the main
2128 ** journal file. There is no need for a bitvec in this case.
2129 **
2130 ** In either case, before playback commences the Pager.dbSize variable
2131 ** is reset to the value that it held at the start of the savepoint
2132 ** (or transaction). No page with a page-number greater than this value
2133 ** is played back. If one is encountered it is simply skipped.
2134 */
2135 static int pagerPlaybackSavepoint(Pager *pPager, PagerSavepoint *pSavepoint){
2136   i64 szJ;                 /* Effective size of the main journal */
2137   i64 iHdrOff;             /* End of first segment of main-journal records */
2138   int rc = SQLITE_OK;      /* Return code */
2139   Bitvec *pDone = 0;       /* Bitvec to ensure pages played back only once */
2140 
2141   assert( pPager->state>=PAGER_SHARED );
2142 
2143   /* Allocate a bitvec to use to store the set of pages rolled back */
2144   if( pSavepoint ){
2145     pDone = sqlite3BitvecCreate(pSavepoint->nOrig);
2146     if( !pDone ){
2147       return SQLITE_NOMEM;
2148     }
2149   }
2150 
2151   /* Set the database size back to the value it was before the savepoint
2152   ** being reverted was opened.
2153   */
2154   pPager->dbSize = pSavepoint ? pSavepoint->nOrig : pPager->dbOrigSize;
2155 
2156   /* Use pPager->journalOff as the effective size of the main rollback
2157   ** journal.  The actual file might be larger than this in
2158   ** PAGER_JOURNALMODE_TRUNCATE or PAGER_JOURNALMODE_PERSIST.  But anything
2159   ** past pPager->journalOff is off-limits to us.
2160   */
2161   szJ = pPager->journalOff;
2162 
2163   /* Begin by rolling back records from the main journal starting at
2164   ** PagerSavepoint.iOffset and continuing to the next journal header.
2165   ** There might be records in the main journal that have a page number
2166   ** greater than the current database size (pPager->dbSize) but those
2167   ** will be skipped automatically.  Pages are added to pDone as they
2168   ** are played back.
2169   */
2170   if( pSavepoint ){
2171     iHdrOff = pSavepoint->iHdrOffset ? pSavepoint->iHdrOffset : szJ;
2172     pPager->journalOff = pSavepoint->iOffset;
2173     while( rc==SQLITE_OK && pPager->journalOff<iHdrOff ){
2174       rc = pager_playback_one_page(pPager, 1, 0, &pPager->journalOff, 1, pDone);
2175     }
2176     assert( rc!=SQLITE_DONE );
2177   }else{
2178     pPager->journalOff = 0;
2179   }
2180 
2181   /* Continue rolling back records out of the main journal starting at
2182   ** the first journal header seen and continuing until the effective end
2183   ** of the main journal file.  Continue to skip out-of-range pages and
2184   ** continue adding pages rolled back to pDone.
2185   */
2186   while( rc==SQLITE_OK && pPager->journalOff<szJ ){
2187     u32 ii;            /* Loop counter */
2188     u32 nJRec = 0;     /* Number of Journal Records */
2189     u32 dummy;
2190     rc = readJournalHdr(pPager, szJ, &nJRec, &dummy);
2191     assert( rc!=SQLITE_DONE );
2192 
2193     /*
2194     ** The "pPager->journalHdr+JOURNAL_HDR_SZ(pPager)==pPager->journalOff"
2195     ** test is related to ticket #2565.  See the discussion in the
2196     ** pager_playback() function for additional information.
2197     */
2198     assert( !(nJRec==0
2199          && pPager->journalHdr+JOURNAL_HDR_SZ(pPager)!=pPager->journalOff
2200          && ((szJ - pPager->journalOff) / JOURNAL_PG_SZ(pPager))>0
2201          && pagerNextJournalPageIsValid(pPager))
2202     );
2203     if( nJRec==0
2204      && pPager->journalHdr+JOURNAL_HDR_SZ(pPager)==pPager->journalOff
2205     ){
2206       nJRec = (u32)((szJ - pPager->journalOff)/JOURNAL_PG_SZ(pPager));
2207     }
2208     for(ii=0; rc==SQLITE_OK && ii<nJRec && pPager->journalOff<szJ; ii++){
2209       rc = pager_playback_one_page(pPager, 1, 0, &pPager->journalOff, 1, pDone);
2210     }
2211     assert( rc!=SQLITE_DONE );
2212   }
2213   assert( rc!=SQLITE_OK || pPager->journalOff==szJ );
2214 
2215   /* Finally,  rollback pages from the sub-journal.  Page that were
2216   ** previously rolled back out of the main journal (and are hence in pDone)
2217   ** will be skipped.  Out-of-range pages are also skipped.
2218   */
2219   if( pSavepoint ){
2220     u32 ii;            /* Loop counter */
2221     i64 offset = pSavepoint->iSubRec*(4+pPager->pageSize);
2222     for(ii=pSavepoint->iSubRec; rc==SQLITE_OK && ii<pPager->nSubRec; ii++){
2223       assert( offset==ii*(4+pPager->pageSize) );
2224       rc = pager_playback_one_page(pPager, 0, 0, &offset, 1, pDone);
2225     }
2226     assert( rc!=SQLITE_DONE );
2227   }
2228 
2229   sqlite3BitvecDestroy(pDone);
2230   if( rc==SQLITE_OK ){
2231     pPager->journalOff = szJ;
2232   }
2233   return rc;
2234 }
2235 
2236 /*
2237 ** Change the maximum number of in-memory pages that are allowed.
2238 */
2239 void sqlite3PagerSetCachesize(Pager *pPager, int mxPage){
2240   sqlite3PcacheSetCachesize(pPager->pPCache, mxPage);
2241 }
2242 
2243 /*
2244 ** Adjust the robustness of the database to damage due to OS crashes
2245 ** or power failures by changing the number of syncs()s when writing
2246 ** the rollback journal.  There are three levels:
2247 **
2248 **    OFF       sqlite3OsSync() is never called.  This is the default
2249 **              for temporary and transient files.
2250 **
2251 **    NORMAL    The journal is synced once before writes begin on the
2252 **              database.  This is normally adequate protection, but
2253 **              it is theoretically possible, though very unlikely,
2254 **              that an inopertune power failure could leave the journal
2255 **              in a state which would cause damage to the database
2256 **              when it is rolled back.
2257 **
2258 **    FULL      The journal is synced twice before writes begin on the
2259 **              database (with some additional information - the nRec field
2260 **              of the journal header - being written in between the two
2261 **              syncs).  If we assume that writing a
2262 **              single disk sector is atomic, then this mode provides
2263 **              assurance that the journal will not be corrupted to the
2264 **              point of causing damage to the database during rollback.
2265 **
2266 ** Numeric values associated with these states are OFF==1, NORMAL=2,
2267 ** and FULL=3.
2268 */
2269 #ifndef SQLITE_OMIT_PAGER_PRAGMAS
2270 void sqlite3PagerSetSafetyLevel(Pager *pPager, int level, int bFullFsync){
2271   pPager->noSync =  (level==1 || pPager->tempFile) ?1:0;
2272   pPager->fullSync = (level==3 && !pPager->tempFile) ?1:0;
2273   pPager->sync_flags = (bFullFsync?SQLITE_SYNC_FULL:SQLITE_SYNC_NORMAL);
2274   if( pPager->noSync ) pPager->needSync = 0;
2275 }
2276 #endif
2277 
2278 /*
2279 ** The following global variable is incremented whenever the library
2280 ** attempts to open a temporary file.  This information is used for
2281 ** testing and analysis only.
2282 */
2283 #ifdef SQLITE_TEST
2284 int sqlite3_opentemp_count = 0;
2285 #endif
2286 
2287 /*
2288 ** Open a temporary file.
2289 **
2290 ** Write the file descriptor into *pFile. Return SQLITE_OK on success
2291 ** or some other error code if we fail. The OS will automatically
2292 ** delete the temporary file when it is closed.
2293 **
2294 ** The flags passed to the VFS layer xOpen() call are those specified
2295 ** by parameter vfsFlags ORed with the following:
2296 **
2297 **     SQLITE_OPEN_READWRITE
2298 **     SQLITE_OPEN_CREATE
2299 **     SQLITE_OPEN_EXCLUSIVE
2300 **     SQLITE_OPEN_DELETEONCLOSE
2301 */
2302 static int pagerOpentemp(
2303   Pager *pPager,        /* The pager object */
2304   sqlite3_file *pFile,  /* Write the file descriptor here */
2305   int vfsFlags          /* Flags passed through to the VFS */
2306 ){
2307   int rc;               /* Return code */
2308 
2309 #ifdef SQLITE_TEST
2310   sqlite3_opentemp_count++;  /* Used for testing and analysis only */
2311 #endif
2312 
2313   vfsFlags |=  SQLITE_OPEN_READWRITE | SQLITE_OPEN_CREATE |
2314             SQLITE_OPEN_EXCLUSIVE | SQLITE_OPEN_DELETEONCLOSE;
2315   rc = sqlite3OsOpen(pPager->pVfs, 0, pFile, vfsFlags, 0);
2316   assert( rc!=SQLITE_OK || isOpen(pFile) );
2317   return rc;
2318 }
2319 
2320 /*
2321 ** Set the busy handler function.
2322 **
2323 ** The pager invokes the busy-handler if sqlite3OsLock() returns
2324 ** SQLITE_BUSY when trying to upgrade from no-lock to a SHARED lock,
2325 ** or when trying to upgrade from a RESERVED lock to an EXCLUSIVE
2326 ** lock. It does *not* invoke the busy handler when upgrading from
2327 ** SHARED to RESERVED, or when upgrading from SHARED to EXCLUSIVE
2328 ** (which occurs during hot-journal rollback). Summary:
2329 **
2330 **   Transition                        | Invokes xBusyHandler
2331 **   --------------------------------------------------------
2332 **   NO_LOCK       -> SHARED_LOCK      | Yes
2333 **   SHARED_LOCK   -> RESERVED_LOCK    | No
2334 **   SHARED_LOCK   -> EXCLUSIVE_LOCK   | No
2335 **   RESERVED_LOCK -> EXCLUSIVE_LOCK   | Yes
2336 **
2337 ** If the busy-handler callback returns non-zero, the lock is
2338 ** retried. If it returns zero, then the SQLITE_BUSY error is
2339 ** returned to the caller of the pager API function.
2340 */
2341 void sqlite3PagerSetBusyhandler(
2342   Pager *pPager,                       /* Pager object */
2343   int (*xBusyHandler)(void *),         /* Pointer to busy-handler function */
2344   void *pBusyHandlerArg                /* Argument to pass to xBusyHandler */
2345 ){
2346   pPager->xBusyHandler = xBusyHandler;
2347   pPager->pBusyHandlerArg = pBusyHandlerArg;
2348 }
2349 
2350 /*
2351 ** Set the reinitializer for this pager. If not NULL, the reinitializer
2352 ** is called when the content of a page in cache is modified (restored)
2353 ** as part of a transaction or savepoint rollback. The callback gives
2354 ** higher-level code an opportunity to restore the EXTRA section to
2355 ** agree with the restored page data.
2356 */
2357 void sqlite3PagerSetReiniter(Pager *pPager, void (*xReinit)(DbPage*)){
2358   pPager->xReiniter = xReinit;
2359 }
2360 
2361 /*
2362 ** Change the page size used by the Pager object. The new page size
2363 ** is passed in *pPageSize.
2364 **
2365 ** If the pager is in the error state when this function is called, it
2366 ** is a no-op. The value returned is the error state error code (i.e.
2367 ** one of SQLITE_IOERR, SQLITE_CORRUPT or SQLITE_FULL).
2368 **
2369 ** Otherwise, if all of the following are true:
2370 **
2371 **   * the new page size (value of *pPageSize) is valid (a power
2372 **     of two between 512 and SQLITE_MAX_PAGE_SIZE, inclusive), and
2373 **
2374 **   * there are no outstanding page references, and
2375 **
2376 **   * the database is either not an in-memory database or it is
2377 **     an in-memory database that currently consists of zero pages.
2378 **
2379 ** then the pager object page size is set to *pPageSize.
2380 **
2381 ** If the page size is changed, then this function uses sqlite3PagerMalloc()
2382 ** to obtain a new Pager.pTmpSpace buffer. If this allocation attempt
2383 ** fails, SQLITE_NOMEM is returned and the page size remains unchanged.
2384 ** In all other cases, SQLITE_OK is returned.
2385 **
2386 ** If the page size is not changed, either because one of the enumerated
2387 ** conditions above is not true, the pager was in error state when this
2388 ** function was called, or because the memory allocation attempt failed,
2389 ** then *pPageSize is set to the old, retained page size before returning.
2390 */
2391 int sqlite3PagerSetPagesize(Pager *pPager, u16 *pPageSize){
2392   int rc = pPager->errCode;
2393   if( rc==SQLITE_OK ){
2394     u16 pageSize = *pPageSize;
2395     assert( pageSize==0 || (pageSize>=512 && pageSize<=SQLITE_MAX_PAGE_SIZE) );
2396     if( pageSize && pageSize!=pPager->pageSize
2397      && (pPager->memDb==0 || pPager->dbSize==0)
2398      && sqlite3PcacheRefCount(pPager->pPCache)==0
2399     ){
2400       char *pNew = (char *)sqlite3PageMalloc(pageSize);
2401       if( !pNew ){
2402         rc = SQLITE_NOMEM;
2403       }else{
2404         pager_reset(pPager);
2405         pPager->pageSize = pageSize;
2406         sqlite3PageFree(pPager->pTmpSpace);
2407         pPager->pTmpSpace = pNew;
2408         sqlite3PcacheSetPageSize(pPager->pPCache, pageSize);
2409       }
2410     }
2411     *pPageSize = (u16)pPager->pageSize;
2412   }
2413   return rc;
2414 }
2415 
2416 /*
2417 ** Return a pointer to the "temporary page" buffer held internally
2418 ** by the pager.  This is a buffer that is big enough to hold the
2419 ** entire content of a database page.  This buffer is used internally
2420 ** during rollback and will be overwritten whenever a rollback
2421 ** occurs.  But other modules are free to use it too, as long as
2422 ** no rollbacks are happening.
2423 */
2424 void *sqlite3PagerTempSpace(Pager *pPager){
2425   return pPager->pTmpSpace;
2426 }
2427 
2428 /*
2429 ** Attempt to set the maximum database page count if mxPage is positive.
2430 ** Make no changes if mxPage is zero or negative.  And never reduce the
2431 ** maximum page count below the current size of the database.
2432 **
2433 ** Regardless of mxPage, return the current maximum page count.
2434 */
2435 int sqlite3PagerMaxPageCount(Pager *pPager, int mxPage){
2436   if( mxPage>0 ){
2437     pPager->mxPgno = mxPage;
2438   }
2439   sqlite3PagerPagecount(pPager, 0);
2440   return pPager->mxPgno;
2441 }
2442 
2443 /*
2444 ** The following set of routines are used to disable the simulated
2445 ** I/O error mechanism.  These routines are used to avoid simulated
2446 ** errors in places where we do not care about errors.
2447 **
2448 ** Unless -DSQLITE_TEST=1 is used, these routines are all no-ops
2449 ** and generate no code.
2450 */
2451 #ifdef SQLITE_TEST
2452 extern int sqlite3_io_error_pending;
2453 extern int sqlite3_io_error_hit;
2454 static int saved_cnt;
2455 void disable_simulated_io_errors(void){
2456   saved_cnt = sqlite3_io_error_pending;
2457   sqlite3_io_error_pending = -1;
2458 }
2459 void enable_simulated_io_errors(void){
2460   sqlite3_io_error_pending = saved_cnt;
2461 }
2462 #else
2463 # define disable_simulated_io_errors()
2464 # define enable_simulated_io_errors()
2465 #endif
2466 
2467 /*
2468 ** Read the first N bytes from the beginning of the file into memory
2469 ** that pDest points to.
2470 **
2471 ** If the pager was opened on a transient file (zFilename==""), or
2472 ** opened on a file less than N bytes in size, the output buffer is
2473 ** zeroed and SQLITE_OK returned. The rationale for this is that this
2474 ** function is used to read database headers, and a new transient or
2475 ** zero sized database has a header than consists entirely of zeroes.
2476 **
2477 ** If any IO error apart from SQLITE_IOERR_SHORT_READ is encountered,
2478 ** the error code is returned to the caller and the contents of the
2479 ** output buffer undefined.
2480 */
2481 int sqlite3PagerReadFileheader(Pager *pPager, int N, unsigned char *pDest){
2482   int rc = SQLITE_OK;
2483   memset(pDest, 0, N);
2484   assert( isOpen(pPager->fd) || pPager->tempFile );
2485   if( isOpen(pPager->fd) ){
2486     IOTRACE(("DBHDR %p 0 %d\n", pPager, N))
2487     rc = sqlite3OsRead(pPager->fd, pDest, N, 0);
2488     if( rc==SQLITE_IOERR_SHORT_READ ){
2489       rc = SQLITE_OK;
2490     }
2491   }
2492   return rc;
2493 }
2494 
2495 /*
2496 ** Return the total number of pages in the database file associated
2497 ** with pPager. Normally, this is calculated as (<db file size>/<page-size>).
2498 ** However, if the file is between 1 and <page-size> bytes in size, then
2499 ** this is considered a 1 page file.
2500 **
2501 ** If the pager is in error state when this function is called, then the
2502 ** error state error code is returned and *pnPage left unchanged. Or,
2503 ** if the file system has to be queried for the size of the file and
2504 ** the query attempt returns an IO error, the IO error code is returned
2505 ** and *pnPage is left unchanged.
2506 **
2507 ** Otherwise, if everything is successful, then SQLITE_OK is returned
2508 ** and *pnPage is set to the number of pages in the database.
2509 */
2510 int sqlite3PagerPagecount(Pager *pPager, int *pnPage){
2511   Pgno nPage;               /* Value to return via *pnPage */
2512 
2513   /* If the pager is already in the error state, return the error code. */
2514   if( pPager->errCode ){
2515     return pPager->errCode;
2516   }
2517 
2518   /* Determine the number of pages in the file. Store this in nPage. */
2519   if( pPager->dbSizeValid ){
2520     nPage = pPager->dbSize;
2521   }else{
2522     int rc;                 /* Error returned by OsFileSize() */
2523     i64 n = 0;              /* File size in bytes returned by OsFileSize() */
2524 
2525     assert( isOpen(pPager->fd) || pPager->tempFile );
2526     if( isOpen(pPager->fd) && (0 != (rc = sqlite3OsFileSize(pPager->fd, &n))) ){
2527       pager_error(pPager, rc);
2528       return rc;
2529     }
2530     if( n>0 && n<pPager->pageSize ){
2531       nPage = 1;
2532     }else{
2533       nPage = (Pgno)(n / pPager->pageSize);
2534     }
2535     if( pPager->state!=PAGER_UNLOCK ){
2536       pPager->dbSize = nPage;
2537       pPager->dbFileSize = nPage;
2538       pPager->dbSizeValid = 1;
2539     }
2540   }
2541 
2542   /* If the current number of pages in the file is greater than the
2543   ** configured maximum pager number, increase the allowed limit so
2544   ** that the file can be read.
2545   */
2546   if( nPage>pPager->mxPgno ){
2547     pPager->mxPgno = (Pgno)nPage;
2548   }
2549 
2550   /* Set the output variable and return SQLITE_OK */
2551   if( pnPage ){
2552     *pnPage = nPage;
2553   }
2554   return SQLITE_OK;
2555 }
2556 
2557 
2558 /*
2559 ** Try to obtain a lock of type locktype on the database file. If
2560 ** a similar or greater lock is already held, this function is a no-op
2561 ** (returning SQLITE_OK immediately).
2562 **
2563 ** Otherwise, attempt to obtain the lock using sqlite3OsLock(). Invoke
2564 ** the busy callback if the lock is currently not available. Repeat
2565 ** until the busy callback returns false or until the attempt to
2566 ** obtain the lock succeeds.
2567 **
2568 ** Return SQLITE_OK on success and an error code if we cannot obtain
2569 ** the lock. If the lock is obtained successfully, set the Pager.state
2570 ** variable to locktype before returning.
2571 */
2572 static int pager_wait_on_lock(Pager *pPager, int locktype){
2573   int rc;                              /* Return code */
2574 
2575   /* The OS lock values must be the same as the Pager lock values */
2576   assert( PAGER_SHARED==SHARED_LOCK );
2577   assert( PAGER_RESERVED==RESERVED_LOCK );
2578   assert( PAGER_EXCLUSIVE==EXCLUSIVE_LOCK );
2579 
2580   /* If the file is currently unlocked then the size must be unknown */
2581   assert( pPager->state>=PAGER_SHARED || pPager->dbSizeValid==0 );
2582 
2583   /* Check that this is either a no-op (because the requested lock is
2584   ** already held, or one of the transistions that the busy-handler
2585   ** may be invoked during, according to the comment above
2586   ** sqlite3PagerSetBusyhandler().
2587   */
2588   assert( (pPager->state>=locktype)
2589        || (pPager->state==PAGER_UNLOCK && locktype==PAGER_SHARED)
2590        || (pPager->state==PAGER_RESERVED && locktype==PAGER_EXCLUSIVE)
2591   );
2592 
2593   if( pPager->state>=locktype ){
2594     rc = SQLITE_OK;
2595   }else{
2596     do {
2597       rc = sqlite3OsLock(pPager->fd, locktype);
2598     }while( rc==SQLITE_BUSY && pPager->xBusyHandler(pPager->pBusyHandlerArg) );
2599     if( rc==SQLITE_OK ){
2600       pPager->state = (u8)locktype;
2601       IOTRACE(("LOCK %p %d\n", pPager, locktype))
2602     }
2603   }
2604   return rc;
2605 }
2606 
2607 /*
2608 ** Truncate the in-memory database file image to nPage pages. This
2609 ** function does not actually modify the database file on disk. It
2610 ** just sets the internal state of the pager object so that the
2611 ** truncation will be done when the current transaction is committed.
2612 */
2613 void sqlite3PagerTruncateImage(Pager *pPager, Pgno nPage){
2614   assert( pPager->dbSizeValid );
2615   assert( pPager->dbSize>=nPage );
2616   assert( pPager->state>=PAGER_RESERVED );
2617   pPager->dbSize = nPage;
2618 }
2619 
2620 /*
2621 ** Shutdown the page cache.  Free all memory and close all files.
2622 **
2623 ** If a transaction was in progress when this routine is called, that
2624 ** transaction is rolled back.  All outstanding pages are invalidated
2625 ** and their memory is freed.  Any attempt to use a page associated
2626 ** with this page cache after this function returns will likely
2627 ** result in a coredump.
2628 **
2629 ** This function always succeeds. If a transaction is active an attempt
2630 ** is made to roll it back. If an error occurs during the rollback
2631 ** a hot journal may be left in the filesystem but no error is returned
2632 ** to the caller.
2633 */
2634 int sqlite3PagerClose(Pager *pPager){
2635   disable_simulated_io_errors();
2636   sqlite3BeginBenignMalloc();
2637   pPager->errCode = 0;
2638   pPager->exclusiveMode = 0;
2639   pager_reset(pPager);
2640   if( MEMDB ){
2641     pager_unlock(pPager);
2642   }else{
2643     /* Set Pager.journalHdr to -1 for the benefit of the pager_playback()
2644     ** call which may be made from within pagerUnlockAndRollback(). If it
2645     ** is not -1, then the unsynced portion of an open journal file may
2646     ** be played back into the database. If a power failure occurs while
2647     ** this is happening, the database may become corrupt.
2648     */
2649     pPager->journalHdr = -1;
2650     pagerUnlockAndRollback(pPager);
2651   }
2652   sqlite3EndBenignMalloc();
2653   enable_simulated_io_errors();
2654   PAGERTRACE(("CLOSE %d\n", PAGERID(pPager)));
2655   IOTRACE(("CLOSE %p\n", pPager))
2656   sqlite3OsClose(pPager->fd);
2657   sqlite3PageFree(pPager->pTmpSpace);
2658   sqlite3PcacheClose(pPager->pPCache);
2659 
2660   assert( !pPager->aSavepoint && !pPager->pInJournal );
2661   assert( !isOpen(pPager->jfd) && !isOpen(pPager->sjfd) );
2662 
2663   sqlite3_free(pPager);
2664   return SQLITE_OK;
2665 }
2666 
2667 #if !defined(NDEBUG) || defined(SQLITE_TEST)
2668 /*
2669 ** Return the page number for page pPg.
2670 */
2671 Pgno sqlite3PagerPagenumber(DbPage *pPg){
2672   return pPg->pgno;
2673 }
2674 #endif
2675 
2676 /*
2677 ** Increment the reference count for page pPg.
2678 */
2679 void sqlite3PagerRef(DbPage *pPg){
2680   sqlite3PcacheRef(pPg);
2681 }
2682 
2683 /*
2684 ** Sync the journal. In other words, make sure all the pages that have
2685 ** been written to the journal have actually reached the surface of the
2686 ** disk and can be restored in the event of a hot-journal rollback.
2687 **
2688 ** If the Pager.needSync flag is not set, then this function is a
2689 ** no-op. Otherwise, the actions required depend on the journal-mode
2690 ** and the device characteristics of the the file-system, as follows:
2691 **
2692 **   * If the journal file is an in-memory journal file, no action need
2693 **     be taken.
2694 **
2695 **   * Otherwise, if the device does not support the SAFE_APPEND property,
2696 **     then the nRec field of the most recently written journal header
2697 **     is updated to contain the number of journal records that have
2698 **     been written following it. If the pager is operating in full-sync
2699 **     mode, then the journal file is synced before this field is updated.
2700 **
2701 **   * If the device does not support the SEQUENTIAL property, then
2702 **     journal file is synced.
2703 **
2704 ** Or, in pseudo-code:
2705 **
2706 **   if( NOT <in-memory journal> ){
2707 **     if( NOT SAFE_APPEND ){
2708 **       if( <full-sync mode> ) xSync(<journal file>);
2709 **       <update nRec field>
2710 **     }
2711 **     if( NOT SEQUENTIAL ) xSync(<journal file>);
2712 **   }
2713 **
2714 ** The Pager.needSync flag is never be set for temporary files, or any
2715 ** file operating in no-sync mode (Pager.noSync set to non-zero).
2716 **
2717 ** If successful, this routine clears the PGHDR_NEED_SYNC flag of every
2718 ** page currently held in memory before returning SQLITE_OK. If an IO
2719 ** error is encountered, then the IO error code is returned to the caller.
2720 */
2721 static int syncJournal(Pager *pPager){
2722   if( pPager->needSync ){
2723     assert( !pPager->tempFile );
2724     if( pPager->journalMode!=PAGER_JOURNALMODE_MEMORY ){
2725       int rc;                              /* Return code */
2726       const int iDc = sqlite3OsDeviceCharacteristics(pPager->fd);
2727       assert( isOpen(pPager->jfd) );
2728 
2729       if( 0==(iDc&SQLITE_IOCAP_SAFE_APPEND) ){
2730         /* Variable iNRecOffset is set to the offset in the journal file
2731         ** of the nRec field of the most recently written journal header.
2732         ** This field will be updated following the xSync() operation
2733         ** on the journal file. */
2734         i64 iNRecOffset = pPager->journalHdr + sizeof(aJournalMagic);
2735 
2736         /* This block deals with an obscure problem. If the last connection
2737         ** that wrote to this database was operating in persistent-journal
2738         ** mode, then the journal file may at this point actually be larger
2739         ** than Pager.journalOff bytes. If the next thing in the journal
2740         ** file happens to be a journal-header (written as part of the
2741         ** previous connections transaction), and a crash or power-failure
2742         ** occurs after nRec is updated but before this connection writes
2743         ** anything else to the journal file (or commits/rolls back its
2744         ** transaction), then SQLite may become confused when doing the
2745         ** hot-journal rollback following recovery. It may roll back all
2746         ** of this connections data, then proceed to rolling back the old,
2747         ** out-of-date data that follows it. Database corruption.
2748         **
2749         ** To work around this, if the journal file does appear to contain
2750         ** a valid header following Pager.journalOff, then write a 0x00
2751         ** byte to the start of it to prevent it from being recognized.
2752         **
2753         ** Variable iNextHdrOffset is set to the offset at which this
2754         ** problematic header will occur, if it exists. aMagic is used
2755         ** as a temporary buffer to inspect the first couple of bytes of
2756         ** the potential journal header.
2757         */
2758         i64 iNextHdrOffset = journalHdrOffset(pPager);
2759         u8 aMagic[8];
2760         rc = sqlite3OsRead(pPager->jfd, aMagic, 8, iNextHdrOffset);
2761         if( rc==SQLITE_OK && 0==memcmp(aMagic, aJournalMagic, 8) ){
2762           static const u8 zerobyte = 0;
2763           rc = sqlite3OsWrite(pPager->jfd, &zerobyte, 1, iNextHdrOffset);
2764         }
2765         if( rc!=SQLITE_OK && rc!=SQLITE_IOERR_SHORT_READ ){
2766           return rc;
2767         }
2768 
2769         /* Write the nRec value into the journal file header. If in
2770         ** full-synchronous mode, sync the journal first. This ensures that
2771         ** all data has really hit the disk before nRec is updated to mark
2772         ** it as a candidate for rollback.
2773         **
2774         ** This is not required if the persistent media supports the
2775         ** SAFE_APPEND property. Because in this case it is not possible
2776         ** for garbage data to be appended to the file, the nRec field
2777         ** is populated with 0xFFFFFFFF when the journal header is written
2778         ** and never needs to be updated.
2779         */
2780         if( pPager->fullSync && 0==(iDc&SQLITE_IOCAP_SEQUENTIAL) ){
2781           PAGERTRACE(("SYNC journal of %d\n", PAGERID(pPager)));
2782           IOTRACE(("JSYNC %p\n", pPager))
2783           rc = sqlite3OsSync(pPager->jfd, pPager->sync_flags);
2784           if( rc!=SQLITE_OK ) return rc;
2785         }
2786         IOTRACE(("JHDR %p %lld %d\n", pPager, iNRecOffset, 4));
2787         rc = write32bits(pPager->jfd, iNRecOffset, pPager->nRec);
2788         if( rc!=SQLITE_OK ) return rc;
2789       }
2790       if( 0==(iDc&SQLITE_IOCAP_SEQUENTIAL) ){
2791         PAGERTRACE(("SYNC journal of %d\n", PAGERID(pPager)));
2792         IOTRACE(("JSYNC %p\n", pPager))
2793         rc = sqlite3OsSync(pPager->jfd, pPager->sync_flags|
2794           (pPager->sync_flags==SQLITE_SYNC_FULL?SQLITE_SYNC_DATAONLY:0)
2795         );
2796         if( rc!=SQLITE_OK ) return rc;
2797       }
2798     }
2799 
2800     /* The journal file was just successfully synced. Set Pager.needSync
2801     ** to zero and clear the PGHDR_NEED_SYNC flag on all pagess.
2802     */
2803     pPager->needSync = 0;
2804     pPager->journalStarted = 1;
2805     sqlite3PcacheClearSyncFlags(pPager->pPCache);
2806   }
2807 
2808   return SQLITE_OK;
2809 }
2810 
2811 /*
2812 ** The argument is the first in a linked list of dirty pages connected
2813 ** by the PgHdr.pDirty pointer. This function writes each one of the
2814 ** in-memory pages in the list to the database file. The argument may
2815 ** be NULL, representing an empty list. In this case this function is
2816 ** a no-op.
2817 **
2818 ** The pager must hold at least a RESERVED lock when this function
2819 ** is called. Before writing anything to the database file, this lock
2820 ** is upgraded to an EXCLUSIVE lock. If the lock cannot be obtained,
2821 ** SQLITE_BUSY is returned and no data is written to the database file.
2822 **
2823 ** If the pager is a temp-file pager and the actual file-system file
2824 ** is not yet open, it is created and opened before any data is
2825 ** written out.
2826 **
2827 ** Once the lock has been upgraded and, if necessary, the file opened,
2828 ** the pages are written out to the database file in list order. Writing
2829 ** a page is skipped if it meets either of the following criteria:
2830 **
2831 **   * The page number is greater than Pager.dbSize, or
2832 **   * The PGHDR_DONT_WRITE flag is set on the page.
2833 **
2834 ** If writing out a page causes the database file to grow, Pager.dbFileSize
2835 ** is updated accordingly. If page 1 is written out, then the value cached
2836 ** in Pager.dbFileVers[] is updated to match the new value stored in
2837 ** the database file.
2838 **
2839 ** If everything is successful, SQLITE_OK is returned. If an IO error
2840 ** occurs, an IO error code is returned. Or, if the EXCLUSIVE lock cannot
2841 ** be obtained, SQLITE_BUSY is returned.
2842 */
2843 static int pager_write_pagelist(PgHdr *pList){
2844   Pager *pPager;                       /* Pager object */
2845   int rc;                              /* Return code */
2846 
2847   if( pList==0 ) return SQLITE_OK;
2848   pPager = pList->pPager;
2849 
2850   /* At this point there may be either a RESERVED or EXCLUSIVE lock on the
2851   ** database file. If there is already an EXCLUSIVE lock, the following
2852   ** call is a no-op.
2853   **
2854   ** Moving the lock from RESERVED to EXCLUSIVE actually involves going
2855   ** through an intermediate state PENDING.   A PENDING lock prevents new
2856   ** readers from attaching to the database but is unsufficient for us to
2857   ** write.  The idea of a PENDING lock is to prevent new readers from
2858   ** coming in while we wait for existing readers to clear.
2859   **
2860   ** While the pager is in the RESERVED state, the original database file
2861   ** is unchanged and we can rollback without having to playback the
2862   ** journal into the original database file.  Once we transition to
2863   ** EXCLUSIVE, it means the database file has been changed and any rollback
2864   ** will require a journal playback.
2865   */
2866   assert( pPager->state>=PAGER_RESERVED );
2867   rc = pager_wait_on_lock(pPager, EXCLUSIVE_LOCK);
2868 
2869   /* If the file is a temp-file has not yet been opened, open it now. It
2870   ** is not possible for rc to be other than SQLITE_OK if this branch
2871   ** is taken, as pager_wait_on_lock() is a no-op for temp-files.
2872   */
2873   if( !isOpen(pPager->fd) ){
2874     assert( pPager->tempFile && rc==SQLITE_OK );
2875     rc = pagerOpentemp(pPager, pPager->fd, pPager->vfsFlags);
2876   }
2877 
2878   while( rc==SQLITE_OK && pList ){
2879     Pgno pgno = pList->pgno;
2880 
2881     /* If there are dirty pages in the page cache with page numbers greater
2882     ** than Pager.dbSize, this means sqlite3PagerTruncateImage() was called to
2883     ** make the file smaller (presumably by auto-vacuum code). Do not write
2884     ** any such pages to the file.
2885     **
2886     ** Also, do not write out any page that has the PGHDR_DONT_WRITE flag
2887     ** set (set by sqlite3PagerDontWrite()).
2888     */
2889     if( pgno<=pPager->dbSize && 0==(pList->flags&PGHDR_DONT_WRITE) ){
2890       i64 offset = (pgno-1)*(i64)pPager->pageSize;         /* Offset to write */
2891       char *pData = CODEC2(pPager, pList->pData, pgno, 6); /* Data to write */
2892 
2893       /* Write out the page data. */
2894       rc = sqlite3OsWrite(pPager->fd, pData, pPager->pageSize, offset);
2895 
2896       /* If page 1 was just written, update Pager.dbFileVers to match
2897       ** the value now stored in the database file. If writing this
2898       ** page caused the database file to grow, update dbFileSize.
2899       */
2900       if( pgno==1 ){
2901         memcpy(&pPager->dbFileVers, &pData[24], sizeof(pPager->dbFileVers));
2902       }
2903       if( pgno>pPager->dbFileSize ){
2904         pPager->dbFileSize = pgno;
2905       }
2906 
2907       /* Update any backup objects copying the contents of this pager. */
2908       sqlite3BackupUpdate(pPager->pBackup, pgno, (u8 *)pData);
2909 
2910       PAGERTRACE(("STORE %d page %d hash(%08x)\n",
2911                    PAGERID(pPager), pgno, pager_pagehash(pList)));
2912       IOTRACE(("PGOUT %p %d\n", pPager, pgno));
2913       PAGER_INCR(sqlite3_pager_writedb_count);
2914       PAGER_INCR(pPager->nWrite);
2915     }else{
2916       PAGERTRACE(("NOSTORE %d page %d\n", PAGERID(pPager), pgno));
2917     }
2918 #ifdef SQLITE_CHECK_PAGES
2919     pList->pageHash = pager_pagehash(pList);
2920 #endif
2921     pList = pList->pDirty;
2922   }
2923 
2924   return rc;
2925 }
2926 
2927 /*
2928 ** Append a record of the current state of page pPg to the sub-journal.
2929 ** It is the callers responsibility to use subjRequiresPage() to check
2930 ** that it is really required before calling this function.
2931 **
2932 ** If successful, set the bit corresponding to pPg->pgno in the bitvecs
2933 ** for all open savepoints before returning.
2934 **
2935 ** This function returns SQLITE_OK if everything is successful, an IO
2936 ** error code if the attempt to write to the sub-journal fails, or
2937 ** SQLITE_NOMEM if a malloc fails while setting a bit in a savepoint
2938 ** bitvec.
2939 */
2940 static int subjournalPage(PgHdr *pPg){
2941   int rc = SQLITE_OK;
2942   Pager *pPager = pPg->pPager;
2943   if( isOpen(pPager->sjfd) ){
2944     void *pData = pPg->pData;
2945     i64 offset = pPager->nSubRec*(4+pPager->pageSize);
2946     char *pData2 = CODEC2(pPager, pData, pPg->pgno, 7);
2947 
2948     PAGERTRACE(("STMT-JOURNAL %d page %d\n", PAGERID(pPager), pPg->pgno));
2949 
2950     assert( pageInJournal(pPg) || pPg->pgno>pPager->dbOrigSize );
2951     rc = write32bits(pPager->sjfd, offset, pPg->pgno);
2952     if( rc==SQLITE_OK ){
2953       rc = sqlite3OsWrite(pPager->sjfd, pData2, pPager->pageSize, offset+4);
2954     }
2955   }
2956   if( rc==SQLITE_OK ){
2957     pPager->nSubRec++;
2958     assert( pPager->nSavepoint>0 );
2959     rc = addToSavepointBitvecs(pPager, pPg->pgno);
2960     testcase( rc!=SQLITE_OK );
2961   }
2962   return rc;
2963 }
2964 
2965 
2966 /*
2967 ** This function is called by the pcache layer when it has reached some
2968 ** soft memory limit. The first argument is a pointer to a Pager object
2969 ** (cast as a void*). The pager is always 'purgeable' (not an in-memory
2970 ** database). The second argument is a reference to a page that is
2971 ** currently dirty but has no outstanding references. The page
2972 ** is always associated with the Pager object passed as the first
2973 ** argument.
2974 **
2975 ** The job of this function is to make pPg clean by writing its contents
2976 ** out to the database file, if possible. This may involve syncing the
2977 ** journal file.
2978 **
2979 ** If successful, sqlite3PcacheMakeClean() is called on the page and
2980 ** SQLITE_OK returned. If an IO error occurs while trying to make the
2981 ** page clean, the IO error code is returned. If the page cannot be
2982 ** made clean for some other reason, but no error occurs, then SQLITE_OK
2983 ** is returned by sqlite3PcacheMakeClean() is not called.
2984 */
2985 static int pagerStress(void *p, PgHdr *pPg){
2986   Pager *pPager = (Pager *)p;
2987   int rc = SQLITE_OK;
2988 
2989   assert( pPg->pPager==pPager );
2990   assert( pPg->flags&PGHDR_DIRTY );
2991 
2992   /* The doNotSync flag is set by the sqlite3PagerWrite() function while it
2993   ** is journalling a set of two or more database pages that are stored
2994   ** on the same disk sector. Syncing the journal is not allowed while
2995   ** this is happening as it is important that all members of such a
2996   ** set of pages are synced to disk together. So, if the page this function
2997   ** is trying to make clean will require a journal sync and the doNotSync
2998   ** flag is set, return without doing anything. The pcache layer will
2999   ** just have to go ahead and allocate a new page buffer instead of
3000   ** reusing pPg.
3001   **
3002   ** Similarly, if the pager has already entered the error state, do not
3003   ** try to write the contents of pPg to disk.
3004   */
3005   if( pPager->errCode || (pPager->doNotSync && pPg->flags&PGHDR_NEED_SYNC) ){
3006     return SQLITE_OK;
3007   }
3008 
3009   /* Sync the journal file if required. */
3010   if( pPg->flags&PGHDR_NEED_SYNC ){
3011     rc = syncJournal(pPager);
3012     if( rc==SQLITE_OK && pPager->fullSync &&
3013       !(pPager->journalMode==PAGER_JOURNALMODE_MEMORY) &&
3014       !(sqlite3OsDeviceCharacteristics(pPager->fd)&SQLITE_IOCAP_SAFE_APPEND)
3015     ){
3016       pPager->nRec = 0;
3017       rc = writeJournalHdr(pPager);
3018     }
3019   }
3020 
3021   /* If the page number of this page is larger than the current size of
3022   ** the database image, it may need to be written to the sub-journal.
3023   ** This is because the call to pager_write_pagelist() below will not
3024   ** actually write data to the file in this case.
3025   **
3026   ** Consider the following sequence of events:
3027   **
3028   **   BEGIN;
3029   **     <journal page X>
3030   **     <modify page X>
3031   **     SAVEPOINT sp;
3032   **       <shrink database file to Y pages>
3033   **       pagerStress(page X)
3034   **     ROLLBACK TO sp;
3035   **
3036   ** If (X>Y), then when pagerStress is called page X will not be written
3037   ** out to the database file, but will be dropped from the cache. Then,
3038   ** following the "ROLLBACK TO sp" statement, reading page X will read
3039   ** data from the database file. This will be the copy of page X as it
3040   ** was when the transaction started, not as it was when "SAVEPOINT sp"
3041   ** was executed.
3042   **
3043   ** The solution is to write the current data for page X into the
3044   ** sub-journal file now (if it is not already there), so that it will
3045   ** be restored to its current value when the "ROLLBACK TO sp" is
3046   ** executed.
3047   */
3048   if( rc==SQLITE_OK && pPg->pgno>pPager->dbSize && subjRequiresPage(pPg) ){
3049     rc = subjournalPage(pPg);
3050   }
3051 
3052   /* Write the contents of the page out to the database file. */
3053   if( rc==SQLITE_OK ){
3054     pPg->pDirty = 0;
3055     rc = pager_write_pagelist(pPg);
3056   }
3057 
3058   /* Mark the page as clean. */
3059   if( rc==SQLITE_OK ){
3060     PAGERTRACE(("STRESS %d page %d\n", PAGERID(pPager), pPg->pgno));
3061     sqlite3PcacheMakeClean(pPg);
3062   }
3063 
3064   return pager_error(pPager, rc);
3065 }
3066 
3067 
3068 /*
3069 ** Allocate and initialize a new Pager object and put a pointer to it
3070 ** in *ppPager. The pager should eventually be freed by passing it
3071 ** to sqlite3PagerClose().
3072 **
3073 ** The zFilename argument is the path to the database file to open.
3074 ** If zFilename is NULL then a randomly-named temporary file is created
3075 ** and used as the file to be cached. Temporary files are be deleted
3076 ** automatically when they are closed. If zFilename is ":memory:" then
3077 ** all information is held in cache. It is never written to disk.
3078 ** This can be used to implement an in-memory database.
3079 **
3080 ** The nExtra parameter specifies the number of bytes of space allocated
3081 ** along with each page reference. This space is available to the user
3082 ** via the sqlite3PagerGetExtra() API.
3083 **
3084 ** The flags argument is used to specify properties that affect the
3085 ** operation of the pager. It should be passed some bitwise combination
3086 ** of the PAGER_OMIT_JOURNAL and PAGER_NO_READLOCK flags.
3087 **
3088 ** The vfsFlags parameter is a bitmask to pass to the flags parameter
3089 ** of the xOpen() method of the supplied VFS when opening files.
3090 **
3091 ** If the pager object is allocated and the specified file opened
3092 ** successfully, SQLITE_OK is returned and *ppPager set to point to
3093 ** the new pager object. If an error occurs, *ppPager is set to NULL
3094 ** and error code returned. This function may return SQLITE_NOMEM
3095 ** (sqlite3Malloc() is used to allocate memory), SQLITE_CANTOPEN or
3096 ** various SQLITE_IO_XXX errors.
3097 */
3098 int sqlite3PagerOpen(
3099   sqlite3_vfs *pVfs,       /* The virtual file system to use */
3100   Pager **ppPager,         /* OUT: Return the Pager structure here */
3101   const char *zFilename,   /* Name of the database file to open */
3102   int nExtra,              /* Extra bytes append to each in-memory page */
3103   int flags,               /* flags controlling this file */
3104   int vfsFlags             /* flags passed through to sqlite3_vfs.xOpen() */
3105 ){
3106   u8 *pPtr;
3107   Pager *pPager = 0;       /* Pager object to allocate and return */
3108   int rc = SQLITE_OK;      /* Return code */
3109   int tempFile = 0;        /* True for temp files (incl. in-memory files) */
3110   int memDb = 0;           /* True if this is an in-memory file */
3111   int readOnly = 0;        /* True if this is a read-only file */
3112   int journalFileSize;     /* Bytes to allocate for each journal fd */
3113   char *zPathname = 0;     /* Full path to database file */
3114   int nPathname = 0;       /* Number of bytes in zPathname */
3115   int useJournal = (flags & PAGER_OMIT_JOURNAL)==0; /* False to omit journal */
3116   int noReadlock = (flags & PAGER_NO_READLOCK)!=0;  /* True to omit read-lock */
3117   int pcacheSize = sqlite3PcacheSize();       /* Bytes to allocate for PCache */
3118   u16 szPageDflt = SQLITE_DEFAULT_PAGE_SIZE;  /* Default page size */
3119 
3120   /* Figure out how much space is required for each journal file-handle
3121   ** (there are two of them, the main journal and the sub-journal). This
3122   ** is the maximum space required for an in-memory journal file handle
3123   ** and a regular journal file-handle. Note that a "regular journal-handle"
3124   ** may be a wrapper capable of caching the first portion of the journal
3125   ** file in memory to implement the atomic-write optimization (see
3126   ** source file journal.c).
3127   */
3128   if( sqlite3JournalSize(pVfs)>sqlite3MemJournalSize() ){
3129     journalFileSize = ROUND8(sqlite3JournalSize(pVfs));
3130   }else{
3131     journalFileSize = ROUND8(sqlite3MemJournalSize());
3132   }
3133 
3134   /* Set the output variable to NULL in case an error occurs. */
3135   *ppPager = 0;
3136 
3137   /* Compute and store the full pathname in an allocated buffer pointed
3138   ** to by zPathname, length nPathname. Or, if this is a temporary file,
3139   ** leave both nPathname and zPathname set to 0.
3140   */
3141   if( zFilename && zFilename[0] ){
3142     nPathname = pVfs->mxPathname+1;
3143     zPathname = sqlite3Malloc(nPathname*2);
3144     if( zPathname==0 ){
3145       return SQLITE_NOMEM;
3146     }
3147 #ifndef SQLITE_OMIT_MEMORYDB
3148     if( strcmp(zFilename,":memory:")==0 ){
3149       memDb = 1;
3150       zPathname[0] = 0;
3151     }else
3152 #endif
3153     {
3154       zPathname[0] = 0; /* Make sure initialized even if FullPathname() fails */
3155       rc = sqlite3OsFullPathname(pVfs, zFilename, nPathname, zPathname);
3156     }
3157 
3158     nPathname = sqlite3Strlen30(zPathname);
3159     if( rc==SQLITE_OK && nPathname+8>pVfs->mxPathname ){
3160       /* This branch is taken when the journal path required by
3161       ** the database being opened will be more than pVfs->mxPathname
3162       ** bytes in length. This means the database cannot be opened,
3163       ** as it will not be possible to open the journal file or even
3164       ** check for a hot-journal before reading.
3165       */
3166       rc = SQLITE_CANTOPEN;
3167     }
3168     if( rc!=SQLITE_OK ){
3169       sqlite3_free(zPathname);
3170       return rc;
3171     }
3172   }
3173 
3174   /* Allocate memory for the Pager structure, PCache object, the
3175   ** three file descriptors, the database file name and the journal
3176   ** file name. The layout in memory is as follows:
3177   **
3178   **     Pager object                    (sizeof(Pager) bytes)
3179   **     PCache object                   (sqlite3PcacheSize() bytes)
3180   **     Database file handle            (pVfs->szOsFile bytes)
3181   **     Sub-journal file handle         (journalFileSize bytes)
3182   **     Main journal file handle        (journalFileSize bytes)
3183   **     Database file name              (nPathname+1 bytes)
3184   **     Journal file name               (nPathname+8+1 bytes)
3185   */
3186   pPtr = (u8 *)sqlite3MallocZero(
3187     ROUND8(sizeof(*pPager)) +      /* Pager structure */
3188     ROUND8(pcacheSize) +           /* PCache object */
3189     ROUND8(pVfs->szOsFile) +       /* The main db file */
3190     journalFileSize * 2 +          /* The two journal files */
3191     nPathname + 1 +                /* zFilename */
3192     nPathname + 8 + 1              /* zJournal */
3193   );
3194   assert( EIGHT_BYTE_ALIGNMENT(journalFileSize) );
3195   if( !pPtr ){
3196     sqlite3_free(zPathname);
3197     return SQLITE_NOMEM;
3198   }
3199   pPager =              (Pager*)(pPtr);
3200   pPager->pPCache =    (PCache*)(pPtr += ROUND8(sizeof(*pPager)));
3201   pPager->fd =   (sqlite3_file*)(pPtr += ROUND8(pcacheSize));
3202   pPager->sjfd = (sqlite3_file*)(pPtr += ROUND8(pVfs->szOsFile));
3203   pPager->jfd =  (sqlite3_file*)(pPtr += journalFileSize);
3204   pPager->zFilename =    (char*)(pPtr += journalFileSize);
3205   assert( EIGHT_BYTE_ALIGNMENT(pPager->jfd) );
3206 
3207   /* Fill in the Pager.zFilename and Pager.zJournal buffers, if required. */
3208   if( zPathname ){
3209     pPager->zJournal =   (char*)(pPtr += nPathname + 1);
3210     memcpy(pPager->zFilename, zPathname, nPathname);
3211     memcpy(pPager->zJournal, zPathname, nPathname);
3212     memcpy(&pPager->zJournal[nPathname], "-journal", 8);
3213     sqlite3_free(zPathname);
3214   }
3215   pPager->pVfs = pVfs;
3216   pPager->vfsFlags = vfsFlags;
3217 
3218   /* Open the pager file.
3219   */
3220   if( zFilename && zFilename[0] && !memDb ){
3221     int fout = 0;                    /* VFS flags returned by xOpen() */
3222     rc = sqlite3OsOpen(pVfs, pPager->zFilename, pPager->fd, vfsFlags, &fout);
3223     readOnly = (fout&SQLITE_OPEN_READONLY);
3224 
3225     /* If the file was successfully opened for read/write access,
3226     ** choose a default page size in case we have to create the
3227     ** database file. The default page size is the maximum of:
3228     **
3229     **    + SQLITE_DEFAULT_PAGE_SIZE,
3230     **    + The value returned by sqlite3OsSectorSize()
3231     **    + The largest page size that can be written atomically.
3232     */
3233     if( rc==SQLITE_OK && !readOnly ){
3234       setSectorSize(pPager);
3235       assert(SQLITE_DEFAULT_PAGE_SIZE<=SQLITE_MAX_DEFAULT_PAGE_SIZE);
3236       if( szPageDflt<pPager->sectorSize ){
3237         if( pPager->sectorSize>SQLITE_MAX_DEFAULT_PAGE_SIZE ){
3238           szPageDflt = SQLITE_MAX_DEFAULT_PAGE_SIZE;
3239         }else{
3240           szPageDflt = (u16)pPager->sectorSize;
3241         }
3242       }
3243 #ifdef SQLITE_ENABLE_ATOMIC_WRITE
3244       {
3245         int iDc = sqlite3OsDeviceCharacteristics(pPager->fd);
3246         int ii;
3247         assert(SQLITE_IOCAP_ATOMIC512==(512>>8));
3248         assert(SQLITE_IOCAP_ATOMIC64K==(65536>>8));
3249         assert(SQLITE_MAX_DEFAULT_PAGE_SIZE<=65536);
3250         for(ii=szPageDflt; ii<=SQLITE_MAX_DEFAULT_PAGE_SIZE; ii=ii*2){
3251           if( iDc&(SQLITE_IOCAP_ATOMIC|(ii>>8)) ){
3252             szPageDflt = ii;
3253           }
3254         }
3255       }
3256 #endif
3257     }
3258   }else{
3259     /* If a temporary file is requested, it is not opened immediately.
3260     ** In this case we accept the default page size and delay actually
3261     ** opening the file until the first call to OsWrite().
3262     **
3263     ** This branch is also run for an in-memory database. An in-memory
3264     ** database is the same as a temp-file that is never written out to
3265     ** disk and uses an in-memory rollback journal.
3266     */
3267     tempFile = 1;
3268     pPager->state = PAGER_EXCLUSIVE;
3269   }
3270 
3271   /* The following call to PagerSetPagesize() serves to set the value of
3272   ** Pager.pageSize and to allocate the Pager.pTmpSpace buffer.
3273   */
3274   if( rc==SQLITE_OK ){
3275     assert( pPager->memDb==0 );
3276     rc = sqlite3PagerSetPagesize(pPager, &szPageDflt);
3277     testcase( rc!=SQLITE_OK );
3278   }
3279 
3280   /* If an error occurred in either of the blocks above, free the
3281   ** Pager structure and close the file.
3282   */
3283   if( rc!=SQLITE_OK ){
3284     assert( !pPager->pTmpSpace );
3285     sqlite3OsClose(pPager->fd);
3286     sqlite3_free(pPager);
3287     return rc;
3288   }
3289 
3290   /* Initialize the PCache object. */
3291   nExtra = ROUND8(nExtra);
3292   sqlite3PcacheOpen(szPageDflt, nExtra, !memDb,
3293                     !memDb?pagerStress:0, (void *)pPager, pPager->pPCache);
3294 
3295   PAGERTRACE(("OPEN %d %s\n", FILEHANDLEID(pPager->fd), pPager->zFilename));
3296   IOTRACE(("OPEN %p %s\n", pPager, pPager->zFilename))
3297 
3298   pPager->useJournal = (u8)useJournal;
3299   pPager->noReadlock = (noReadlock && readOnly) ?1:0;
3300   /* pPager->stmtOpen = 0; */
3301   /* pPager->stmtInUse = 0; */
3302   /* pPager->nRef = 0; */
3303   pPager->dbSizeValid = (u8)memDb;
3304   /* pPager->stmtSize = 0; */
3305   /* pPager->stmtJSize = 0; */
3306   /* pPager->nPage = 0; */
3307   pPager->mxPgno = SQLITE_MAX_PAGE_COUNT;
3308   /* pPager->state = PAGER_UNLOCK; */
3309   assert( pPager->state == (tempFile ? PAGER_EXCLUSIVE : PAGER_UNLOCK) );
3310   /* pPager->errMask = 0; */
3311   pPager->tempFile = (u8)tempFile;
3312   assert( tempFile==PAGER_LOCKINGMODE_NORMAL
3313           || tempFile==PAGER_LOCKINGMODE_EXCLUSIVE );
3314   assert( PAGER_LOCKINGMODE_EXCLUSIVE==1 );
3315   pPager->exclusiveMode = (u8)tempFile;
3316   pPager->changeCountDone = pPager->tempFile;
3317   pPager->memDb = (u8)memDb;
3318   pPager->readOnly = (u8)readOnly;
3319   /* pPager->needSync = 0; */
3320   pPager->noSync = (pPager->tempFile || !useJournal) ?1:0;
3321   pPager->fullSync = pPager->noSync ?0:1;
3322   pPager->sync_flags = SQLITE_SYNC_NORMAL;
3323   /* pPager->pFirst = 0; */
3324   /* pPager->pFirstSynced = 0; */
3325   /* pPager->pLast = 0; */
3326   pPager->nExtra = nExtra;
3327   pPager->journalSizeLimit = SQLITE_DEFAULT_JOURNAL_SIZE_LIMIT;
3328   assert( isOpen(pPager->fd) || tempFile );
3329   setSectorSize(pPager);
3330   if( memDb ){
3331     pPager->journalMode = PAGER_JOURNALMODE_MEMORY;
3332   }
3333   /* pPager->xBusyHandler = 0; */
3334   /* pPager->pBusyHandlerArg = 0; */
3335   /* memset(pPager->aHash, 0, sizeof(pPager->aHash)); */
3336   *ppPager = pPager;
3337   return SQLITE_OK;
3338 }
3339 
3340 
3341 
3342 /*
3343 ** This function is called after transitioning from PAGER_UNLOCK to
3344 ** PAGER_SHARED state. It tests if there is a hot journal present in
3345 ** the file-system for the given pager. A hot journal is one that
3346 ** needs to be played back. According to this function, a hot-journal
3347 ** file exists if the following criteria are met:
3348 **
3349 **   * The journal file exists in the file system, and
3350 **   * No process holds a RESERVED or greater lock on the database file, and
3351 **   * The database file itself is greater than 0 bytes in size, and
3352 **   * The first byte of the journal file exists and is not 0x00.
3353 **
3354 ** If the current size of the database file is 0 but a journal file
3355 ** exists, that is probably an old journal left over from a prior
3356 ** database with the same name. In this case the journal file is
3357 ** just deleted using OsDelete, *pExists is set to 0 and SQLITE_OK
3358 ** is returned.
3359 **
3360 ** This routine does not check if there is a master journal filename
3361 ** at the end of the file. If there is, and that master journal file
3362 ** does not exist, then the journal file is not really hot. In this
3363 ** case this routine will return a false-positive. The pager_playback()
3364 ** routine will discover that the journal file is not really hot and
3365 ** will not roll it back.
3366 **
3367 ** If a hot-journal file is found to exist, *pExists is set to 1 and
3368 ** SQLITE_OK returned. If no hot-journal file is present, *pExists is
3369 ** set to 0 and SQLITE_OK returned. If an IO error occurs while trying
3370 ** to determine whether or not a hot-journal file exists, the IO error
3371 ** code is returned and the value of *pExists is undefined.
3372 */
3373 static int hasHotJournal(Pager *pPager, int *pExists){
3374   sqlite3_vfs * const pVfs = pPager->pVfs;
3375   int rc;                       /* Return code */
3376   int exists;                   /* True if a journal file is present */
3377 
3378   assert( pPager!=0 );
3379   assert( pPager->useJournal );
3380   assert( isOpen(pPager->fd) );
3381   assert( !isOpen(pPager->jfd) );
3382 
3383   *pExists = 0;
3384   rc = sqlite3OsAccess(pVfs, pPager->zJournal, SQLITE_ACCESS_EXISTS, &exists);
3385   if( rc==SQLITE_OK && exists ){
3386     int locked;                 /* True if some process holds a RESERVED lock */
3387     rc = sqlite3OsCheckReservedLock(pPager->fd, &locked);
3388     if( rc==SQLITE_OK && !locked ){
3389       int nPage;
3390 
3391       /* Check the size of the database file. If it consists of 0 pages,
3392       ** then delete the journal file. See the header comment above for
3393       ** the reasoning here.
3394       */
3395       rc = sqlite3PagerPagecount(pPager, &nPage);
3396       if( rc==SQLITE_OK ){
3397         if( nPage==0 ){
3398           rc = sqlite3OsDelete(pVfs, pPager->zJournal, 0);
3399         }else{
3400           /* The journal file exists and no other connection has a reserved
3401           ** or greater lock on the database file. Now check that there is
3402           ** at least one non-zero bytes at the start of the journal file.
3403           ** If there is, then we consider this journal to be hot. If not,
3404           ** it can be ignored.
3405           */
3406           int f = SQLITE_OPEN_READONLY|SQLITE_OPEN_MAIN_JOURNAL;
3407           rc = sqlite3OsOpen(pVfs, pPager->zJournal, pPager->jfd, f, &f);
3408           if( rc==SQLITE_OK ){
3409             u8 first = 0;
3410             rc = sqlite3OsRead(pPager->jfd, (void *)&first, 1, 0);
3411             if( rc==SQLITE_IOERR_SHORT_READ ){
3412               rc = SQLITE_OK;
3413             }
3414             sqlite3OsClose(pPager->jfd);
3415             *pExists = (first!=0);
3416           }
3417         }
3418       }
3419     }
3420   }
3421 
3422   return rc;
3423 }
3424 
3425 /*
3426 ** Read the content for page pPg out of the database file and into
3427 ** pPg->pData. A shared lock or greater must be held on the database
3428 ** file before this function is called.
3429 **
3430 ** If page 1 is read, then the value of Pager.dbFileVers[] is set to
3431 ** the value read from the database file.
3432 **
3433 ** If an IO error occurs, then the IO error is returned to the caller.
3434 ** Otherwise, SQLITE_OK is returned.
3435 */
3436 static int readDbPage(PgHdr *pPg){
3437   Pager *pPager = pPg->pPager; /* Pager object associated with page pPg */
3438   Pgno pgno = pPg->pgno;       /* Page number to read */
3439   int rc;                      /* Return code */
3440   i64 iOffset;                 /* Byte offset of file to read from */
3441 
3442   assert( pPager->state>=PAGER_SHARED && !MEMDB );
3443 
3444   if( !isOpen(pPager->fd) ){
3445     assert( pPager->tempFile );
3446     memset(pPg->pData, 0, pPager->pageSize);
3447     return SQLITE_OK;
3448   }
3449   iOffset = (pgno-1)*(i64)pPager->pageSize;
3450   rc = sqlite3OsRead(pPager->fd, pPg->pData, pPager->pageSize, iOffset);
3451   if( rc==SQLITE_IOERR_SHORT_READ ){
3452     rc = SQLITE_OK;
3453   }
3454   if( pgno==1 ){
3455     u8 *dbFileVers = &((u8*)pPg->pData)[24];
3456     memcpy(&pPager->dbFileVers, dbFileVers, sizeof(pPager->dbFileVers));
3457   }
3458   CODEC1(pPager, pPg->pData, pgno, 3);
3459 
3460   PAGER_INCR(sqlite3_pager_readdb_count);
3461   PAGER_INCR(pPager->nRead);
3462   IOTRACE(("PGIN %p %d\n", pPager, pgno));
3463   PAGERTRACE(("FETCH %d page %d hash(%08x)\n",
3464                PAGERID(pPager), pgno, pager_pagehash(pPg)));
3465 
3466   return rc;
3467 }
3468 
3469 /*
3470 ** This function is called whenever the upper layer requests a database
3471 ** page is requested, before the cache is checked for a suitable page
3472 ** or any data is read from the database. It performs the following
3473 ** two functions:
3474 **
3475 **   1) If the pager is currently in PAGER_UNLOCK state (no lock held
3476 **      on the database file), then an attempt is made to obtain a
3477 **      SHARED lock on the database file. Immediately after obtaining
3478 **      the SHARED lock, the file-system is checked for a hot-journal,
3479 **      which is played back if present. Following any hot-journal
3480 **      rollback, the contents of the cache are validated by checking
3481 **      the 'change-counter' field of the database file header and
3482 **      discarded if they are found to be invalid.
3483 **
3484 **   2) If the pager is running in exclusive-mode, and there are currently
3485 **      no outstanding references to any pages, and is in the error state,
3486 **      then an attempt is made to clear the error state by discarding
3487 **      the contents of the page cache and rolling back any open journal
3488 **      file.
3489 **
3490 ** If the operation described by (2) above is not attempted, and if the
3491 ** pager is in an error state other than SQLITE_FULL when this is called,
3492 ** the error state error code is returned. It is permitted to read the
3493 ** database when in SQLITE_FULL error state.
3494 **
3495 ** Otherwise, if everything is successful, SQLITE_OK is returned. If an
3496 ** IO error occurs while locking the database, checking for a hot-journal
3497 ** file or rolling back a journal file, the IO error code is returned.
3498 */
3499 static int pagerSharedLock(Pager *pPager){
3500   int rc = SQLITE_OK;                /* Return code */
3501   int isErrorReset = 0;              /* True if recovering from error state */
3502 
3503   /* If this database is opened for exclusive access, has no outstanding
3504   ** page references and is in an error-state, this is a chance to clear
3505   ** the error. Discard the contents of the pager-cache and treat any
3506   ** open journal file as a hot-journal.
3507   */
3508   if( !MEMDB && pPager->exclusiveMode
3509    && sqlite3PcacheRefCount(pPager->pPCache)==0 && pPager->errCode
3510   ){
3511     if( isOpen(pPager->jfd) ){
3512       isErrorReset = 1;
3513     }
3514     pPager->errCode = SQLITE_OK;
3515     pager_reset(pPager);
3516   }
3517 
3518   /* If the pager is still in an error state, do not proceed. The error
3519   ** state will be cleared at some point in the future when all page
3520   ** references are dropped and the cache can be discarded.
3521   */
3522   if( pPager->errCode && pPager->errCode!=SQLITE_FULL ){
3523     return pPager->errCode;
3524   }
3525 
3526   if( pPager->state==PAGER_UNLOCK || isErrorReset ){
3527     sqlite3_vfs * const pVfs = pPager->pVfs;
3528     int isHotJournal = 0;
3529     assert( !MEMDB );
3530     assert( sqlite3PcacheRefCount(pPager->pPCache)==0 );
3531     if( !pPager->noReadlock ){
3532       rc = pager_wait_on_lock(pPager, SHARED_LOCK);
3533       if( rc!=SQLITE_OK ){
3534         assert( pPager->state==PAGER_UNLOCK );
3535         return pager_error(pPager, rc);
3536       }
3537     }else if( pPager->state==PAGER_UNLOCK ){
3538       pPager->state = PAGER_SHARED;
3539     }
3540     assert( pPager->state>=SHARED_LOCK );
3541 
3542     /* If a journal file exists, and there is no RESERVED lock on the
3543     ** database file, then it either needs to be played back or deleted.
3544     */
3545     if( !isErrorReset ){
3546       rc = hasHotJournal(pPager, &isHotJournal);
3547       if( rc!=SQLITE_OK ){
3548         goto failed;
3549       }
3550     }
3551     if( isErrorReset || isHotJournal ){
3552       /* Get an EXCLUSIVE lock on the database file. At this point it is
3553       ** important that a RESERVED lock is not obtained on the way to the
3554       ** EXCLUSIVE lock. If it were, another process might open the
3555       ** database file, detect the RESERVED lock, and conclude that the
3556       ** database is safe to read while this process is still rolling the
3557       ** hot-journal back.
3558       **
3559       ** Because the intermediate RESERVED lock is not requested, any
3560       ** other process attempting to access the database file will get to
3561       ** this point in the code and fail to obtain its own EXCLUSIVE lock
3562       ** on the database file.
3563       */
3564       if( pPager->state<EXCLUSIVE_LOCK ){
3565         rc = sqlite3OsLock(pPager->fd, EXCLUSIVE_LOCK);
3566         if( rc!=SQLITE_OK ){
3567           rc = pager_error(pPager, rc);
3568           goto failed;
3569         }
3570         pPager->state = PAGER_EXCLUSIVE;
3571       }
3572 
3573       /* Open the journal for read/write access. This is because in
3574       ** exclusive-access mode the file descriptor will be kept open and
3575       ** possibly used for a transaction later on. On some systems, the
3576       ** OsTruncate() call used in exclusive-access mode also requires
3577       ** a read/write file handle.
3578       */
3579       if( !isOpen(pPager->jfd) ){
3580         int res;
3581         rc = sqlite3OsAccess(pVfs,pPager->zJournal,SQLITE_ACCESS_EXISTS,&res);
3582         if( rc==SQLITE_OK ){
3583           if( res ){
3584             int fout = 0;
3585             int f = SQLITE_OPEN_READWRITE|SQLITE_OPEN_MAIN_JOURNAL;
3586             assert( !pPager->tempFile );
3587             rc = sqlite3OsOpen(pVfs, pPager->zJournal, pPager->jfd, f, &fout);
3588             assert( rc!=SQLITE_OK || isOpen(pPager->jfd) );
3589             if( rc==SQLITE_OK && fout&SQLITE_OPEN_READONLY ){
3590               rc = SQLITE_CANTOPEN;
3591               sqlite3OsClose(pPager->jfd);
3592             }
3593           }else{
3594             /* If the journal does not exist, that means some other process
3595             ** has already rolled it back */
3596             rc = SQLITE_BUSY;
3597           }
3598         }
3599       }
3600       if( rc!=SQLITE_OK ){
3601         goto failed;
3602       }
3603 
3604       /* TODO: Why are these cleared here? Is it necessary? */
3605       pPager->journalStarted = 0;
3606       pPager->journalOff = 0;
3607       pPager->setMaster = 0;
3608       pPager->journalHdr = 0;
3609 
3610       /* Playback and delete the journal.  Drop the database write
3611       ** lock and reacquire the read lock. Purge the cache before
3612       ** playing back the hot-journal so that we don't end up with
3613       ** an inconsistent cache.
3614       */
3615       rc = pager_playback(pPager, 1);
3616       if( rc!=SQLITE_OK ){
3617         rc = pager_error(pPager, rc);
3618         goto failed;
3619       }
3620       assert( (pPager->state==PAGER_SHARED)
3621            || (pPager->exclusiveMode && pPager->state>PAGER_SHARED)
3622       );
3623     }
3624 
3625     if( sqlite3PcachePagecount(pPager->pPCache)>0 ){
3626       /* The shared-lock has just been acquired on the database file
3627       ** and there are already pages in the cache (from a previous
3628       ** read or write transaction).  Check to see if the database
3629       ** has been modified.  If the database has changed, flush the
3630       ** cache.
3631       **
3632       ** Database changes is detected by looking at 15 bytes beginning
3633       ** at offset 24 into the file.  The first 4 of these 16 bytes are
3634       ** a 32-bit counter that is incremented with each change.  The
3635       ** other bytes change randomly with each file change when
3636       ** a codec is in use.
3637       **
3638       ** There is a vanishingly small chance that a change will not be
3639       ** detected.  The chance of an undetected change is so small that
3640       ** it can be neglected.
3641       */
3642       char dbFileVers[sizeof(pPager->dbFileVers)];
3643       sqlite3PagerPagecount(pPager, 0);
3644 
3645       if( pPager->errCode ){
3646         rc = pPager->errCode;
3647         goto failed;
3648       }
3649 
3650       assert( pPager->dbSizeValid );
3651       if( pPager->dbSize>0 ){
3652         IOTRACE(("CKVERS %p %d\n", pPager, sizeof(dbFileVers)));
3653         rc = sqlite3OsRead(pPager->fd, &dbFileVers, sizeof(dbFileVers), 24);
3654         if( rc!=SQLITE_OK ){
3655           goto failed;
3656         }
3657       }else{
3658         memset(dbFileVers, 0, sizeof(dbFileVers));
3659       }
3660 
3661       if( memcmp(pPager->dbFileVers, dbFileVers, sizeof(dbFileVers))!=0 ){
3662         pager_reset(pPager);
3663       }
3664     }
3665     assert( pPager->exclusiveMode || pPager->state==PAGER_SHARED );
3666   }
3667 
3668  failed:
3669   if( rc!=SQLITE_OK ){
3670     /* pager_unlock() is a no-op for exclusive mode and in-memory databases. */
3671     pager_unlock(pPager);
3672   }
3673   return rc;
3674 }
3675 
3676 /*
3677 ** If the reference count has reached zero, rollback any active
3678 ** transaction and unlock the pager.
3679 **
3680 ** Except, in locking_mode=EXCLUSIVE when there is nothing to in
3681 ** the rollback journal, the unlock is not performed and there is
3682 ** nothing to rollback, so this routine is a no-op.
3683 */
3684 static void pagerUnlockIfUnused(Pager *pPager){
3685   if( (sqlite3PcacheRefCount(pPager->pPCache)==0)
3686    && (!pPager->exclusiveMode || pPager->journalOff>0)
3687   ){
3688     pagerUnlockAndRollback(pPager);
3689   }
3690 }
3691 
3692 /*
3693 ** Drop a page from the cache using sqlite3PcacheDrop().
3694 **
3695 ** If this means there are now no pages with references to them, a rollback
3696 ** occurs and the lock on the database is removed.
3697 */
3698 static void pagerDropPage(DbPage *pPg){
3699   Pager *pPager = pPg->pPager;
3700   sqlite3PcacheDrop(pPg);
3701   pagerUnlockIfUnused(pPager);
3702 }
3703 
3704 /*
3705 ** Acquire a reference to page number pgno in pager pPager (a page
3706 ** reference has type DbPage*). If the requested reference is
3707 ** successfully obtained, it is copied to *ppPage and SQLITE_OK returned.
3708 **
3709 ** This function calls pagerSharedLock() to obtain a SHARED lock on
3710 ** the database file if such a lock or greater is not already held.
3711 ** This may cause hot-journal rollback or a cache purge. See comments
3712 ** above function pagerSharedLock() for details.
3713 **
3714 ** If the requested page is already in the cache, it is returned.
3715 ** Otherwise, a new page object is allocated and populated with data
3716 ** read from the database file. In some cases, the pcache module may
3717 ** choose not to allocate a new page object and may reuse an existing
3718 ** object with no outstanding references.
3719 **
3720 ** The extra data appended to a page is always initialized to zeros the
3721 ** first time a page is loaded into memory. If the page requested is
3722 ** already in the cache when this function is called, then the extra
3723 ** data is left as it was when the page object was last used.
3724 **
3725 ** If the database image is smaller than the requested page or if a
3726 ** non-zero value is passed as the noContent parameter and the
3727 ** requested page is not already stored in the cache, then no
3728 ** actual disk read occurs. In this case the memory image of the
3729 ** page is initialized to all zeros.
3730 **
3731 ** If noContent is true, it means that we do not care about the contents
3732 ** of the page. This occurs in two seperate scenarios:
3733 **
3734 **   a) When reading a free-list leaf page from the database, and
3735 **
3736 **   b) When a savepoint is being rolled back and we need to load
3737 **      a new page into the cache to populate with the data read
3738 **      from the savepoint journal.
3739 **
3740 ** If noContent is true, then the data returned is zeroed instead of
3741 ** being read from the database. Additionally, the bits corresponding
3742 ** to pgno in Pager.pInJournal (bitvec of pages already written to the
3743 ** journal file) and the PagerSavepoint.pInSavepoint bitvecs of any open
3744 ** savepoints are set. This means if the page is made writable at any
3745 ** point in the future, using a call to sqlite3PagerWrite(), its contents
3746 ** will not be journaled. This saves IO.
3747 **
3748 ** The acquisition might fail for several reasons.  In all cases,
3749 ** an appropriate error code is returned and *ppPage is set to NULL.
3750 **
3751 ** See also sqlite3PagerLookup().  Both this routine and Lookup() attempt
3752 ** to find a page in the in-memory cache first.  If the page is not already
3753 ** in memory, this routine goes to disk to read it in whereas Lookup()
3754 ** just returns 0.  This routine acquires a read-lock the first time it
3755 ** has to go to disk, and could also playback an old journal if necessary.
3756 ** Since Lookup() never goes to disk, it never has to deal with locks
3757 ** or journal files.
3758 */
3759 int sqlite3PagerAcquire(
3760   Pager *pPager,      /* The pager open on the database file */
3761   Pgno pgno,          /* Page number to fetch */
3762   DbPage **ppPage,    /* Write a pointer to the page here */
3763   int noContent       /* Do not bother reading content from disk if true */
3764 ){
3765   PgHdr *pPg = 0;
3766   int rc;
3767 
3768   assert( assert_pager_state(pPager) );
3769   assert( pPager->state==PAGER_UNLOCK
3770        || sqlite3PcacheRefCount(pPager->pPCache)>0
3771        || pgno==1
3772   );
3773 
3774   /* The maximum page number is 2^31. Return SQLITE_CORRUPT if a page
3775   ** number greater than this, or zero, is requested.
3776   */
3777   if( pgno>PAGER_MAX_PGNO || pgno==0 || pgno==PAGER_MJ_PGNO(pPager) ){
3778     return SQLITE_CORRUPT_BKPT;
3779   }
3780 
3781   /* Make sure we have not hit any critical errors.
3782   */
3783   assert( pPager!=0 );
3784   *ppPage = 0;
3785 
3786   /* If this is the first page accessed, then get a SHARED lock
3787   ** on the database file. pagerSharedLock() is a no-op if
3788   ** a database lock is already held.
3789   */
3790   rc = pagerSharedLock(pPager);
3791   if( rc!=SQLITE_OK ){
3792     return rc;
3793   }
3794   assert( pPager->state!=PAGER_UNLOCK );
3795 
3796   rc = sqlite3PcacheFetch(pPager->pPCache, pgno, 1, &pPg);
3797   if( rc!=SQLITE_OK ){
3798     return rc;
3799   }
3800   assert( pPg->pgno==pgno );
3801   assert( pPg->pPager==pPager || pPg->pPager==0 );
3802   if( pPg->pPager==0 ){
3803     /* The pager cache has created a new page. Its content needs to
3804     ** be initialized.
3805     */
3806     int nMax;
3807     PAGER_INCR(pPager->nMiss);
3808     pPg->pPager = pPager;
3809 
3810     rc = sqlite3PagerPagecount(pPager, &nMax);
3811     if( rc!=SQLITE_OK ){
3812       sqlite3PagerUnref(pPg);
3813       return rc;
3814     }
3815 
3816     if( nMax<(int)pgno || MEMDB || noContent ){
3817       if( pgno>pPager->mxPgno ){
3818         sqlite3PagerUnref(pPg);
3819         return SQLITE_FULL;
3820       }
3821       if( noContent ){
3822         /* Failure to set the bits in the InJournal bit-vectors is benign.
3823         ** It merely means that we might do some extra work to journal a
3824         ** page that does not need to be journaled.  Nevertheless, be sure
3825         ** to test the case where a malloc error occurs while trying to set
3826         ** a bit in a bit vector.
3827         */
3828         sqlite3BeginBenignMalloc();
3829         if( pgno<=pPager->dbOrigSize ){
3830           TESTONLY( rc = ) sqlite3BitvecSet(pPager->pInJournal, pgno);
3831           testcase( rc==SQLITE_NOMEM );
3832         }
3833         TESTONLY( rc = ) addToSavepointBitvecs(pPager, pgno);
3834         testcase( rc==SQLITE_NOMEM );
3835         sqlite3EndBenignMalloc();
3836       }else{
3837         memset(pPg->pData, 0, pPager->pageSize);
3838       }
3839       IOTRACE(("ZERO %p %d\n", pPager, pgno));
3840     }else{
3841       assert( pPg->pPager==pPager );
3842       rc = readDbPage(pPg);
3843       if( rc!=SQLITE_OK ){
3844         pagerDropPage(pPg);
3845         return rc;
3846       }
3847     }
3848 #ifdef SQLITE_CHECK_PAGES
3849     pPg->pageHash = pager_pagehash(pPg);
3850 #endif
3851   }else{
3852     /* The requested page is in the page cache. */
3853     PAGER_INCR(pPager->nHit);
3854   }
3855 
3856   *ppPage = pPg;
3857   return SQLITE_OK;
3858 }
3859 
3860 /*
3861 ** Acquire a page if it is already in the in-memory cache.  Do
3862 ** not read the page from disk.  Return a pointer to the page,
3863 ** or 0 if the page is not in cache. Also, return 0 if the
3864 ** pager is in PAGER_UNLOCK state when this function is called,
3865 ** or if the pager is in an error state other than SQLITE_FULL.
3866 **
3867 ** See also sqlite3PagerGet().  The difference between this routine
3868 ** and sqlite3PagerGet() is that _get() will go to the disk and read
3869 ** in the page if the page is not already in cache.  This routine
3870 ** returns NULL if the page is not in cache or if a disk I/O error
3871 ** has ever happened.
3872 */
3873 DbPage *sqlite3PagerLookup(Pager *pPager, Pgno pgno){
3874   PgHdr *pPg = 0;
3875   assert( pPager!=0 );
3876   assert( pgno!=0 );
3877 
3878   if( (pPager->state!=PAGER_UNLOCK)
3879    && (pPager->errCode==SQLITE_OK || pPager->errCode==SQLITE_FULL)
3880   ){
3881     sqlite3PcacheFetch(pPager->pPCache, pgno, 0, &pPg);
3882   }
3883 
3884   return pPg;
3885 }
3886 
3887 /*
3888 ** Release a page reference.
3889 **
3890 ** If the number of references to the page drop to zero, then the
3891 ** page is added to the LRU list.  When all references to all pages
3892 ** are released, a rollback occurs and the lock on the database is
3893 ** removed.
3894 */
3895 void sqlite3PagerUnref(DbPage *pPg){
3896   if( pPg ){
3897     Pager *pPager = pPg->pPager;
3898     sqlite3PcacheRelease(pPg);
3899     pagerUnlockIfUnused(pPager);
3900   }
3901 }
3902 
3903 /*
3904 ** If the main journal file has already been opened, ensure that the
3905 ** sub-journal file is open too. If the main journal is not open,
3906 ** this function is a no-op.
3907 **
3908 ** SQLITE_OK is returned if everything goes according to plan.
3909 ** An SQLITE_IOERR_XXX error code is returned if a call to
3910 ** sqlite3OsOpen() fails.
3911 */
3912 static int openSubJournal(Pager *pPager){
3913   int rc = SQLITE_OK;
3914   if( isOpen(pPager->jfd) && !isOpen(pPager->sjfd) ){
3915     if( pPager->journalMode==PAGER_JOURNALMODE_MEMORY || pPager->subjInMemory ){
3916       sqlite3MemJournalOpen(pPager->sjfd);
3917     }else{
3918       rc = pagerOpentemp(pPager, pPager->sjfd, SQLITE_OPEN_SUBJOURNAL);
3919     }
3920   }
3921   return rc;
3922 }
3923 
3924 /*
3925 ** This function is called at the start of every write transaction.
3926 ** There must already be a RESERVED or EXCLUSIVE lock on the database
3927 ** file when this routine is called.
3928 **
3929 ** Open the journal file for pager pPager and write a journal header
3930 ** to the start of it. If there are active savepoints, open the sub-journal
3931 ** as well. This function is only used when the journal file is being
3932 ** opened to write a rollback log for a transaction. It is not used
3933 ** when opening a hot journal file to roll it back.
3934 **
3935 ** If the journal file is already open (as it may be in exclusive mode),
3936 ** then this function just writes a journal header to the start of the
3937 ** already open file.
3938 **
3939 ** Whether or not the journal file is opened by this function, the
3940 ** Pager.pInJournal bitvec structure is allocated.
3941 **
3942 ** Return SQLITE_OK if everything is successful. Otherwise, return
3943 ** SQLITE_NOMEM if the attempt to allocate Pager.pInJournal fails, or
3944 ** an IO error code if opening or writing the journal file fails.
3945 */
3946 static int pager_open_journal(Pager *pPager){
3947   int rc = SQLITE_OK;                        /* Return code */
3948   sqlite3_vfs * const pVfs = pPager->pVfs;   /* Local cache of vfs pointer */
3949 
3950   assert( pPager->state>=PAGER_RESERVED );
3951   assert( pPager->useJournal );
3952   assert( pPager->pInJournal==0 );
3953 
3954   /* If already in the error state, this function is a no-op. */
3955   if( pPager->errCode ){
3956     return pPager->errCode;
3957   }
3958 
3959   /* TODO: Is it really possible to get here with dbSizeValid==0? If not,
3960   ** the call to PagerPagecount() can be removed.
3961   */
3962   testcase( pPager->dbSizeValid==0 );
3963   sqlite3PagerPagecount(pPager, 0);
3964 
3965   pPager->pInJournal = sqlite3BitvecCreate(pPager->dbSize);
3966   if( pPager->pInJournal==0 ){
3967     return SQLITE_NOMEM;
3968   }
3969 
3970   /* Open the journal file if it is not already open. */
3971   if( !isOpen(pPager->jfd) ){
3972     if( pPager->journalMode==PAGER_JOURNALMODE_MEMORY ){
3973       sqlite3MemJournalOpen(pPager->jfd);
3974     }else{
3975       const int flags =                   /* VFS flags to open journal file */
3976         SQLITE_OPEN_READWRITE|SQLITE_OPEN_CREATE|
3977         (pPager->tempFile ?
3978           (SQLITE_OPEN_DELETEONCLOSE|SQLITE_OPEN_TEMP_JOURNAL):
3979           (SQLITE_OPEN_MAIN_JOURNAL)
3980         );
3981 #ifdef SQLITE_ENABLE_ATOMIC_WRITE
3982       rc = sqlite3JournalOpen(
3983           pVfs, pPager->zJournal, pPager->jfd, flags, jrnlBufferSize(pPager)
3984       );
3985 #else
3986       rc = sqlite3OsOpen(pVfs, pPager->zJournal, pPager->jfd, flags, 0);
3987 #endif
3988     }
3989     assert( rc!=SQLITE_OK || isOpen(pPager->jfd) );
3990   }
3991 
3992 
3993   /* Write the first journal header to the journal file and open
3994   ** the sub-journal if necessary.
3995   */
3996   if( rc==SQLITE_OK ){
3997     /* TODO: Check if all of these are really required. */
3998     pPager->dbOrigSize = pPager->dbSize;
3999     pPager->journalStarted = 0;
4000     pPager->needSync = 0;
4001     pPager->nRec = 0;
4002     pPager->journalOff = 0;
4003     pPager->setMaster = 0;
4004     pPager->journalHdr = 0;
4005     rc = writeJournalHdr(pPager);
4006   }
4007   if( rc==SQLITE_OK && pPager->nSavepoint ){
4008     rc = openSubJournal(pPager);
4009   }
4010 
4011   if( rc!=SQLITE_OK ){
4012     sqlite3BitvecDestroy(pPager->pInJournal);
4013     pPager->pInJournal = 0;
4014   }
4015   return rc;
4016 }
4017 
4018 /*
4019 ** Begin a write-transaction on the specified pager object. If a
4020 ** write-transaction has already been opened, this function is a no-op.
4021 **
4022 ** If the exFlag argument is false, then acquire at least a RESERVED
4023 ** lock on the database file. If exFlag is true, then acquire at least
4024 ** an EXCLUSIVE lock. If such a lock is already held, no locking
4025 ** functions need be called.
4026 **
4027 ** If this is not a temporary or in-memory file and, the journal file is
4028 ** opened if it has not been already. For a temporary file, the opening
4029 ** of the journal file is deferred until there is an actual need to
4030 ** write to the journal. TODO: Why handle temporary files differently?
4031 **
4032 ** If the journal file is opened (or if it is already open), then a
4033 ** journal-header is written to the start of it.
4034 **
4035 ** If the subjInMemory argument is non-zero, then any sub-journal opened
4036 ** within this transaction will be opened as an in-memory file. This
4037 ** has no effect if the sub-journal is already opened (as it may be when
4038 ** running in exclusive mode) or if the transaction does not require a
4039 ** sub-journal. If the subjInMemory argument is zero, then any required
4040 ** sub-journal is implemented in-memory if pPager is an in-memory database,
4041 ** or using a temporary file otherwise.
4042 */
4043 int sqlite3PagerBegin(Pager *pPager, int exFlag, int subjInMemory){
4044   int rc = SQLITE_OK;
4045   assert( pPager->state!=PAGER_UNLOCK );
4046   pPager->subjInMemory = subjInMemory;
4047   if( pPager->state==PAGER_SHARED ){
4048     assert( pPager->pInJournal==0 );
4049     assert( !MEMDB && !pPager->tempFile );
4050 
4051     /* Obtain a RESERVED lock on the database file. If the exFlag parameter
4052     ** is true, then immediately upgrade this to an EXCLUSIVE lock. The
4053     ** busy-handler callback can be used when upgrading to the EXCLUSIVE
4054     ** lock, but not when obtaining the RESERVED lock.
4055     */
4056     rc = sqlite3OsLock(pPager->fd, RESERVED_LOCK);
4057     if( rc==SQLITE_OK ){
4058       pPager->state = PAGER_RESERVED;
4059       if( exFlag ){
4060         rc = pager_wait_on_lock(pPager, EXCLUSIVE_LOCK);
4061       }
4062     }
4063 
4064     /* If the required locks were successfully obtained, open the journal
4065     ** file and write the first journal-header to it.
4066     */
4067     if( rc==SQLITE_OK && pPager->useJournal
4068      && pPager->journalMode!=PAGER_JOURNALMODE_OFF
4069     ){
4070       rc = pager_open_journal(pPager);
4071     }
4072   }else if( isOpen(pPager->jfd) && pPager->journalOff==0 ){
4073     /* This happens when the pager was in exclusive-access mode the last
4074     ** time a (read or write) transaction was successfully concluded
4075     ** by this connection. Instead of deleting the journal file it was
4076     ** kept open and either was truncated to 0 bytes or its header was
4077     ** overwritten with zeros.
4078     */
4079     assert( pPager->nRec==0 );
4080     assert( pPager->dbOrigSize==0 );
4081     assert( pPager->pInJournal==0 );
4082     rc = pager_open_journal(pPager);
4083   }
4084 
4085   PAGERTRACE(("TRANSACTION %d\n", PAGERID(pPager)));
4086   assert( !isOpen(pPager->jfd) || pPager->journalOff>0 || rc!=SQLITE_OK );
4087   return rc;
4088 }
4089 
4090 /*
4091 ** Mark a single data page as writeable. The page is written into the
4092 ** main journal or sub-journal as required. If the page is written into
4093 ** one of the journals, the corresponding bit is set in the
4094 ** Pager.pInJournal bitvec and the PagerSavepoint.pInSavepoint bitvecs
4095 ** of any open savepoints as appropriate.
4096 */
4097 static int pager_write(PgHdr *pPg){
4098   void *pData = pPg->pData;
4099   Pager *pPager = pPg->pPager;
4100   int rc = SQLITE_OK;
4101 
4102   /* Check for errors
4103   */
4104   if( pPager->errCode ){
4105     return pPager->errCode;
4106   }
4107   if( pPager->readOnly ){
4108     return SQLITE_PERM;
4109   }
4110 
4111   assert( !pPager->setMaster );
4112 
4113   CHECK_PAGE(pPg);
4114 
4115   /* Mark the page as dirty.  If the page has already been written
4116   ** to the journal then we can return right away.
4117   */
4118   sqlite3PcacheMakeDirty(pPg);
4119   if( pageInJournal(pPg) && !subjRequiresPage(pPg) ){
4120     pPager->dbModified = 1;
4121   }else{
4122 
4123     /* If we get this far, it means that the page needs to be
4124     ** written to the transaction journal or the ckeckpoint journal
4125     ** or both.
4126     **
4127     ** First check to see that the transaction journal exists and
4128     ** create it if it does not.
4129     */
4130     assert( pPager->state!=PAGER_UNLOCK );
4131     rc = sqlite3PagerBegin(pPager, 0, pPager->subjInMemory);
4132     if( rc!=SQLITE_OK ){
4133       return rc;
4134     }
4135     assert( pPager->state>=PAGER_RESERVED );
4136     if( !isOpen(pPager->jfd) && pPager->useJournal
4137           && pPager->journalMode!=PAGER_JOURNALMODE_OFF ){
4138       rc = pager_open_journal(pPager);
4139       if( rc!=SQLITE_OK ) return rc;
4140     }
4141     pPager->dbModified = 1;
4142 
4143     /* The transaction journal now exists and we have a RESERVED or an
4144     ** EXCLUSIVE lock on the main database file.  Write the current page to
4145     ** the transaction journal if it is not there already.
4146     */
4147     if( !pageInJournal(pPg) && isOpen(pPager->jfd) ){
4148       if( pPg->pgno<=pPager->dbOrigSize ){
4149         u32 cksum;
4150         char *pData2;
4151 
4152         /* We should never write to the journal file the page that
4153         ** contains the database locks.  The following assert verifies
4154         ** that we do not. */
4155         assert( pPg->pgno!=PAGER_MJ_PGNO(pPager) );
4156         pData2 = CODEC2(pPager, pData, pPg->pgno, 7);
4157         cksum = pager_cksum(pPager, (u8*)pData2);
4158         rc = write32bits(pPager->jfd, pPager->journalOff, pPg->pgno);
4159         if( rc==SQLITE_OK ){
4160           rc = sqlite3OsWrite(pPager->jfd, pData2, pPager->pageSize,
4161                               pPager->journalOff + 4);
4162           pPager->journalOff += pPager->pageSize+4;
4163         }
4164         if( rc==SQLITE_OK ){
4165           rc = write32bits(pPager->jfd, pPager->journalOff, cksum);
4166           pPager->journalOff += 4;
4167         }
4168         IOTRACE(("JOUT %p %d %lld %d\n", pPager, pPg->pgno,
4169                  pPager->journalOff, pPager->pageSize));
4170         PAGER_INCR(sqlite3_pager_writej_count);
4171         PAGERTRACE(("JOURNAL %d page %d needSync=%d hash(%08x)\n",
4172              PAGERID(pPager), pPg->pgno,
4173              ((pPg->flags&PGHDR_NEED_SYNC)?1:0), pager_pagehash(pPg)));
4174 
4175         /* Even if an IO or diskfull error occurred while journalling the
4176         ** page in the block above, set the need-sync flag for the page.
4177         ** Otherwise, when the transaction is rolled back, the logic in
4178         ** playback_one_page() will think that the page needs to be restored
4179         ** in the database file. And if an IO error occurs while doing so,
4180         ** then corruption may follow.
4181         */
4182         if( !pPager->noSync ){
4183           pPg->flags |= PGHDR_NEED_SYNC;
4184           pPager->needSync = 1;
4185         }
4186 
4187         /* An error has occurred writing to the journal file. The
4188         ** transaction will be rolled back by the layer above.
4189         */
4190         if( rc!=SQLITE_OK ){
4191           return rc;
4192         }
4193 
4194         pPager->nRec++;
4195         assert( pPager->pInJournal!=0 );
4196         rc = sqlite3BitvecSet(pPager->pInJournal, pPg->pgno);
4197         testcase( rc==SQLITE_NOMEM );
4198         assert( rc==SQLITE_OK || rc==SQLITE_NOMEM );
4199         rc |= addToSavepointBitvecs(pPager, pPg->pgno);
4200         if( rc!=SQLITE_OK ){
4201           assert( rc==SQLITE_NOMEM );
4202           return rc;
4203         }
4204       }else{
4205         if( !pPager->journalStarted && !pPager->noSync ){
4206           pPg->flags |= PGHDR_NEED_SYNC;
4207           pPager->needSync = 1;
4208         }
4209         PAGERTRACE(("APPEND %d page %d needSync=%d\n",
4210                 PAGERID(pPager), pPg->pgno,
4211                ((pPg->flags&PGHDR_NEED_SYNC)?1:0)));
4212       }
4213     }
4214 
4215     /* If the statement journal is open and the page is not in it,
4216     ** then write the current page to the statement journal.  Note that
4217     ** the statement journal format differs from the standard journal format
4218     ** in that it omits the checksums and the header.
4219     */
4220     if( subjRequiresPage(pPg) ){
4221       rc = subjournalPage(pPg);
4222     }
4223   }
4224 
4225   /* Update the database size and return.
4226   */
4227   assert( pPager->state>=PAGER_SHARED );
4228   if( pPager->dbSize<pPg->pgno ){
4229     pPager->dbSize = pPg->pgno;
4230   }
4231   return rc;
4232 }
4233 
4234 /*
4235 ** Mark a data page as writeable. This routine must be called before
4236 ** making changes to a page. The caller must check the return value
4237 ** of this function and be careful not to change any page data unless
4238 ** this routine returns SQLITE_OK.
4239 **
4240 ** The difference between this function and pager_write() is that this
4241 ** function also deals with the special case where 2 or more pages
4242 ** fit on a single disk sector. In this case all co-resident pages
4243 ** must have been written to the journal file before returning.
4244 **
4245 ** If an error occurs, SQLITE_NOMEM or an IO error code is returned
4246 ** as appropriate. Otherwise, SQLITE_OK.
4247 */
4248 int sqlite3PagerWrite(DbPage *pDbPage){
4249   int rc = SQLITE_OK;
4250 
4251   PgHdr *pPg = pDbPage;
4252   Pager *pPager = pPg->pPager;
4253   Pgno nPagePerSector = (pPager->sectorSize/pPager->pageSize);
4254 
4255   if( nPagePerSector>1 ){
4256     Pgno nPageCount;          /* Total number of pages in database file */
4257     Pgno pg1;                 /* First page of the sector pPg is located on. */
4258     int nPage;                /* Number of pages starting at pg1 to journal */
4259     int ii;                   /* Loop counter */
4260     int needSync = 0;         /* True if any page has PGHDR_NEED_SYNC */
4261 
4262     /* Set the doNotSync flag to 1. This is because we cannot allow a journal
4263     ** header to be written between the pages journaled by this function.
4264     */
4265     assert( !MEMDB );
4266     assert( pPager->doNotSync==0 );
4267     pPager->doNotSync = 1;
4268 
4269     /* This trick assumes that both the page-size and sector-size are
4270     ** an integer power of 2. It sets variable pg1 to the identifier
4271     ** of the first page of the sector pPg is located on.
4272     */
4273     pg1 = ((pPg->pgno-1) & ~(nPagePerSector-1)) + 1;
4274 
4275     sqlite3PagerPagecount(pPager, (int *)&nPageCount);
4276     if( pPg->pgno>nPageCount ){
4277       nPage = (pPg->pgno - pg1)+1;
4278     }else if( (pg1+nPagePerSector-1)>nPageCount ){
4279       nPage = nPageCount+1-pg1;
4280     }else{
4281       nPage = nPagePerSector;
4282     }
4283     assert(nPage>0);
4284     assert(pg1<=pPg->pgno);
4285     assert((pg1+nPage)>pPg->pgno);
4286 
4287     for(ii=0; ii<nPage && rc==SQLITE_OK; ii++){
4288       Pgno pg = pg1+ii;
4289       PgHdr *pPage;
4290       if( pg==pPg->pgno || !sqlite3BitvecTest(pPager->pInJournal, pg) ){
4291         if( pg!=PAGER_MJ_PGNO(pPager) ){
4292           rc = sqlite3PagerGet(pPager, pg, &pPage);
4293           if( rc==SQLITE_OK ){
4294             rc = pager_write(pPage);
4295             if( pPage->flags&PGHDR_NEED_SYNC ){
4296               needSync = 1;
4297               assert(pPager->needSync);
4298             }
4299             sqlite3PagerUnref(pPage);
4300           }
4301         }
4302       }else if( (pPage = pager_lookup(pPager, pg))!=0 ){
4303         if( pPage->flags&PGHDR_NEED_SYNC ){
4304           needSync = 1;
4305         }
4306         sqlite3PagerUnref(pPage);
4307       }
4308     }
4309 
4310     /* If the PGHDR_NEED_SYNC flag is set for any of the nPage pages
4311     ** starting at pg1, then it needs to be set for all of them. Because
4312     ** writing to any of these nPage pages may damage the others, the
4313     ** journal file must contain sync()ed copies of all of them
4314     ** before any of them can be written out to the database file.
4315     */
4316     if( needSync ){
4317       assert( !MEMDB && pPager->noSync==0 );
4318       for(ii=0; ii<nPage && needSync; ii++){
4319         PgHdr *pPage = pager_lookup(pPager, pg1+ii);
4320         if( pPage ){
4321           pPage->flags |= PGHDR_NEED_SYNC;
4322           sqlite3PagerUnref(pPage);
4323         }
4324       }
4325       assert(pPager->needSync);
4326     }
4327 
4328     assert( pPager->doNotSync==1 );
4329     pPager->doNotSync = 0;
4330   }else{
4331     rc = pager_write(pDbPage);
4332   }
4333   return rc;
4334 }
4335 
4336 /*
4337 ** Return TRUE if the page given in the argument was previously passed
4338 ** to sqlite3PagerWrite().  In other words, return TRUE if it is ok
4339 ** to change the content of the page.
4340 */
4341 #ifndef NDEBUG
4342 int sqlite3PagerIswriteable(DbPage *pPg){
4343   return pPg->flags&PGHDR_DIRTY;
4344 }
4345 #endif
4346 
4347 /*
4348 ** A call to this routine tells the pager that it is not necessary to
4349 ** write the information on page pPg back to the disk, even though
4350 ** that page might be marked as dirty.  This happens, for example, when
4351 ** the page has been added as a leaf of the freelist and so its
4352 ** content no longer matters.
4353 **
4354 ** The overlying software layer calls this routine when all of the data
4355 ** on the given page is unused. The pager marks the page as clean so
4356 ** that it does not get written to disk.
4357 **
4358 ** Tests show that this optimization can quadruple the speed of large
4359 ** DELETE operations.
4360 */
4361 void sqlite3PagerDontWrite(PgHdr *pPg){
4362   Pager *pPager = pPg->pPager;
4363   if( (pPg->flags&PGHDR_DIRTY) && pPager->nSavepoint==0 ){
4364     PAGERTRACE(("DONT_WRITE page %d of %d\n", pPg->pgno, PAGERID(pPager)));
4365     IOTRACE(("CLEAN %p %d\n", pPager, pPg->pgno))
4366     pPg->flags |= PGHDR_DONT_WRITE;
4367 #ifdef SQLITE_CHECK_PAGES
4368     pPg->pageHash = pager_pagehash(pPg);
4369 #endif
4370   }
4371 }
4372 
4373 /*
4374 ** This routine is called to increment the value of the database file
4375 ** change-counter, stored as a 4-byte big-endian integer starting at
4376 ** byte offset 24 of the pager file.
4377 **
4378 ** If the isDirect flag is zero, then this is done by calling
4379 ** sqlite3PagerWrite() on page 1, then modifying the contents of the
4380 ** page data. In this case the file will be updated when the current
4381 ** transaction is committed.
4382 **
4383 ** The isDirect flag may only be non-zero if the library was compiled
4384 ** with the SQLITE_ENABLE_ATOMIC_WRITE macro defined. In this case,
4385 ** if isDirect is non-zero, then the database file is updated directly
4386 ** by writing an updated version of page 1 using a call to the
4387 ** sqlite3OsWrite() function.
4388 */
4389 static int pager_incr_changecounter(Pager *pPager, int isDirectMode){
4390   int rc = SQLITE_OK;
4391 
4392   /* Declare and initialize constant integer 'isDirect'. If the
4393   ** atomic-write optimization is enabled in this build, then isDirect
4394   ** is initialized to the value passed as the isDirectMode parameter
4395   ** to this function. Otherwise, it is always set to zero.
4396   **
4397   ** The idea is that if the atomic-write optimization is not
4398   ** enabled at compile time, the compiler can omit the tests of
4399   ** 'isDirect' below, as well as the block enclosed in the
4400   ** "if( isDirect )" condition.
4401   */
4402 #ifndef SQLITE_ENABLE_ATOMIC_WRITE
4403   const int isDirect = 0;
4404   assert( isDirectMode==0 );
4405   UNUSED_PARAMETER(isDirectMode);
4406 #else
4407   const int isDirect = isDirectMode;
4408 #endif
4409 
4410   assert( pPager->state>=PAGER_RESERVED );
4411   if( !pPager->changeCountDone && pPager->dbSize>0 ){
4412     PgHdr *pPgHdr;                /* Reference to page 1 */
4413     u32 change_counter;           /* Initial value of change-counter field */
4414 
4415     assert( !pPager->tempFile && isOpen(pPager->fd) );
4416 
4417     /* Open page 1 of the file for writing. */
4418     rc = sqlite3PagerGet(pPager, 1, &pPgHdr);
4419     assert( pPgHdr==0 || rc==SQLITE_OK );
4420 
4421     /* If page one was fetched successfully, and this function is not
4422     ** operating in direct-mode, make page 1 writable.
4423     */
4424     if( rc==SQLITE_OK && !isDirect ){
4425       rc = sqlite3PagerWrite(pPgHdr);
4426     }
4427 
4428     if( rc==SQLITE_OK ){
4429       /* Increment the value just read and write it back to byte 24. */
4430       change_counter = sqlite3Get4byte((u8*)pPager->dbFileVers);
4431       change_counter++;
4432       put32bits(((char*)pPgHdr->pData)+24, change_counter);
4433 
4434       /* If running in direct mode, write the contents of page 1 to the file. */
4435       if( isDirect ){
4436         const void *zBuf = pPgHdr->pData;
4437         assert( pPager->dbFileSize>0 );
4438         rc = sqlite3OsWrite(pPager->fd, zBuf, pPager->pageSize, 0);
4439       }
4440 
4441       /* If everything worked, set the changeCountDone flag. */
4442       if( rc==SQLITE_OK ){
4443         pPager->changeCountDone = 1;
4444       }
4445     }
4446 
4447     /* Release the page reference. */
4448     sqlite3PagerUnref(pPgHdr);
4449   }
4450   return rc;
4451 }
4452 
4453 /*
4454 ** Sync the pager file to disk. This is a no-op for in-memory files
4455 ** or pages with the Pager.noSync flag set.
4456 **
4457 ** If successful, or called on a pager for which it is a no-op, this
4458 ** function returns SQLITE_OK. Otherwise, an IO error code is returned.
4459 */
4460 int sqlite3PagerSync(Pager *pPager){
4461   int rc;                              /* Return code */
4462   if( MEMDB || pPager->noSync ){
4463     rc = SQLITE_OK;
4464   }else{
4465     rc = sqlite3OsSync(pPager->fd, pPager->sync_flags);
4466   }
4467   return rc;
4468 }
4469 
4470 /*
4471 ** Sync the database file for the pager pPager. zMaster points to the name
4472 ** of a master journal file that should be written into the individual
4473 ** journal file. zMaster may be NULL, which is interpreted as no master
4474 ** journal (a single database transaction).
4475 **
4476 ** This routine ensures that:
4477 **
4478 **   * The database file change-counter is updated,
4479 **   * the journal is synced (unless the atomic-write optimization is used),
4480 **   * all dirty pages are written to the database file,
4481 **   * the database file is truncated (if required), and
4482 **   * the database file synced.
4483 **
4484 ** The only thing that remains to commit the transaction is to finalize
4485 ** (delete, truncate or zero the first part of) the journal file (or
4486 ** delete the master journal file if specified).
4487 **
4488 ** Note that if zMaster==NULL, this does not overwrite a previous value
4489 ** passed to an sqlite3PagerCommitPhaseOne() call.
4490 **
4491 ** If the final parameter - noSync - is true, then the database file itself
4492 ** is not synced. The caller must call sqlite3PagerSync() directly to
4493 ** sync the database file before calling CommitPhaseTwo() to delete the
4494 ** journal file in this case.
4495 */
4496 int sqlite3PagerCommitPhaseOne(
4497   Pager *pPager,                  /* Pager object */
4498   const char *zMaster,            /* If not NULL, the master journal name */
4499   int noSync                      /* True to omit the xSync on the db file */
4500 ){
4501   int rc = SQLITE_OK;             /* Return code */
4502 
4503   if( pPager->errCode ){
4504     return pPager->errCode;
4505   }
4506 
4507   PAGERTRACE(("DATABASE SYNC: File=%s zMaster=%s nSize=%d\n",
4508       pPager->zFilename, zMaster, pPager->dbSize));
4509 
4510   /* If this is an in-memory db, or no pages have been written to, or this
4511   ** function has already been called, it is a no-op.
4512   */
4513   if( MEMDB && pPager->dbModified ){
4514     sqlite3BackupRestart(pPager->pBackup);
4515   }else if( pPager->state!=PAGER_SYNCED && pPager->dbModified ){
4516 
4517     /* The following block updates the change-counter. Exactly how it
4518     ** does this depends on whether or not the atomic-update optimization
4519     ** was enabled at compile time, and if this transaction meets the
4520     ** runtime criteria to use the operation:
4521     **
4522     **    * The file-system supports the atomic-write property for
4523     **      blocks of size page-size, and
4524     **    * This commit is not part of a multi-file transaction, and
4525     **    * Exactly one page has been modified and store in the journal file.
4526     **
4527     ** If the optimization was not enabled at compile time, then the
4528     ** pager_incr_changecounter() function is called to update the change
4529     ** counter in 'indirect-mode'. If the optimization is compiled in but
4530     ** is not applicable to this transaction, call sqlite3JournalCreate()
4531     ** to make sure the journal file has actually been created, then call
4532     ** pager_incr_changecounter() to update the change-counter in indirect
4533     ** mode.
4534     **
4535     ** Otherwise, if the optimization is both enabled and applicable,
4536     ** then call pager_incr_changecounter() to update the change-counter
4537     ** in 'direct' mode. In this case the journal file will never be
4538     ** created for this transaction.
4539     */
4540 #ifdef SQLITE_ENABLE_ATOMIC_WRITE
4541     PgHdr *pPg;
4542     assert( isOpen(pPager->jfd) || pPager->journalMode==PAGER_JOURNALMODE_OFF );
4543     if( !zMaster && isOpen(pPager->jfd)
4544      && pPager->journalOff==jrnlBufferSize(pPager)
4545      && pPager->dbSize>=pPager->dbFileSize
4546      && (0==(pPg = sqlite3PcacheDirtyList(pPager->pPCache)) || 0==pPg->pDirty)
4547     ){
4548       /* Update the db file change counter via the direct-write method. The
4549       ** following call will modify the in-memory representation of page 1
4550       ** to include the updated change counter and then write page 1
4551       ** directly to the database file. Because of the atomic-write
4552       ** property of the host file-system, this is safe.
4553       */
4554       rc = pager_incr_changecounter(pPager, 1);
4555     }else{
4556       rc = sqlite3JournalCreate(pPager->jfd);
4557       if( rc==SQLITE_OK ){
4558         rc = pager_incr_changecounter(pPager, 0);
4559       }
4560     }
4561 #else
4562     rc = pager_incr_changecounter(pPager, 0);
4563 #endif
4564     if( rc!=SQLITE_OK ) goto commit_phase_one_exit;
4565 
4566     /* If this transaction has made the database smaller, then all pages
4567     ** being discarded by the truncation must be written to the journal
4568     ** file. This can only happen in auto-vacuum mode.
4569     **
4570     ** Before reading the pages with page numbers larger than the
4571     ** current value of Pager.dbSize, set dbSize back to the value
4572     ** that it took at the start of the transaction. Otherwise, the
4573     ** calls to sqlite3PagerGet() return zeroed pages instead of
4574     ** reading data from the database file.
4575     */
4576 #ifndef SQLITE_OMIT_AUTOVACUUM
4577     if( pPager->dbSize<pPager->dbOrigSize
4578      && pPager->journalMode!=PAGER_JOURNALMODE_OFF
4579     ){
4580       Pgno i;                                   /* Iterator variable */
4581       const Pgno iSkip = PAGER_MJ_PGNO(pPager); /* Pending lock page */
4582       const Pgno dbSize = pPager->dbSize;       /* Database image size */
4583       pPager->dbSize = pPager->dbOrigSize;
4584       for( i=dbSize+1; i<=pPager->dbOrigSize; i++ ){
4585         if( !sqlite3BitvecTest(pPager->pInJournal, i) && i!=iSkip ){
4586           PgHdr *pPage;             /* Page to journal */
4587           rc = sqlite3PagerGet(pPager, i, &pPage);
4588           if( rc!=SQLITE_OK ) goto commit_phase_one_exit;
4589           rc = sqlite3PagerWrite(pPage);
4590           sqlite3PagerUnref(pPage);
4591           if( rc!=SQLITE_OK ) goto commit_phase_one_exit;
4592         }
4593       }
4594       pPager->dbSize = dbSize;
4595     }
4596 #endif
4597 
4598     /* Write the master journal name into the journal file. If a master
4599     ** journal file name has already been written to the journal file,
4600     ** or if zMaster is NULL (no master journal), then this call is a no-op.
4601     */
4602     rc = writeMasterJournal(pPager, zMaster);
4603     if( rc!=SQLITE_OK ) goto commit_phase_one_exit;
4604 
4605     /* Sync the journal file. If the atomic-update optimization is being
4606     ** used, this call will not create the journal file or perform any
4607     ** real IO.
4608     */
4609     rc = syncJournal(pPager);
4610     if( rc!=SQLITE_OK ) goto commit_phase_one_exit;
4611 
4612     /* Write all dirty pages to the database file. */
4613     rc = pager_write_pagelist(sqlite3PcacheDirtyList(pPager->pPCache));
4614     if( rc!=SQLITE_OK ){
4615       assert( rc!=SQLITE_IOERR_BLOCKED );
4616       goto commit_phase_one_exit;
4617     }
4618     sqlite3PcacheCleanAll(pPager->pPCache);
4619 
4620     /* If the file on disk is not the same size as the database image,
4621     ** then use pager_truncate to grow or shrink the file here.
4622     */
4623     if( pPager->dbSize!=pPager->dbFileSize ){
4624       Pgno nNew = pPager->dbSize - (pPager->dbSize==PAGER_MJ_PGNO(pPager));
4625       assert( pPager->state>=PAGER_EXCLUSIVE );
4626       rc = pager_truncate(pPager, nNew);
4627       if( rc!=SQLITE_OK ) goto commit_phase_one_exit;
4628     }
4629 
4630     /* Finally, sync the database file. */
4631     if( !pPager->noSync && !noSync ){
4632       rc = sqlite3OsSync(pPager->fd, pPager->sync_flags);
4633     }
4634     IOTRACE(("DBSYNC %p\n", pPager))
4635 
4636     pPager->state = PAGER_SYNCED;
4637   }
4638 
4639 commit_phase_one_exit:
4640   if( rc==SQLITE_IOERR_BLOCKED ){
4641     /* pager_incr_changecounter() may attempt to obtain an exclusive
4642     ** lock to spill the cache and return IOERR_BLOCKED. But since
4643     ** there is no chance the cache is inconsistent, it is
4644     ** better to return SQLITE_BUSY.
4645     **/
4646     rc = SQLITE_BUSY;
4647   }
4648   return rc;
4649 }
4650 
4651 
4652 /*
4653 ** When this function is called, the database file has been completely
4654 ** updated to reflect the changes made by the current transaction and
4655 ** synced to disk. The journal file still exists in the file-system
4656 ** though, and if a failure occurs at this point it will eventually
4657 ** be used as a hot-journal and the current transaction rolled back.
4658 **
4659 ** This function finalizes the journal file, either by deleting,
4660 ** truncating or partially zeroing it, so that it cannot be used
4661 ** for hot-journal rollback. Once this is done the transaction is
4662 ** irrevocably committed.
4663 **
4664 ** If an error occurs, an IO error code is returned and the pager
4665 ** moves into the error state. Otherwise, SQLITE_OK is returned.
4666 */
4667 int sqlite3PagerCommitPhaseTwo(Pager *pPager){
4668   int rc = SQLITE_OK;                  /* Return code */
4669 
4670   /* Do not proceed if the pager is already in the error state. */
4671   if( pPager->errCode ){
4672     return pPager->errCode;
4673   }
4674 
4675   /* This function should not be called if the pager is not in at least
4676   ** PAGER_RESERVED state. And indeed SQLite never does this. But it is
4677   ** nice to have this defensive block here anyway.
4678   */
4679   if( NEVER(pPager->state<PAGER_RESERVED) ){
4680     return SQLITE_ERROR;
4681   }
4682 
4683   /* An optimization. If the database was not actually modified during
4684   ** this transaction, the pager is running in exclusive-mode and is
4685   ** using persistent journals, then this function is a no-op.
4686   **
4687   ** The start of the journal file currently contains a single journal
4688   ** header with the nRec field set to 0. If such a journal is used as
4689   ** a hot-journal during hot-journal rollback, 0 changes will be made
4690   ** to the database file. So there is no need to zero the journal
4691   ** header. Since the pager is in exclusive mode, there is no need
4692   ** to drop any locks either.
4693   */
4694   if( pPager->dbModified==0 && pPager->exclusiveMode
4695    && pPager->journalMode==PAGER_JOURNALMODE_PERSIST
4696   ){
4697     assert( pPager->journalOff==JOURNAL_HDR_SZ(pPager) );
4698     return SQLITE_OK;
4699   }
4700 
4701   PAGERTRACE(("COMMIT %d\n", PAGERID(pPager)));
4702   assert( pPager->state==PAGER_SYNCED || MEMDB || !pPager->dbModified );
4703   rc = pager_end_transaction(pPager, pPager->setMaster);
4704   return pager_error(pPager, rc);
4705 }
4706 
4707 /*
4708 ** Rollback all changes. The database falls back to PAGER_SHARED mode.
4709 **
4710 ** This function performs two tasks:
4711 **
4712 **   1) It rolls back the journal file, restoring all database file and
4713 **      in-memory cache pages to the state they were in when the transaction
4714 **      was opened, and
4715 **   2) It finalizes the journal file, so that it is not used for hot
4716 **      rollback at any point in the future.
4717 **
4718 ** subject to the following qualifications:
4719 **
4720 ** * If the journal file is not yet open when this function is called,
4721 **   then only (2) is performed. In this case there is no journal file
4722 **   to roll back.
4723 **
4724 ** * If in an error state other than SQLITE_FULL, then task (1) is
4725 **   performed. If successful, task (2). Regardless of the outcome
4726 **   of either, the error state error code is returned to the caller
4727 **   (i.e. either SQLITE_IOERR or SQLITE_CORRUPT).
4728 **
4729 ** * If the pager is in PAGER_RESERVED state, then attempt (1). Whether
4730 **   or not (1) is succussful, also attempt (2). If successful, return
4731 **   SQLITE_OK. Otherwise, enter the error state and return the first
4732 **   error code encountered.
4733 **
4734 **   In this case there is no chance that the database was written to.
4735 **   So is safe to finalize the journal file even if the playback
4736 **   (operation 1) failed. However the pager must enter the error state
4737 **   as the contents of the in-memory cache are now suspect.
4738 **
4739 ** * Finally, if in PAGER_EXCLUSIVE state, then attempt (1). Only
4740 **   attempt (2) if (1) is successful. Return SQLITE_OK if successful,
4741 **   otherwise enter the error state and return the error code from the
4742 **   failing operation.
4743 **
4744 **   In this case the database file may have been written to. So if the
4745 **   playback operation did not succeed it would not be safe to finalize
4746 **   the journal file. It needs to be left in the file-system so that
4747 **   some other process can use it to restore the database state (by
4748 **   hot-journal rollback).
4749 */
4750 int sqlite3PagerRollback(Pager *pPager){
4751   int rc = SQLITE_OK;                  /* Return code */
4752   PAGERTRACE(("ROLLBACK %d\n", PAGERID(pPager)));
4753   if( !pPager->dbModified || !isOpen(pPager->jfd) ){
4754     rc = pager_end_transaction(pPager, pPager->setMaster);
4755   }else if( pPager->errCode && pPager->errCode!=SQLITE_FULL ){
4756     if( pPager->state>=PAGER_EXCLUSIVE ){
4757       pager_playback(pPager, 0);
4758     }
4759     rc = pPager->errCode;
4760   }else{
4761     if( pPager->state==PAGER_RESERVED ){
4762       int rc2;
4763       rc = pager_playback(pPager, 0);
4764       rc2 = pager_end_transaction(pPager, pPager->setMaster);
4765       if( rc==SQLITE_OK ){
4766         rc = rc2;
4767       }
4768     }else{
4769       rc = pager_playback(pPager, 0);
4770     }
4771 
4772     if( !MEMDB ){
4773       pPager->dbSizeValid = 0;
4774     }
4775 
4776     /* If an error occurs during a ROLLBACK, we can no longer trust the pager
4777     ** cache. So call pager_error() on the way out to make any error
4778     ** persistent.
4779     */
4780     rc = pager_error(pPager, rc);
4781   }
4782   return rc;
4783 }
4784 
4785 /*
4786 ** Return TRUE if the database file is opened read-only.  Return FALSE
4787 ** if the database is (in theory) writable.
4788 */
4789 u8 sqlite3PagerIsreadonly(Pager *pPager){
4790   return pPager->readOnly;
4791 }
4792 
4793 /*
4794 ** Return the number of references to the pager.
4795 */
4796 int sqlite3PagerRefcount(Pager *pPager){
4797   return sqlite3PcacheRefCount(pPager->pPCache);
4798 }
4799 
4800 /*
4801 ** Return the number of references to the specified page.
4802 */
4803 int sqlite3PagerPageRefcount(DbPage *pPage){
4804   return sqlite3PcachePageRefcount(pPage);
4805 }
4806 
4807 #ifdef SQLITE_TEST
4808 /*
4809 ** This routine is used for testing and analysis only.
4810 */
4811 int *sqlite3PagerStats(Pager *pPager){
4812   static int a[11];
4813   a[0] = sqlite3PcacheRefCount(pPager->pPCache);
4814   a[1] = sqlite3PcachePagecount(pPager->pPCache);
4815   a[2] = sqlite3PcacheGetCachesize(pPager->pPCache);
4816   a[3] = pPager->dbSizeValid ? (int) pPager->dbSize : -1;
4817   a[4] = pPager->state;
4818   a[5] = pPager->errCode;
4819   a[6] = pPager->nHit;
4820   a[7] = pPager->nMiss;
4821   a[8] = 0;  /* Used to be pPager->nOvfl */
4822   a[9] = pPager->nRead;
4823   a[10] = pPager->nWrite;
4824   return a;
4825 }
4826 #endif
4827 
4828 /*
4829 ** Return true if this is an in-memory pager.
4830 */
4831 int sqlite3PagerIsMemdb(Pager *pPager){
4832   return MEMDB;
4833 }
4834 
4835 /*
4836 ** Check that there are at least nSavepoint savepoints open. If there are
4837 ** currently less than nSavepoints open, then open one or more savepoints
4838 ** to make up the difference. If the number of savepoints is already
4839 ** equal to nSavepoint, then this function is a no-op.
4840 **
4841 ** If a memory allocation fails, SQLITE_NOMEM is returned. If an error
4842 ** occurs while opening the sub-journal file, then an IO error code is
4843 ** returned. Otherwise, SQLITE_OK.
4844 */
4845 int sqlite3PagerOpenSavepoint(Pager *pPager, int nSavepoint){
4846   int rc = SQLITE_OK;                       /* Return code */
4847   int nCurrent = pPager->nSavepoint;        /* Current number of savepoints */
4848 
4849   if( nSavepoint>nCurrent && pPager->useJournal ){
4850     int ii;                                 /* Iterator variable */
4851     PagerSavepoint *aNew;                   /* New Pager.aSavepoint array */
4852 
4853     /* Either there is no active journal or the sub-journal is open or
4854     ** the journal is always stored in memory */
4855     assert( pPager->nSavepoint==0 || isOpen(pPager->sjfd) ||
4856             pPager->journalMode==PAGER_JOURNALMODE_MEMORY );
4857 
4858     /* Grow the Pager.aSavepoint array using realloc(). Return SQLITE_NOMEM
4859     ** if the allocation fails. Otherwise, zero the new portion in case a
4860     ** malloc failure occurs while populating it in the for(...) loop below.
4861     */
4862     aNew = (PagerSavepoint *)sqlite3Realloc(
4863         pPager->aSavepoint, sizeof(PagerSavepoint)*nSavepoint
4864     );
4865     if( !aNew ){
4866       return SQLITE_NOMEM;
4867     }
4868     memset(&aNew[nCurrent], 0, (nSavepoint-nCurrent) * sizeof(PagerSavepoint));
4869     pPager->aSavepoint = aNew;
4870     pPager->nSavepoint = nSavepoint;
4871 
4872     /* Populate the PagerSavepoint structures just allocated. */
4873     for(ii=nCurrent; ii<nSavepoint; ii++){
4874       assert( pPager->dbSizeValid );
4875       aNew[ii].nOrig = pPager->dbSize;
4876       if( isOpen(pPager->jfd) && pPager->journalOff>0 ){
4877         aNew[ii].iOffset = pPager->journalOff;
4878       }else{
4879         aNew[ii].iOffset = JOURNAL_HDR_SZ(pPager);
4880       }
4881       aNew[ii].iSubRec = pPager->nSubRec;
4882       aNew[ii].pInSavepoint = sqlite3BitvecCreate(pPager->dbSize);
4883       if( !aNew[ii].pInSavepoint ){
4884         return SQLITE_NOMEM;
4885       }
4886     }
4887 
4888     /* Open the sub-journal, if it is not already opened. */
4889     rc = openSubJournal(pPager);
4890   }
4891 
4892   return rc;
4893 }
4894 
4895 /*
4896 ** This function is called to rollback or release (commit) a savepoint.
4897 ** The savepoint to release or rollback need not be the most recently
4898 ** created savepoint.
4899 **
4900 ** Parameter op is always either SAVEPOINT_ROLLBACK or SAVEPOINT_RELEASE.
4901 ** If it is SAVEPOINT_RELEASE, then release and destroy the savepoint with
4902 ** index iSavepoint. If it is SAVEPOINT_ROLLBACK, then rollback all changes
4903 ** that have occurred since the specified savepoint was created.
4904 **
4905 ** The savepoint to rollback or release is identified by parameter
4906 ** iSavepoint. A value of 0 means to operate on the outermost savepoint
4907 ** (the first created). A value of (Pager.nSavepoint-1) means operate
4908 ** on the most recently created savepoint. If iSavepoint is greater than
4909 ** (Pager.nSavepoint-1), then this function is a no-op.
4910 **
4911 ** If a negative value is passed to this function, then the current
4912 ** transaction is rolled back. This is different to calling
4913 ** sqlite3PagerRollback() because this function does not terminate
4914 ** the transaction or unlock the database, it just restores the
4915 ** contents of the database to its original state.
4916 **
4917 ** In any case, all savepoints with an index greater than iSavepoint
4918 ** are destroyed. If this is a release operation (op==SAVEPOINT_RELEASE),
4919 ** then savepoint iSavepoint is also destroyed.
4920 **
4921 ** This function may return SQLITE_NOMEM if a memory allocation fails,
4922 ** or an IO error code if an IO error occurs while rolling back a
4923 ** savepoint. If no errors occur, SQLITE_OK is returned.
4924 */
4925 int sqlite3PagerSavepoint(Pager *pPager, int op, int iSavepoint){
4926   int rc = SQLITE_OK;
4927 
4928   assert( op==SAVEPOINT_RELEASE || op==SAVEPOINT_ROLLBACK );
4929   assert( iSavepoint>=0 || op==SAVEPOINT_ROLLBACK );
4930 
4931   if( iSavepoint<pPager->nSavepoint ){
4932     int ii;            /* Iterator variable */
4933     int nNew;          /* Number of remaining savepoints after this op. */
4934 
4935     /* Figure out how many savepoints will still be active after this
4936     ** operation. Store this value in nNew. Then free resources associated
4937     ** with any savepoints that are destroyed by this operation.
4938     */
4939     nNew = iSavepoint + (op==SAVEPOINT_ROLLBACK);
4940     for(ii=nNew; ii<pPager->nSavepoint; ii++){
4941       sqlite3BitvecDestroy(pPager->aSavepoint[ii].pInSavepoint);
4942     }
4943     pPager->nSavepoint = nNew;
4944 
4945     /* If this is a rollback operation, playback the specified savepoint.
4946     ** If this is a temp-file, it is possible that the journal file has
4947     ** not yet been opened. In this case there have been no changes to
4948     ** the database file, so the playback operation can be skipped.
4949     */
4950     if( op==SAVEPOINT_ROLLBACK && isOpen(pPager->jfd) ){
4951       PagerSavepoint *pSavepoint = (nNew==0)?0:&pPager->aSavepoint[nNew-1];
4952       rc = pagerPlaybackSavepoint(pPager, pSavepoint);
4953       assert(rc!=SQLITE_DONE);
4954     }
4955 
4956     /* If this is a release of the outermost savepoint, truncate
4957     ** the sub-journal to zero bytes in size. */
4958     if( nNew==0 && op==SAVEPOINT_RELEASE && isOpen(pPager->sjfd) ){
4959       assert( rc==SQLITE_OK );
4960       rc = sqlite3OsTruncate(pPager->sjfd, 0);
4961       pPager->nSubRec = 0;
4962     }
4963   }
4964   return rc;
4965 }
4966 
4967 /*
4968 ** Return the full pathname of the database file.
4969 */
4970 const char *sqlite3PagerFilename(Pager *pPager){
4971   return pPager->zFilename;
4972 }
4973 
4974 /*
4975 ** Return the VFS structure for the pager.
4976 */
4977 const sqlite3_vfs *sqlite3PagerVfs(Pager *pPager){
4978   return pPager->pVfs;
4979 }
4980 
4981 /*
4982 ** Return the file handle for the database file associated
4983 ** with the pager.  This might return NULL if the file has
4984 ** not yet been opened.
4985 */
4986 sqlite3_file *sqlite3PagerFile(Pager *pPager){
4987   return pPager->fd;
4988 }
4989 
4990 /*
4991 ** Return the full pathname of the journal file.
4992 */
4993 const char *sqlite3PagerJournalname(Pager *pPager){
4994   return pPager->zJournal;
4995 }
4996 
4997 /*
4998 ** Return true if fsync() calls are disabled for this pager.  Return FALSE
4999 ** if fsync()s are executed normally.
5000 */
5001 int sqlite3PagerNosync(Pager *pPager){
5002   return pPager->noSync;
5003 }
5004 
5005 #ifdef SQLITE_HAS_CODEC
5006 /*
5007 ** Set the codec for this pager
5008 */
5009 void sqlite3PagerSetCodec(
5010   Pager *pPager,
5011   void *(*xCodec)(void*,void*,Pgno,int),
5012   void *pCodecArg
5013 ){
5014   pPager->xCodec = xCodec;
5015   pPager->pCodecArg = pCodecArg;
5016 }
5017 #endif
5018 
5019 #ifndef SQLITE_OMIT_AUTOVACUUM
5020 /*
5021 ** Move the page pPg to location pgno in the file.
5022 **
5023 ** There must be no references to the page previously located at
5024 ** pgno (which we call pPgOld) though that page is allowed to be
5025 ** in cache.  If the page previously located at pgno is not already
5026 ** in the rollback journal, it is not put there by by this routine.
5027 **
5028 ** References to the page pPg remain valid. Updating any
5029 ** meta-data associated with pPg (i.e. data stored in the nExtra bytes
5030 ** allocated along with the page) is the responsibility of the caller.
5031 **
5032 ** A transaction must be active when this routine is called. It used to be
5033 ** required that a statement transaction was not active, but this restriction
5034 ** has been removed (CREATE INDEX needs to move a page when a statement
5035 ** transaction is active).
5036 **
5037 ** If the fourth argument, isCommit, is non-zero, then this page is being
5038 ** moved as part of a database reorganization just before the transaction
5039 ** is being committed. In this case, it is guaranteed that the database page
5040 ** pPg refers to will not be written to again within this transaction.
5041 **
5042 ** This function may return SQLITE_NOMEM or an IO error code if an error
5043 ** occurs. Otherwise, it returns SQLITE_OK.
5044 */
5045 int sqlite3PagerMovepage(Pager *pPager, DbPage *pPg, Pgno pgno, int isCommit){
5046   PgHdr *pPgOld;               /* The page being overwritten. */
5047   Pgno needSyncPgno = 0;       /* Old value of pPg->pgno, if sync is required */
5048   int rc;                      /* Return code */
5049   Pgno origPgno;               /* The original page number */
5050 
5051   assert( pPg->nRef>0 );
5052 
5053   /* If the page being moved is dirty and has not been saved by the latest
5054   ** savepoint, then save the current contents of the page into the
5055   ** sub-journal now. This is required to handle the following scenario:
5056   **
5057   **   BEGIN;
5058   **     <journal page X, then modify it in memory>
5059   **     SAVEPOINT one;
5060   **       <Move page X to location Y>
5061   **     ROLLBACK TO one;
5062   **
5063   ** If page X were not written to the sub-journal here, it would not
5064   ** be possible to restore its contents when the "ROLLBACK TO one"
5065   ** statement were is processed.
5066   **
5067   ** subjournalPage() may need to allocate space to store pPg->pgno into
5068   ** one or more savepoint bitvecs. This is the reason this function
5069   ** may return SQLITE_NOMEM.
5070   */
5071   if( pPg->flags&PGHDR_DIRTY
5072    && subjRequiresPage(pPg)
5073    && SQLITE_OK!=(rc = subjournalPage(pPg))
5074   ){
5075     return rc;
5076   }
5077 
5078   PAGERTRACE(("MOVE %d page %d (needSync=%d) moves to %d\n",
5079       PAGERID(pPager), pPg->pgno, (pPg->flags&PGHDR_NEED_SYNC)?1:0, pgno));
5080   IOTRACE(("MOVE %p %d %d\n", pPager, pPg->pgno, pgno))
5081 
5082   /* If the journal needs to be sync()ed before page pPg->pgno can
5083   ** be written to, store pPg->pgno in local variable needSyncPgno.
5084   **
5085   ** If the isCommit flag is set, there is no need to remember that
5086   ** the journal needs to be sync()ed before database page pPg->pgno
5087   ** can be written to. The caller has already promised not to write to it.
5088   */
5089   if( (pPg->flags&PGHDR_NEED_SYNC) && !isCommit ){
5090     needSyncPgno = pPg->pgno;
5091     assert( pageInJournal(pPg) || pPg->pgno>pPager->dbOrigSize );
5092     assert( pPg->flags&PGHDR_DIRTY );
5093     assert( pPager->needSync );
5094   }
5095 
5096   /* If the cache contains a page with page-number pgno, remove it
5097   ** from its hash chain. Also, if the PgHdr.needSync was set for
5098   ** page pgno before the 'move' operation, it needs to be retained
5099   ** for the page moved there.
5100   */
5101   pPg->flags &= ~PGHDR_NEED_SYNC;
5102   pPgOld = pager_lookup(pPager, pgno);
5103   assert( !pPgOld || pPgOld->nRef==1 );
5104   if( pPgOld ){
5105     pPg->flags |= (pPgOld->flags&PGHDR_NEED_SYNC);
5106     sqlite3PcacheDrop(pPgOld);
5107   }
5108 
5109   origPgno = pPg->pgno;
5110   sqlite3PcacheMove(pPg, pgno);
5111   sqlite3PcacheMakeDirty(pPg);
5112   pPager->dbModified = 1;
5113 
5114   if( needSyncPgno ){
5115     /* If needSyncPgno is non-zero, then the journal file needs to be
5116     ** sync()ed before any data is written to database file page needSyncPgno.
5117     ** Currently, no such page exists in the page-cache and the
5118     ** "is journaled" bitvec flag has been set. This needs to be remedied by
5119     ** loading the page into the pager-cache and setting the PgHdr.needSync
5120     ** flag.
5121     **
5122     ** If the attempt to load the page into the page-cache fails, (due
5123     ** to a malloc() or IO failure), clear the bit in the pInJournal[]
5124     ** array. Otherwise, if the page is loaded and written again in
5125     ** this transaction, it may be written to the database file before
5126     ** it is synced into the journal file. This way, it may end up in
5127     ** the journal file twice, but that is not a problem.
5128     **
5129     ** The sqlite3PagerGet() call may cause the journal to sync. So make
5130     ** sure the Pager.needSync flag is set too.
5131     */
5132     PgHdr *pPgHdr;
5133     assert( pPager->needSync );
5134     rc = sqlite3PagerGet(pPager, needSyncPgno, &pPgHdr);
5135     if( rc!=SQLITE_OK ){
5136       if( pPager->pInJournal && needSyncPgno<=pPager->dbOrigSize ){
5137         sqlite3BitvecClear(pPager->pInJournal, needSyncPgno);
5138       }
5139       return rc;
5140     }
5141     pPager->needSync = 1;
5142     assert( pPager->noSync==0 && !MEMDB );
5143     pPgHdr->flags |= PGHDR_NEED_SYNC;
5144     sqlite3PcacheMakeDirty(pPgHdr);
5145     sqlite3PagerUnref(pPgHdr);
5146   }
5147 
5148   /*
5149   ** For an in-memory database, make sure the original page continues
5150   ** to exist, in case the transaction needs to roll back.  We allocate
5151   ** the page now, instead of at rollback, because we can better deal
5152   ** with an out-of-memory error now.  Ticket #3761.
5153   */
5154   if( MEMDB ){
5155     DbPage *pNew;
5156     rc = sqlite3PagerAcquire(pPager, origPgno, &pNew, 1);
5157     if( rc!=SQLITE_OK ) return rc;
5158     sqlite3PagerUnref(pNew);
5159   }
5160 
5161   return SQLITE_OK;
5162 }
5163 #endif
5164 
5165 /*
5166 ** Return a pointer to the data for the specified page.
5167 */
5168 void *sqlite3PagerGetData(DbPage *pPg){
5169   assert( pPg->nRef>0 || pPg->pPager->memDb );
5170   return pPg->pData;
5171 }
5172 
5173 /*
5174 ** Return a pointer to the Pager.nExtra bytes of "extra" space
5175 ** allocated along with the specified page.
5176 */
5177 void *sqlite3PagerGetExtra(DbPage *pPg){
5178   Pager *pPager = pPg->pPager;
5179   return (pPager?pPg->pExtra:0);
5180 }
5181 
5182 /*
5183 ** Get/set the locking-mode for this pager. Parameter eMode must be one
5184 ** of PAGER_LOCKINGMODE_QUERY, PAGER_LOCKINGMODE_NORMAL or
5185 ** PAGER_LOCKINGMODE_EXCLUSIVE. If the parameter is not _QUERY, then
5186 ** the locking-mode is set to the value specified.
5187 **
5188 ** The returned value is either PAGER_LOCKINGMODE_NORMAL or
5189 ** PAGER_LOCKINGMODE_EXCLUSIVE, indicating the current (possibly updated)
5190 ** locking-mode.
5191 */
5192 int sqlite3PagerLockingMode(Pager *pPager, int eMode){
5193   assert( eMode==PAGER_LOCKINGMODE_QUERY
5194             || eMode==PAGER_LOCKINGMODE_NORMAL
5195             || eMode==PAGER_LOCKINGMODE_EXCLUSIVE );
5196   assert( PAGER_LOCKINGMODE_QUERY<0 );
5197   assert( PAGER_LOCKINGMODE_NORMAL>=0 && PAGER_LOCKINGMODE_EXCLUSIVE>=0 );
5198   if( eMode>=0 && !pPager->tempFile ){
5199     pPager->exclusiveMode = (u8)eMode;
5200   }
5201   return (int)pPager->exclusiveMode;
5202 }
5203 
5204 /*
5205 ** Get/set the journal-mode for this pager. Parameter eMode must be one of:
5206 **
5207 **    PAGER_JOURNALMODE_QUERY
5208 **    PAGER_JOURNALMODE_DELETE
5209 **    PAGER_JOURNALMODE_TRUNCATE
5210 **    PAGER_JOURNALMODE_PERSIST
5211 **    PAGER_JOURNALMODE_OFF
5212 **    PAGER_JOURNALMODE_MEMORY
5213 **
5214 ** If the parameter is not _QUERY, then the journal_mode is set to the
5215 ** value specified if the change is allowed.  The change is disallowed
5216 ** for the following reasons:
5217 **
5218 **   *  An in-memory database can only have its journal_mode set to _OFF
5219 **      or _MEMORY.
5220 **
5221 **   *  The journal mode may not be changed while a transaction is active.
5222 **
5223 ** The returned indicate the current (possibly updated) journal-mode.
5224 */
5225 int sqlite3PagerJournalMode(Pager *pPager, int eMode){
5226   assert( eMode==PAGER_JOURNALMODE_QUERY
5227             || eMode==PAGER_JOURNALMODE_DELETE
5228             || eMode==PAGER_JOURNALMODE_TRUNCATE
5229             || eMode==PAGER_JOURNALMODE_PERSIST
5230             || eMode==PAGER_JOURNALMODE_OFF
5231             || eMode==PAGER_JOURNALMODE_MEMORY );
5232   assert( PAGER_JOURNALMODE_QUERY<0 );
5233   if( eMode>=0
5234    && (!MEMDB || eMode==PAGER_JOURNALMODE_MEMORY
5235               || eMode==PAGER_JOURNALMODE_OFF)
5236    && !pPager->dbModified
5237    && (!isOpen(pPager->jfd) || 0==pPager->journalOff)
5238   ){
5239     if( isOpen(pPager->jfd) ){
5240       sqlite3OsClose(pPager->jfd);
5241     }
5242     pPager->journalMode = (u8)eMode;
5243   }
5244   return (int)pPager->journalMode;
5245 }
5246 
5247 /*
5248 ** Get/set the size-limit used for persistent journal files.
5249 **
5250 ** Setting the size limit to -1 means no limit is enforced.
5251 ** An attempt to set a limit smaller than -1 is a no-op.
5252 */
5253 i64 sqlite3PagerJournalSizeLimit(Pager *pPager, i64 iLimit){
5254   if( iLimit>=-1 ){
5255     pPager->journalSizeLimit = iLimit;
5256   }
5257   return pPager->journalSizeLimit;
5258 }
5259 
5260 /*
5261 ** Return a pointer to the pPager->pBackup variable. The backup module
5262 ** in backup.c maintains the content of this variable. This module
5263 ** uses it opaquely as an argument to sqlite3BackupRestart() and
5264 ** sqlite3BackupUpdate() only.
5265 */
5266 sqlite3_backup **sqlite3PagerBackupPtr(Pager *pPager){
5267   return &pPager->pBackup;
5268 }
5269 
5270 #endif /* SQLITE_OMIT_DISKIO */
5271