xref: /sqlite-3.40.0/src/pager.c (revision 104f1fef)
1ed7c855cSdrh /*
2b19a2bc6Sdrh ** 2001 September 15
3ed7c855cSdrh **
4b19a2bc6Sdrh ** The author disclaims copyright to this source code.  In place of
5b19a2bc6Sdrh ** a legal notice, here is a blessing:
6ed7c855cSdrh **
7b19a2bc6Sdrh **    May you do good and not evil.
8b19a2bc6Sdrh **    May you find forgiveness for yourself and forgive others.
9b19a2bc6Sdrh **    May you share freely, never taking more than you give.
10ed7c855cSdrh **
11ed7c855cSdrh *************************************************************************
12b19a2bc6Sdrh ** This is the implementation of the page cache subsystem or "pager".
13ed7c855cSdrh **
14b19a2bc6Sdrh ** The pager is used to access a database disk file.  It implements
15b19a2bc6Sdrh ** atomic commit and rollback through the use of a journal file that
16b19a2bc6Sdrh ** is separate from the database file.  The pager also implements file
17b19a2bc6Sdrh ** locking to prevent two processes from writing the same database
18b19a2bc6Sdrh ** file simultaneously, or one process from reading the database while
19b19a2bc6Sdrh ** another is writing.
20ed7c855cSdrh **
21*104f1fefSdanielk1977 ** @(#) $Id: pager.c,v 1.550 2009/01/14 17:45:58 danielk1977 Exp $
22ed7c855cSdrh */
232e66f0b9Sdrh #ifndef SQLITE_OMIT_DISKIO
24d9b0257aSdrh #include "sqliteInt.h"
25ed7c855cSdrh 
26ed7c855cSdrh /*
27db48ee02Sdrh ** Macros for troubleshooting.  Normally turned off
28db48ee02Sdrh */
29466be56bSdanielk1977 #if 0
30f2c31ad8Sdanielk1977 int sqlite3PagerTrace=1;  /* True to enable tracing */
31d3627afcSdrh #define sqlite3DebugPrintf printf
3230d53701Sdrh #define PAGERTRACE(X)     if( sqlite3PagerTrace ){ sqlite3DebugPrintf X; }
33db48ee02Sdrh #else
3430d53701Sdrh #define PAGERTRACE(X)
35db48ee02Sdrh #endif
36db48ee02Sdrh 
37599fcbaeSdanielk1977 /*
3830d53701Sdrh ** The following two macros are used within the PAGERTRACE() macros above
39d86959f5Sdrh ** to print out file-descriptors.
40599fcbaeSdanielk1977 **
4185b623f2Sdrh ** PAGERID() takes a pointer to a Pager struct as its argument. The
4262079060Sdanielk1977 ** associated file-descriptor is returned. FILEHANDLEID() takes an sqlite3_file
4385b623f2Sdrh ** struct as its argument.
44599fcbaeSdanielk1977 */
45c001c58aSdrh #define PAGERID(p) ((int)(p->fd))
46c001c58aSdrh #define FILEHANDLEID(fd) ((int)fd)
47db48ee02Sdrh 
48db48ee02Sdrh /*
49ed7c855cSdrh ** The page cache as a whole is always in one of the following
50ed7c855cSdrh ** states:
51ed7c855cSdrh **
52a6abd041Sdrh **   PAGER_UNLOCK        The page cache is not currently reading or
53ed7c855cSdrh **                       writing the database file.  There is no
54ed7c855cSdrh **                       data held in memory.  This is the initial
55ed7c855cSdrh **                       state.
56ed7c855cSdrh **
57a6abd041Sdrh **   PAGER_SHARED        The page cache is reading the database.
58ed7c855cSdrh **                       Writing is not permitted.  There can be
59ed7c855cSdrh **                       multiple readers accessing the same database
6069688d5fSdrh **                       file at the same time.
61ed7c855cSdrh **
62726de599Sdrh **   PAGER_RESERVED      This process has reserved the database for writing
63726de599Sdrh **                       but has not yet made any changes.  Only one process
64726de599Sdrh **                       at a time can reserve the database.  The original
65726de599Sdrh **                       database file has not been modified so other
66726de599Sdrh **                       processes may still be reading the on-disk
67a6abd041Sdrh **                       database file.
68a6abd041Sdrh **
69a6abd041Sdrh **   PAGER_EXCLUSIVE     The page cache is writing the database.
70ed7c855cSdrh **                       Access is exclusive.  No other processes or
71ed7c855cSdrh **                       threads can be reading or writing while one
72ed7c855cSdrh **                       process is writing.
73ed7c855cSdrh **
74aa5ccdf5Sdanielk1977 **   PAGER_SYNCED        The pager moves to this state from PAGER_EXCLUSIVE
75aa5ccdf5Sdanielk1977 **                       after all dirty pages have been written to the
76aa5ccdf5Sdanielk1977 **                       database file and the file has been synced to
77369339dbSdrh **                       disk. All that remains to do is to remove or
78369339dbSdrh **                       truncate the journal file and the transaction
79369339dbSdrh **                       will be committed.
80aa5ccdf5Sdanielk1977 **
81a6abd041Sdrh ** The page cache comes up in PAGER_UNLOCK.  The first time a
823b8a05f6Sdanielk1977 ** sqlite3PagerGet() occurs, the state transitions to PAGER_SHARED.
83ed7c855cSdrh ** After all pages have been released using sqlite_page_unref(),
84a6abd041Sdrh ** the state transitions back to PAGER_UNLOCK.  The first time
853b8a05f6Sdanielk1977 ** that sqlite3PagerWrite() is called, the state transitions to
86369339dbSdrh ** PAGER_RESERVED.  (Note that sqlite3PagerWrite() can only be
87306dc213Sdrh ** called on an outstanding page which means that the pager must
88a6abd041Sdrh ** be in PAGER_SHARED before it transitions to PAGER_RESERVED.)
89369339dbSdrh ** PAGER_RESERVED means that there is an open rollback journal.
90369339dbSdrh ** The transition to PAGER_EXCLUSIVE occurs before any changes
91369339dbSdrh ** are made to the database file, though writes to the rollback
92369339dbSdrh ** journal occurs with just PAGER_RESERVED.  After an sqlite3PagerRollback()
93369339dbSdrh ** or sqlite3PagerCommitPhaseTwo(), the state can go back to PAGER_SHARED,
94369339dbSdrh ** or it can stay at PAGER_EXCLUSIVE if we are in exclusive access mode.
95ed7c855cSdrh */
96a6abd041Sdrh #define PAGER_UNLOCK      0
97684917c2Sdrh #define PAGER_SHARED      1   /* same as SHARED_LOCK */
98684917c2Sdrh #define PAGER_RESERVED    2   /* same as RESERVED_LOCK */
99684917c2Sdrh #define PAGER_EXCLUSIVE   4   /* same as EXCLUSIVE_LOCK */
100684917c2Sdrh #define PAGER_SYNCED      5
101ed7c855cSdrh 
102684917c2Sdrh /*
103887dc4c2Sdrh ** This macro rounds values up so that if the value is an address it
104887dc4c2Sdrh ** is guaranteed to be an address that is aligned to an 8-byte boundary.
105887dc4c2Sdrh */
106887dc4c2Sdrh #define FORCE_ALIGNMENT(X)   (((X)+7)&~7)
107887dc4c2Sdrh 
1089eb9e26bSdrh /*
1099eb9e26bSdrh ** A macro used for invoking the codec if there is one
1109eb9e26bSdrh */
1119eb9e26bSdrh #ifdef SQLITE_HAS_CODEC
112c001c58aSdrh # define CODEC1(P,D,N,X) if( P->xCodec!=0 ){ P->xCodec(P->pCodecArg,D,N,X); }
113c001c58aSdrh # define CODEC2(P,D,N,X) ((char*)(P->xCodec!=0?P->xCodec(P->pCodecArg,D,N,X):D))
1149eb9e26bSdrh #else
115c001c58aSdrh # define CODEC1(P,D,N,X) /* NO-OP */
116c001c58aSdrh # define CODEC2(P,D,N,X) ((char*)D)
1179eb9e26bSdrh #endif
1189eb9e26bSdrh 
119ed7c855cSdrh /*
1207cbd589dSdanielk1977 ** The maximum allowed sector size. 16MB. If the xSectorsize() method
1217cbd589dSdanielk1977 ** returns a value larger than this, then MAX_SECTOR_SIZE is used instead.
1227cbd589dSdanielk1977 ** This could conceivably cause corruption following a power failure on
1237cbd589dSdanielk1977 ** such a system. This is currently an undocumented limit.
1247cbd589dSdanielk1977 */
1257cbd589dSdanielk1977 #define MAX_SECTOR_SIZE 0x0100000
1267cbd589dSdanielk1977 
1277cbd589dSdanielk1977 /*
128fd7f0452Sdanielk1977 ** An instance of the following structure is allocated for each active
129fd7f0452Sdanielk1977 ** savepoint and statement transaction in the system. All such structures
130fd7f0452Sdanielk1977 ** are stored in the Pager.aSavepoint[] array, which is allocated and
131fd7f0452Sdanielk1977 ** resized using sqlite3Realloc().
132fd7f0452Sdanielk1977 **
133fd7f0452Sdanielk1977 ** When a savepoint is created, the PagerSavepoint.iHdrOffset field is
134fd7f0452Sdanielk1977 ** set to 0. If a journal-header is written into the main journal while
135fd7f0452Sdanielk1977 ** the savepoint is active, then iHdrOffset is set to the byte offset
136fd7f0452Sdanielk1977 ** immediately following the last journal record written into the main
137fd7f0452Sdanielk1977 ** journal before the journal-header. This is required during savepoint
138fd7f0452Sdanielk1977 ** rollback (see pagerPlaybackSavepoint()).
139fd7f0452Sdanielk1977 */
140fd7f0452Sdanielk1977 typedef struct PagerSavepoint PagerSavepoint;
141fd7f0452Sdanielk1977 struct PagerSavepoint {
142fd7f0452Sdanielk1977   i64 iOffset;                 /* Starting offset in main journal */
143fd7f0452Sdanielk1977   i64 iHdrOffset;              /* See above */
144fd7f0452Sdanielk1977   Bitvec *pInSavepoint;        /* Set of pages in this savepoint */
145fd7f0452Sdanielk1977   Pgno nOrig;                  /* Original number of pages in file */
146fd7f0452Sdanielk1977   Pgno iSubRec;                /* Index of first record in sub-journal */
147fd7f0452Sdanielk1977 };
148fd7f0452Sdanielk1977 
149fd7f0452Sdanielk1977 /*
150ed7c855cSdrh ** A open page cache is an instance of the following structure.
151efaaf579Sdanielk1977 **
1524f0ee686Sdrh ** Pager.errCode may be set to SQLITE_IOERR, SQLITE_CORRUPT, or
153efaaf579Sdanielk1977 ** or SQLITE_FULL. Once one of the first three errors occurs, it persists
154efaaf579Sdanielk1977 ** and is returned as the result of every major pager API call.  The
155efaaf579Sdanielk1977 ** SQLITE_FULL return code is slightly different. It persists only until the
156efaaf579Sdanielk1977 ** next successful rollback is performed on the pager cache. Also,
1573b8a05f6Sdanielk1977 ** SQLITE_FULL does not affect the sqlite3PagerGet() and sqlite3PagerLookup()
158efaaf579Sdanielk1977 ** APIs, they may still be used successfully.
1593460d19cSdanielk1977 **
1603460d19cSdanielk1977 ** Managing the size of the database file in pages is a little complicated.
1613460d19cSdanielk1977 ** The variable Pager.dbSize contains the number of pages that the database
1623460d19cSdanielk1977 ** image currently contains. As the database image grows or shrinks this
1633460d19cSdanielk1977 ** variable is updated. The variable Pager.dbFileSize contains the number
1643460d19cSdanielk1977 ** of pages in the database file. This may be different from Pager.dbSize
1653460d19cSdanielk1977 ** if some pages have been appended to the database image but not yet written
1663460d19cSdanielk1977 ** out from the cache to the actual file on disk. Or if the image has been
1673460d19cSdanielk1977 ** truncated by an incremental-vacuum operation. The Pager.dbOrigSize variable
1683460d19cSdanielk1977 ** contains the number of pages in the database image when the current
1693460d19cSdanielk1977 ** transaction was opened. The contents of all three of these variables is
1703460d19cSdanielk1977 ** only guaranteed to be correct if the boolean Pager.dbSizeValid is true.
171ed7c855cSdrh */
172ed7c855cSdrh struct Pager {
173b4b47411Sdanielk1977   sqlite3_vfs *pVfs;          /* OS functions to use for IO */
174603240cfSdrh   u8 journalOpen;             /* True if journal file descriptors is valid */
17534e79ceeSdrh   u8 journalStarted;          /* True if header of journal is synced */
17634e79ceeSdrh   u8 useJournal;              /* Use a rollback journal on this file */
1777bec505eSdrh   u8 noReadlock;              /* Do not bother to obtain readlocks */
178603240cfSdrh   u8 noSync;                  /* Do not sync the journal if true */
179968af52aSdrh   u8 fullSync;                /* Do extra syncs of the journal for robustness */
180f036aef0Sdanielk1977   u8 sync_flags;              /* One of SYNC_NORMAL or SYNC_FULL */
181a6abd041Sdrh   u8 state;                   /* PAGER_UNLOCK, _SHARED, _RESERVED, etc. */
182603240cfSdrh   u8 tempFile;                /* zFilename is a temporary file */
183603240cfSdrh   u8 readOnly;                /* True for a read-only database */
184603240cfSdrh   u8 needSync;                /* True if an fsync() is needed on the journal */
185a6abd041Sdrh   u8 dirtyCache;              /* True if cached pages have changed */
186ac69b05eSdrh   u8 memDb;                   /* True to inhibit all file I/O */
1876d156e46Sdrh   u8 setMaster;               /* True if a m-j name has been written to jrnl */
18880e35f46Sdrh   u8 doNotSync;               /* Boolean. While true, do not spill the cache */
18980e35f46Sdrh   u8 exclusiveMode;           /* Boolean. True if locking_mode==EXCLUSIVE */
190fdc40e91Sdrh   u8 journalMode;             /* On of the PAGER_JOURNALMODE_* values */
191d138c016Sdrh   u8 dbModified;              /* True if there are any changes to the Db */
19280e35f46Sdrh   u8 changeCountDone;         /* Set after incrementing the change-counter */
193d92db531Sdanielk1977   u8 dbSizeValid;             /* Set when dbSize is correct */
1943460d19cSdanielk1977   Pgno dbSize;                /* Number of pages in the database */
1953460d19cSdanielk1977   Pgno dbOrigSize;            /* dbSize before the current transaction */
1963460d19cSdanielk1977   Pgno dbFileSize;            /* Number of pages in the database file */
19733f4e02aSdrh   u32 vfsFlags;               /* Flags for sqlite3_vfs.xOpen() */
198e49f9827Sdrh   int errCode;                /* One of several kinds of errors */
199fcd35c7bSdrh   int nRec;                   /* Number of pages written to the journal */
200fcd35c7bSdrh   u32 cksumInit;              /* Quasi-random value added to every checksum */
201fcd35c7bSdrh   int stmtNRec;               /* Number of records in stmt subjournal */
202fcd35c7bSdrh   int nExtra;                 /* Add this many bytes to each in-memory page */
203fcd35c7bSdrh   int pageSize;               /* Number of bytes in a page */
204fcd35c7bSdrh   int nPage;                  /* Total number of in-memory pages */
205fcd35c7bSdrh   int mxPage;                 /* Maximum number of pages to hold in cache */
206f8e632b6Sdrh   Pgno mxPgno;                /* Maximum allowed size of the database */
207f5e7bb51Sdrh   Bitvec *pInJournal;         /* One bit for each page in the database file */
208a1fa00d9Sdanielk1977   Bitvec *pAlwaysRollback;    /* One bit for each page marked always-rollback */
209fcd35c7bSdrh   char *zFilename;            /* Name of the database file */
210fcd35c7bSdrh   char *zJournal;             /* Name of the journal file */
211fcd35c7bSdrh   char *zDirectory;           /* Directory hold database and journal files */
21262079060Sdanielk1977   sqlite3_file *fd, *jfd;     /* File descriptors for database and journal */
213fd7f0452Sdanielk1977   sqlite3_file *sjfd;         /* File descriptor for the sub-journal*/
2141ceedd37Sdanielk1977   int (*xBusyHandler)(void*); /* Function to call when busy */
2151ceedd37Sdanielk1977   void *pBusyHandlerArg;      /* Context argument for xBusyHandler */
216eb206256Sdrh   i64 journalOff;             /* Current byte offset in the journal file */
217eb206256Sdrh   i64 journalHdr;             /* Byte offset to previous journal header */
21898c58356Sdrh   u32 sectorSize;             /* Assumed sector size during rollback */
219fcd35c7bSdrh #ifdef SQLITE_TEST
2207c4ac0c5Sdrh   int nHit, nMiss;            /* Cache hits and missing */
2216d156e46Sdrh   int nRead, nWrite;          /* Database pages read/written */
222fcd35c7bSdrh #endif
223eaa06f69Sdanielk1977   void (*xReiniter)(DbPage*); /* Call this routine when reloading pages */
2247c4ac0c5Sdrh #ifdef SQLITE_HAS_CODEC
225c001c58aSdrh   void *(*xCodec)(void*,void*,Pgno,int); /* Routine for en/decoding data */
2266d156e46Sdrh   void *pCodecArg;            /* First argument to xCodec() */
2277c4ac0c5Sdrh #endif
2288186df86Sdanielk1977   char *pTmpSpace;            /* Pager.pageSize bytes of space for tmp use */
22986a88114Sdrh   char dbFileVers[16];        /* Changes whenever database file changes */
230b53e4960Sdanielk1977   i64 journalSizeLimit;       /* Size limit for persistent journal files */
2318c0a791aSdanielk1977   PCache *pPCache;            /* Pointer to page cache object */
2329f0bbf9cSdrh   PagerSavepoint *aSavepoint; /* Array of active savepoints */
2339f0bbf9cSdrh   int nSavepoint;             /* Number of elements in aSavepoint[] */
234d9b0257aSdrh };
235d9b0257aSdrh 
236d9b0257aSdrh /*
237538f570cSdrh ** The following global variables hold counters used for
238538f570cSdrh ** testing purposes only.  These variables do not exist in
239538f570cSdrh ** a non-testing build.  These variables are not thread-safe.
240fcd35c7bSdrh */
241fcd35c7bSdrh #ifdef SQLITE_TEST
242538f570cSdrh int sqlite3_pager_readdb_count = 0;    /* Number of full pages read from DB */
243538f570cSdrh int sqlite3_pager_writedb_count = 0;   /* Number of full pages written to DB */
244538f570cSdrh int sqlite3_pager_writej_count = 0;    /* Number of pages written to journal */
245538f570cSdrh # define PAGER_INCR(v)  v++
246fcd35c7bSdrh #else
247538f570cSdrh # define PAGER_INCR(v)
248fcd35c7bSdrh #endif
249fcd35c7bSdrh 
250538f570cSdrh 
251538f570cSdrh 
252fcd35c7bSdrh /*
2535e00f6c7Sdrh ** Journal files begin with the following magic string.  The data
2545e00f6c7Sdrh ** was obtained from /dev/random.  It is used only as a sanity check.
25594f3331aSdrh **
256ae2b40c4Sdrh ** Since version 2.8.0, the journal format contains additional sanity
25730d53701Sdrh ** checking information.  If the power fails while the journal is being
258ae2b40c4Sdrh ** written, semi-random garbage data might appear in the journal
259ae2b40c4Sdrh ** file after power is restored.  If an attempt is then made
260968af52aSdrh ** to roll the journal back, the database could be corrupted.  The additional
261968af52aSdrh ** sanity checking data is an attempt to discover the garbage in the
262968af52aSdrh ** journal and ignore it.
263968af52aSdrh **
264ae2b40c4Sdrh ** The sanity checking information for the new journal format consists
265968af52aSdrh ** of a 32-bit checksum on each page of data.  The checksum covers both
26690f5ecb3Sdrh ** the page number and the pPager->pageSize bytes of data for the page.
267968af52aSdrh ** This cksum is initialized to a 32-bit random value that appears in the
268968af52aSdrh ** journal file right after the header.  The random initializer is important,
269968af52aSdrh ** because garbage data that appears at the end of a journal is likely
270968af52aSdrh ** data that was once in other files that have now been deleted.  If the
271968af52aSdrh ** garbage data came from an obsolete journal file, the checksums might
272968af52aSdrh ** be correct.  But by initializing the checksum to random value which
273968af52aSdrh ** is different for every journal, we minimize that risk.
274d9b0257aSdrh */
275ae2b40c4Sdrh static const unsigned char aJournalMagic[] = {
276ae2b40c4Sdrh   0xd9, 0xd5, 0x05, 0xf9, 0x20, 0xa1, 0x63, 0xd7,
277ed7c855cSdrh };
278ed7c855cSdrh 
279ed7c855cSdrh /*
280726de599Sdrh ** The size of the header and of each page in the journal is determined
281726de599Sdrh ** by the following macros.
282968af52aSdrh */
283ae2b40c4Sdrh #define JOURNAL_PG_SZ(pPager)  ((pPager->pageSize) + 8)
284968af52aSdrh 
2857657240aSdanielk1977 /*
2867657240aSdanielk1977 ** The journal header size for this pager. In the future, this could be
2877657240aSdanielk1977 ** set to some value read from the disk controller. The important
2887657240aSdanielk1977 ** characteristic is that it is the same size as a disk sector.
2897657240aSdanielk1977 */
2907657240aSdanielk1977 #define JOURNAL_HDR_SZ(pPager) (pPager->sectorSize)
2917657240aSdanielk1977 
292b7f9164eSdrh /*
293b7f9164eSdrh ** The macro MEMDB is true if we are dealing with an in-memory database.
294b7f9164eSdrh ** We do this as a macro so that if the SQLITE_OMIT_MEMORYDB macro is set,
295b7f9164eSdrh ** the value of MEMDB will be a constant and the compiler will optimize
296b7f9164eSdrh ** out code that would never execute.
297b7f9164eSdrh */
298b7f9164eSdrh #ifdef SQLITE_OMIT_MEMORYDB
299b7f9164eSdrh # define MEMDB 0
300b7f9164eSdrh #else
301b7f9164eSdrh # define MEMDB pPager->memDb
302b7f9164eSdrh #endif
303b7f9164eSdrh 
304b7f9164eSdrh /*
3057657240aSdanielk1977 ** Page number PAGER_MJ_PGNO is never used in an SQLite database (it is
3067657240aSdanielk1977 ** reserved for working around a windows/posix incompatibility). It is
3077657240aSdanielk1977 ** used in the journal to signify that the remainder of the journal file
3087657240aSdanielk1977 ** is devoted to storing a master journal name - there are no more pages to
3097657240aSdanielk1977 ** roll back. See comments for function writeMasterJournal() for details.
3107657240aSdanielk1977 */
311599fcbaeSdanielk1977 /* #define PAGER_MJ_PGNO(x) (PENDING_BYTE/((x)->pageSize)) */
312d92db531Sdanielk1977 #define PAGER_MJ_PGNO(x) ((Pgno)((PENDING_BYTE/((x)->pageSize))+1))
31313adf8a0Sdanielk1977 
314968af52aSdrh /*
31526836654Sdanielk1977 ** The maximum legal page number is (2^31 - 1).
31626836654Sdanielk1977 */
31726836654Sdanielk1977 #define PAGER_MAX_PGNO 2147483647
31826836654Sdanielk1977 
31926836654Sdanielk1977 /*
3203460d19cSdanielk1977 ** Return true if it is necessary to write page *pPg into the sub-journal.
3213460d19cSdanielk1977 ** A page needs to be written into the sub-journal if there exists one
3223460d19cSdanielk1977 ** or more open savepoints for which:
323fd7f0452Sdanielk1977 **
3243460d19cSdanielk1977 **   * The page-number is less than or equal to PagerSavepoint.nOrig, and
3253460d19cSdanielk1977 **   * The bit corresponding to the page-number is not set in
3263460d19cSdanielk1977 **     PagerSavepoint.pInSavepoint.
327f35843b5Sdanielk1977 */
3283460d19cSdanielk1977 static int subjRequiresPage(PgHdr *pPg){
3293460d19cSdanielk1977   Pgno pgno = pPg->pgno;
330f35843b5Sdanielk1977   Pager *pPager = pPg->pPager;
3313460d19cSdanielk1977   int i;
3323460d19cSdanielk1977   for(i=0; i<pPager->nSavepoint; i++){
3333460d19cSdanielk1977     PagerSavepoint *p = &pPager->aSavepoint[i];
3343460d19cSdanielk1977     if( p->nOrig>=pgno && 0==sqlite3BitvecTest(p->pInSavepoint, pgno) ){
335fd7f0452Sdanielk1977       return 1;
336fd7f0452Sdanielk1977     }
3373460d19cSdanielk1977   }
3383460d19cSdanielk1977   return 0;
339f35843b5Sdanielk1977 }
3408ca0c724Sdrh 
3413460d19cSdanielk1977 /*
3423460d19cSdanielk1977 ** Return true if the page is already in the journal file.
3433460d19cSdanielk1977 */
344bc2ca9ebSdanielk1977 static int pageInJournal(PgHdr *pPg){
345bc2ca9ebSdanielk1977   return sqlite3BitvecTest(pPg->pPager->pInJournal, pPg->pgno);
346bc2ca9ebSdanielk1977 }
347bc2ca9ebSdanielk1977 
3488ca0c724Sdrh /*
34934e79ceeSdrh ** Read a 32-bit integer from the given file descriptor.  Store the integer
35034e79ceeSdrh ** that is read in *pRes.  Return SQLITE_OK if everything worked, or an
35134e79ceeSdrh ** error code is something goes wrong.
352726de599Sdrh **
353726de599Sdrh ** All values are stored on disk as big-endian.
35494f3331aSdrh */
35562079060Sdanielk1977 static int read32bits(sqlite3_file *fd, i64 offset, u32 *pRes){
35694f3331aSdrh   unsigned char ac[4];
35762079060Sdanielk1977   int rc = sqlite3OsRead(fd, ac, sizeof(ac), offset);
3583b59a5ccSdrh   if( rc==SQLITE_OK ){
359a3152895Sdrh     *pRes = sqlite3Get4byte(ac);
36094f3331aSdrh   }
36194f3331aSdrh   return rc;
36294f3331aSdrh }
36394f3331aSdrh 
36494f3331aSdrh /*
36597b57484Sdrh ** Write a 32-bit integer into a string buffer in big-endian byte order.
36697b57484Sdrh */
367a3152895Sdrh #define put32bits(A,B)  sqlite3Put4byte((u8*)A,B)
36897b57484Sdrh 
36997b57484Sdrh /*
37034e79ceeSdrh ** Write a 32-bit integer into the given file descriptor.  Return SQLITE_OK
37134e79ceeSdrh ** on success or an error code is something goes wrong.
37294f3331aSdrh */
37362079060Sdanielk1977 static int write32bits(sqlite3_file *fd, i64 offset, u32 val){
374bab45c64Sdanielk1977   char ac[4];
37597b57484Sdrh   put32bits(ac, val);
37662079060Sdanielk1977   return sqlite3OsWrite(fd, ac, 4, offset);
37794f3331aSdrh }
37894f3331aSdrh 
3792554f8b0Sdrh /*
3807a2b1eebSdanielk1977 ** If file pFd is open, call sqlite3OsUnlock() on it.
3817a2b1eebSdanielk1977 */
3827a2b1eebSdanielk1977 static int osUnlock(sqlite3_file *pFd, int eLock){
3837a2b1eebSdanielk1977   if( !pFd->pMethods ){
3847a2b1eebSdanielk1977     return SQLITE_OK;
3857a2b1eebSdanielk1977   }
3867a2b1eebSdanielk1977   return sqlite3OsUnlock(pFd, eLock);
3877a2b1eebSdanielk1977 }
3887a2b1eebSdanielk1977 
3897a2b1eebSdanielk1977 /*
390c7b6017cSdanielk1977 ** This function determines whether or not the atomic-write optimization
391c7b6017cSdanielk1977 ** can be used with this pager. The optimization can be used if:
392c7b6017cSdanielk1977 **
393c7b6017cSdanielk1977 **  (a) the value returned by OsDeviceCharacteristics() indicates that
394c7b6017cSdanielk1977 **      a database page may be written atomically, and
395c7b6017cSdanielk1977 **  (b) the value returned by OsSectorSize() is less than or equal
396c7b6017cSdanielk1977 **      to the page size.
397c7b6017cSdanielk1977 **
398c7b6017cSdanielk1977 ** If the optimization cannot be used, 0 is returned. If it can be used,
399c7b6017cSdanielk1977 ** then the value returned is the size of the journal file when it
400c7b6017cSdanielk1977 ** contains rollback data for exactly one page.
401c7b6017cSdanielk1977 */
402c7b6017cSdanielk1977 #ifdef SQLITE_ENABLE_ATOMIC_WRITE
403c7b6017cSdanielk1977 static int jrnlBufferSize(Pager *pPager){
404c7b6017cSdanielk1977   int dc;           /* Device characteristics */
405c7b6017cSdanielk1977   int nSector;      /* Sector size */
406facf0307Sdrh   int szPage;        /* Page size */
407c7b6017cSdanielk1977   sqlite3_file *fd = pPager->fd;
408c7b6017cSdanielk1977 
409c7b6017cSdanielk1977   if( fd->pMethods ){
410c7b6017cSdanielk1977     dc = sqlite3OsDeviceCharacteristics(fd);
4117cbd589dSdanielk1977     nSector = pPager->sectorSize;
412facf0307Sdrh     szPage = pPager->pageSize;
413c7b6017cSdanielk1977   }
414c7b6017cSdanielk1977 
415c7b6017cSdanielk1977   assert(SQLITE_IOCAP_ATOMIC512==(512>>8));
416c7b6017cSdanielk1977   assert(SQLITE_IOCAP_ATOMIC64K==(65536>>8));
417c7b6017cSdanielk1977 
418facf0307Sdrh   if( !fd->pMethods ||
419facf0307Sdrh        (dc & (SQLITE_IOCAP_ATOMIC|(szPage>>8)) && nSector<=szPage) ){
420c7b6017cSdanielk1977     return JOURNAL_HDR_SZ(pPager) + JOURNAL_PG_SZ(pPager);
421c7b6017cSdanielk1977   }
422c7b6017cSdanielk1977   return 0;
423c7b6017cSdanielk1977 }
424c7b6017cSdanielk1977 #endif
425c7b6017cSdanielk1977 
426c7b6017cSdanielk1977 /*
427aef0bf64Sdanielk1977 ** This function should be called when an error occurs within the pager
428a96a7103Sdanielk1977 ** code. The first argument is a pointer to the pager structure, the
429a96a7103Sdanielk1977 ** second the error-code about to be returned by a pager API function.
430a96a7103Sdanielk1977 ** The value returned is a copy of the second argument to this function.
431a96a7103Sdanielk1977 **
4324f0ee686Sdrh ** If the second argument is SQLITE_IOERR, SQLITE_CORRUPT, or SQLITE_FULL
433ae72d982Sdanielk1977 ** the error becomes persistent. Until the persisten error is cleared,
434ae72d982Sdanielk1977 ** subsequent API calls on this Pager will immediately return the same
435ae72d982Sdanielk1977 ** error code.
436ae72d982Sdanielk1977 **
437ae72d982Sdanielk1977 ** A persistent error indicates that the contents of the pager-cache
438ae72d982Sdanielk1977 ** cannot be trusted. This state can be cleared by completely discarding
439ae72d982Sdanielk1977 ** the contents of the pager-cache. If a transaction was active when
440ae72d982Sdanielk1977 ** the persistent error occured, then the rollback journal may need
441ae72d982Sdanielk1977 ** to be replayed.
442aef0bf64Sdanielk1977 */
443ae72d982Sdanielk1977 static void pager_unlock(Pager *pPager);
444aef0bf64Sdanielk1977 static int pager_error(Pager *pPager, int rc){
4454ac285a1Sdrh   int rc2 = rc & 0xff;
44634f5621fSdrh   assert(
44734f5621fSdrh        pPager->errCode==SQLITE_FULL ||
44834f5621fSdrh        pPager->errCode==SQLITE_OK ||
44934f5621fSdrh        (pPager->errCode & 0xff)==SQLITE_IOERR
45034f5621fSdrh   );
451efaaf579Sdanielk1977   if(
4524ac285a1Sdrh     rc2==SQLITE_FULL ||
4534ac285a1Sdrh     rc2==SQLITE_IOERR ||
4544f0ee686Sdrh     rc2==SQLITE_CORRUPT
455efaaf579Sdanielk1977   ){
456efaaf579Sdanielk1977     pPager->errCode = rc;
4578c0a791aSdanielk1977     if( pPager->state==PAGER_UNLOCK
4588c0a791aSdanielk1977      && sqlite3PcacheRefCount(pPager->pPCache)==0
4598c0a791aSdanielk1977     ){
460ae72d982Sdanielk1977       /* If the pager is already unlocked, call pager_unlock() now to
461ae72d982Sdanielk1977       ** clear the error state and ensure that the pager-cache is
462ae72d982Sdanielk1977       ** completely empty.
463ae72d982Sdanielk1977       */
464ae72d982Sdanielk1977       pager_unlock(pPager);
465ae72d982Sdanielk1977     }
466aef0bf64Sdanielk1977   }
467aef0bf64Sdanielk1977   return rc;
468aef0bf64Sdanielk1977 }
469aef0bf64Sdanielk1977 
470477731b5Sdrh /*
471477731b5Sdrh ** If SQLITE_CHECK_PAGES is defined then we do some sanity checking
472477731b5Sdrh ** on the cache using a hash function.  This is used for testing
473477731b5Sdrh ** and debugging only.
474477731b5Sdrh */
4753c407374Sdanielk1977 #ifdef SQLITE_CHECK_PAGES
4763c407374Sdanielk1977 /*
4773c407374Sdanielk1977 ** Return a 32-bit hash of the page data for pPage.
4783c407374Sdanielk1977 */
479477731b5Sdrh static u32 pager_datahash(int nByte, unsigned char *pData){
4803c407374Sdanielk1977   u32 hash = 0;
4813c407374Sdanielk1977   int i;
482477731b5Sdrh   for(i=0; i<nByte; i++){
483477731b5Sdrh     hash = (hash*1039) + pData[i];
4843c407374Sdanielk1977   }
4853c407374Sdanielk1977   return hash;
4863c407374Sdanielk1977 }
487477731b5Sdrh static u32 pager_pagehash(PgHdr *pPage){
4888c0a791aSdanielk1977   return pager_datahash(pPage->pPager->pageSize, (unsigned char *)pPage->pData);
4898c0a791aSdanielk1977 }
490bc2ca9ebSdanielk1977 static void pager_set_pagehash(PgHdr *pPage){
4918c0a791aSdanielk1977   pPage->pageHash = pager_pagehash(pPage);
492477731b5Sdrh }
4933c407374Sdanielk1977 
4943c407374Sdanielk1977 /*
4953c407374Sdanielk1977 ** The CHECK_PAGE macro takes a PgHdr* as an argument. If SQLITE_CHECK_PAGES
4963c407374Sdanielk1977 ** is defined, and NDEBUG is not defined, an assert() statement checks
4973c407374Sdanielk1977 ** that the page is either dirty or still matches the calculated page-hash.
4983c407374Sdanielk1977 */
4993c407374Sdanielk1977 #define CHECK_PAGE(x) checkPage(x)
5003c407374Sdanielk1977 static void checkPage(PgHdr *pPg){
5013c407374Sdanielk1977   Pager *pPager = pPg->pPager;
502b3175389Sdanielk1977   assert( !pPg->pageHash || pPager->errCode
5038c0a791aSdanielk1977       || (pPg->flags&PGHDR_DIRTY) || pPg->pageHash==pager_pagehash(pPg) );
5043c407374Sdanielk1977 }
5053c407374Sdanielk1977 
5063c407374Sdanielk1977 #else
5078ffa8173Sdrh #define pager_datahash(X,Y)  0
508477731b5Sdrh #define pager_pagehash(X)  0
5093c407374Sdanielk1977 #define CHECK_PAGE(x)
51041d3027cSdrh #endif  /* SQLITE_CHECK_PAGES */
5113c407374Sdanielk1977 
512ed7c855cSdrh /*
5137657240aSdanielk1977 ** When this is called the journal file for pager pPager must be open.
5147657240aSdanielk1977 ** The master journal file name is read from the end of the file and
51565839c6aSdanielk1977 ** written into memory supplied by the caller.
5167657240aSdanielk1977 **
51765839c6aSdanielk1977 ** zMaster must point to a buffer of at least nMaster bytes allocated by
51865839c6aSdanielk1977 ** the caller. This should be sqlite3_vfs.mxPathname+1 (to ensure there is
51965839c6aSdanielk1977 ** enough space to write the master journal name). If the master journal
52065839c6aSdanielk1977 ** name in the journal is longer than nMaster bytes (including a
52165839c6aSdanielk1977 ** nul-terminator), then this is handled as if no master journal name
52265839c6aSdanielk1977 ** were present in the journal.
52365839c6aSdanielk1977 **
52465839c6aSdanielk1977 ** If no master journal file name is present zMaster[0] is set to 0 and
5257657240aSdanielk1977 ** SQLITE_OK returned.
5267657240aSdanielk1977 */
527d92db531Sdanielk1977 static int readMasterJournal(sqlite3_file *pJrnl, char *zMaster, u32 nMaster){
5287657240aSdanielk1977   int rc;
5297657240aSdanielk1977   u32 len;
530eb206256Sdrh   i64 szJ;
531c3e8f5efSdanielk1977   u32 cksum;
5320b8d2766Sshane   u32 u;                   /* Unsigned loop counter */
5337657240aSdanielk1977   unsigned char aMagic[8]; /* A buffer to hold the magic header */
5347657240aSdanielk1977 
53565839c6aSdanielk1977   zMaster[0] = '\0';
5367657240aSdanielk1977 
537054889ecSdrh   rc = sqlite3OsFileSize(pJrnl, &szJ);
538cafadbacSdanielk1977   if( rc!=SQLITE_OK || szJ<16 ) return rc;
5397657240aSdanielk1977 
54062079060Sdanielk1977   rc = read32bits(pJrnl, szJ-16, &len);
5417657240aSdanielk1977   if( rc!=SQLITE_OK ) return rc;
5427657240aSdanielk1977 
54365839c6aSdanielk1977   if( len>=nMaster ){
54465839c6aSdanielk1977     return SQLITE_OK;
54565839c6aSdanielk1977   }
54665839c6aSdanielk1977 
54762079060Sdanielk1977   rc = read32bits(pJrnl, szJ-12, &cksum);
5487657240aSdanielk1977   if( rc!=SQLITE_OK ) return rc;
5497657240aSdanielk1977 
55062079060Sdanielk1977   rc = sqlite3OsRead(pJrnl, aMagic, 8, szJ-8);
5517657240aSdanielk1977   if( rc!=SQLITE_OK || memcmp(aMagic, aJournalMagic, 8) ) return rc;
5527657240aSdanielk1977 
55365839c6aSdanielk1977   rc = sqlite3OsRead(pJrnl, zMaster, len, szJ-16-len);
5547657240aSdanielk1977   if( rc!=SQLITE_OK ){
5557657240aSdanielk1977     return rc;
5567657240aSdanielk1977   }
55765839c6aSdanielk1977   zMaster[len] = '\0';
5587657240aSdanielk1977 
559cafadbacSdanielk1977   /* See if the checksum matches the master journal name */
5600b8d2766Sshane   for(u=0; u<len; u++){
5610b8d2766Sshane     cksum -= zMaster[u];
562cafadbacSdanielk1977    }
5638191bff0Sdanielk1977   if( cksum ){
5648191bff0Sdanielk1977     /* If the checksum doesn't add up, then one or more of the disk sectors
5658191bff0Sdanielk1977     ** containing the master journal filename is corrupted. This means
5668191bff0Sdanielk1977     ** definitely roll back, so just return SQLITE_OK and report a (nul)
5678191bff0Sdanielk1977     ** master-journal filename.
5688191bff0Sdanielk1977     */
56965839c6aSdanielk1977     zMaster[0] = '\0';
570aca790acSdanielk1977   }
571cafadbacSdanielk1977 
5727657240aSdanielk1977   return SQLITE_OK;
5737657240aSdanielk1977 }
5747657240aSdanielk1977 
5757657240aSdanielk1977 /*
5767657240aSdanielk1977 ** Seek the journal file descriptor to the next sector boundary where a
5777657240aSdanielk1977 ** journal header may be read or written. Pager.journalOff is updated with
5787657240aSdanielk1977 ** the new seek offset.
5797657240aSdanielk1977 **
5807657240aSdanielk1977 ** i.e for a sector size of 512:
5817657240aSdanielk1977 **
5827657240aSdanielk1977 ** Input Offset              Output Offset
5837657240aSdanielk1977 ** ---------------------------------------
5847657240aSdanielk1977 ** 0                         0
5857657240aSdanielk1977 ** 512                       512
5867657240aSdanielk1977 ** 100                       512
5877657240aSdanielk1977 ** 2000                      2048
5887657240aSdanielk1977 **
5897657240aSdanielk1977 */
590112f752bSdanielk1977 static i64 journalHdrOffset(Pager *pPager){
591eb206256Sdrh   i64 offset = 0;
592eb206256Sdrh   i64 c = pPager->journalOff;
5937657240aSdanielk1977   if( c ){
5947657240aSdanielk1977     offset = ((c-1)/JOURNAL_HDR_SZ(pPager) + 1) * JOURNAL_HDR_SZ(pPager);
5957657240aSdanielk1977   }
5967657240aSdanielk1977   assert( offset%JOURNAL_HDR_SZ(pPager)==0 );
5977657240aSdanielk1977   assert( offset>=c );
5987657240aSdanielk1977   assert( (offset-c)<JOURNAL_HDR_SZ(pPager) );
599112f752bSdanielk1977   return offset;
600112f752bSdanielk1977 }
601112f752bSdanielk1977 static void seekJournalHdr(Pager *pPager){
602112f752bSdanielk1977   pPager->journalOff = journalHdrOffset(pPager);
6037657240aSdanielk1977 }
6047657240aSdanielk1977 
6057657240aSdanielk1977 /*
606f3a87624Sdrh ** Write zeros over the header of the journal file.  This has the
607f3a87624Sdrh ** effect of invalidating the journal file and committing the
608f3a87624Sdrh ** transaction.
609f3a87624Sdrh */
610df2566a3Sdanielk1977 static int zeroJournalHdr(Pager *pPager, int doTruncate){
611df2566a3Sdanielk1977   int rc = SQLITE_OK;
61255a25a12Sdanielk1977   static const char zeroHdr[28] = {0};
613f3a87624Sdrh 
614df2566a3Sdanielk1977   if( pPager->journalOff ){
615b53e4960Sdanielk1977     i64 iLimit = pPager->journalSizeLimit;
616b53e4960Sdanielk1977 
617f3a87624Sdrh     IOTRACE(("JZEROHDR %p\n", pPager))
618b53e4960Sdanielk1977     if( doTruncate || iLimit==0 ){
619df2566a3Sdanielk1977       rc = sqlite3OsTruncate(pPager->jfd, 0);
620df2566a3Sdanielk1977     }else{
621f3a87624Sdrh       rc = sqlite3OsWrite(pPager->jfd, zeroHdr, sizeof(zeroHdr), 0);
622df2566a3Sdanielk1977     }
6238162054bSdanielk1977     if( rc==SQLITE_OK && !pPager->noSync ){
624a06ecba2Sdrh       rc = sqlite3OsSync(pPager->jfd, SQLITE_SYNC_DATAONLY|pPager->sync_flags);
625a06ecba2Sdrh     }
626b53e4960Sdanielk1977 
627b53e4960Sdanielk1977     /* At this point the transaction is committed but the write lock
628b53e4960Sdanielk1977     ** is still held on the file. If there is a size limit configured for
629b53e4960Sdanielk1977     ** the persistent journal and the journal file currently consumes more
630b53e4960Sdanielk1977     ** space than that limit allows for, truncate it now. There is no need
631b53e4960Sdanielk1977     ** to sync the file following this operation.
632b53e4960Sdanielk1977     */
633b53e4960Sdanielk1977     if( rc==SQLITE_OK && iLimit>0 ){
634b53e4960Sdanielk1977       i64 sz;
635b53e4960Sdanielk1977       rc = sqlite3OsFileSize(pPager->jfd, &sz);
636b53e4960Sdanielk1977       if( rc==SQLITE_OK && sz>iLimit ){
637b53e4960Sdanielk1977         rc = sqlite3OsTruncate(pPager->jfd, iLimit);
638b53e4960Sdanielk1977       }
639b53e4960Sdanielk1977     }
640df2566a3Sdanielk1977   }
641f3a87624Sdrh   return rc;
642f3a87624Sdrh }
643f3a87624Sdrh 
644f3a87624Sdrh /*
6457657240aSdanielk1977 ** The journal file must be open when this routine is called. A journal
6467657240aSdanielk1977 ** header (JOURNAL_HDR_SZ bytes) is written into the journal file at the
6477657240aSdanielk1977 ** current location.
6487657240aSdanielk1977 **
6497657240aSdanielk1977 ** The format for the journal header is as follows:
6507657240aSdanielk1977 ** - 8 bytes: Magic identifying journal format.
6517657240aSdanielk1977 ** - 4 bytes: Number of records in journal, or -1 no-sync mode is on.
6527657240aSdanielk1977 ** - 4 bytes: Random number used for page hash.
6537657240aSdanielk1977 ** - 4 bytes: Initial database page count.
6547657240aSdanielk1977 ** - 4 bytes: Sector size used by the process that wrote this journal.
65567c007bfSdanielk1977 ** - 4 bytes: Database page size.
6567657240aSdanielk1977 **
65767c007bfSdanielk1977 ** Followed by (JOURNAL_HDR_SZ - 28) bytes of unused space.
6587657240aSdanielk1977 */
6597657240aSdanielk1977 static int writeJournalHdr(Pager *pPager){
660a664f8ebSdanielk1977   int rc = SQLITE_OK;
661a664f8ebSdanielk1977   char *zHeader = pPager->pTmpSpace;
662d92db531Sdanielk1977   u32 nHeader = pPager->pageSize;
663d92db531Sdanielk1977   u32 nWrite;
664fd7f0452Sdanielk1977   int ii;
665a664f8ebSdanielk1977 
666a664f8ebSdanielk1977   if( nHeader>JOURNAL_HDR_SZ(pPager) ){
667a664f8ebSdanielk1977     nHeader = JOURNAL_HDR_SZ(pPager);
668a664f8ebSdanielk1977   }
6697657240aSdanielk1977 
670fd7f0452Sdanielk1977   /* If there are active savepoints and any of them were created since the
671fd7f0452Sdanielk1977   ** most recent journal header was written, update the PagerSavepoint.iHdrOff
672fd7f0452Sdanielk1977   ** fields now.
673fd7f0452Sdanielk1977   */
674fd7f0452Sdanielk1977   for(ii=0; ii<pPager->nSavepoint; ii++){
675fd7f0452Sdanielk1977     if( pPager->aSavepoint[ii].iHdrOffset==0 ){
676fd7f0452Sdanielk1977       pPager->aSavepoint[ii].iHdrOffset = pPager->journalOff;
677fd7f0452Sdanielk1977     }
6784099f6e1Sdanielk1977   }
6794099f6e1Sdanielk1977 
68062079060Sdanielk1977   seekJournalHdr(pPager);
6817657240aSdanielk1977   pPager->journalHdr = pPager->journalOff;
6827657240aSdanielk1977 
68397b57484Sdrh   memcpy(zHeader, aJournalMagic, sizeof(aJournalMagic));
6844cd2cd5cSdanielk1977 
6854cd2cd5cSdanielk1977   /*
6864cd2cd5cSdanielk1977   ** Write the nRec Field - the number of page records that follow this
6874cd2cd5cSdanielk1977   ** journal header. Normally, zero is written to this value at this time.
6884cd2cd5cSdanielk1977   ** After the records are added to the journal (and the journal synced,
6894cd2cd5cSdanielk1977   ** if in full-sync mode), the zero is overwritten with the true number
6904cd2cd5cSdanielk1977   ** of records (see syncJournal()).
6914cd2cd5cSdanielk1977   **
6924cd2cd5cSdanielk1977   ** A faster alternative is to write 0xFFFFFFFF to the nRec field. When
6934cd2cd5cSdanielk1977   ** reading the journal this value tells SQLite to assume that the
6944cd2cd5cSdanielk1977   ** rest of the journal file contains valid page records. This assumption
6954cd2cd5cSdanielk1977   ** is dangerous, as if a failure occured whilst writing to the journal
6964cd2cd5cSdanielk1977   ** file it may contain some garbage data. There are two scenarios
6974cd2cd5cSdanielk1977   ** where this risk can be ignored:
6984cd2cd5cSdanielk1977   **
6994cd2cd5cSdanielk1977   **   * When the pager is in no-sync mode. Corruption can follow a
7004cd2cd5cSdanielk1977   **     power failure in this case anyway.
7014cd2cd5cSdanielk1977   **
7024cd2cd5cSdanielk1977   **   * When the SQLITE_IOCAP_SAFE_APPEND flag is set. This guarantees
7034cd2cd5cSdanielk1977   **     that garbage data is never appended to the journal file.
7044cd2cd5cSdanielk1977   */
7054cd2cd5cSdanielk1977   assert(pPager->fd->pMethods||pPager->noSync);
706b3175389Sdanielk1977   if( (pPager->noSync) || (pPager->journalMode==PAGER_JOURNALMODE_MEMORY)
7074cd2cd5cSdanielk1977    || (sqlite3OsDeviceCharacteristics(pPager->fd)&SQLITE_IOCAP_SAFE_APPEND)
7084cd2cd5cSdanielk1977   ){
7094cd2cd5cSdanielk1977     put32bits(&zHeader[sizeof(aJournalMagic)], 0xffffffff);
7104cd2cd5cSdanielk1977   }else{
7114cd2cd5cSdanielk1977     put32bits(&zHeader[sizeof(aJournalMagic)], 0);
7124cd2cd5cSdanielk1977   }
7134cd2cd5cSdanielk1977 
7147657240aSdanielk1977   /* The random check-hash initialiser */
7152fa1868fSdrh   sqlite3_randomness(sizeof(pPager->cksumInit), &pPager->cksumInit);
71697b57484Sdrh   put32bits(&zHeader[sizeof(aJournalMagic)+4], pPager->cksumInit);
7177657240aSdanielk1977   /* The initial database size */
7183460d19cSdanielk1977   put32bits(&zHeader[sizeof(aJournalMagic)+8], pPager->dbOrigSize);
7197657240aSdanielk1977   /* The assumed sector size for this process */
72097b57484Sdrh   put32bits(&zHeader[sizeof(aJournalMagic)+12], pPager->sectorSize);
72108609ce7Sdrh 
72208609ce7Sdrh   /* Initializing the tail of the buffer is not necessary.  Everything
72308609ce7Sdrh   ** works find if the following memset() is omitted.  But initializing
72408609ce7Sdrh   ** the memory prevents valgrind from complaining, so we are willing to
72508609ce7Sdrh   ** take the performance hit.
72608609ce7Sdrh   */
72708609ce7Sdrh   memset(&zHeader[sizeof(aJournalMagic)+16], 0,
72808609ce7Sdrh          nHeader-(sizeof(aJournalMagic)+16));
72908609ce7Sdrh 
73067c007bfSdanielk1977   if( pPager->journalHdr==0 ){
73167c007bfSdanielk1977     /* The page size */
73267c007bfSdanielk1977     put32bits(&zHeader[sizeof(aJournalMagic)+16], pPager->pageSize);
73367c007bfSdanielk1977   }
7347657240aSdanielk1977 
735a664f8ebSdanielk1977   for(nWrite=0; rc==SQLITE_OK&&nWrite<JOURNAL_HDR_SZ(pPager); nWrite+=nHeader){
736a664f8ebSdanielk1977     IOTRACE(("JHDR %p %lld %d\n", pPager, pPager->journalHdr, nHeader))
737a664f8ebSdanielk1977     rc = sqlite3OsWrite(pPager->jfd, zHeader, nHeader, pPager->journalOff);
738a664f8ebSdanielk1977     pPager->journalOff += nHeader;
739b4746b9eSdrh   }
740a664f8ebSdanielk1977 
7417657240aSdanielk1977   return rc;
7427657240aSdanielk1977 }
7437657240aSdanielk1977 
7447657240aSdanielk1977 /*
7457657240aSdanielk1977 ** The journal file must be open when this is called. A journal header file
7467657240aSdanielk1977 ** (JOURNAL_HDR_SZ bytes) is read from the current location in the journal
747d6e5e098Sdrh ** file. The current location in the journal file is given by
748d6e5e098Sdrh ** pPager->journalOff.  See comments above function writeJournalHdr() for
749d6e5e098Sdrh ** a description of the journal header format.
7507657240aSdanielk1977 **
7517657240aSdanielk1977 ** If the header is read successfully, *nRec is set to the number of
7527657240aSdanielk1977 ** page records following this header and *dbSize is set to the size of the
7537657240aSdanielk1977 ** database before the transaction began, in pages. Also, pPager->cksumInit
7547657240aSdanielk1977 ** is set to the value read from the journal header. SQLITE_OK is returned
7557657240aSdanielk1977 ** in this case.
7567657240aSdanielk1977 **
7577657240aSdanielk1977 ** If the journal header file appears to be corrupted, SQLITE_DONE is
758d6e5e098Sdrh ** returned and *nRec and *dbSize are undefined.  If JOURNAL_HDR_SZ bytes
7597657240aSdanielk1977 ** cannot be read from the journal file an error code is returned.
7607657240aSdanielk1977 */
7617657240aSdanielk1977 static int readJournalHdr(
7627657240aSdanielk1977   Pager *pPager,
763eb206256Sdrh   i64 journalSize,
7647657240aSdanielk1977   u32 *pNRec,
7657657240aSdanielk1977   u32 *pDbSize
7667657240aSdanielk1977 ){
7677657240aSdanielk1977   int rc;
7687657240aSdanielk1977   unsigned char aMagic[8]; /* A buffer to hold the magic header */
76962079060Sdanielk1977   i64 jrnlOff;
7707cbd589dSdanielk1977   u32 iPageSize;
7717cbd589dSdanielk1977   u32 iSectorSize;
7727657240aSdanielk1977 
77362079060Sdanielk1977   seekJournalHdr(pPager);
7747657240aSdanielk1977   if( pPager->journalOff+JOURNAL_HDR_SZ(pPager) > journalSize ){
7757657240aSdanielk1977     return SQLITE_DONE;
7767657240aSdanielk1977   }
77762079060Sdanielk1977   jrnlOff = pPager->journalOff;
7787657240aSdanielk1977 
77962079060Sdanielk1977   rc = sqlite3OsRead(pPager->jfd, aMagic, sizeof(aMagic), jrnlOff);
7807657240aSdanielk1977   if( rc ) return rc;
78162079060Sdanielk1977   jrnlOff += sizeof(aMagic);
7827657240aSdanielk1977 
7837657240aSdanielk1977   if( memcmp(aMagic, aJournalMagic, sizeof(aMagic))!=0 ){
7847657240aSdanielk1977     return SQLITE_DONE;
7857657240aSdanielk1977   }
7867657240aSdanielk1977 
78762079060Sdanielk1977   rc = read32bits(pPager->jfd, jrnlOff, pNRec);
7887657240aSdanielk1977   if( rc ) return rc;
7897657240aSdanielk1977 
79062079060Sdanielk1977   rc = read32bits(pPager->jfd, jrnlOff+4, &pPager->cksumInit);
7917657240aSdanielk1977   if( rc ) return rc;
7927657240aSdanielk1977 
79362079060Sdanielk1977   rc = read32bits(pPager->jfd, jrnlOff+8, pDbSize);
7947657240aSdanielk1977   if( rc ) return rc;
7957657240aSdanielk1977 
7967cbd589dSdanielk1977   if( pPager->journalOff==0 ){
7977cbd589dSdanielk1977     rc = read32bits(pPager->jfd, jrnlOff+16, &iPageSize);
7987cbd589dSdanielk1977     if( rc ) return rc;
7997cbd589dSdanielk1977 
8007cbd589dSdanielk1977     if( iPageSize<512
8017cbd589dSdanielk1977      || iPageSize>SQLITE_MAX_PAGE_SIZE
8027cbd589dSdanielk1977      || ((iPageSize-1)&iPageSize)!=0
80367c007bfSdanielk1977     ){
8047cbd589dSdanielk1977       /* If the page-size in the journal-header is invalid, then the process
8057cbd589dSdanielk1977       ** that wrote the journal-header must have crashed before the header
8067cbd589dSdanielk1977       ** was synced. In this case stop reading the journal file here.
8077cbd589dSdanielk1977       */
8087cbd589dSdanielk1977       rc = SQLITE_DONE;
8097cbd589dSdanielk1977     }else{
8104f21c4afSdrh       u16 pagesize = (u16)iPageSize;
81167c007bfSdanielk1977       rc = sqlite3PagerSetPagesize(pPager, &pagesize);
8127cbd589dSdanielk1977       assert( rc!=SQLITE_OK || pagesize==(u16)iPageSize );
81367c007bfSdanielk1977     }
81467c007bfSdanielk1977     if( rc ) return rc;
81567c007bfSdanielk1977 
8167657240aSdanielk1977     /* Update the assumed sector-size to match the value used by
8177657240aSdanielk1977     ** the process that created this journal. If this journal was
8187657240aSdanielk1977     ** created by a process other than this one, then this routine
8197657240aSdanielk1977     ** is being called from within pager_playback(). The local value
8207657240aSdanielk1977     ** of Pager.sectorSize is restored at the end of that routine.
8217657240aSdanielk1977     */
8227cbd589dSdanielk1977     rc = read32bits(pPager->jfd, jrnlOff+12, &iSectorSize);
8237657240aSdanielk1977     if( rc ) return rc;
8247cbd589dSdanielk1977     if( (iSectorSize&(iSectorSize-1))
8257cbd589dSdanielk1977       || iSectorSize<512
8267cbd589dSdanielk1977       || iSectorSize>MAX_SECTOR_SIZE
8277cbd589dSdanielk1977     ){
82898c58356Sdrh       return SQLITE_DONE;
82998c58356Sdrh     }
8307cbd589dSdanielk1977     pPager->sectorSize = iSectorSize;
8317cbd589dSdanielk1977   }
8327657240aSdanielk1977 
8337657240aSdanielk1977   pPager->journalOff += JOURNAL_HDR_SZ(pPager);
83462079060Sdanielk1977   return SQLITE_OK;
8357657240aSdanielk1977 }
8367657240aSdanielk1977 
8377657240aSdanielk1977 
8387657240aSdanielk1977 /*
8397657240aSdanielk1977 ** Write the supplied master journal name into the journal file for pager
840cafadbacSdanielk1977 ** pPager at the current location. The master journal name must be the last
841cafadbacSdanielk1977 ** thing written to a journal file. If the pager is in full-sync mode, the
842cafadbacSdanielk1977 ** journal file descriptor is advanced to the next sector boundary before
843cafadbacSdanielk1977 ** anything is written. The format is:
844cafadbacSdanielk1977 **
845cafadbacSdanielk1977 ** + 4 bytes: PAGER_MJ_PGNO.
846cafadbacSdanielk1977 ** + N bytes: length of master journal name.
847cafadbacSdanielk1977 ** + 4 bytes: N
848cafadbacSdanielk1977 ** + 4 bytes: Master journal name checksum.
849cafadbacSdanielk1977 ** + 8 bytes: aJournalMagic[].
850cafadbacSdanielk1977 **
851cafadbacSdanielk1977 ** The master journal page checksum is the sum of the bytes in the master
852cafadbacSdanielk1977 ** journal name.
853aef0bf64Sdanielk1977 **
854aef0bf64Sdanielk1977 ** If zMaster is a NULL pointer (occurs for a single database transaction),
855aef0bf64Sdanielk1977 ** this call is a no-op.
8567657240aSdanielk1977 */
8577657240aSdanielk1977 static int writeMasterJournal(Pager *pPager, const char *zMaster){
8587657240aSdanielk1977   int rc;
8597657240aSdanielk1977   int len;
860cafadbacSdanielk1977   int i;
86162079060Sdanielk1977   i64 jrnlOff;
862df2566a3Sdanielk1977   i64 jrnlSize;
863c3e8f5efSdanielk1977   u32 cksum = 0;
86497b57484Sdrh   char zBuf[sizeof(aJournalMagic)+2*4];
8657657240aSdanielk1977 
8667657240aSdanielk1977   if( !zMaster || pPager->setMaster ) return SQLITE_OK;
867b3175389Sdanielk1977   if( pPager->journalMode==PAGER_JOURNALMODE_MEMORY ) return SQLITE_OK;
8687657240aSdanielk1977   pPager->setMaster = 1;
8697657240aSdanielk1977 
870ea678832Sdrh   len = sqlite3Strlen30(zMaster);
871cafadbacSdanielk1977   for(i=0; i<len; i++){
872cafadbacSdanielk1977     cksum += zMaster[i];
873cafadbacSdanielk1977   }
8747657240aSdanielk1977 
8757657240aSdanielk1977   /* If in full-sync mode, advance to the next disk sector before writing
8767657240aSdanielk1977   ** the master journal name. This is in case the previous page written to
8777657240aSdanielk1977   ** the journal has already been synced.
8787657240aSdanielk1977   */
8797657240aSdanielk1977   if( pPager->fullSync ){
88062079060Sdanielk1977     seekJournalHdr(pPager);
8817657240aSdanielk1977   }
88262079060Sdanielk1977   jrnlOff = pPager->journalOff;
883cafadbacSdanielk1977   pPager->journalOff += (len+20);
8847657240aSdanielk1977 
88562079060Sdanielk1977   rc = write32bits(pPager->jfd, jrnlOff, PAGER_MJ_PGNO(pPager));
8867657240aSdanielk1977   if( rc!=SQLITE_OK ) return rc;
88762079060Sdanielk1977   jrnlOff += 4;
8887657240aSdanielk1977 
88962079060Sdanielk1977   rc = sqlite3OsWrite(pPager->jfd, zMaster, len, jrnlOff);
8907657240aSdanielk1977   if( rc!=SQLITE_OK ) return rc;
89162079060Sdanielk1977   jrnlOff += len;
8927657240aSdanielk1977 
89397b57484Sdrh   put32bits(zBuf, len);
89497b57484Sdrh   put32bits(&zBuf[4], cksum);
89597b57484Sdrh   memcpy(&zBuf[8], aJournalMagic, sizeof(aJournalMagic));
89662079060Sdanielk1977   rc = sqlite3OsWrite(pPager->jfd, zBuf, 8+sizeof(aJournalMagic), jrnlOff);
897df2566a3Sdanielk1977   jrnlOff += 8+sizeof(aJournalMagic);
8982c8997b9Sdrh   pPager->needSync = !pPager->noSync;
899df2566a3Sdanielk1977 
900df2566a3Sdanielk1977   /* If the pager is in peristent-journal mode, then the physical
901df2566a3Sdanielk1977   ** journal-file may extend past the end of the master-journal name
902df2566a3Sdanielk1977   ** and 8 bytes of magic data just written to the file. This is
903df2566a3Sdanielk1977   ** dangerous because the code to rollback a hot-journal file
904df2566a3Sdanielk1977   ** will not be able to find the master-journal name to determine
905df2566a3Sdanielk1977   ** whether or not the journal is hot.
906df2566a3Sdanielk1977   **
907df2566a3Sdanielk1977   ** Easiest thing to do in this scenario is to truncate the journal
908df2566a3Sdanielk1977   ** file to the required size.
909df2566a3Sdanielk1977   */
910df2566a3Sdanielk1977   if( (rc==SQLITE_OK)
911df2566a3Sdanielk1977    && (rc = sqlite3OsFileSize(pPager->jfd, &jrnlSize))==SQLITE_OK
912df2566a3Sdanielk1977    && jrnlSize>jrnlOff
913df2566a3Sdanielk1977   ){
914df2566a3Sdanielk1977     rc = sqlite3OsTruncate(pPager->jfd, jrnlOff);
915df2566a3Sdanielk1977   }
9167657240aSdanielk1977   return rc;
9177657240aSdanielk1977 }
9187657240aSdanielk1977 
9197657240aSdanielk1977 /*
920ed7c855cSdrh ** Find a page in the hash table given its page number.  Return
921ed7c855cSdrh ** a pointer to the page or NULL if not found.
922ed7c855cSdrh */
923d9b0257aSdrh static PgHdr *pager_lookup(Pager *pPager, Pgno pgno){
9248ca0c724Sdrh   PgHdr *p;
9258c0a791aSdanielk1977   sqlite3PcacheFetch(pPager->pPCache, pgno, 0, &p);
926ed7c855cSdrh   return p;
927ed7c855cSdrh }
928ed7c855cSdrh 
929ed7c855cSdrh /*
930e180dd93Sdanielk1977 ** Clear the in-memory cache.  This routine
931ed7c855cSdrh ** sets the state of the pager back to what it was when it was first
932ed7c855cSdrh ** opened.  Any outstanding pages are invalidated and subsequent attempts
933ed7c855cSdrh ** to access those pages will likely result in a coredump.
934ed7c855cSdrh */
935d9b0257aSdrh static void pager_reset(Pager *pPager){
936efaaf579Sdanielk1977   if( pPager->errCode ) return;
9378c0a791aSdanielk1977   sqlite3PcacheClear(pPager->pPCache);
938e277be05Sdanielk1977 }
939e277be05Sdanielk1977 
94034cf35daSdanielk1977 /*
94134cf35daSdanielk1977 ** Free all structures in the Pager.aSavepoint[] array and set both
94234cf35daSdanielk1977 ** Pager.aSavepoint and Pager.nSavepoint to zero. Close the sub-journal
94334cf35daSdanielk1977 ** if it is open and the pager is not in exclusive mode.
94434cf35daSdanielk1977 */
945fd7f0452Sdanielk1977 static void releaseAllSavepoint(Pager *pPager){
946fd7f0452Sdanielk1977   int ii;
947fd7f0452Sdanielk1977   for(ii=0; ii<pPager->nSavepoint; ii++){
948fd7f0452Sdanielk1977     sqlite3BitvecDestroy(pPager->aSavepoint[ii].pInSavepoint);
949fd7f0452Sdanielk1977   }
950fd7f0452Sdanielk1977   if( !pPager->exclusiveMode ){
951fd7f0452Sdanielk1977     sqlite3OsClose(pPager->sjfd);
952fd7f0452Sdanielk1977   }
953fd7f0452Sdanielk1977   sqlite3_free(pPager->aSavepoint);
954fd7f0452Sdanielk1977   pPager->aSavepoint = 0;
955fd7f0452Sdanielk1977   pPager->nSavepoint = 0;
95667ddef69Sdanielk1977   pPager->stmtNRec = 0;
957fd7f0452Sdanielk1977 }
958fd7f0452Sdanielk1977 
95934cf35daSdanielk1977 /*
96034cf35daSdanielk1977 ** Set the bit number pgno in the PagerSavepoint.pInSavepoint bitvecs of
96134cf35daSdanielk1977 ** all open savepoints.
96234cf35daSdanielk1977 */
963fd7f0452Sdanielk1977 static int addToSavepointBitvecs(Pager *pPager, Pgno pgno){
9647539b6b8Sdrh   int ii;                   /* Loop counter */
9657539b6b8Sdrh   int rc = SQLITE_OK;       /* Result code */
9667539b6b8Sdrh 
967fd7f0452Sdanielk1977   for(ii=0; ii<pPager->nSavepoint; ii++){
968fd7f0452Sdanielk1977     PagerSavepoint *p = &pPager->aSavepoint[ii];
969fd7f0452Sdanielk1977     if( pgno<=p->nOrig ){
9707539b6b8Sdrh       rc |= sqlite3BitvecSet(p->pInSavepoint, pgno);
9717539b6b8Sdrh       assert( rc==SQLITE_OK || rc==SQLITE_NOMEM );
972fd7f0452Sdanielk1977     }
973fd7f0452Sdanielk1977   }
9747539b6b8Sdrh   return rc;
975fd7f0452Sdanielk1977 }
976fd7f0452Sdanielk1977 
977e277be05Sdanielk1977 /*
978ae72d982Sdanielk1977 ** Unlock the database file.
979ae72d982Sdanielk1977 **
980ae72d982Sdanielk1977 ** If the pager is currently in error state, discard the contents of
981ae72d982Sdanielk1977 ** the cache and reset the Pager structure internal state. If there is
982ae72d982Sdanielk1977 ** an open journal-file, then the next time a shared-lock is obtained
983ae72d982Sdanielk1977 ** on the pager file (by this or any other process), it will be
984ae72d982Sdanielk1977 ** treated as a hot-journal and rolled back.
985ae72d982Sdanielk1977 */
986ae72d982Sdanielk1977 static void pager_unlock(Pager *pPager){
987ae72d982Sdanielk1977   if( !pPager->exclusiveMode ){
9885f2d46b3Sdanielk1977     int rc;
989ae72d982Sdanielk1977 
99016e45a43Sdrh     /* Always close the journal file when dropping the database lock.
99116e45a43Sdrh     ** Otherwise, another connection with journal_mode=delete might
99216e45a43Sdrh     ** delete the file out from under us.
99316e45a43Sdrh     */
99416e45a43Sdrh     if( pPager->journalOpen ){
99516e45a43Sdrh       sqlite3OsClose(pPager->jfd);
99616e45a43Sdrh       pPager->journalOpen = 0;
99716e45a43Sdrh       sqlite3BitvecDestroy(pPager->pInJournal);
99816e45a43Sdrh       pPager->pInJournal = 0;
999a1fa00d9Sdanielk1977       sqlite3BitvecDestroy(pPager->pAlwaysRollback);
1000a1fa00d9Sdanielk1977       pPager->pAlwaysRollback = 0;
100116e45a43Sdrh     }
100216e45a43Sdrh 
10035f2d46b3Sdanielk1977     rc = osUnlock(pPager->fd, NO_LOCK);
10045f2d46b3Sdanielk1977     if( rc ) pPager->errCode = rc;
10055f2d46b3Sdanielk1977     pPager->dbSizeValid = 0;
10065f2d46b3Sdanielk1977     IOTRACE(("UNLOCK %p\n", pPager))
10075f2d46b3Sdanielk1977 
1008ae72d982Sdanielk1977     /* If Pager.errCode is set, the contents of the pager cache cannot be
1009ae72d982Sdanielk1977     ** trusted. Now that the pager file is unlocked, the contents of the
1010ae72d982Sdanielk1977     ** cache can be discarded and the error code safely cleared.
1011ae72d982Sdanielk1977     */
1012ae72d982Sdanielk1977     if( pPager->errCode ){
10131aa5af11Sdrh       if( rc==SQLITE_OK ) pPager->errCode = SQLITE_OK;
1014ae72d982Sdanielk1977       pager_reset(pPager);
1015fd7f0452Sdanielk1977       releaseAllSavepoint(pPager);
1016ae72d982Sdanielk1977       pPager->journalOff = 0;
1017ae72d982Sdanielk1977       pPager->journalStarted = 0;
10183460d19cSdanielk1977       pPager->dbOrigSize = 0;
1019ae72d982Sdanielk1977     }
1020ae72d982Sdanielk1977 
1021ae72d982Sdanielk1977     pPager->state = PAGER_UNLOCK;
1022ae72d982Sdanielk1977     pPager->changeCountDone = 0;
1023ae72d982Sdanielk1977   }
1024ae72d982Sdanielk1977 }
1025ae72d982Sdanielk1977 
1026ae72d982Sdanielk1977 /*
1027ae72d982Sdanielk1977 ** Execute a rollback if a transaction is active and unlock the
1028ae72d982Sdanielk1977 ** database file. If the pager has already entered the error state,
1029ae72d982Sdanielk1977 ** do not attempt the rollback.
1030ae72d982Sdanielk1977 */
1031ae72d982Sdanielk1977 static void pagerUnlockAndRollback(Pager *p){
1032ae72d982Sdanielk1977   if( p->errCode==SQLITE_OK && p->state>=PAGER_RESERVED ){
10332d1d86fbSdanielk1977     sqlite3BeginBenignMalloc();
1034ae72d982Sdanielk1977     sqlite3PagerRollback(p);
10352d1d86fbSdanielk1977     sqlite3EndBenignMalloc();
1036ae72d982Sdanielk1977   }
1037ae72d982Sdanielk1977   pager_unlock(p);
1038ae72d982Sdanielk1977 }
1039ae72d982Sdanielk1977 
1040ae72d982Sdanielk1977 /*
104180e35f46Sdrh ** This routine ends a transaction.  A transaction is ended by either
104280e35f46Sdrh ** a COMMIT or a ROLLBACK.
104380e35f46Sdrh **
1044ed7c855cSdrh ** When this routine is called, the pager has the journal file open and
104580e35f46Sdrh ** a RESERVED or EXCLUSIVE lock on the database.  This routine will release
104680e35f46Sdrh ** the database lock and acquires a SHARED lock in its place if that is
104780e35f46Sdrh ** the appropriate thing to do.  Release locks usually is appropriate,
104880e35f46Sdrh ** unless we are in exclusive access mode or unless this is a
104980e35f46Sdrh ** COMMIT AND BEGIN or ROLLBACK AND BEGIN operation.
105080e35f46Sdrh **
105180e35f46Sdrh ** The journal file is either deleted or truncated.
105250457896Sdrh **
105350457896Sdrh ** TODO: Consider keeping the journal file open for temporary databases.
105450457896Sdrh ** This might give a performance improvement on windows where opening
105550457896Sdrh ** a file is an expensive operation.
1056ed7c855cSdrh */
1057df2566a3Sdanielk1977 static int pager_end_transaction(Pager *pPager, int hasMaster){
105841483468Sdanielk1977   int rc = SQLITE_OK;
1059979f38e5Sdanielk1977   int rc2 = SQLITE_OK;
1060a6abd041Sdrh   if( pPager->state<PAGER_RESERVED ){
1061a6abd041Sdrh     return SQLITE_OK;
1062a6abd041Sdrh   }
1063fd7f0452Sdanielk1977   releaseAllSavepoint(pPager);
1064da47d774Sdrh   if( pPager->journalOpen ){
1065b3175389Sdanielk1977     if( pPager->journalMode==PAGER_JOURNALMODE_MEMORY ){
1066b3175389Sdanielk1977       int isMemoryJournal = sqlite3IsMemJournal(pPager->jfd);
1067b3175389Sdanielk1977       sqlite3OsClose(pPager->jfd);
1068b3175389Sdanielk1977       pPager->journalOpen = 0;
1069b3175389Sdanielk1977       if( !isMemoryJournal ){
1070b3175389Sdanielk1977         rc = sqlite3OsDelete(pPager->pVfs, pPager->zJournal, 0);
1071b3175389Sdanielk1977       }
1072b3175389Sdanielk1977     }else if( pPager->journalMode==PAGER_JOURNALMODE_TRUNCATE
107304335886Sdrh          && (rc = sqlite3OsTruncate(pPager->jfd, 0))==SQLITE_OK ){
107404335886Sdrh       pPager->journalOff = 0;
107504335886Sdrh       pPager->journalStarted = 0;
107604335886Sdrh     }else if( pPager->exclusiveMode
107793f7af97Sdanielk1977      || pPager->journalMode==PAGER_JOURNALMODE_PERSIST
107893f7af97Sdanielk1977     ){
107993f7af97Sdanielk1977       rc = zeroJournalHdr(pPager, hasMaster);
108093f7af97Sdanielk1977       pager_error(pPager, rc);
108141483468Sdanielk1977       pPager->journalOff = 0;
1082334cdb63Sdanielk1977       pPager->journalStarted = 0;
108341483468Sdanielk1977     }else{
108404335886Sdrh       assert( pPager->journalMode==PAGER_JOURNALMODE_DELETE || rc );
1085b4b47411Sdanielk1977       sqlite3OsClose(pPager->jfd);
10868cfbf08fSdrh       pPager->journalOpen = 0;
10870f01fdaeSdanielk1977       if( rc==SQLITE_OK && !pPager->tempFile ){
1088fee2d25aSdanielk1977         rc = sqlite3OsDelete(pPager->pVfs, pPager->zJournal, 0);
108941483468Sdanielk1977       }
10907152de8dSdanielk1977     }
1091f5e7bb51Sdrh     sqlite3BitvecDestroy(pPager->pInJournal);
1092f5e7bb51Sdrh     pPager->pInJournal = 0;
1093a1fa00d9Sdanielk1977     sqlite3BitvecDestroy(pPager->pAlwaysRollback);
1094a1fa00d9Sdanielk1977     pPager->pAlwaysRollback = 0;
10953c407374Sdanielk1977 #ifdef SQLITE_CHECK_PAGES
1096bc2ca9ebSdanielk1977     sqlite3PcacheIterateDirty(pPager->pPCache, pager_set_pagehash);
10973c407374Sdanielk1977 #endif
1098bc2ca9ebSdanielk1977     sqlite3PcacheCleanAll(pPager->pPCache);
1099ef317ab5Sdanielk1977     pPager->dirtyCache = 0;
1100ef317ab5Sdanielk1977     pPager->nRec = 0;
1101da47d774Sdrh   }else{
1102f5e7bb51Sdrh     assert( pPager->pInJournal==0 );
1103da47d774Sdrh   }
1104979f38e5Sdanielk1977 
110541483468Sdanielk1977   if( !pPager->exclusiveMode ){
11067a2b1eebSdanielk1977     rc2 = osUnlock(pPager->fd, SHARED_LOCK);
1107a6abd041Sdrh     pPager->state = PAGER_SHARED;
1108*104f1fefSdanielk1977     pPager->changeCountDone = 0;
1109334cdb63Sdanielk1977   }else if( pPager->state==PAGER_SYNCED ){
1110334cdb63Sdanielk1977     pPager->state = PAGER_EXCLUSIVE;
1111334cdb63Sdanielk1977   }
11123460d19cSdanielk1977   pPager->dbOrigSize = 0;
11137657240aSdanielk1977   pPager->setMaster = 0;
1114c4da5b9fSdanielk1977   pPager->needSync = 0;
11158c0a791aSdanielk1977   /* lruListSetFirstSynced(pPager); */
1116f90b7260Sdanielk1977   sqlite3PcacheTruncate(pPager->pPCache, pPager->dbSize);
1117b3175389Sdanielk1977   if( !MEMDB ){
1118d92db531Sdanielk1977     pPager->dbSizeValid = 0;
1119b3175389Sdanielk1977   }
1120d138c016Sdrh   pPager->dbModified = 0;
1121979f38e5Sdanielk1977 
1122979f38e5Sdanielk1977   return (rc==SQLITE_OK?rc2:rc);
1123ed7c855cSdrh }
1124ed7c855cSdrh 
1125ed7c855cSdrh /*
1126968af52aSdrh ** Compute and return a checksum for the page of data.
112734e79ceeSdrh **
112834e79ceeSdrh ** This is not a real checksum.  It is really just the sum of the
1129726de599Sdrh ** random initial value and the page number.  We experimented with
1130726de599Sdrh ** a checksum of the entire data, but that was found to be too slow.
1131726de599Sdrh **
1132726de599Sdrh ** Note that the page number is stored at the beginning of data and
1133726de599Sdrh ** the checksum is stored at the end.  This is important.  If journal
1134726de599Sdrh ** corruption occurs due to a power failure, the most likely scenario
1135726de599Sdrh ** is that one end or the other of the record will be changed.  It is
1136726de599Sdrh ** much less likely that the two ends of the journal record will be
1137726de599Sdrh ** correct and the middle be corrupt.  Thus, this "checksum" scheme,
1138726de599Sdrh ** though fast and simple, catches the mostly likely kind of corruption.
1139726de599Sdrh **
1140726de599Sdrh ** FIX ME:  Consider adding every 200th (or so) byte of the data to the
1141726de599Sdrh ** checksum.  That way if a single page spans 3 or more disk sectors and
1142726de599Sdrh ** only the middle sector is corrupt, we will still have a reasonable
1143726de599Sdrh ** chance of failing the checksum and thus detecting the problem.
1144968af52aSdrh */
114574161705Sdrh static u32 pager_cksum(Pager *pPager, const u8 *aData){
1146ef317ab5Sdanielk1977   u32 cksum = pPager->cksumInit;
1147ef317ab5Sdanielk1977   int i = pPager->pageSize-200;
1148ef317ab5Sdanielk1977   while( i>0 ){
1149ef317ab5Sdanielk1977     cksum += aData[i];
1150ef317ab5Sdanielk1977     i -= 200;
1151ef317ab5Sdanielk1977   }
1152968af52aSdrh   return cksum;
1153968af52aSdrh }
1154968af52aSdrh 
1155968af52aSdrh /*
1156d6e5e098Sdrh ** Read a single page from either the journal file (if isMainJrnl==1) or
1157d6e5e098Sdrh ** from the sub-journal (if isMainJrnl==0) and playback that page.
1158d6e5e098Sdrh ** The page begins at offset *pOffset into the file.  The  *pOffset
1159d6e5e098Sdrh ** value is increased to the start of the next page in the journal.
1160968af52aSdrh **
1161c13148ffSdrh ** The isMainJrnl flag is true if this is the main rollback journal and
1162c13148ffSdrh ** false for the statement journal.  The main rollback journal uses
1163c13148ffSdrh ** checksums - the statement journal does not.
1164d6e5e098Sdrh **
1165d6e5e098Sdrh ** If pDone is not NULL, then it is a record of pages that have already
1166d6e5e098Sdrh ** been played back.  If the page at *pOffset has already been played back
1167d6e5e098Sdrh ** (if the corresponding pDone bit is set) then skip the playback.
1168d6e5e098Sdrh ** Make sure the pDone bit corresponding to the *pOffset page is set
1169d6e5e098Sdrh ** prior to returning.
1170fa86c412Sdrh */
117162079060Sdanielk1977 static int pager_playback_one_page(
1172c13148ffSdrh   Pager *pPager,                /* The pager being played back */
1173fd7f0452Sdanielk1977   int isMainJrnl,               /* 1 -> main journal. 0 -> sub-journal. */
1174d6e5e098Sdrh   i64 *pOffset,                 /* Offset of record to playback */
1175ecfef985Sdanielk1977   int isSavepnt,                /* True for a savepoint rollback */
1176fd7f0452Sdanielk1977   Bitvec *pDone                 /* Bitvec of pages already played back */
117762079060Sdanielk1977 ){
1178fa86c412Sdrh   int rc;
1179fa86c412Sdrh   PgHdr *pPg;                   /* An existing page in the cache */
1180ae2b40c4Sdrh   Pgno pgno;                    /* The page number of a page in journal */
1181ae2b40c4Sdrh   u32 cksum;                    /* Checksum used for sanity checking */
1182d6e5e098Sdrh   u8 *aData;                    /* Temporary storage for the page */
1183d6e5e098Sdrh   sqlite3_file *jfd;            /* The file descriptor for the journal file */
1184fa86c412Sdrh 
1185d6e5e098Sdrh   assert( (isMainJrnl&~1)==0 );      /* isMainJrnl is 0 or 1 */
1186d6e5e098Sdrh   assert( (isSavepnt&~1)==0 );       /* isSavepnt is 0 or 1 */
1187d6e5e098Sdrh   assert( isMainJrnl || pDone );     /* pDone always used on sub-journals */
1188d6e5e098Sdrh   assert( isSavepnt || pDone==0 );   /* pDone never used on non-savepoint */
11899636284eSdrh 
1190d6e5e098Sdrh   aData = (u8*)pPager->pTmpSpace;
1191d6e5e098Sdrh   assert( aData );         /* Temp storage must have already been allocated */
1192d6e5e098Sdrh 
1193d6e5e098Sdrh   jfd = isMainJrnl ? pPager->jfd : pPager->sjfd;
1194d6e5e098Sdrh 
1195d6e5e098Sdrh   rc = read32bits(jfd, *pOffset, &pgno);
119699ee3600Sdrh   if( rc!=SQLITE_OK ) return rc;
1197d6e5e098Sdrh   rc = sqlite3OsRead(jfd, aData, pPager->pageSize, (*pOffset)+4);
119899ee3600Sdrh   if( rc!=SQLITE_OK ) return rc;
1199d6e5e098Sdrh   *pOffset += pPager->pageSize + 4 + isMainJrnl*4;
1200fa86c412Sdrh 
1201968af52aSdrh   /* Sanity checking on the page.  This is more important that I originally
1202968af52aSdrh   ** thought.  If a power failure occurs while the journal is being written,
1203968af52aSdrh   ** it could cause invalid data to be written into the journal.  We need to
1204968af52aSdrh   ** detect this invalid data (with high probability) and ignore it.
1205968af52aSdrh   */
120675edc16fSdanielk1977   if( pgno==0 || pgno==PAGER_MJ_PGNO(pPager) ){
1207968af52aSdrh     return SQLITE_DONE;
1208968af52aSdrh   }
1209fd7f0452Sdanielk1977   if( pgno>(Pgno)pPager->dbSize || sqlite3BitvecTest(pDone, pgno) ){
1210968af52aSdrh     return SQLITE_OK;
1211968af52aSdrh   }
1212c13148ffSdrh   if( isMainJrnl ){
1213d6e5e098Sdrh     rc = read32bits(jfd, (*pOffset)-4, &cksum);
121499ee3600Sdrh     if( rc ) return rc;
1215ecfef985Sdanielk1977     if( !isSavepnt && pager_cksum(pPager, aData)!=cksum ){
1216968af52aSdrh       return SQLITE_DONE;
1217968af52aSdrh     }
1218968af52aSdrh   }
1219fd7f0452Sdanielk1977   if( pDone && (rc = sqlite3BitvecSet(pDone, pgno)) ){
1220fd7f0452Sdanielk1977     return rc;
1221fd7f0452Sdanielk1977   }
1222fa86c412Sdrh 
1223aa5ccdf5Sdanielk1977   assert( pPager->state==PAGER_RESERVED || pPager->state>=PAGER_EXCLUSIVE );
1224a3f3a5f3Sdanielk1977 
1225a3f3a5f3Sdanielk1977   /* If the pager is in RESERVED state, then there must be a copy of this
1226a3f3a5f3Sdanielk1977   ** page in the pager cache. In this case just update the pager cache,
12270de0bb33Sdanielk1977   ** not the database file. The page is left marked dirty in this case.
12280de0bb33Sdanielk1977   **
12292df71c74Sdanielk1977   ** An exception to the above rule: If the database is in no-sync mode
12302df71c74Sdanielk1977   ** and a page is moved during an incremental vacuum then the page may
1231369f3a05Sdanielk1977   ** not be in the pager cache. Later: if a malloc() or IO error occurs
1232369f3a05Sdanielk1977   ** during a Movepage() call, then the page may not be in the cache
1233369f3a05Sdanielk1977   ** either. So the condition described in the above paragraph is not
1234369f3a05Sdanielk1977   ** assert()able.
12352df71c74Sdanielk1977   **
1236a3f3a5f3Sdanielk1977   ** If in EXCLUSIVE state, then we update the pager cache if it exists
1237a3f3a5f3Sdanielk1977   ** and the main file. The page is then marked not dirty.
12389636284eSdrh   **
12399636284eSdrh   ** Ticket #1171:  The statement journal might contain page content that is
12409636284eSdrh   ** different from the page content at the start of the transaction.
12419636284eSdrh   ** This occurs when a page is changed prior to the start of a statement
12429636284eSdrh   ** then changed again within the statement.  When rolling back such a
12439636284eSdrh   ** statement we must not write to the original database unless we know
12445e385311Sdrh   ** for certain that original page contents are synced into the main rollback
12455e385311Sdrh   ** journal.  Otherwise, a power loss might leave modified data in the
12465e385311Sdrh   ** database file without an entry in the rollback journal that can
12475e385311Sdrh   ** restore the database to its original form.  Two conditions must be
12485e385311Sdrh   ** met before writing to the database files. (1) the database must be
12495e385311Sdrh   ** locked.  (2) we know that the original page content is fully synced
12505e385311Sdrh   ** in the main journal either because the page is not in cache or else
12515e385311Sdrh   ** the page is marked as needSync==0.
12524c02a235Sdrh   **
12534c02a235Sdrh   ** 2008-04-14:  When attempting to vacuum a corrupt database file, it
12544c02a235Sdrh   ** is possible to fail a statement on a database that does not yet exist.
12554c02a235Sdrh   ** Do not attempt to write if database file has never been opened.
1256fa86c412Sdrh   */
1257ae2b40c4Sdrh   pPg = pager_lookup(pPager, pgno);
125830d53701Sdrh   PAGERTRACE(("PLAYBACK %d page %d hash(%08x) %s\n",
1259ecfef985Sdanielk1977                PAGERID(pPager), pgno, pager_datahash(pPager->pageSize, aData),
1260ecfef985Sdanielk1977                (isMainJrnl?"main-journal":"sub-journal")
126130d53701Sdrh   ));
12628c0a791aSdanielk1977   if( (pPager->state>=PAGER_EXCLUSIVE)
12638c0a791aSdanielk1977    && (pPg==0 || 0==(pPg->flags&PGHDR_NEED_SYNC))
12648c0a791aSdanielk1977    && (pPager->fd->pMethods)
12658c0a791aSdanielk1977   ){
1266281b21daSdrh     i64 ofst = (pgno-1)*(i64)pPager->pageSize;
1267281b21daSdrh     rc = sqlite3OsWrite(pPager->fd, aData, pPager->pageSize, ofst);
12683460d19cSdanielk1977     if( pgno>pPager->dbFileSize ){
12693460d19cSdanielk1977       pPager->dbFileSize = pgno;
12703460d19cSdanielk1977     }
1271f2c31ad8Sdanielk1977   }else if( !isMainJrnl && pPg==0 ){
1272f2c31ad8Sdanielk1977     /* If this is a rollback of a savepoint and data was not written to
1273f2c31ad8Sdanielk1977     ** the database and the page is not in-memory, there is a potential
1274f2c31ad8Sdanielk1977     ** problem. When the page is next fetched by the b-tree layer, it
1275f2c31ad8Sdanielk1977     ** will be read from the database file, which may or may not be
1276f2c31ad8Sdanielk1977     ** current.
1277f2c31ad8Sdanielk1977     **
1278f2c31ad8Sdanielk1977     ** There are a couple of different ways this can happen. All are quite
1279401b65edSdanielk1977     ** obscure. When running in synchronous mode, this can only happen
1280f2c31ad8Sdanielk1977     ** if the page is on the free-list at the start of the transaction, then
1281f2c31ad8Sdanielk1977     ** populated, then moved using sqlite3PagerMovepage().
1282f2c31ad8Sdanielk1977     **
1283f2c31ad8Sdanielk1977     ** The solution is to add an in-memory page to the cache containing
1284f2c31ad8Sdanielk1977     ** the data just read from the sub-journal. Mark the page as dirty
1285f2c31ad8Sdanielk1977     ** and if the pager requires a journal-sync, then mark the page as
1286f2c31ad8Sdanielk1977     ** requiring a journal-sync before it is written.
1287f2c31ad8Sdanielk1977     */
1288f2c31ad8Sdanielk1977     assert( isSavepnt );
1289f2c31ad8Sdanielk1977     if( (rc = sqlite3PagerAcquire(pPager, pgno, &pPg, 1)) ){
1290f2c31ad8Sdanielk1977       return rc;
1291f2c31ad8Sdanielk1977     }
1292f2c31ad8Sdanielk1977     pPg->flags &= ~PGHDR_NEED_READ;
1293f2c31ad8Sdanielk1977     sqlite3PcacheMakeDirty(pPg);
1294a3f3a5f3Sdanielk1977   }
1295fa86c412Sdrh   if( pPg ){
12962812956bSdanielk1977     /* No page should ever be explicitly rolled back that is in use, except
12972812956bSdanielk1977     ** for page 1 which is held in use in order to keep the lock on the
12982812956bSdanielk1977     ** database active. However such a page may be rolled back as a result
12992812956bSdanielk1977     ** of an internal error resulting in an automatic call to
13003b8a05f6Sdanielk1977     ** sqlite3PagerRollback().
13013a84069dSdrh     */
1302b6f41486Sdrh     void *pData;
13038c0a791aSdanielk1977     pData = pPg->pData;
1304ae2b40c4Sdrh     memcpy(pData, aData, pPager->pageSize);
13059038bb64Sdanielk1977     if( pPager->xReiniter ){
1306eaa06f69Sdanielk1977       pPager->xReiniter(pPg);
1307de647130Sdrh     }
1308ecfef985Sdanielk1977     if( isMainJrnl && (!isSavepnt || pPager->journalOff<=pPager->journalHdr) ){
1309488af099Sdanielk1977       /* If the contents of this page were just restored from the main
1310488af099Sdanielk1977       ** journal file, then its content must be as they were when the
1311488af099Sdanielk1977       ** transaction was first opened. In this case we can mark the page
1312488af099Sdanielk1977       ** as clean, since there will be no need to write it out to the.
1313488af099Sdanielk1977       **
1314488af099Sdanielk1977       ** There is one exception to this rule. If the page is being rolled
1315488af099Sdanielk1977       ** back as part of a savepoint (or statement) rollback from an
1316488af099Sdanielk1977       ** unsynced portion of the main journal file, then it is not safe
1317488af099Sdanielk1977       ** to mark the page as clean. This is because marking the page as
1318488af099Sdanielk1977       ** clean will clear the PGHDR_NEED_SYNC flag. Since the page is
1319488af099Sdanielk1977       ** already in the journal file (recorded in Pager.pInJournal) and
1320488af099Sdanielk1977       ** the PGHDR_NEED_SYNC flag is cleared, if the page is written to
1321488af099Sdanielk1977       ** again within this transaction, it will be marked as dirty but
1322488af099Sdanielk1977       ** the PGHDR_NEED_SYNC flag will not be set. It could then potentially
1323488af099Sdanielk1977       ** be written out into the database file before its journal file
1324488af099Sdanielk1977       ** segment is synced. If a crash occurs during or following this,
1325488af099Sdanielk1977       ** database corruption may ensue.
1326488af099Sdanielk1977       */
1327c047b9f7Sdrh       sqlite3PcacheMakeClean(pPg);
1328c047b9f7Sdrh     }
13293c407374Sdanielk1977 #ifdef SQLITE_CHECK_PAGES
13303c407374Sdanielk1977     pPg->pageHash = pager_pagehash(pPg);
13313c407374Sdanielk1977 #endif
133286a88114Sdrh     /* If this was page 1, then restore the value of Pager.dbFileVers.
133386a88114Sdrh     ** Do this before any decoding. */
133441483468Sdanielk1977     if( pgno==1 ){
133586a88114Sdrh       memcpy(&pPager->dbFileVers, &((u8*)pData)[24],sizeof(pPager->dbFileVers));
133641483468Sdanielk1977     }
133786a88114Sdrh 
133886a88114Sdrh     /* Decode the page just read from disk */
133986a88114Sdrh     CODEC1(pPager, pData, pPg->pgno, 3);
13408c0a791aSdanielk1977     sqlite3PcacheRelease(pPg);
1341fa86c412Sdrh   }
1342fa86c412Sdrh   return rc;
1343fa86c412Sdrh }
1344fa86c412Sdrh 
1345ee03d629Sdrh #if !defined(NDEBUG) || defined(SQLITE_COVERAGE_TEST)
1346d6e5e098Sdrh /*
1347d6e5e098Sdrh ** This routine looks ahead into the main journal file and determines
1348d6e5e098Sdrh ** whether or not the next record (the record that begins at file
1349d6e5e098Sdrh ** offset pPager->journalOff) is a well-formed page record consisting
1350d6e5e098Sdrh ** of a valid page number, pPage->pageSize bytes of content, followed
1351d6e5e098Sdrh ** by a valid checksum.
1352d6e5e098Sdrh **
1353d6e5e098Sdrh ** The pager never needs to know this in order to do its job.   This
1354d6e5e098Sdrh ** routine is only used from with assert() and testcase() macros.
1355d6e5e098Sdrh */
1356d6e5e098Sdrh static int pagerNextJournalPageIsValid(Pager *pPager){
1357d6e5e098Sdrh   Pgno pgno;           /* The page number of the page */
1358d6e5e098Sdrh   u32 cksum;           /* The page checksum */
1359d6e5e098Sdrh   int rc;              /* Return code from read operations */
1360d6e5e098Sdrh   sqlite3_file *fd;    /* The file descriptor from which we are reading */
1361d6e5e098Sdrh   u8 *aData;           /* Content of the page */
1362d6e5e098Sdrh 
1363d6e5e098Sdrh   /* Read the page number header */
1364d6e5e098Sdrh   fd = pPager->jfd;
1365d6e5e098Sdrh   rc = read32bits(fd, pPager->journalOff, &pgno);
1366d6e5e098Sdrh   if( rc!=SQLITE_OK ){ return 0; }                                  /*NO_TEST*/
1367d6e5e098Sdrh   if( pgno==0 || pgno==PAGER_MJ_PGNO(pPager) ){ return 0; }         /*NO_TEST*/
1368d6e5e098Sdrh   if( pgno>(Pgno)pPager->dbSize ){ return 0; }                      /*NO_TEST*/
1369d6e5e098Sdrh 
1370d6e5e098Sdrh   /* Read the checksum */
1371d6e5e098Sdrh   rc = read32bits(fd, pPager->journalOff+pPager->pageSize+4, &cksum);
1372d6e5e098Sdrh   if( rc!=SQLITE_OK ){ return 0; }                                  /*NO_TEST*/
1373d6e5e098Sdrh 
1374d6e5e098Sdrh   /* Read the data and verify the checksum */
1375d6e5e098Sdrh   aData = (u8*)pPager->pTmpSpace;
1376d6e5e098Sdrh   rc = sqlite3OsRead(fd, aData, pPager->pageSize, pPager->journalOff+4);
1377d6e5e098Sdrh   if( rc!=SQLITE_OK ){ return 0; }                                  /*NO_TEST*/
1378d6e5e098Sdrh   if( pager_cksum(pPager, aData)!=cksum ){ return 0; }              /*NO_TEST*/
1379d6e5e098Sdrh 
1380d6e5e098Sdrh   /* Reach this point only if the page is valid */
1381d6e5e098Sdrh   return 1;
1382d6e5e098Sdrh }
1383d6e5e098Sdrh #endif /* !defined(NDEBUG) || defined(SQLITE_COVERAGE_TEST) */
1384d6e5e098Sdrh 
1385fa86c412Sdrh /*
138613adf8a0Sdanielk1977 ** Parameter zMaster is the name of a master journal file. A single journal
138713adf8a0Sdanielk1977 ** file that referred to the master journal file has just been rolled back.
138813adf8a0Sdanielk1977 ** This routine checks if it is possible to delete the master journal file,
138913adf8a0Sdanielk1977 ** and does so if it is.
1390726de599Sdrh **
139165839c6aSdanielk1977 ** Argument zMaster may point to Pager.pTmpSpace. So that buffer is not
139265839c6aSdanielk1977 ** available for use within this function.
139365839c6aSdanielk1977 **
139465839c6aSdanielk1977 **
1395726de599Sdrh ** The master journal file contains the names of all child journals.
1396726de599Sdrh ** To tell if a master journal can be deleted, check to each of the
1397726de599Sdrh ** children.  If all children are either missing or do not refer to
1398726de599Sdrh ** a different master journal, then this master journal can be deleted.
139913adf8a0Sdanielk1977 */
1400b4b47411Sdanielk1977 static int pager_delmaster(Pager *pPager, const char *zMaster){
1401b4b47411Sdanielk1977   sqlite3_vfs *pVfs = pPager->pVfs;
140213adf8a0Sdanielk1977   int rc;
140313adf8a0Sdanielk1977   int master_open = 0;
1404b4b47411Sdanielk1977   sqlite3_file *pMaster;
1405b4b47411Sdanielk1977   sqlite3_file *pJournal;
140613adf8a0Sdanielk1977   char *zMasterJournal = 0; /* Contents of master journal file */
1407eb206256Sdrh   i64 nMasterJournal;       /* Size of master journal file */
140813adf8a0Sdanielk1977 
140913adf8a0Sdanielk1977   /* Open the master journal file exclusively in case some other process
141013adf8a0Sdanielk1977   ** is running this routine also. Not that it makes too much difference.
141113adf8a0Sdanielk1977   */
1412e5ae5735Sdrh   pMaster = (sqlite3_file *)sqlite3Malloc(pVfs->szOsFile * 2);
1413fee2d25aSdanielk1977   pJournal = (sqlite3_file *)(((u8 *)pMaster) + pVfs->szOsFile);
1414b4b47411Sdanielk1977   if( !pMaster ){
1415b4b47411Sdanielk1977     rc = SQLITE_NOMEM;
1416b4b47411Sdanielk1977   }else{
1417fee2d25aSdanielk1977     int flags = (SQLITE_OPEN_READONLY|SQLITE_OPEN_MASTER_JOURNAL);
1418fee2d25aSdanielk1977     rc = sqlite3OsOpen(pVfs, zMaster, pMaster, flags, 0);
1419b4b47411Sdanielk1977   }
142013adf8a0Sdanielk1977   if( rc!=SQLITE_OK ) goto delmaster_out;
142113adf8a0Sdanielk1977   master_open = 1;
1422b4b47411Sdanielk1977 
1423b4b47411Sdanielk1977   rc = sqlite3OsFileSize(pMaster, &nMasterJournal);
142413adf8a0Sdanielk1977   if( rc!=SQLITE_OK ) goto delmaster_out;
142513adf8a0Sdanielk1977 
142613adf8a0Sdanielk1977   if( nMasterJournal>0 ){
14275865e3d5Sdanielk1977     char *zJournal;
14287657240aSdanielk1977     char *zMasterPtr = 0;
142965839c6aSdanielk1977     int nMasterPtr = pPager->pVfs->mxPathname+1;
14305865e3d5Sdanielk1977 
14315865e3d5Sdanielk1977     /* Load the entire master journal file into space obtained from
143217435752Sdrh     ** sqlite3_malloc() and pointed to by zMasterJournal.
14335865e3d5Sdanielk1977     */
14344f21c4afSdrh     zMasterJournal = (char *)sqlite3Malloc((int)nMasterJournal + nMasterPtr);
143513adf8a0Sdanielk1977     if( !zMasterJournal ){
143613adf8a0Sdanielk1977       rc = SQLITE_NOMEM;
143713adf8a0Sdanielk1977       goto delmaster_out;
143813adf8a0Sdanielk1977     }
143965839c6aSdanielk1977     zMasterPtr = &zMasterJournal[nMasterJournal];
14404f21c4afSdrh     rc = sqlite3OsRead(pMaster, zMasterJournal, (int)nMasterJournal, 0);
144113adf8a0Sdanielk1977     if( rc!=SQLITE_OK ) goto delmaster_out;
144213adf8a0Sdanielk1977 
14435865e3d5Sdanielk1977     zJournal = zMasterJournal;
14445865e3d5Sdanielk1977     while( (zJournal-zMasterJournal)<nMasterJournal ){
1445861f7456Sdanielk1977       int exists;
1446861f7456Sdanielk1977       rc = sqlite3OsAccess(pVfs, zJournal, SQLITE_ACCESS_EXISTS, &exists);
1447861f7456Sdanielk1977       if( rc!=SQLITE_OK ){
144819db9352Sdrh         goto delmaster_out;
144919db9352Sdrh       }
1450861f7456Sdanielk1977       if( exists ){
145113adf8a0Sdanielk1977         /* One of the journals pointed to by the master journal exists.
145213adf8a0Sdanielk1977         ** Open it and check if it points at the master journal. If
145313adf8a0Sdanielk1977         ** so, return without deleting the master journal file.
145413adf8a0Sdanielk1977         */
14553b7b78b3Sdrh         int c;
1456fee2d25aSdanielk1977         int flags = (SQLITE_OPEN_READONLY|SQLITE_OPEN_MAIN_JOURNAL);
1457fee2d25aSdanielk1977         rc = sqlite3OsOpen(pVfs, zJournal, pJournal, flags, 0);
145813adf8a0Sdanielk1977         if( rc!=SQLITE_OK ){
145913adf8a0Sdanielk1977           goto delmaster_out;
146013adf8a0Sdanielk1977         }
14619eed5057Sdanielk1977 
146265839c6aSdanielk1977         rc = readMasterJournal(pJournal, zMasterPtr, nMasterPtr);
1463b4b47411Sdanielk1977         sqlite3OsClose(pJournal);
14649eed5057Sdanielk1977         if( rc!=SQLITE_OK ){
14659eed5057Sdanielk1977           goto delmaster_out;
14669eed5057Sdanielk1977         }
146713adf8a0Sdanielk1977 
146865839c6aSdanielk1977         c = zMasterPtr[0]!=0 && strcmp(zMasterPtr, zMaster)==0;
14693b7b78b3Sdrh         if( c ){
147013adf8a0Sdanielk1977           /* We have a match. Do not delete the master journal file. */
147113adf8a0Sdanielk1977           goto delmaster_out;
147213adf8a0Sdanielk1977         }
147313adf8a0Sdanielk1977       }
1474ea678832Sdrh       zJournal += (sqlite3Strlen30(zJournal)+1);
147513adf8a0Sdanielk1977     }
147613adf8a0Sdanielk1977   }
147713adf8a0Sdanielk1977 
1478fee2d25aSdanielk1977   rc = sqlite3OsDelete(pVfs, zMaster, 0);
147913adf8a0Sdanielk1977 
148013adf8a0Sdanielk1977 delmaster_out:
148113adf8a0Sdanielk1977   if( zMasterJournal ){
148217435752Sdrh     sqlite3_free(zMasterJournal);
148313adf8a0Sdanielk1977   }
148413adf8a0Sdanielk1977   if( master_open ){
1485b4b47411Sdanielk1977     sqlite3OsClose(pMaster);
148613adf8a0Sdanielk1977   }
1487b4b47411Sdanielk1977   sqlite3_free(pMaster);
148813adf8a0Sdanielk1977   return rc;
148913adf8a0Sdanielk1977 }
149013adf8a0Sdanielk1977 
1491a6abd041Sdrh 
1492a6abd041Sdrh /*
1493f90b7260Sdanielk1977 ** If the main database file is open and an exclusive lock is held,
1494f90b7260Sdanielk1977 ** truncate the main file of the given pager to the specified number
1495f90b7260Sdanielk1977 ** of pages.
14967fe3f7e9Sdrh **
1497f90b7260Sdanielk1977 ** It might might be the case that the file on disk is smaller than nPage.
14987fe3f7e9Sdrh ** This can happen, for example, if we are in the middle of a transaction
14997fe3f7e9Sdrh ** which has extended the file size and the new pages are still all held
15007fe3f7e9Sdrh ** in cache, then an INSERT or UPDATE does a statement rollback.  Some
15017fe3f7e9Sdrh ** operating system implementations can get confused if you try to
15027fe3f7e9Sdrh ** truncate a file to some size that is larger than it currently is,
150306e11af9Sdanielk1977 ** so detect this case and write a single zero byte to the end of the new
150406e11af9Sdanielk1977 ** file instead.
1505cb4c40baSdrh */
1506d92db531Sdanielk1977 static int pager_truncate(Pager *pPager, Pgno nPage){
1507e180dd93Sdanielk1977   int rc = SQLITE_OK;
15087a2b1eebSdanielk1977   if( pPager->state>=PAGER_EXCLUSIVE && pPager->fd->pMethods ){
15097fe3f7e9Sdrh     i64 currentSize, newSize;
15107fe3f7e9Sdrh     rc = sqlite3OsFileSize(pPager->fd, &currentSize);
15117fe3f7e9Sdrh     newSize = pPager->pageSize*(i64)nPage;
151206e11af9Sdanielk1977     if( rc==SQLITE_OK && currentSize!=newSize ){
151306e11af9Sdanielk1977       if( currentSize>newSize ){
15147fe3f7e9Sdrh         rc = sqlite3OsTruncate(pPager->fd, newSize);
151506e11af9Sdanielk1977       }else{
151606e11af9Sdanielk1977         rc = sqlite3OsWrite(pPager->fd, "", 1, newSize-1);
151706e11af9Sdanielk1977       }
15183460d19cSdanielk1977       if( rc==SQLITE_OK ){
15193460d19cSdanielk1977         pPager->dbFileSize = nPage;
15203460d19cSdanielk1977       }
15217fe3f7e9Sdrh     }
1522e180dd93Sdanielk1977   }
1523e180dd93Sdanielk1977   return rc;
1524cb4c40baSdrh }
1525cb4c40baSdrh 
1526cb4c40baSdrh /*
1527c80f058dSdrh ** Set the sectorSize for the given pager.
1528c80f058dSdrh **
1529334c80d6Sdrh ** The sector size is at least as big as the sector size reported
1530334c80d6Sdrh ** by sqlite3OsSectorSize(). The minimum sector size is 512.
1531c80f058dSdrh */
1532c80f058dSdrh static void setSectorSize(Pager *pPager){
15337a2b1eebSdanielk1977   assert(pPager->fd->pMethods||pPager->tempFile);
15347a2b1eebSdanielk1977   if( !pPager->tempFile ){
15357a2b1eebSdanielk1977     /* Sector size doesn't matter for temporary files. Also, the file
15367a2b1eebSdanielk1977     ** may not have been opened yet, in whcih case the OsSectorSize()
15377a2b1eebSdanielk1977     ** call will segfault.
15387a2b1eebSdanielk1977     */
1539c80f058dSdrh     pPager->sectorSize = sqlite3OsSectorSize(pPager->fd);
15407a2b1eebSdanielk1977   }
1541334c80d6Sdrh   if( pPager->sectorSize<512 ){
1542334c80d6Sdrh     pPager->sectorSize = 512;
1543c80f058dSdrh   }
15447cbd589dSdanielk1977   if( pPager->sectorSize>MAX_SECTOR_SIZE ){
15457cbd589dSdanielk1977     pPager->sectorSize = MAX_SECTOR_SIZE;
15467cbd589dSdanielk1977   }
1547c80f058dSdrh }
1548c80f058dSdrh 
1549c80f058dSdrh /*
1550ed7c855cSdrh ** Playback the journal and thus restore the database file to
1551ed7c855cSdrh ** the state it was in before we started making changes.
1552ed7c855cSdrh **
155334e79ceeSdrh ** The journal file format is as follows:
155434e79ceeSdrh **
1555ae2b40c4Sdrh **  (1)  8 byte prefix.  A copy of aJournalMagic[].
1556ae2b40c4Sdrh **  (2)  4 byte big-endian integer which is the number of valid page records
155734e79ceeSdrh **       in the journal.  If this value is 0xffffffff, then compute the
1558ae2b40c4Sdrh **       number of page records from the journal size.
1559ae2b40c4Sdrh **  (3)  4 byte big-endian integer which is the initial value for the
1560ae2b40c4Sdrh **       sanity checksum.
1561ae2b40c4Sdrh **  (4)  4 byte integer which is the number of pages to truncate the
156234e79ceeSdrh **       database to during a rollback.
1563334c80d6Sdrh **  (5)  4 byte big-endian integer which is the sector size.  The header
1564334c80d6Sdrh **       is this many bytes in size.
1565334c80d6Sdrh **  (6)  4 byte big-endian integer which is the page case.
1566334c80d6Sdrh **  (7)  4 byte integer which is the number of bytes in the master journal
1567ae2b40c4Sdrh **       name.  The value may be zero (indicate that there is no master
1568ae2b40c4Sdrh **       journal.)
1569334c80d6Sdrh **  (8)  N bytes of the master journal name.  The name will be nul-terminated
1570ae2b40c4Sdrh **       and might be shorter than the value read from (5).  If the first byte
1571ae2b40c4Sdrh **       of the name is \000 then there is no master journal.  The master
1572ae2b40c4Sdrh **       journal name is stored in UTF-8.
1573334c80d6Sdrh **  (9)  Zero or more pages instances, each as follows:
157434e79ceeSdrh **        +  4 byte page number.
1575ae2b40c4Sdrh **        +  pPager->pageSize bytes of data.
1576ae2b40c4Sdrh **        +  4 byte checksum
157734e79ceeSdrh **
1578334c80d6Sdrh ** When we speak of the journal header, we mean the first 8 items above.
1579334c80d6Sdrh ** Each entry in the journal is an instance of the 9th item.
158034e79ceeSdrh **
158134e79ceeSdrh ** Call the value from the second bullet "nRec".  nRec is the number of
158234e79ceeSdrh ** valid page entries in the journal.  In most cases, you can compute the
158334e79ceeSdrh ** value of nRec from the size of the journal file.  But if a power
158434e79ceeSdrh ** failure occurred while the journal was being written, it could be the
158534e79ceeSdrh ** case that the size of the journal file had already been increased but
158634e79ceeSdrh ** the extra entries had not yet made it safely to disk.  In such a case,
158734e79ceeSdrh ** the value of nRec computed from the file size would be too large.  For
158834e79ceeSdrh ** that reason, we always use the nRec value in the header.
158934e79ceeSdrh **
159034e79ceeSdrh ** If the nRec value is 0xffffffff it means that nRec should be computed
159134e79ceeSdrh ** from the file size.  This value is used when the user selects the
159234e79ceeSdrh ** no-sync option for the journal.  A power failure could lead to corruption
159334e79ceeSdrh ** in this case.  But for things like temporary table (which will be
159434e79ceeSdrh ** deleted when the power is restored) we don't care.
159534e79ceeSdrh **
1596d9b0257aSdrh ** If the file opened as the journal file is not a well-formed
1597ece80f1eSdanielk1977 ** journal file then all pages up to the first corrupted page are rolled
1598ece80f1eSdanielk1977 ** back (or no pages if the journal header is corrupted). The journal file
1599ece80f1eSdanielk1977 ** is then deleted and SQLITE_OK returned, just as if no corruption had
1600ece80f1eSdanielk1977 ** been encountered.
1601ece80f1eSdanielk1977 **
1602ece80f1eSdanielk1977 ** If an I/O or malloc() error occurs, the journal-file is not deleted
1603ece80f1eSdanielk1977 ** and an error code is returned.
1604ed7c855cSdrh */
1605e277be05Sdanielk1977 static int pager_playback(Pager *pPager, int isHot){
1606b4b47411Sdanielk1977   sqlite3_vfs *pVfs = pPager->pVfs;
1607eb206256Sdrh   i64 szJ;                 /* Size of the journal file in bytes */
1608c3e8f5efSdanielk1977   u32 nRec;                /* Number of Records in the journal */
16090b8d2766Sshane   u32 u;                   /* Unsigned loop counter */
1610ed7c855cSdrh   Pgno mxPg = 0;           /* Size of the original file in pages */
1611ae2b40c4Sdrh   int rc;                  /* Result code of a subroutine */
1612861f7456Sdanielk1977   int res = 1;             /* Value returned by sqlite3OsAccess() */
161313adf8a0Sdanielk1977   char *zMaster = 0;       /* Name of master journal file if any */
1614ed7c855cSdrh 
1615c3a64ba0Sdrh   /* Figure out how many records are in the journal.  Abort early if
1616c3a64ba0Sdrh   ** the journal is empty.
1617ed7c855cSdrh   */
16188cfbf08fSdrh   assert( pPager->journalOpen );
1619054889ecSdrh   rc = sqlite3OsFileSize(pPager->jfd, &szJ);
1620334cdb63Sdanielk1977   if( rc!=SQLITE_OK || szJ==0 ){
1621c3a64ba0Sdrh     goto end_playback;
1622c3a64ba0Sdrh   }
1623240c5795Sdrh 
16247657240aSdanielk1977   /* Read the master journal name from the journal, if it is present.
16257657240aSdanielk1977   ** If a master journal file name is specified, but the file is not
16267657240aSdanielk1977   ** present on disk, then the journal is not hot and does not need to be
16277657240aSdanielk1977   ** played back.
1628240c5795Sdrh   */
162965839c6aSdanielk1977   zMaster = pPager->pTmpSpace;
163065839c6aSdanielk1977   rc = readMasterJournal(pPager->jfd, zMaster, pPager->pVfs->mxPathname+1);
1631861f7456Sdanielk1977   if( rc==SQLITE_OK && zMaster[0] ){
1632861f7456Sdanielk1977     rc = sqlite3OsAccess(pVfs, zMaster, SQLITE_ACCESS_EXISTS, &res);
16337657240aSdanielk1977   }
163465839c6aSdanielk1977   zMaster = 0;
1635861f7456Sdanielk1977   if( rc!=SQLITE_OK || !res ){
1636ce98bba2Sdanielk1977     goto end_playback;
1637ce98bba2Sdanielk1977   }
1638ce98bba2Sdanielk1977   pPager->journalOff = 0;
16397657240aSdanielk1977 
16407657240aSdanielk1977   /* This loop terminates either when the readJournalHdr() call returns
16417657240aSdanielk1977   ** SQLITE_DONE or an IO error occurs. */
16427657240aSdanielk1977   while( 1 ){
16437657240aSdanielk1977 
16447657240aSdanielk1977     /* Read the next journal header from the journal file.  If there are
16457657240aSdanielk1977     ** not enough bytes left in the journal file for a complete header, or
16467657240aSdanielk1977     ** it is corrupted, then a process must of failed while writing it.
16477657240aSdanielk1977     ** This indicates nothing more needs to be rolled back.
16487657240aSdanielk1977     */
16497657240aSdanielk1977     rc = readJournalHdr(pPager, szJ, &nRec, &mxPg);
16507657240aSdanielk1977     if( rc!=SQLITE_OK ){
16517657240aSdanielk1977       if( rc==SQLITE_DONE ){
16527657240aSdanielk1977         rc = SQLITE_OK;
16537657240aSdanielk1977       }
1654c3a64ba0Sdrh       goto end_playback;
1655c3a64ba0Sdrh     }
1656c3a64ba0Sdrh 
16577657240aSdanielk1977     /* If nRec is 0xffffffff, then this journal was created by a process
16587657240aSdanielk1977     ** working in no-sync mode. This means that the rest of the journal
16597657240aSdanielk1977     ** file consists of pages, there are no more journal headers. Compute
16607657240aSdanielk1977     ** the value of nRec based on this assumption.
16617657240aSdanielk1977     */
16627657240aSdanielk1977     if( nRec==0xffffffff ){
16637657240aSdanielk1977       assert( pPager->journalOff==JOURNAL_HDR_SZ(pPager) );
16644f21c4afSdrh       nRec = (int)((szJ - JOURNAL_HDR_SZ(pPager))/JOURNAL_PG_SZ(pPager));
166513adf8a0Sdanielk1977     }
166613adf8a0Sdanielk1977 
1667e277be05Sdanielk1977     /* If nRec is 0 and this rollback is of a transaction created by this
16688940f4eeSdrh     ** process and if this is the final header in the journal, then it means
16698940f4eeSdrh     ** that this part of the journal was being filled but has not yet been
16708940f4eeSdrh     ** synced to disk.  Compute the number of pages based on the remaining
16718940f4eeSdrh     ** size of the file.
16728940f4eeSdrh     **
16738940f4eeSdrh     ** The third term of the test was added to fix ticket #2565.
1674d6e5e098Sdrh     ** When rolling back a hot journal, nRec==0 always means that the next
1675d6e5e098Sdrh     ** chunk of the journal contains zero pages to be rolled back.  But
1676d6e5e098Sdrh     ** when doing a ROLLBACK and the nRec==0 chunk is the last chunk in
1677d6e5e098Sdrh     ** the journal, it means that the journal might contain additional
1678d6e5e098Sdrh     ** pages that need to be rolled back and that the number of pages
1679d6e5e098Sdrh     ** should be computed based on the journal file size.
1680e277be05Sdanielk1977     */
16814fd18c4bSdrh     testcase( nRec==0 && !isHot
1682d6e5e098Sdrh          && pPager->journalHdr+JOURNAL_HDR_SZ(pPager)!=pPager->journalOff
1683d6e5e098Sdrh          && ((szJ - pPager->journalOff) / JOURNAL_PG_SZ(pPager))>0
16844fd18c4bSdrh          && pagerNextJournalPageIsValid(pPager)
1685d6e5e098Sdrh     );
16868940f4eeSdrh     if( nRec==0 && !isHot &&
16878940f4eeSdrh         pPager->journalHdr+JOURNAL_HDR_SZ(pPager)==pPager->journalOff ){
16884f21c4afSdrh       nRec = (int)((szJ - pPager->journalOff) / JOURNAL_PG_SZ(pPager));
1689e277be05Sdanielk1977     }
1690e277be05Sdanielk1977 
16917657240aSdanielk1977     /* If this is the first header read from the journal, truncate the
169285b623f2Sdrh     ** database file back to its original size.
16937657240aSdanielk1977     */
1694e180dd93Sdanielk1977     if( pPager->journalOff==JOURNAL_HDR_SZ(pPager) ){
1695cb4c40baSdrh       rc = pager_truncate(pPager, mxPg);
169681a20f21Sdrh       if( rc!=SQLITE_OK ){
169781a20f21Sdrh         goto end_playback;
169881a20f21Sdrh       }
1699f90b7260Sdanielk1977       pPager->dbSize = mxPg;
17007657240aSdanielk1977     }
17017657240aSdanielk1977 
1702fa86c412Sdrh     /* Copy original pages out of the journal and back into the database file.
1703ed7c855cSdrh     */
17040b8d2766Sshane     for(u=0; u<nRec; u++){
1705d6e5e098Sdrh       rc = pager_playback_one_page(pPager, 1, &pPager->journalOff, 0, 0);
1706968af52aSdrh       if( rc!=SQLITE_OK ){
1707968af52aSdrh         if( rc==SQLITE_DONE ){
1708968af52aSdrh           rc = SQLITE_OK;
17097657240aSdanielk1977           pPager->journalOff = szJ;
1710968af52aSdrh           break;
17117657240aSdanielk1977         }else{
1712a9625eaeSdrh           /* If we are unable to rollback, then the database is probably
1713a9625eaeSdrh           ** going to end up being corrupt.  It is corrupt to us, anyhow.
1714a9625eaeSdrh           ** Perhaps the next process to come along can fix it....
1715a9625eaeSdrh           */
171698c21903Sdanielk1977           rc = SQLITE_CORRUPT_BKPT;
17177657240aSdanielk1977           goto end_playback;
17187657240aSdanielk1977         }
17197657240aSdanielk1977       }
1720968af52aSdrh     }
1721d9b0257aSdrh   }
1722580eeaf3Sdrh   /*NOTREACHED*/
1723580eeaf3Sdrh   assert( 0 );
17244a0681efSdrh 
17254a0681efSdrh end_playback:
17268191bff0Sdanielk1977   if( rc==SQLITE_OK ){
172765839c6aSdanielk1977     zMaster = pPager->pTmpSpace;
172865839c6aSdanielk1977     rc = readMasterJournal(pPager->jfd, zMaster, pPager->pVfs->mxPathname+1);
172965839c6aSdanielk1977   }
173065839c6aSdanielk1977   if( rc==SQLITE_OK ){
1731df2566a3Sdanielk1977     rc = pager_end_transaction(pPager, zMaster[0]!='\0');
17328191bff0Sdanielk1977   }
1733c56774e2Sdanielk1977   if( rc==SQLITE_OK && zMaster[0] && res ){
1734979f38e5Sdanielk1977     /* If there was a master journal and this routine will return success,
173532554c10Sdanielk1977     ** see if it is possible to delete the master journal.
173613adf8a0Sdanielk1977     */
1737b4b47411Sdanielk1977     rc = pager_delmaster(pPager, zMaster);
173813adf8a0Sdanielk1977   }
17397657240aSdanielk1977 
17407657240aSdanielk1977   /* The Pager.sectorSize variable may have been updated while rolling
17413ceeb756Sdrh   ** back a journal created by a process with a different sector size
17427657240aSdanielk1977   ** value. Reset it to the correct value for this process.
17437657240aSdanielk1977   */
1744c80f058dSdrh   setSectorSize(pPager);
1745d9b0257aSdrh   return rc;
1746ed7c855cSdrh }
1747ed7c855cSdrh 
1748ed7c855cSdrh /*
1749d6e5e098Sdrh ** Playback savepoint pSavepoint.  Or, if pSavepoint==NULL, then playback
1750d6e5e098Sdrh ** the entire master journal file.
1751d6e5e098Sdrh **
1752d6e5e098Sdrh ** The case pSavepoint==NULL occurs when a ROLLBACK TO command is invoked
1753d6e5e098Sdrh ** on a SAVEPOINT that is a transaction savepoint.
1754fa86c412Sdrh */
1755fd7f0452Sdanielk1977 static int pagerPlaybackSavepoint(Pager *pPager, PagerSavepoint *pSavepoint){
1756d6e5e098Sdrh   i64 szJ;                 /* Effective size of the main journal */
1757fd7f0452Sdanielk1977   i64 iHdrOff;             /* End of first segment of main-journal records */
1758fd7f0452Sdanielk1977   Pgno ii;                 /* Loop counter */
1759f2c31ad8Sdanielk1977   int rc = SQLITE_OK;      /* Return code */
1760fd7f0452Sdanielk1977   Bitvec *pDone = 0;       /* Bitvec to ensure pages played back only once */
1761fa86c412Sdrh 
1762fd7f0452Sdanielk1977   /* Allocate a bitvec to use to store the set of pages rolled back */
1763fd7f0452Sdanielk1977   if( pSavepoint ){
1764fd7f0452Sdanielk1977     pDone = sqlite3BitvecCreate(pSavepoint->nOrig);
1765fd7f0452Sdanielk1977     if( !pDone ){
1766fd7f0452Sdanielk1977       return SQLITE_NOMEM;
1767fd7f0452Sdanielk1977     }
17687657240aSdanielk1977   }
17697657240aSdanielk1977 
1770fd7f0452Sdanielk1977   /* Truncate the database back to the size it was before the
1771fd7f0452Sdanielk1977   ** savepoint being reverted was opened.
1772fa86c412Sdrh   */
1773f2c31ad8Sdanielk1977   pPager->dbSize = pSavepoint ? pSavepoint->nOrig : pPager->dbOrigSize;
17741aa2d8b5Sdrh   assert( pPager->state>=PAGER_SHARED );
1775fa86c412Sdrh 
1776d6e5e098Sdrh   /* Use pPager->journalOff as the effective size of the main rollback
1777d6e5e098Sdrh   ** journal.  The actual file might be larger than this in
1778d6e5e098Sdrh   ** PAGER_JOURNALMODE_TRUNCATE or PAGER_JOURNALMODE_PERSIST.  But anything
1779d6e5e098Sdrh   ** past pPager->journalOff is off-limits to us.
1780fa86c412Sdrh   */
1781fd7f0452Sdanielk1977   szJ = pPager->journalOff;
1782d6e5e098Sdrh 
1783d6e5e098Sdrh   /* Begin by rolling back records from the main journal starting at
1784d6e5e098Sdrh   ** PagerSavepoint.iOffset and continuing to the next journal header.
1785d6e5e098Sdrh   ** There might be records in the main journal that have a page number
1786d6e5e098Sdrh   ** greater than the current database size (pPager->dbSize) but those
1787d6e5e098Sdrh   ** will be skipped automatically.  Pages are added to pDone as they
1788d6e5e098Sdrh   ** are played back.
1789d6e5e098Sdrh   */
1790fd7f0452Sdanielk1977   if( pSavepoint ){
1791fd7f0452Sdanielk1977     iHdrOff = pSavepoint->iHdrOffset ? pSavepoint->iHdrOffset : szJ;
1792fd7f0452Sdanielk1977     pPager->journalOff = pSavepoint->iOffset;
1793fd7f0452Sdanielk1977     while( rc==SQLITE_OK && pPager->journalOff<iHdrOff ){
1794d6e5e098Sdrh       rc = pager_playback_one_page(pPager, 1, &pPager->journalOff, 1, pDone);
1795968af52aSdrh       assert( rc!=SQLITE_DONE );
1796fa86c412Sdrh     }
1797fd7f0452Sdanielk1977   }else{
1798fd7f0452Sdanielk1977     pPager->journalOff = 0;
17997657240aSdanielk1977   }
1800d6e5e098Sdrh 
1801d6e5e098Sdrh   /* Continue rolling back records out of the main journal starting at
1802d6e5e098Sdrh   ** the first journal header seen and continuing until the effective end
1803d6e5e098Sdrh   ** of the main journal file.  Continue to skip out-of-range pages and
1804d6e5e098Sdrh   ** continue adding pages rolled back to pDone.
1805d6e5e098Sdrh   */
1806fd7f0452Sdanielk1977   while( rc==SQLITE_OK && pPager->journalOff<szJ ){
1807c81806f3Sdanielk1977     u32 nJRec = 0;     /* Number of Journal Records */
18087657240aSdanielk1977     u32 dummy;
1809f0113000Sdanielk1977     rc = readJournalHdr(pPager, szJ, &nJRec, &dummy);
1810968af52aSdrh     assert( rc!=SQLITE_DONE );
1811d6e5e098Sdrh 
1812d6e5e098Sdrh     /*
1813d6e5e098Sdrh     ** The "pPager->journalHdr+JOURNAL_HDR_SZ(pPager)==pPager->journalOff"
1814d6e5e098Sdrh     ** test is related to ticket #2565.  See the discussion in the
1815d6e5e098Sdrh     ** pager_playback() function for additional information.
1816d6e5e098Sdrh     */
1817ee03d629Sdrh     assert( !(nJRec==0
1818d6e5e098Sdrh          && pPager->journalHdr+JOURNAL_HDR_SZ(pPager)!=pPager->journalOff
1819d6e5e098Sdrh          && ((szJ - pPager->journalOff) / JOURNAL_PG_SZ(pPager))>0
1820ee03d629Sdrh          && pagerNextJournalPageIsValid(pPager))
1821d6e5e098Sdrh     );
1822d6e5e098Sdrh     if( nJRec==0
1823d6e5e098Sdrh      && pPager->journalHdr+JOURNAL_HDR_SZ(pPager)==pPager->journalOff
1824d6e5e098Sdrh     ){
1825d6e5e098Sdrh       nJRec = (szJ - pPager->journalOff)/JOURNAL_PG_SZ(pPager);
182675edc16fSdanielk1977     }
182712dd5496Sdanielk1977     for(ii=0; rc==SQLITE_OK && ii<nJRec && pPager->journalOff<szJ; ii++){
1828d6e5e098Sdrh       rc = pager_playback_one_page(pPager, 1, &pPager->journalOff, 1, pDone);
18297657240aSdanielk1977       assert( rc!=SQLITE_DONE );
1830fd7f0452Sdanielk1977     }
1831fd7f0452Sdanielk1977   }
1832fd7f0452Sdanielk1977   assert( rc!=SQLITE_OK || pPager->journalOff==szJ );
1833fd7f0452Sdanielk1977 
1834d6e5e098Sdrh   /* Finally,  rollback pages from the sub-journal.  Page that were
1835d6e5e098Sdrh   ** previously rolled back out of the main journal (and are hence in pDone)
1836d6e5e098Sdrh   ** will be skipped.  Out-of-range pages are also skipped.
1837d6e5e098Sdrh   */
1838fd7f0452Sdanielk1977   if( pSavepoint ){
1839d6e5e098Sdrh     i64 offset = pSavepoint->iSubRec*(4+pPager->pageSize);
184049b9d338Sdrh     for(ii=pSavepoint->iSubRec; rc==SQLITE_OK&&ii<(u32)pPager->stmtNRec; ii++){
1841d6e5e098Sdrh       assert( offset == ii*(4+pPager->pageSize) );
1842d6e5e098Sdrh       rc = pager_playback_one_page(pPager, 0, &offset, 1, pDone);
1843fd7f0452Sdanielk1977       assert( rc!=SQLITE_DONE );
1844968af52aSdrh     }
18457657240aSdanielk1977   }
18467657240aSdanielk1977 
1847fd7f0452Sdanielk1977   sqlite3BitvecDestroy(pDone);
18488a7aea3bSdanielk1977   if( rc==SQLITE_OK ){
184975edc16fSdanielk1977     pPager->journalOff = szJ;
1850fa86c412Sdrh   }
1851fa86c412Sdrh   return rc;
1852fa86c412Sdrh }
1853fa86c412Sdrh 
1854fa86c412Sdrh /*
1855f57b14a6Sdrh ** Change the maximum number of in-memory pages that are allowed.
1856f57b14a6Sdrh */
18573b8a05f6Sdanielk1977 void sqlite3PagerSetCachesize(Pager *pPager, int mxPage){
18588c0a791aSdanielk1977   sqlite3PcacheSetCachesize(pPager->pPCache, mxPage);
1859f57b14a6Sdrh }
1860f57b14a6Sdrh 
1861f57b14a6Sdrh /*
1862973b6e33Sdrh ** Adjust the robustness of the database to damage due to OS crashes
1863973b6e33Sdrh ** or power failures by changing the number of syncs()s when writing
1864973b6e33Sdrh ** the rollback journal.  There are three levels:
1865973b6e33Sdrh **
1866054889ecSdrh **    OFF       sqlite3OsSync() is never called.  This is the default
1867973b6e33Sdrh **              for temporary and transient files.
1868973b6e33Sdrh **
1869973b6e33Sdrh **    NORMAL    The journal is synced once before writes begin on the
1870973b6e33Sdrh **              database.  This is normally adequate protection, but
1871973b6e33Sdrh **              it is theoretically possible, though very unlikely,
1872973b6e33Sdrh **              that an inopertune power failure could leave the journal
1873973b6e33Sdrh **              in a state which would cause damage to the database
1874973b6e33Sdrh **              when it is rolled back.
1875973b6e33Sdrh **
1876973b6e33Sdrh **    FULL      The journal is synced twice before writes begin on the
187734e79ceeSdrh **              database (with some additional information - the nRec field
187834e79ceeSdrh **              of the journal header - being written in between the two
187934e79ceeSdrh **              syncs).  If we assume that writing a
1880973b6e33Sdrh **              single disk sector is atomic, then this mode provides
1881973b6e33Sdrh **              assurance that the journal will not be corrupted to the
1882973b6e33Sdrh **              point of causing damage to the database during rollback.
1883973b6e33Sdrh **
1884973b6e33Sdrh ** Numeric values associated with these states are OFF==1, NORMAL=2,
1885973b6e33Sdrh ** and FULL=3.
1886973b6e33Sdrh */
188793758c8dSdanielk1977 #ifndef SQLITE_OMIT_PAGER_PRAGMAS
1888281b21daSdrh void sqlite3PagerSetSafetyLevel(Pager *pPager, int level, int bFullFsync){
18894f21c4afSdrh   pPager->noSync =  (level==1 || pPager->tempFile) ?1:0;
18904f21c4afSdrh   pPager->fullSync = (level==3 && !pPager->tempFile) ?1:0;
1891281b21daSdrh   pPager->sync_flags = (bFullFsync?SQLITE_SYNC_FULL:SQLITE_SYNC_NORMAL);
18921d850a72Sdanielk1977   if( pPager->noSync ) pPager->needSync = 0;
1893973b6e33Sdrh }
189493758c8dSdanielk1977 #endif
1895973b6e33Sdrh 
1896973b6e33Sdrh /*
1897af6df11fSdrh ** The following global variable is incremented whenever the library
1898af6df11fSdrh ** attempts to open a temporary file.  This information is used for
1899af6df11fSdrh ** testing and analysis only.
1900af6df11fSdrh */
19010f7eb611Sdrh #ifdef SQLITE_TEST
1902af6df11fSdrh int sqlite3_opentemp_count = 0;
19030f7eb611Sdrh #endif
1904af6df11fSdrh 
1905af6df11fSdrh /*
19063f56e6ebSdrh ** Open a temporary file.
19073f56e6ebSdrh **
19083f56e6ebSdrh ** Write the file descriptor into *fd.  Return SQLITE_OK on success or some
1909fee2d25aSdanielk1977 ** other error code if we fail. The OS will automatically delete the temporary
1910fee2d25aSdanielk1977 ** file when it is closed.
1911fa86c412Sdrh */
1912b4b47411Sdanielk1977 static int sqlite3PagerOpentemp(
191317b90b53Sdanielk1977   Pager *pPager,        /* The pager object */
191433f4e02aSdrh   sqlite3_file *pFile,  /* Write the file descriptor here */
191533f4e02aSdrh   int vfsFlags          /* Flags passed through to the VFS */
1916b4b47411Sdanielk1977 ){
1917fa86c412Sdrh   int rc;
19183f56e6ebSdrh 
19190f7eb611Sdrh #ifdef SQLITE_TEST
1920af6df11fSdrh   sqlite3_opentemp_count++;  /* Used for testing and analysis only */
19210f7eb611Sdrh #endif
1922b4b47411Sdanielk1977 
192333f4e02aSdrh   vfsFlags |=  SQLITE_OPEN_READWRITE | SQLITE_OPEN_CREATE |
192433f4e02aSdrh             SQLITE_OPEN_EXCLUSIVE | SQLITE_OPEN_DELETEONCLOSE;
192517b90b53Sdanielk1977   rc = sqlite3OsOpen(pPager->pVfs, 0, pFile, vfsFlags, 0);
1926b4b47411Sdanielk1977   assert( rc!=SQLITE_OK || pFile->pMethods );
1927fa86c412Sdrh   return rc;
1928fa86c412Sdrh }
1929fa86c412Sdrh 
1930a858aa2eSdanielk1977 static int pagerStress(void *,PgHdr *);
19318c0a791aSdanielk1977 
1932fa86c412Sdrh /*
1933ed7c855cSdrh ** Create a new page cache and put a pointer to the page cache in *ppPager.
19345e00f6c7Sdrh ** The file to be cached need not exist.  The file is not locked until
19353b8a05f6Sdanielk1977 ** the first call to sqlite3PagerGet() and is only held open until the
19363b8a05f6Sdanielk1977 ** last page is released using sqlite3PagerUnref().
1937382c0247Sdrh **
19386446c4dcSdrh ** If zFilename is NULL then a randomly-named temporary file is created
19391cc8c448Sdrh ** and used as the file to be cached.  The file will be deleted
19406446c4dcSdrh ** automatically when it is closed.
194190f5ecb3Sdrh **
194290f5ecb3Sdrh ** If zFilename is ":memory:" then all information is held in cache.
194390f5ecb3Sdrh ** It is never written to disk.  This can be used to implement an
194490f5ecb3Sdrh ** in-memory database.
1945ed7c855cSdrh */
19463b8a05f6Sdanielk1977 int sqlite3PagerOpen(
194786f8c197Sdrh   sqlite3_vfs *pVfs,       /* The virtual file system to use */
19487e3b0a07Sdrh   Pager **ppPager,         /* Return the Pager structure here */
19497e3b0a07Sdrh   const char *zFilename,   /* Name of the database file to open */
1950da47d774Sdrh   int nExtra,              /* Extra bytes append to each in-memory page */
195133f4e02aSdrh   int flags,               /* flags controlling this file */
195233f4e02aSdrh   int vfsFlags             /* flags passed through to sqlite3_vfs.xOpen() */
19537e3b0a07Sdrh ){
1954b4b47411Sdanielk1977   u8 *pPtr;
1955aef0bf64Sdanielk1977   Pager *pPager = 0;
1956cfe9a69fSdanielk1977   int rc = SQLITE_OK;
1957cfe9a69fSdanielk1977   int i;
19588def5ea2Sdanielk1977   int tempFile = 0;
1959ac69b05eSdrh   int memDb = 0;
19605e00f6c7Sdrh   int readOnly = 0;
19617bec505eSdrh   int useJournal = (flags & PAGER_OMIT_JOURNAL)==0;
19627bec505eSdrh   int noReadlock = (flags & PAGER_NO_READLOCK)!=0;
1963b3175389Sdanielk1977   int journalFileSize;
19648c0a791aSdanielk1977   int pcacheSize = sqlite3PcacheSize();
1965facf0307Sdrh   int szPageDflt = SQLITE_DEFAULT_PAGE_SIZE;
196617b90b53Sdanielk1977   char *zPathname = 0;
196717b90b53Sdanielk1977   int nPathname = 0;
1968b4b47411Sdanielk1977 
1969b3175389Sdanielk1977   if( sqlite3JournalSize(pVfs)>sqlite3MemJournalSize() ){
1970b3175389Sdanielk1977     journalFileSize = sqlite3JournalSize(pVfs);
1971b3175389Sdanielk1977   }else{
1972b3175389Sdanielk1977     journalFileSize = sqlite3MemJournalSize();
1973b3175389Sdanielk1977   }
1974b3175389Sdanielk1977 
197586f8c197Sdrh   /* The default return is a NULL pointer */
1976d9b0257aSdrh   *ppPager = 0;
1977aef0bf64Sdanielk1977 
197817b90b53Sdanielk1977   /* Compute and store the full pathname in an allocated buffer pointed
197917b90b53Sdanielk1977   ** to by zPathname, length nPathname. Or, if this is a temporary file,
198017b90b53Sdanielk1977   ** leave both nPathname and zPathname set to 0.
198117b90b53Sdanielk1977   */
198217b90b53Sdanielk1977   if( zFilename && zFilename[0] ){
1983adfb9b05Sdanielk1977     nPathname = pVfs->mxPathname+1;
1984e5ae5735Sdrh     zPathname = sqlite3Malloc(nPathname*2);
19851cc8c448Sdrh     if( zPathname==0 ){
19861cc8c448Sdrh       return SQLITE_NOMEM;
19871cc8c448Sdrh     }
19881cc8c448Sdrh #ifndef SQLITE_OMIT_MEMORYDB
19891cc8c448Sdrh     if( strcmp(zFilename,":memory:")==0 ){
19901cc8c448Sdrh       memDb = 1;
19911cc8c448Sdrh       zPathname[0] = 0;
19921cc8c448Sdrh     }else
19931cc8c448Sdrh #endif
19941cc8c448Sdrh     {
1995adfb9b05Sdanielk1977       rc = sqlite3OsFullPathname(pVfs, zFilename, nPathname, zPathname);
1996ae28c01aSdrh     }
19971cc8c448Sdrh     if( rc!=SQLITE_OK ){
19981cc8c448Sdrh       sqlite3_free(zPathname);
19991cc8c448Sdrh       return rc;
20001cc8c448Sdrh     }
2001ea678832Sdrh     nPathname = sqlite3Strlen30(zPathname);
200299b90c3fSdrh   }
200399b90c3fSdrh 
2004b4b47411Sdanielk1977   /* Allocate memory for the pager structure */
2005b4b47411Sdanielk1977   pPager = sqlite3MallocZero(
2006b4b47411Sdanielk1977     sizeof(*pPager) +           /* Pager structure */
20078c0a791aSdanielk1977     pcacheSize      +           /* PCache object */
2008c7b6017cSdanielk1977     journalFileSize +           /* The journal file structure */
2009b3175389Sdanielk1977     pVfs->szOsFile  +           /* The main db file */
2010b3175389Sdanielk1977     journalFileSize * 2 +       /* The two journal files */
201117b90b53Sdanielk1977     3*nPathname + 40            /* zFilename, zDirectory, zJournal */
2012b4b47411Sdanielk1977   );
2013b4b47411Sdanielk1977   if( !pPager ){
20141cc8c448Sdrh     sqlite3_free(zPathname);
2015b4b47411Sdanielk1977     return SQLITE_NOMEM;
2016b4b47411Sdanielk1977   }
20178c0a791aSdanielk1977   pPager->pPCache = (PCache *)&pPager[1];
20188c0a791aSdanielk1977   pPtr = ((u8 *)&pPager[1]) + pcacheSize;
201933f4e02aSdrh   pPager->vfsFlags = vfsFlags;
2020b4b47411Sdanielk1977   pPager->fd = (sqlite3_file*)&pPtr[pVfs->szOsFile*0];
2021fd7f0452Sdanielk1977   pPager->sjfd = (sqlite3_file*)&pPtr[pVfs->szOsFile];
2022b3175389Sdanielk1977   pPager->jfd = (sqlite3_file*)&pPtr[pVfs->szOsFile+journalFileSize];
2023b3175389Sdanielk1977   pPager->zFilename = (char*)&pPtr[pVfs->szOsFile+2*journalFileSize];
20241cc8c448Sdrh   pPager->zDirectory = &pPager->zFilename[nPathname+1];
20251cc8c448Sdrh   pPager->zJournal = &pPager->zDirectory[nPathname+1];
2026b4b47411Sdanielk1977   pPager->pVfs = pVfs;
202717b90b53Sdanielk1977   if( zPathname ){
20281cc8c448Sdrh     memcpy(pPager->zFilename, zPathname, nPathname+1);
20291cc8c448Sdrh     sqlite3_free(zPathname);
203017b90b53Sdanielk1977   }
2031b4b47411Sdanielk1977 
2032153c62c4Sdrh   /* Open the pager file.
2033aef0bf64Sdanielk1977   */
2034ae28c01aSdrh   if( zFilename && zFilename[0] && !memDb ){
2035d92db531Sdanielk1977     if( nPathname>(pVfs->mxPathname - (int)sizeof("-journal")) ){
2036b4b47411Sdanielk1977       rc = SQLITE_CANTOPEN;
2037b4b47411Sdanielk1977     }else{
2038b4b47411Sdanielk1977       int fout = 0;
203933f4e02aSdrh       rc = sqlite3OsOpen(pVfs, pPager->zFilename, pPager->fd,
204033f4e02aSdrh                          pPager->vfsFlags, &fout);
2041b4b47411Sdanielk1977       readOnly = (fout&SQLITE_OPEN_READONLY);
20429663b8f9Sdanielk1977 
20439663b8f9Sdanielk1977       /* If the file was successfully opened for read/write access,
20449663b8f9Sdanielk1977       ** choose a default page size in case we have to create the
20459663b8f9Sdanielk1977       ** database file. The default page size is the maximum of:
20469663b8f9Sdanielk1977       **
20479663b8f9Sdanielk1977       **    + SQLITE_DEFAULT_PAGE_SIZE,
20489663b8f9Sdanielk1977       **    + The value returned by sqlite3OsSectorSize()
20499663b8f9Sdanielk1977       **    + The largest page size that can be written atomically.
20509663b8f9Sdanielk1977       */
20519663b8f9Sdanielk1977       if( rc==SQLITE_OK && !readOnly ){
20527cbd589dSdanielk1977         setSectorSize(pPager);
20537cbd589dSdanielk1977         if( szPageDflt<pPager->sectorSize ){
20547cbd589dSdanielk1977           szPageDflt = pPager->sectorSize;
20559663b8f9Sdanielk1977         }
20569663b8f9Sdanielk1977 #ifdef SQLITE_ENABLE_ATOMIC_WRITE
20579663b8f9Sdanielk1977         {
20589663b8f9Sdanielk1977           int iDc = sqlite3OsDeviceCharacteristics(pPager->fd);
20599663b8f9Sdanielk1977           int ii;
20609663b8f9Sdanielk1977           assert(SQLITE_IOCAP_ATOMIC512==(512>>8));
20619663b8f9Sdanielk1977           assert(SQLITE_IOCAP_ATOMIC64K==(65536>>8));
20629663b8f9Sdanielk1977           assert(SQLITE_MAX_DEFAULT_PAGE_SIZE<=65536);
2063facf0307Sdrh           for(ii=szPageDflt; ii<=SQLITE_MAX_DEFAULT_PAGE_SIZE; ii=ii*2){
2064facf0307Sdrh             if( iDc&(SQLITE_IOCAP_ATOMIC|(ii>>8)) ) szPageDflt = ii;
20659663b8f9Sdanielk1977           }
20669663b8f9Sdanielk1977         }
20679663b8f9Sdanielk1977 #endif
2068facf0307Sdrh         if( szPageDflt>SQLITE_MAX_DEFAULT_PAGE_SIZE ){
2069facf0307Sdrh           szPageDflt = SQLITE_MAX_DEFAULT_PAGE_SIZE;
20709663b8f9Sdanielk1977         }
20719663b8f9Sdanielk1977       }
2072b4b47411Sdanielk1977     }
2073b3175389Sdanielk1977   }else{
20747a2b1eebSdanielk1977     /* If a temporary file is requested, it is not opened immediately.
20757a2b1eebSdanielk1977     ** In this case we accept the default page size and delay actually
20767a2b1eebSdanielk1977     ** opening the file until the first call to OsWrite().
2077b3175389Sdanielk1977     **
2078b3175389Sdanielk1977     ** This branch is also run for an in-memory database. An in-memory
2079b3175389Sdanielk1977     ** database is the same as a temp-file that is never written out to
2080b3175389Sdanielk1977     ** disk and uses an in-memory rollback journal.
20817a2b1eebSdanielk1977     */
20825e00f6c7Sdrh     tempFile = 1;
20837a2b1eebSdanielk1977     pPager->state = PAGER_EXCLUSIVE;
20848def5ea2Sdanielk1977   }
2085aef0bf64Sdanielk1977 
20868186df86Sdanielk1977   if( pPager && rc==SQLITE_OK ){
2087facf0307Sdrh     pPager->pTmpSpace = sqlite3PageMalloc(szPageDflt);
2088aef0bf64Sdanielk1977   }
2089aef0bf64Sdanielk1977 
2090153c62c4Sdrh   /* If an error occured in either of the blocks above.
2091153c62c4Sdrh   ** Free the Pager structure and close the file.
2092153c62c4Sdrh   ** Since the pager is not allocated there is no need to set
2093aef0bf64Sdanielk1977   ** any Pager.errMask variables.
2094aef0bf64Sdanielk1977   */
2095b4b47411Sdanielk1977   if( !pPager || !pPager->pTmpSpace ){
2096b4b47411Sdanielk1977     sqlite3OsClose(pPager->fd);
209717435752Sdrh     sqlite3_free(pPager);
2098aef0bf64Sdanielk1977     return ((rc==SQLITE_OK)?SQLITE_NOMEM:rc);
2099d9b0257aSdrh   }
21008c0a791aSdanielk1977   nExtra = FORCE_ALIGNMENT(nExtra);
210171d5d2cdSdanielk1977   sqlite3PcacheOpen(szPageDflt, nExtra, !memDb,
210241d3027cSdrh                     !memDb?pagerStress:0, (void *)pPager, pPager->pPCache);
2103aef0bf64Sdanielk1977 
210430d53701Sdrh   PAGERTRACE(("OPEN %d %s\n", FILEHANDLEID(pPager->fd), pPager->zFilename));
2105153c62c4Sdrh   IOTRACE(("OPEN %p %s\n", pPager, pPager->zFilename))
2106aef0bf64Sdanielk1977 
2107b4b47411Sdanielk1977   /* Fill in Pager.zDirectory[] */
21081cc8c448Sdrh   memcpy(pPager->zDirectory, pPager->zFilename, nPathname+1);
2109ea678832Sdrh   for(i=sqlite3Strlen30(pPager->zDirectory);
2110ea678832Sdrh       i>0 && pPager->zDirectory[i-1]!='/'; i--){}
2111a76c82ebSdrh   if( i>0 ) pPager->zDirectory[i-1] = 0;
2112b4b47411Sdanielk1977 
211399b90c3fSdrh   /* Fill in Pager.zJournal[] */
211417b90b53Sdanielk1977   if( zPathname ){
21151cc8c448Sdrh     memcpy(pPager->zJournal, pPager->zFilename, nPathname);
21161cc8c448Sdrh     memcpy(&pPager->zJournal[nPathname], "-journal", 9);
211717b90b53Sdanielk1977   }else{
211817b90b53Sdanielk1977     pPager->zJournal = 0;
211917b90b53Sdanielk1977   }
2120b4b47411Sdanielk1977 
21213b59a5ccSdrh   /* pPager->journalOpen = 0; */
21224f21c4afSdrh   pPager->useJournal = (u8)useJournal;
21234f21c4afSdrh   pPager->noReadlock = (noReadlock && readOnly) ?1:0;
21243b59a5ccSdrh   /* pPager->stmtOpen = 0; */
21253b59a5ccSdrh   /* pPager->stmtInUse = 0; */
21263b59a5ccSdrh   /* pPager->nRef = 0; */
21274f21c4afSdrh   pPager->dbSizeValid = (u8)memDb;
2128facf0307Sdrh   pPager->pageSize = szPageDflt;
21293b59a5ccSdrh   /* pPager->stmtSize = 0; */
21303b59a5ccSdrh   /* pPager->stmtJSize = 0; */
21313b59a5ccSdrh   /* pPager->nPage = 0; */
213290f5ecb3Sdrh   pPager->mxPage = 100;
2133f8e632b6Sdrh   pPager->mxPgno = SQLITE_MAX_PAGE_COUNT;
21343b59a5ccSdrh   /* pPager->state = PAGER_UNLOCK; */
21351cc8c448Sdrh   assert( pPager->state == (tempFile ? PAGER_EXCLUSIVE : PAGER_UNLOCK) );
21363b59a5ccSdrh   /* pPager->errMask = 0; */
21374f21c4afSdrh   pPager->tempFile = (u8)tempFile;
2138369339dbSdrh   assert( tempFile==PAGER_LOCKINGMODE_NORMAL
2139369339dbSdrh           || tempFile==PAGER_LOCKINGMODE_EXCLUSIVE );
2140369339dbSdrh   assert( PAGER_LOCKINGMODE_EXCLUSIVE==1 );
21414f21c4afSdrh   pPager->exclusiveMode = (u8)tempFile;
21424f21c4afSdrh   pPager->memDb = (u8)memDb;
21434f21c4afSdrh   pPager->readOnly = (u8)readOnly;
21443b59a5ccSdrh   /* pPager->needSync = 0; */
21454f21c4afSdrh   pPager->noSync = (pPager->tempFile || !useJournal) ?1:0;
21464f21c4afSdrh   pPager->fullSync = pPager->noSync ?0:1;
2147f036aef0Sdanielk1977   pPager->sync_flags = SQLITE_SYNC_NORMAL;
21483b59a5ccSdrh   /* pPager->pFirst = 0; */
21493b59a5ccSdrh   /* pPager->pFirstSynced = 0; */
21503b59a5ccSdrh   /* pPager->pLast = 0; */
21518c0a791aSdanielk1977   pPager->nExtra = nExtra;
2152b53e4960Sdanielk1977   pPager->journalSizeLimit = SQLITE_DEFAULT_JOURNAL_SIZE_LIMIT;
2153b3175389Sdanielk1977   assert(pPager->fd->pMethods||tempFile);
2154c80f058dSdrh   setSectorSize(pPager);
2155b3175389Sdanielk1977   if( memDb ){
2156b3175389Sdanielk1977     pPager->journalMode = PAGER_JOURNALMODE_MEMORY;
2157b472117cSdanielk1977   }
21581ceedd37Sdanielk1977   /* pPager->xBusyHandler = 0; */
21591ceedd37Sdanielk1977   /* pPager->pBusyHandlerArg = 0; */
21603b59a5ccSdrh   /* memset(pPager->aHash, 0, sizeof(pPager->aHash)); */
2161ed7c855cSdrh   *ppPager = pPager;
2162ed7c855cSdrh   return SQLITE_OK;
2163ed7c855cSdrh }
2164ed7c855cSdrh 
2165ed7c855cSdrh /*
216690f5ecb3Sdrh ** Set the busy handler function.
216790f5ecb3Sdrh */
21681ceedd37Sdanielk1977 void sqlite3PagerSetBusyhandler(
21691ceedd37Sdanielk1977   Pager *pPager,
21701ceedd37Sdanielk1977   int (*xBusyHandler)(void *),
21711ceedd37Sdanielk1977   void *pBusyHandlerArg
21721ceedd37Sdanielk1977 ){
21731ceedd37Sdanielk1977   pPager->xBusyHandler = xBusyHandler;
21741ceedd37Sdanielk1977   pPager->pBusyHandlerArg = pBusyHandlerArg;
217590f5ecb3Sdrh }
217690f5ecb3Sdrh 
217790f5ecb3Sdrh /*
2178a6abd041Sdrh ** Set the reinitializer for this pager.  If not NULL, the reinitializer
2179a6abd041Sdrh ** is called when the content of a page in cache is restored to its original
2180a6abd041Sdrh ** value as a result of a rollback.  The callback gives higher-level code
2181a6abd041Sdrh ** an opportunity to restore the EXTRA section to agree with the restored
2182a6abd041Sdrh ** page data.
2183a6abd041Sdrh */
2184eaa06f69Sdanielk1977 void sqlite3PagerSetReiniter(Pager *pPager, void (*xReinit)(DbPage*)){
2185a6abd041Sdrh   pPager->xReiniter = xReinit;
2186a6abd041Sdrh }
2187a6abd041Sdrh 
2188a6abd041Sdrh /*
2189a1644fd8Sdanielk1977 ** Set the page size to *pPageSize. If the suggest new page size is
2190a1644fd8Sdanielk1977 ** inappropriate, then an alternative page size is set to that
2191a1644fd8Sdanielk1977 ** value before returning.
219290f5ecb3Sdrh */
2193a1644fd8Sdanielk1977 int sqlite3PagerSetPagesize(Pager *pPager, u16 *pPageSize){
21941357d9f5Sdanielk1977   int rc = pPager->errCode;
21951357d9f5Sdanielk1977   if( rc==SQLITE_OK ){
2196a1644fd8Sdanielk1977     u16 pageSize = *pPageSize;
21979663b8f9Sdanielk1977     assert( pageSize==0 || (pageSize>=512 && pageSize<=SQLITE_MAX_PAGE_SIZE) );
2198a1644fd8Sdanielk1977     if( pageSize && pageSize!=pPager->pageSize
21997426f864Sdrh      && (pPager->memDb==0 || pPager->dbSize==0)
22007426f864Sdrh      && sqlite3PcacheRefCount(pPager->pPCache)==0
2201a1644fd8Sdanielk1977     ){
2202facf0307Sdrh       char *pNew = (char *)sqlite3PageMalloc(pageSize);
2203a1644fd8Sdanielk1977       if( !pNew ){
2204a1644fd8Sdanielk1977         rc = SQLITE_NOMEM;
2205a1644fd8Sdanielk1977       }else{
2206c7c7e623Sdanielk1977         pager_reset(pPager);
220790f5ecb3Sdrh         pPager->pageSize = pageSize;
22087426f864Sdrh         if( !pPager->memDb ) setSectorSize(pPager);
2209facf0307Sdrh         sqlite3PageFree(pPager->pTmpSpace);
2210a1644fd8Sdanielk1977         pPager->pTmpSpace = pNew;
22118c0a791aSdanielk1977         sqlite3PcacheSetPageSize(pPager->pPCache, pageSize);
22121c7880e5Sdrh       }
2213a1644fd8Sdanielk1977     }
22144f21c4afSdrh     *pPageSize = (u16)pPager->pageSize;
22151357d9f5Sdanielk1977   }
2216a1644fd8Sdanielk1977   return rc;
221790f5ecb3Sdrh }
221890f5ecb3Sdrh 
221990f5ecb3Sdrh /*
222026b7994aSdrh ** Return a pointer to the "temporary page" buffer held internally
222126b7994aSdrh ** by the pager.  This is a buffer that is big enough to hold the
222226b7994aSdrh ** entire content of a database page.  This buffer is used internally
222326b7994aSdrh ** during rollback and will be overwritten whenever a rollback
222426b7994aSdrh ** occurs.  But other modules are free to use it too, as long as
222526b7994aSdrh ** no rollbacks are happening.
222626b7994aSdrh */
222726b7994aSdrh void *sqlite3PagerTempSpace(Pager *pPager){
222826b7994aSdrh   return pPager->pTmpSpace;
222926b7994aSdrh }
223026b7994aSdrh 
223126b7994aSdrh /*
2232f8e632b6Sdrh ** Attempt to set the maximum database page count if mxPage is positive.
2233f8e632b6Sdrh ** Make no changes if mxPage is zero or negative.  And never reduce the
2234f8e632b6Sdrh ** maximum page count below the current size of the database.
2235f8e632b6Sdrh **
2236f8e632b6Sdrh ** Regardless of mxPage, return the current maximum page count.
2237f8e632b6Sdrh */
2238f8e632b6Sdrh int sqlite3PagerMaxPageCount(Pager *pPager, int mxPage){
2239f8e632b6Sdrh   if( mxPage>0 ){
2240f8e632b6Sdrh     pPager->mxPgno = mxPage;
2241f8e632b6Sdrh   }
2242ad0132dfSdanielk1977   sqlite3PagerPagecount(pPager, 0);
2243f8e632b6Sdrh   return pPager->mxPgno;
2244f8e632b6Sdrh }
2245f8e632b6Sdrh 
2246f8e632b6Sdrh /*
2247c9ac5caaSdrh ** The following set of routines are used to disable the simulated
2248c9ac5caaSdrh ** I/O error mechanism.  These routines are used to avoid simulated
2249c9ac5caaSdrh ** errors in places where we do not care about errors.
2250c9ac5caaSdrh **
2251c9ac5caaSdrh ** Unless -DSQLITE_TEST=1 is used, these routines are all no-ops
2252c9ac5caaSdrh ** and generate no code.
2253c9ac5caaSdrh */
2254c9ac5caaSdrh #ifdef SQLITE_TEST
2255c9ac5caaSdrh extern int sqlite3_io_error_pending;
2256c9ac5caaSdrh extern int sqlite3_io_error_hit;
2257c9ac5caaSdrh static int saved_cnt;
2258c9ac5caaSdrh void disable_simulated_io_errors(void){
2259c9ac5caaSdrh   saved_cnt = sqlite3_io_error_pending;
2260c9ac5caaSdrh   sqlite3_io_error_pending = -1;
2261c9ac5caaSdrh }
2262c9ac5caaSdrh void enable_simulated_io_errors(void){
2263c9ac5caaSdrh   sqlite3_io_error_pending = saved_cnt;
2264c9ac5caaSdrh }
2265c9ac5caaSdrh #else
2266152410faSdrh # define disable_simulated_io_errors()
2267152410faSdrh # define enable_simulated_io_errors()
2268c9ac5caaSdrh #endif
2269c9ac5caaSdrh 
2270c9ac5caaSdrh /*
227190f5ecb3Sdrh ** Read the first N bytes from the beginning of the file into memory
2272aef0bf64Sdanielk1977 ** that pDest points to.
2273aef0bf64Sdanielk1977 **
2274aef0bf64Sdanielk1977 ** No error checking is done. The rational for this is that this function
2275aef0bf64Sdanielk1977 ** may be called even if the file does not exist or contain a header. In
2276aef0bf64Sdanielk1977 ** these cases sqlite3OsRead() will return an error, to which the correct
2277aef0bf64Sdanielk1977 ** response is to zero the memory at pDest and continue.  A real IO error
2278aef0bf64Sdanielk1977 ** will presumably recur and be picked up later (Todo: Think about this).
227990f5ecb3Sdrh */
22803b8a05f6Sdanielk1977 int sqlite3PagerReadFileheader(Pager *pPager, int N, unsigned char *pDest){
2281551b7736Sdrh   int rc = SQLITE_OK;
228290f5ecb3Sdrh   memset(pDest, 0, N);
2283b3175389Sdanielk1977   assert(pPager->fd->pMethods||pPager->tempFile);
22847a2b1eebSdanielk1977   if( pPager->fd->pMethods ){
2285b0603416Sdrh     IOTRACE(("DBHDR %p 0 %d\n", pPager, N))
228662079060Sdanielk1977     rc = sqlite3OsRead(pPager->fd, pDest, N, 0);
2287551b7736Sdrh     if( rc==SQLITE_IOERR_SHORT_READ ){
2288551b7736Sdrh       rc = SQLITE_OK;
228990f5ecb3Sdrh     }
229090f5ecb3Sdrh   }
2291551b7736Sdrh   return rc;
2292551b7736Sdrh }
229390f5ecb3Sdrh 
229490f5ecb3Sdrh /*
22955e00f6c7Sdrh ** Return the total number of pages in the disk file associated with
22965e00f6c7Sdrh ** pPager.
229715f411dbSdanielk1977 **
229815f411dbSdanielk1977 ** If the PENDING_BYTE lies on the page directly after the end of the
229915f411dbSdanielk1977 ** file, then consider this page part of the file too. For example, if
230015f411dbSdanielk1977 ** PENDING_BYTE is byte 4096 (the first byte of page 5) and the size of the
230115f411dbSdanielk1977 ** file is 4096 bytes, 5 is returned instead of 4.
2302ed7c855cSdrh */
2303ad0132dfSdanielk1977 int sqlite3PagerPagecount(Pager *pPager, int *pnPage){
23047a2b1eebSdanielk1977   i64 n = 0;
2305e49f9827Sdrh   int rc;
2306d9b0257aSdrh   assert( pPager!=0 );
2307a7aea3ddSdrh   if( pPager->errCode ){
23088c0a791aSdanielk1977     rc = pPager->errCode;
23098c0a791aSdanielk1977     return rc;
2310a7aea3ddSdrh   }
2311d92db531Sdanielk1977   if( pPager->dbSizeValid ){
231215f411dbSdanielk1977     n = pPager->dbSize;
231315f411dbSdanielk1977   } else {
23147a2b1eebSdanielk1977     assert(pPager->fd->pMethods||pPager->tempFile);
23157a2b1eebSdanielk1977     if( (pPager->fd->pMethods)
23167a2b1eebSdanielk1977      && (rc = sqlite3OsFileSize(pPager->fd, &n))!=SQLITE_OK ){
2317e49f9827Sdrh       pager_error(pPager, rc);
2318ad0132dfSdanielk1977       return rc;
2319ed7c855cSdrh     }
2320992f2d78Sdrh     if( n>0 && n<pPager->pageSize ){
2321992f2d78Sdrh       n = 1;
2322992f2d78Sdrh     }else{
232390f5ecb3Sdrh       n /= pPager->pageSize;
2324992f2d78Sdrh     }
2325a6abd041Sdrh     if( pPager->state!=PAGER_UNLOCK ){
23263460d19cSdanielk1977       pPager->dbSize = (Pgno)n;
23273460d19cSdanielk1977       pPager->dbFileSize = (Pgno)n;
2328d92db531Sdanielk1977       pPager->dbSizeValid = 1;
2329ed7c855cSdrh     }
233015f411dbSdanielk1977   }
233115f411dbSdanielk1977   if( n==(PENDING_BYTE/pPager->pageSize) ){
233215f411dbSdanielk1977     n++;
233315f411dbSdanielk1977   }
2334f8e632b6Sdrh   if( n>pPager->mxPgno ){
23354f21c4afSdrh     pPager->mxPgno = (Pgno)n;
2336f8e632b6Sdrh   }
2337ad0132dfSdanielk1977   if( pnPage ){
23384f21c4afSdrh     *pnPage = (int)n;
2339ad0132dfSdanielk1977   }
2340ad0132dfSdanielk1977   return SQLITE_OK;
2341ed7c855cSdrh }
2342ed7c855cSdrh 
2343ed7c855cSdrh /*
2344f7c57531Sdrh ** Forward declaration
2345f7c57531Sdrh */
23467657240aSdanielk1977 static int syncJournal(Pager*);
2347ac69b05eSdrh 
2348ac69b05eSdrh /*
234917221813Sdanielk1977 ** Try to obtain a lock on a file.  Invoke the busy callback if the lock
2350a4afb65cSdrh ** is currently not available.  Repeat until the busy callback returns
235117221813Sdanielk1977 ** false or until the lock succeeds.
235217221813Sdanielk1977 **
235317221813Sdanielk1977 ** Return SQLITE_OK on success and an error code if we cannot obtain
235417221813Sdanielk1977 ** the lock.
235517221813Sdanielk1977 */
235617221813Sdanielk1977 static int pager_wait_on_lock(Pager *pPager, int locktype){
235717221813Sdanielk1977   int rc;
23581aa2d8b5Sdrh 
23591aa2d8b5Sdrh   /* The OS lock values must be the same as the Pager lock values */
236017221813Sdanielk1977   assert( PAGER_SHARED==SHARED_LOCK );
236117221813Sdanielk1977   assert( PAGER_RESERVED==RESERVED_LOCK );
236217221813Sdanielk1977   assert( PAGER_EXCLUSIVE==EXCLUSIVE_LOCK );
23631aa2d8b5Sdrh 
23641aa2d8b5Sdrh   /* If the file is currently unlocked then the size must be unknown */
2365d92db531Sdanielk1977   assert( pPager->state>=PAGER_SHARED || pPager->dbSizeValid==0 );
23661aa2d8b5Sdrh 
236717221813Sdanielk1977   if( pPager->state>=locktype ){
236817221813Sdanielk1977     rc = SQLITE_OK;
236917221813Sdanielk1977   }else{
237017221813Sdanielk1977     do {
2371054889ecSdrh       rc = sqlite3OsLock(pPager->fd, locktype);
23721ceedd37Sdanielk1977     }while( rc==SQLITE_BUSY && pPager->xBusyHandler(pPager->pBusyHandlerArg) );
237317221813Sdanielk1977     if( rc==SQLITE_OK ){
23744f21c4afSdrh       pPager->state = (u8)locktype;
2375b0603416Sdrh       IOTRACE(("LOCK %p %d\n", pPager, locktype))
237617221813Sdanielk1977     }
237717221813Sdanielk1977   }
237817221813Sdanielk1977   return rc;
237917221813Sdanielk1977 }
238017221813Sdanielk1977 
23813460d19cSdanielk1977 #ifndef SQLITE_OMIT_AUTOVACUUM
23823460d19cSdanielk1977 /*
2383f90b7260Sdanielk1977 ** Truncate the in-memory database file image to nPage pages. This
2384f90b7260Sdanielk1977 ** function does not actually modify the database file on disk. It
2385f90b7260Sdanielk1977 ** just sets the internal state of the pager object so that the
2386f90b7260Sdanielk1977 ** truncation will be done when the current transaction is committed.
23873460d19cSdanielk1977 */
23883460d19cSdanielk1977 void sqlite3PagerTruncateImage(Pager *pPager, Pgno nPage){
23893460d19cSdanielk1977   assert( pPager->dbSizeValid );
23903460d19cSdanielk1977   assert( pPager->dbSize>=nPage );
23913460d19cSdanielk1977   pPager->dbSize = nPage;
23923460d19cSdanielk1977 }
23933460d19cSdanielk1977 
23943460d19cSdanielk1977 /*
23953460d19cSdanielk1977 ** Return the current size of the database file image in pages. This
23963460d19cSdanielk1977 ** function differs from sqlite3PagerPagecount() in two ways:
23973460d19cSdanielk1977 **
23983460d19cSdanielk1977 **  a) It may only be called when at least one reference to a database
23993460d19cSdanielk1977 **     page is held. This guarantees that the database size is already
24003460d19cSdanielk1977 **     known and a call to sqlite3OsFileSize() is not required.
24013460d19cSdanielk1977 **
24023460d19cSdanielk1977 **  b) The return value is not adjusted for the locking page.
24033460d19cSdanielk1977 */
24043460d19cSdanielk1977 Pgno sqlite3PagerImageSize(Pager *pPager){
24053460d19cSdanielk1977   assert( pPager->dbSizeValid );
24063460d19cSdanielk1977   return pPager->dbSize;
24073460d19cSdanielk1977 }
24083460d19cSdanielk1977 #endif  /* ifndef SQLITE_OMIT_AUTOVACUUM */
24093460d19cSdanielk1977 
2410f7c57531Sdrh /*
2411ed7c855cSdrh ** Shutdown the page cache.  Free all memory and close all files.
2412ed7c855cSdrh **
2413ed7c855cSdrh ** If a transaction was in progress when this routine is called, that
2414ed7c855cSdrh ** transaction is rolled back.  All outstanding pages are invalidated
2415ed7c855cSdrh ** and their memory is freed.  Any attempt to use a page associated
2416ed7c855cSdrh ** with this page cache after this function returns will likely
2417ed7c855cSdrh ** result in a coredump.
2418aef0bf64Sdanielk1977 **
2419aef0bf64Sdanielk1977 ** This function always succeeds. If a transaction is active an attempt
2420aef0bf64Sdanielk1977 ** is made to roll it back. If an error occurs during the rollback
2421aef0bf64Sdanielk1977 ** a hot journal may be left in the filesystem but no error is returned
2422aef0bf64Sdanielk1977 ** to the caller.
2423ed7c855cSdrh */
24243b8a05f6Sdanielk1977 int sqlite3PagerClose(Pager *pPager){
242513f7299bSdanielk1977 
2426c9ac5caaSdrh   disable_simulated_io_errors();
24272d1d86fbSdanielk1977   sqlite3BeginBenignMalloc();
2428c2ee76cbSdrh   pPager->errCode = 0;
242941483468Sdanielk1977   pPager->exclusiveMode = 0;
2430bafda096Sdrh   pager_reset(pPager);
2431b3175389Sdanielk1977   if( !MEMDB ){
2432f2c31ad8Sdanielk1977     /* Set Pager.journalHdr to -1 for the benefit of the pager_playback()
2433f2c31ad8Sdanielk1977     ** call which may be made from within pagerUnlockAndRollback(). If it
2434f2c31ad8Sdanielk1977     ** is not -1, then the unsynced portion of an open journal file may
2435f2c31ad8Sdanielk1977     ** be played back into the database. If a power failure occurs while
2436f2c31ad8Sdanielk1977     ** this is happening, the database may become corrupt.
2437f2c31ad8Sdanielk1977     */
2438f2c31ad8Sdanielk1977     pPager->journalHdr = -1;
2439e277be05Sdanielk1977     pagerUnlockAndRollback(pPager);
2440b3175389Sdanielk1977   }
2441c9ac5caaSdrh   enable_simulated_io_errors();
24422d1d86fbSdanielk1977   sqlite3EndBenignMalloc();
244330d53701Sdrh   PAGERTRACE(("CLOSE %d\n", PAGERID(pPager)));
2444b0603416Sdrh   IOTRACE(("CLOSE %p\n", pPager))
2445e94ddc9eSdanielk1977   if( pPager->journalOpen ){
2446b4b47411Sdanielk1977     sqlite3OsClose(pPager->jfd);
2447e94ddc9eSdanielk1977   }
2448f5e7bb51Sdrh   sqlite3BitvecDestroy(pPager->pInJournal);
2449a1fa00d9Sdanielk1977   sqlite3BitvecDestroy(pPager->pAlwaysRollback);
2450fd7f0452Sdanielk1977   releaseAllSavepoint(pPager);
2451b4b47411Sdanielk1977   sqlite3OsClose(pPager->fd);
24520f89253eSdrh   /* Temp files are automatically deleted by the OS
24530f89253eSdrh   ** if( pPager->tempFile ){
245466560adaSdrh   **   sqlite3OsDelete(pPager->zFilename);
24550f89253eSdrh   ** }
24560f89253eSdrh   */
2457aca790acSdanielk1977 
2458facf0307Sdrh   sqlite3PageFree(pPager->pTmpSpace);
24598c0a791aSdanielk1977   sqlite3PcacheClose(pPager->pPCache);
246017435752Sdrh   sqlite3_free(pPager);
2461ed7c855cSdrh   return SQLITE_OK;
2462ed7c855cSdrh }
2463ed7c855cSdrh 
246487cc3b31Sdrh #if !defined(NDEBUG) || defined(SQLITE_TEST)
2465ed7c855cSdrh /*
24665e00f6c7Sdrh ** Return the page number for the given page data.
2467ed7c855cSdrh */
24683b8a05f6Sdanielk1977 Pgno sqlite3PagerPagenumber(DbPage *p){
2469ed7c855cSdrh   return p->pgno;
2470ed7c855cSdrh }
247187cc3b31Sdrh #endif
2472ed7c855cSdrh 
2473ed7c855cSdrh /*
2474df0b3b09Sdrh ** Increment the reference count for a page.  The input pointer is
2475df0b3b09Sdrh ** a reference to the page data.
2476df0b3b09Sdrh */
24773b8a05f6Sdanielk1977 int sqlite3PagerRef(DbPage *pPg){
24788c0a791aSdanielk1977   sqlite3PcacheRef(pPg);
24798c42ca93Sdrh   return SQLITE_OK;
24807e3b0a07Sdrh }
24817e3b0a07Sdrh 
24827e3b0a07Sdrh /*
248334e79ceeSdrh ** Sync the journal.  In other words, make sure all the pages that have
248434e79ceeSdrh ** been written to the journal have actually reached the surface of the
248534e79ceeSdrh ** disk.  It is not safe to modify the original database file until after
248634e79ceeSdrh ** the journal has been synced.  If the original database is modified before
248734e79ceeSdrh ** the journal is synced and a power failure occurs, the unsynced journal
248834e79ceeSdrh ** data would be lost and we would be unable to completely rollback the
248934e79ceeSdrh ** database changes.  Database corruption would occur.
2490b19a2bc6Sdrh **
249134e79ceeSdrh ** This routine also updates the nRec field in the header of the journal.
249234e79ceeSdrh ** (See comments on the pager_playback() routine for additional information.)
249334e79ceeSdrh ** If the sync mode is FULL, two syncs will occur.  First the whole journal
249434e79ceeSdrh ** is synced, then the nRec field is updated, then a second sync occurs.
2495fa86c412Sdrh **
249634e79ceeSdrh ** For temporary databases, we do not care if we are able to rollback
24974cd2cd5cSdanielk1977 ** after a power failure, so no sync occurs.
24984cd2cd5cSdanielk1977 **
24994cd2cd5cSdanielk1977 ** If the IOCAP_SEQUENTIAL flag is set for the persistent media on which
25004cd2cd5cSdanielk1977 ** the database is stored, then OsSync() is never called on the journal
25014cd2cd5cSdanielk1977 ** file. In this case all that is required is to update the nRec field in
25024cd2cd5cSdanielk1977 ** the journal header.
250334e79ceeSdrh **
250434e79ceeSdrh ** This routine clears the needSync field of every page current held in
250534e79ceeSdrh ** memory.
250650e5dadfSdrh */
25077657240aSdanielk1977 static int syncJournal(Pager *pPager){
250850e5dadfSdrh   int rc = SQLITE_OK;
250903eb96a7Sdrh 
251003eb96a7Sdrh   /* Sync the journal before modifying the main database
251103eb96a7Sdrh   ** (assuming there is a journal and it needs to be synced.)
251203eb96a7Sdrh   */
25137657240aSdanielk1977   if( pPager->needSync ){
2514b3175389Sdanielk1977     assert( !pPager->tempFile );
2515b3175389Sdanielk1977     if( pPager->journalMode!=PAGER_JOURNALMODE_MEMORY ){
25164cd2cd5cSdanielk1977       int iDc = sqlite3OsDeviceCharacteristics(pPager->fd);
2517db48ee02Sdrh       assert( pPager->journalOpen );
25184cd2cd5cSdanielk1977 
25194cd2cd5cSdanielk1977       if( 0==(iDc&SQLITE_IOCAP_SAFE_APPEND) ){
2520112f752bSdanielk1977         i64 jrnlOff = journalHdrOffset(pPager);
2521112f752bSdanielk1977         u8 zMagic[8];
2522112f752bSdanielk1977 
2523112f752bSdanielk1977         /* This block deals with an obscure problem. If the last connection
2524112f752bSdanielk1977         ** that wrote to this database was operating in persistent-journal
2525112f752bSdanielk1977         ** mode, then the journal file may at this point actually be larger
2526112f752bSdanielk1977         ** than Pager.journalOff bytes. If the next thing in the journal
2527112f752bSdanielk1977         ** file happens to be a journal-header (written as part of the
2528112f752bSdanielk1977         ** previous connections transaction), and a crash or power-failure
2529112f752bSdanielk1977         ** occurs after nRec is updated but before this connection writes
2530112f752bSdanielk1977         ** anything else to the journal file (or commits/rolls back its
2531112f752bSdanielk1977         ** transaction), then SQLite may become confused when doing the
2532112f752bSdanielk1977         ** hot-journal rollback following recovery. It may roll back all
2533112f752bSdanielk1977         ** of this connections data, then proceed to rolling back the old,
2534112f752bSdanielk1977         ** out-of-date data that follows it. Database corruption.
2535112f752bSdanielk1977         **
2536112f752bSdanielk1977         ** To work around this, if the journal file does appear to contain
2537112f752bSdanielk1977         ** a valid header following Pager.journalOff, then write a 0x00
2538112f752bSdanielk1977         ** byte to the start of it to prevent it from being recognized.
2539112f752bSdanielk1977         */
2540112f752bSdanielk1977         rc = sqlite3OsRead(pPager->jfd, zMagic, 8, jrnlOff);
2541112f752bSdanielk1977         if( rc==SQLITE_OK && 0==memcmp(zMagic, aJournalMagic, 8) ){
2542112f752bSdanielk1977           static const u8 zerobyte = 0;
2543112f752bSdanielk1977           rc = sqlite3OsWrite(pPager->jfd, &zerobyte, 1, jrnlOff);
2544112f752bSdanielk1977         }
2545112f752bSdanielk1977         if( rc!=SQLITE_OK && rc!=SQLITE_IOERR_SHORT_READ ){
2546112f752bSdanielk1977           return rc;
2547112f752bSdanielk1977         }
2548112f752bSdanielk1977 
25497657240aSdanielk1977         /* Write the nRec value into the journal file header. If in
25507657240aSdanielk1977         ** full-synchronous mode, sync the journal first. This ensures that
25517657240aSdanielk1977         ** all data has really hit the disk before nRec is updated to mark
25527657240aSdanielk1977         ** it as a candidate for rollback.
25534cd2cd5cSdanielk1977         **
25544cd2cd5cSdanielk1977         ** This is not required if the persistent media supports the
25554cd2cd5cSdanielk1977         ** SAFE_APPEND property. Because in this case it is not possible
25564cd2cd5cSdanielk1977         ** for garbage data to be appended to the file, the nRec field
25574cd2cd5cSdanielk1977         ** is populated with 0xFFFFFFFF when the journal header is written
25584cd2cd5cSdanielk1977         ** and never needs to be updated.
25597657240aSdanielk1977         */
25604cd2cd5cSdanielk1977         if( pPager->fullSync && 0==(iDc&SQLITE_IOCAP_SEQUENTIAL) ){
256130d53701Sdrh           PAGERTRACE(("SYNC journal of %d\n", PAGERID(pPager)));
2562b0603416Sdrh           IOTRACE(("JSYNC %p\n", pPager))
2563f036aef0Sdanielk1977           rc = sqlite3OsSync(pPager->jfd, pPager->sync_flags);
256450e5dadfSdrh           if( rc!=0 ) return rc;
2565968af52aSdrh         }
256613adf8a0Sdanielk1977 
256762079060Sdanielk1977         jrnlOff = pPager->journalHdr + sizeof(aJournalMagic);
256862079060Sdanielk1977         IOTRACE(("JHDR %p %lld %d\n", pPager, jrnlOff, 4));
256962079060Sdanielk1977         rc = write32bits(pPager->jfd, jrnlOff, pPager->nRec);
2570b4746b9eSdrh         if( rc ) return rc;
2571d8d66e8cSdrh       }
25724cd2cd5cSdanielk1977       if( 0==(iDc&SQLITE_IOCAP_SEQUENTIAL) ){
257330d53701Sdrh         PAGERTRACE(("SYNC journal of %d\n", PAGERID(pPager)));
2574126afe6bSdrh         IOTRACE(("JSYNC %p\n", pPager))
2575f036aef0Sdanielk1977         rc = sqlite3OsSync(pPager->jfd, pPager->sync_flags|
2576f036aef0Sdanielk1977           (pPager->sync_flags==SQLITE_SYNC_FULL?SQLITE_SYNC_DATAONLY:0)
2577f036aef0Sdanielk1977         );
2578db48ee02Sdrh         if( rc!=0 ) return rc;
25794cd2cd5cSdanielk1977       }
2580db48ee02Sdrh       pPager->journalStarted = 1;
2581fa86c412Sdrh     }
258250e5dadfSdrh     pPager->needSync = 0;
258303eb96a7Sdrh 
2584db48ee02Sdrh     /* Erase the needSync flag from every page.
258503eb96a7Sdrh     */
2586bc2ca9ebSdanielk1977     sqlite3PcacheClearSyncFlags(pPager->pPCache);
2587341eae8dSdrh   }
2588341eae8dSdrh 
258981a20f21Sdrh   return rc;
259050e5dadfSdrh }
259150e5dadfSdrh 
259250e5dadfSdrh /*
25932554f8b0Sdrh ** Given a list of pages (connected by the PgHdr.pDirty pointer) write
2594a858aa2eSdanielk1977 ** every one of those pages out to the database file. No calls are made
2595a858aa2eSdanielk1977 ** to the page-cache to mark the pages as clean. It is the responsibility
2596a858aa2eSdanielk1977 ** of the caller to use PcacheCleanAll() or PcacheMakeClean() to mark
2597a858aa2eSdanielk1977 ** the pages as clean.
25982554f8b0Sdrh */
25992554f8b0Sdrh static int pager_write_pagelist(PgHdr *pList){
26002554f8b0Sdrh   Pager *pPager;
26012554f8b0Sdrh   int rc;
26022554f8b0Sdrh 
26032554f8b0Sdrh   if( pList==0 ) return SQLITE_OK;
26042554f8b0Sdrh   pPager = pList->pPager;
26059eed5057Sdanielk1977 
26069eed5057Sdanielk1977   /* At this point there may be either a RESERVED or EXCLUSIVE lock on the
26079eed5057Sdanielk1977   ** database file. If there is already an EXCLUSIVE lock, the following
2608054889ecSdrh   ** calls to sqlite3OsLock() are no-ops.
26099eed5057Sdanielk1977   **
2610a6abd041Sdrh   ** Moving the lock from RESERVED to EXCLUSIVE actually involves going
2611a6abd041Sdrh   ** through an intermediate state PENDING.   A PENDING lock prevents new
2612a6abd041Sdrh   ** readers from attaching to the database but is unsufficient for us to
2613a6abd041Sdrh   ** write.  The idea of a PENDING lock is to prevent new readers from
2614a6abd041Sdrh   ** coming in while we wait for existing readers to clear.
26159eed5057Sdanielk1977   **
2616a6abd041Sdrh   ** While the pager is in the RESERVED state, the original database file
2617a6abd041Sdrh   ** is unchanged and we can rollback without having to playback the
2618a6abd041Sdrh   ** journal into the original database file.  Once we transition to
2619a6abd041Sdrh   ** EXCLUSIVE, it means the database file has been changed and any rollback
2620a6abd041Sdrh   ** will require a journal playback.
26219eed5057Sdanielk1977   */
2622684917c2Sdrh   rc = pager_wait_on_lock(pPager, EXCLUSIVE_LOCK);
26239eed5057Sdanielk1977   if( rc!=SQLITE_OK ){
26249eed5057Sdanielk1977     return rc;
26259eed5057Sdanielk1977   }
26269eed5057Sdanielk1977 
26272554f8b0Sdrh   while( pList ){
26287a2b1eebSdanielk1977 
26297a2b1eebSdanielk1977     /* If the file has not yet been opened, open it now. */
26307a2b1eebSdanielk1977     if( !pPager->fd->pMethods ){
26317a2b1eebSdanielk1977       assert(pPager->tempFile);
263217b90b53Sdanielk1977       rc = sqlite3PagerOpentemp(pPager, pPager->fd, pPager->vfsFlags);
26337a2b1eebSdanielk1977       if( rc ) return rc;
26347a2b1eebSdanielk1977     }
26357a2b1eebSdanielk1977 
2636687566d7Sdanielk1977     /* If there are dirty pages in the page cache with page numbers greater
2637f90b7260Sdanielk1977     ** than Pager.dbSize, this means sqlite3PagerTruncateImage() was called to
2638687566d7Sdanielk1977     ** make the file smaller (presumably by auto-vacuum code). Do not write
2639687566d7Sdanielk1977     ** any such pages to the file.
2640687566d7Sdanielk1977     */
264133e3216aSdanielk1977     if( pList->pgno<=pPager->dbSize && 0==(pList->flags&PGHDR_DONT_WRITE) ){
264262079060Sdanielk1977       i64 offset = (pList->pgno-1)*(i64)pPager->pageSize;
26438c0a791aSdanielk1977       char *pData = CODEC2(pPager, pList->pData, pList->pgno, 6);
264412dd5496Sdanielk1977 
264530d53701Sdrh       PAGERTRACE(("STORE %d page %d hash(%08x)\n",
264630d53701Sdrh                    PAGERID(pPager), pList->pgno, pager_pagehash(pList)));
2647538f570cSdrh       IOTRACE(("PGOUT %p %d\n", pPager, pList->pgno));
264862079060Sdanielk1977       rc = sqlite3OsWrite(pPager->fd, pData, pPager->pageSize, offset);
2649538f570cSdrh       PAGER_INCR(sqlite3_pager_writedb_count);
2650538f570cSdrh       PAGER_INCR(pPager->nWrite);
265186a88114Sdrh       if( pList->pgno==1 ){
265286a88114Sdrh         memcpy(&pPager->dbFileVers, &pData[24], sizeof(pPager->dbFileVers));
265386a88114Sdrh       }
26543460d19cSdanielk1977       if( pList->pgno>pPager->dbFileSize ){
26553460d19cSdanielk1977         pPager->dbFileSize = pList->pgno;
26563460d19cSdanielk1977       }
2657687566d7Sdanielk1977     }
2658687566d7Sdanielk1977 #ifndef NDEBUG
2659687566d7Sdanielk1977     else{
266030d53701Sdrh       PAGERTRACE(("NOSTORE %d page %d\n", PAGERID(pPager), pList->pgno));
2661687566d7Sdanielk1977     }
2662687566d7Sdanielk1977 #endif
26632554f8b0Sdrh     if( rc ) return rc;
26643c407374Sdanielk1977 #ifdef SQLITE_CHECK_PAGES
26653c407374Sdanielk1977     pList->pageHash = pager_pagehash(pList);
26663c407374Sdanielk1977 #endif
26672554f8b0Sdrh     pList = pList->pDirty;
26682554f8b0Sdrh   }
26698c0a791aSdanielk1977 
26702554f8b0Sdrh   return SQLITE_OK;
26712554f8b0Sdrh }
26722554f8b0Sdrh 
26732554f8b0Sdrh /*
2674f2c31ad8Sdanielk1977 ** Add the page to the sub-journal. It is the callers responsibility to
2675f2c31ad8Sdanielk1977 ** use subjRequiresPage() to check that it is really required before
2676f2c31ad8Sdanielk1977 ** calling this function.
2677f2c31ad8Sdanielk1977 */
2678f2c31ad8Sdanielk1977 static int subjournalPage(PgHdr *pPg){
2679f2c31ad8Sdanielk1977   int rc;
2680f2c31ad8Sdanielk1977   void *pData = pPg->pData;
2681f2c31ad8Sdanielk1977   Pager *pPager = pPg->pPager;
2682f2c31ad8Sdanielk1977   i64 offset = pPager->stmtNRec*(4+pPager->pageSize);
2683f2c31ad8Sdanielk1977   char *pData2 = CODEC2(pPager, pData, pPg->pgno, 7);
2684f2c31ad8Sdanielk1977 
268530d53701Sdrh   PAGERTRACE(("STMT-JOURNAL %d page %d\n", PAGERID(pPager), pPg->pgno));
2686f2c31ad8Sdanielk1977 
2687f2c31ad8Sdanielk1977   assert( pageInJournal(pPg) || pPg->pgno>pPager->dbOrigSize );
2688f2c31ad8Sdanielk1977   rc = write32bits(pPager->sjfd, offset, pPg->pgno);
2689f2c31ad8Sdanielk1977   if( rc==SQLITE_OK ){
2690f2c31ad8Sdanielk1977     rc = sqlite3OsWrite(pPager->sjfd, pData2, pPager->pageSize, offset+4);
2691f2c31ad8Sdanielk1977   }
2692f2c31ad8Sdanielk1977   if( rc==SQLITE_OK ){
2693f2c31ad8Sdanielk1977     pPager->stmtNRec++;
2694f2c31ad8Sdanielk1977     assert( pPager->nSavepoint>0 );
2695f2c31ad8Sdanielk1977     rc = addToSavepointBitvecs(pPager, pPg->pgno);
2696f2c31ad8Sdanielk1977   }
2697f2c31ad8Sdanielk1977   return rc;
2698f2c31ad8Sdanielk1977 }
2699f2c31ad8Sdanielk1977 
2700f2c31ad8Sdanielk1977 
2701f2c31ad8Sdanielk1977 /*
27028c0a791aSdanielk1977 ** This function is called by the pcache layer when it has reached some
27038c0a791aSdanielk1977 ** soft memory limit. The argument is a pointer to a purgeable Pager
27048c0a791aSdanielk1977 ** object. This function attempts to make a single dirty page that has no
27058c0a791aSdanielk1977 ** outstanding references (if one exists) clean so that it can be recycled
27068c0a791aSdanielk1977 ** by the pcache layer.
27072554f8b0Sdrh */
2708a858aa2eSdanielk1977 static int pagerStress(void *p, PgHdr *pPg){
27098c0a791aSdanielk1977   Pager *pPager = (Pager *)p;
27108c0a791aSdanielk1977   int rc = SQLITE_OK;
27118f2e9a1aSdrh 
27128c20014aSdanielk1977   if( pPager->doNotSync ){
27138c20014aSdanielk1977     return SQLITE_OK;
27148c20014aSdanielk1977   }
27158c20014aSdanielk1977 
27168c0a791aSdanielk1977   assert( pPg->flags&PGHDR_DIRTY );
271767e3da7aSdanielk1977   if( pPager->errCode==SQLITE_OK ){
27188c0a791aSdanielk1977     if( pPg->flags&PGHDR_NEED_SYNC ){
27198c0a791aSdanielk1977       rc = syncJournal(pPager);
272067e3da7aSdanielk1977       if( rc==SQLITE_OK && pPager->fullSync &&
2721b3175389Sdanielk1977         !(pPager->journalMode==PAGER_JOURNALMODE_MEMORY) &&
272267e3da7aSdanielk1977         !(sqlite3OsDeviceCharacteristics(pPager->fd)&SQLITE_IOCAP_SAFE_APPEND)
27238c0a791aSdanielk1977       ){
27248c0a791aSdanielk1977         pPager->nRec = 0;
27258c0a791aSdanielk1977         rc = writeJournalHdr(pPager);
27262554f8b0Sdrh       }
27278c0a791aSdanielk1977     }
27288c0a791aSdanielk1977     if( rc==SQLITE_OK ){
2729a858aa2eSdanielk1977       pPg->pDirty = 0;
2730f2c31ad8Sdanielk1977       if( pPg->pgno>pPager->dbSize && subjRequiresPage(pPg) ){
2731f2c31ad8Sdanielk1977         rc = subjournalPage(pPg);
2732f2c31ad8Sdanielk1977       }
2733f2c31ad8Sdanielk1977       if( rc==SQLITE_OK ){
27348c0a791aSdanielk1977         rc = pager_write_pagelist(pPg);
27358c0a791aSdanielk1977       }
2736f2c31ad8Sdanielk1977     }
27378c0a791aSdanielk1977     if( rc!=SQLITE_OK ){
27388c0a791aSdanielk1977       pager_error(pPager, rc);
27398c0a791aSdanielk1977     }
274067e3da7aSdanielk1977   }
2741a858aa2eSdanielk1977 
2742a858aa2eSdanielk1977   if( rc==SQLITE_OK ){
274330d53701Sdrh     PAGERTRACE(("STRESS %d page %d\n", PAGERID(pPager), pPg->pgno));
2744a858aa2eSdanielk1977     sqlite3PcacheMakeClean(pPg);
27458c0a791aSdanielk1977   }
27468c0a791aSdanielk1977   return rc;
27478c0a791aSdanielk1977 }
27488c0a791aSdanielk1977 
27492554f8b0Sdrh 
27502554f8b0Sdrh /*
275119db9352Sdrh ** Return 1 if there is a hot journal on the given pager.
2752165ffe97Sdrh ** A hot journal is one that needs to be played back.
2753165ffe97Sdrh **
2754165ffe97Sdrh ** If the current size of the database file is 0 but a journal file
2755165ffe97Sdrh ** exists, that is probably an old journal left over from a prior
2756165ffe97Sdrh ** database with the same name.  Just delete the journal.
275719db9352Sdrh **
275819db9352Sdrh ** Return negative if unable to determine the status of the journal.
275982ed1e5bSdrh **
276082ed1e5bSdrh ** This routine does not open the journal file to examine its
276182ed1e5bSdrh ** content.  Hence, the journal might contain the name of a master
276282ed1e5bSdrh ** journal file that has been deleted, and hence not be hot.  Or
276382ed1e5bSdrh ** the header of the journal might be zeroed out.  This routine
276482ed1e5bSdrh ** does not discover these cases of a non-hot journal - if the
276582ed1e5bSdrh ** journal file exists and is not empty this routine assumes it
276682ed1e5bSdrh ** is hot.  The pager_playback() routine will discover that the
276782ed1e5bSdrh ** journal file is not really hot and will no-op.
2768165ffe97Sdrh */
2769d300b8a3Sdanielk1977 static int hasHotJournal(Pager *pPager, int *pExists){
2770b4b47411Sdanielk1977   sqlite3_vfs *pVfs = pPager->pVfs;
2771d300b8a3Sdanielk1977   int rc = SQLITE_OK;
2772ea678832Sdrh   int exists = 0;
2773ea678832Sdrh   int locked = 0;
27740a846f96Sdrh   assert( pPager!=0 );
27750a846f96Sdrh   assert( pPager->useJournal );
27760a846f96Sdrh   assert( pPager->fd->pMethods );
27770a846f96Sdrh   *pExists = 0;
2778861f7456Sdanielk1977   rc = sqlite3OsAccess(pVfs, pPager->zJournal, SQLITE_ACCESS_EXISTS, &exists);
2779861f7456Sdanielk1977   if( rc==SQLITE_OK && exists ){
2780861f7456Sdanielk1977     rc = sqlite3OsCheckReservedLock(pPager->fd, &locked);
2781bb5f18d2Sdrh   }
2782861f7456Sdanielk1977   if( rc==SQLITE_OK && exists && !locked ){
2783ad0132dfSdanielk1977     int nPage;
2784ad0132dfSdanielk1977     rc = sqlite3PagerPagecount(pPager, &nPage);
2785d300b8a3Sdanielk1977     if( rc==SQLITE_OK ){
2786d300b8a3Sdanielk1977      if( nPage==0 ){
2787fee2d25aSdanielk1977         sqlite3OsDelete(pVfs, pPager->zJournal, 0);
2788d300b8a3Sdanielk1977       }else{
2789d300b8a3Sdanielk1977         *pExists = 1;
2790d300b8a3Sdanielk1977       }
2791d300b8a3Sdanielk1977     }
2792165ffe97Sdrh   }
2793d300b8a3Sdanielk1977   return rc;
2794861f7456Sdanielk1977 }
2795861f7456Sdanielk1977 
2796165ffe97Sdrh /*
2797e180dd93Sdanielk1977 ** Read the content of page pPg out of the database file.
2798e180dd93Sdanielk1977 */
2799e180dd93Sdanielk1977 static int readDbPage(Pager *pPager, PgHdr *pPg, Pgno pgno){
2800e180dd93Sdanielk1977   int rc;
280162079060Sdanielk1977   i64 offset;
2802e180dd93Sdanielk1977   assert( MEMDB==0 );
28037a2b1eebSdanielk1977   assert(pPager->fd->pMethods||pPager->tempFile);
28047a2b1eebSdanielk1977   if( !pPager->fd->pMethods ){
28057a2b1eebSdanielk1977     return SQLITE_IOERR_SHORT_READ;
28067a2b1eebSdanielk1977   }
280762079060Sdanielk1977   offset = (pgno-1)*(i64)pPager->pageSize;
28088c0a791aSdanielk1977   rc = sqlite3OsRead(pPager->fd, pPg->pData, pPager->pageSize, offset);
2809538f570cSdrh   PAGER_INCR(sqlite3_pager_readdb_count);
2810538f570cSdrh   PAGER_INCR(pPager->nRead);
2811538f570cSdrh   IOTRACE(("PGIN %p %d\n", pPager, pgno));
281286a88114Sdrh   if( pgno==1 ){
28138c0a791aSdanielk1977     memcpy(&pPager->dbFileVers, &((u8*)pPg->pData)[24],
281486a88114Sdrh                                               sizeof(pPager->dbFileVers));
281586a88114Sdrh   }
28163084952aSdanielk1977   CODEC1(pPager, pPg->pData, pPg->pgno, 3);
281730d53701Sdrh   PAGERTRACE(("FETCH %d page %d hash(%08x)\n",
281830d53701Sdrh                PAGERID(pPager), pPg->pgno, pager_pagehash(pPg)));
2819e180dd93Sdanielk1977   return rc;
2820e180dd93Sdanielk1977 }
2821e180dd93Sdanielk1977 
2822e180dd93Sdanielk1977 
2823e180dd93Sdanielk1977 /*
2824e277be05Sdanielk1977 ** This function is called to obtain the shared lock required before
2825e277be05Sdanielk1977 ** data may be read from the pager cache. If the shared lock has already
2826e277be05Sdanielk1977 ** been obtained, this function is a no-op.
2827393f0689Sdanielk1977 **
2828393f0689Sdanielk1977 ** Immediately after obtaining the shared lock (if required), this function
2829393f0689Sdanielk1977 ** checks for a hot-journal file. If one is found, an emergency rollback
2830393f0689Sdanielk1977 ** is performed immediately.
2831ed7c855cSdrh */
2832e277be05Sdanielk1977 static int pagerSharedLock(Pager *pPager){
2833e277be05Sdanielk1977   int rc = SQLITE_OK;
2834d300b8a3Sdanielk1977   int isErrorReset = 0;
2835ed7c855cSdrh 
2836ae72d982Sdanielk1977   /* If this database is opened for exclusive access, has no outstanding
2837ae72d982Sdanielk1977   ** page references and is in an error-state, now is the chance to clear
2838ae72d982Sdanielk1977   ** the error. Discard the contents of the pager-cache and treat any
2839ae72d982Sdanielk1977   ** open journal file as a hot-journal.
2840ae72d982Sdanielk1977   */
28418c0a791aSdanielk1977   if( !MEMDB && pPager->exclusiveMode
28428c0a791aSdanielk1977    && sqlite3PcacheRefCount(pPager->pPCache)==0 && pPager->errCode
28438c0a791aSdanielk1977   ){
2844ae72d982Sdanielk1977     if( pPager->journalOpen ){
2845d300b8a3Sdanielk1977       isErrorReset = 1;
2846ae72d982Sdanielk1977     }
2847ae72d982Sdanielk1977     pPager->errCode = SQLITE_OK;
284893f7af97Sdanielk1977     pager_reset(pPager);
2849ae72d982Sdanielk1977   }
2850ae72d982Sdanielk1977 
2851ae72d982Sdanielk1977   /* If the pager is still in an error state, do not proceed. The error
2852ae72d982Sdanielk1977   ** state will be cleared at some point in the future when all page
2853ae72d982Sdanielk1977   ** references are dropped and the cache can be discarded.
2854ae72d982Sdanielk1977   */
2855ae72d982Sdanielk1977   if( pPager->errCode && pPager->errCode!=SQLITE_FULL ){
2856ae72d982Sdanielk1977     return pPager->errCode;
2857ae72d982Sdanielk1977   }
2858ae72d982Sdanielk1977 
2859d300b8a3Sdanielk1977   if( pPager->state==PAGER_UNLOCK || isErrorReset ){
2860b4b47411Sdanielk1977     sqlite3_vfs *pVfs = pPager->pVfs;
28614f21c4afSdrh     int isHotJournal = 0;
2862049fc21dSshane     assert( !MEMDB );
28638c0a791aSdanielk1977     assert( sqlite3PcacheRefCount(pPager->pPCache)==0 );
28647bec505eSdrh     if( !pPager->noReadlock ){
2865684917c2Sdrh       rc = pager_wait_on_lock(pPager, SHARED_LOCK);
28668766c343Sdrh       if( rc!=SQLITE_OK ){
286752b472aeSdanielk1977         assert( pPager->state==PAGER_UNLOCK );
2868aef0bf64Sdanielk1977         return pager_error(pPager, rc);
2869ed7c855cSdrh       }
28700371f1b2Sdanielk1977     }else if( pPager->state==PAGER_UNLOCK ){
28710371f1b2Sdanielk1977       pPager->state = PAGER_SHARED;
28727bec505eSdrh     }
28730371f1b2Sdanielk1977     assert( pPager->state>=SHARED_LOCK );
2874ed7c855cSdrh 
287513adf8a0Sdanielk1977     /* If a journal file exists, and there is no RESERVED lock on the
287613adf8a0Sdanielk1977     ** database file, then it either needs to be played back or deleted.
2877ed7c855cSdrh     */
2878d300b8a3Sdanielk1977     if( !isErrorReset ){
2879d300b8a3Sdanielk1977       rc = hasHotJournal(pPager, &isHotJournal);
2880d300b8a3Sdanielk1977       if( rc!=SQLITE_OK ){
288152b472aeSdanielk1977         goto failed;
288219db9352Sdrh       }
2883d300b8a3Sdanielk1977     }
2884d300b8a3Sdanielk1977     if( isErrorReset || isHotJournal ){
288590ba3bd0Sdanielk1977       /* Get an EXCLUSIVE lock on the database file. At this point it is
288690ba3bd0Sdanielk1977       ** important that a RESERVED lock is not obtained on the way to the
288790ba3bd0Sdanielk1977       ** EXCLUSIVE lock. If it were, another process might open the
288890ba3bd0Sdanielk1977       ** database file, detect the RESERVED lock, and conclude that the
288990ba3bd0Sdanielk1977       ** database is safe to read while this process is still rolling it
289090ba3bd0Sdanielk1977       ** back.
289190ba3bd0Sdanielk1977       **
289290ba3bd0Sdanielk1977       ** Because the intermediate RESERVED lock is not requested, the
289390ba3bd0Sdanielk1977       ** second process will get to this point in the code and fail to
289485b623f2Sdrh       ** obtain its own EXCLUSIVE lock on the database file.
289590ba3bd0Sdanielk1977       */
2896ae72d982Sdanielk1977       if( pPager->state<EXCLUSIVE_LOCK ){
2897054889ecSdrh         rc = sqlite3OsLock(pPager->fd, EXCLUSIVE_LOCK);
2898a7fcb059Sdrh         if( rc!=SQLITE_OK ){
289952b472aeSdanielk1977           rc = pager_error(pPager, rc);
290052b472aeSdanielk1977           goto failed;
2901a7fcb059Sdrh         }
2902a6abd041Sdrh         pPager->state = PAGER_EXCLUSIVE;
2903ae72d982Sdanielk1977       }
2904a7fcb059Sdrh 
290516e45a43Sdrh       /* Open the journal for read/write access. This is because in
2906979f38e5Sdanielk1977       ** exclusive-access mode the file descriptor will be kept open and
2907979f38e5Sdanielk1977       ** possibly used for a transaction later on. On some systems, the
2908979f38e5Sdanielk1977       ** OsTruncate() call used in exclusive-access mode also requires
2909979f38e5Sdanielk1977       ** a read/write file handle.
2910ed7c855cSdrh       */
2911d300b8a3Sdanielk1977       if( !isErrorReset && pPager->journalOpen==0 ){
2912861f7456Sdanielk1977         int res;
2913861f7456Sdanielk1977         rc = sqlite3OsAccess(pVfs,pPager->zJournal,SQLITE_ACCESS_EXISTS,&res);
2914861f7456Sdanielk1977         if( rc==SQLITE_OK ){
2915861f7456Sdanielk1977           if( res ){
2916b4b47411Sdanielk1977             int fout = 0;
2917ae72d982Sdanielk1977             int f = SQLITE_OPEN_READWRITE|SQLITE_OPEN_MAIN_JOURNAL;
29187152de8dSdanielk1977             assert( !pPager->tempFile );
2919ae72d982Sdanielk1977             rc = sqlite3OsOpen(pVfs, pPager->zJournal, pPager->jfd, f, &fout);
2920b4b47411Sdanielk1977             assert( rc!=SQLITE_OK || pPager->jfd->pMethods );
2921281d8bd3Sdanielk1977             if( rc==SQLITE_OK && fout&SQLITE_OPEN_READONLY ){
2922281d8bd3Sdanielk1977               rc = SQLITE_CANTOPEN;
2923b4b47411Sdanielk1977               sqlite3OsClose(pPager->jfd);
2924979f38e5Sdanielk1977             }
2925861f7456Sdanielk1977           }else{
292616e45a43Sdrh             /* If the journal does not exist, that means some other process
292716e45a43Sdrh             ** has already rolled it back */
292816e45a43Sdrh             rc = SQLITE_BUSY;
2929861f7456Sdanielk1977           }
2930979f38e5Sdanielk1977         }
2931ae72d982Sdanielk1977       }
2932a7fcb059Sdrh       if( rc!=SQLITE_OK ){
293352b472aeSdanielk1977         goto failed;
2934ed7c855cSdrh       }
2935a7fcb059Sdrh       pPager->journalOpen = 1;
2936db48ee02Sdrh       pPager->journalStarted = 0;
29377657240aSdanielk1977       pPager->journalOff = 0;
29387657240aSdanielk1977       pPager->setMaster = 0;
29397657240aSdanielk1977       pPager->journalHdr = 0;
2940ed7c855cSdrh 
2941ed7c855cSdrh       /* Playback and delete the journal.  Drop the database write
2942112f752bSdanielk1977       ** lock and reacquire the read lock. Purge the cache before
2943112f752bSdanielk1977       ** playing back the hot-journal so that we don't end up with
2944ad0ea228Sdanielk1977       ** an inconsistent cache.
2945ed7c855cSdrh       */
2946112f752bSdanielk1977       sqlite3PcacheClear(pPager->pPCache);
2947e277be05Sdanielk1977       rc = pager_playback(pPager, 1);
2948ed7c855cSdrh       if( rc!=SQLITE_OK ){
294952b472aeSdanielk1977         rc = pager_error(pPager, rc);
295052b472aeSdanielk1977         goto failed;
2951ed7c855cSdrh       }
2952c5859718Sdanielk1977       assert(pPager->state==PAGER_SHARED ||
2953c5859718Sdanielk1977           (pPager->exclusiveMode && pPager->state>PAGER_SHARED)
2954c5859718Sdanielk1977       );
2955ed7c855cSdrh     }
2956e277be05Sdanielk1977 
29578c0a791aSdanielk1977     if( sqlite3PcachePagecount(pPager->pPCache)>0 ){
295824168728Sdanielk1977       /* The shared-lock has just been acquired on the database file
295924168728Sdanielk1977       ** and there are already pages in the cache (from a previous
296086a88114Sdrh       ** read or write transaction).  Check to see if the database
296186a88114Sdrh       ** has been modified.  If the database has changed, flush the
296286a88114Sdrh       ** cache.
296386a88114Sdrh       **
296486a88114Sdrh       ** Database changes is detected by looking at 15 bytes beginning
296586a88114Sdrh       ** at offset 24 into the file.  The first 4 of these 16 bytes are
296686a88114Sdrh       ** a 32-bit counter that is incremented with each change.  The
296786a88114Sdrh       ** other bytes change randomly with each file change when
296886a88114Sdrh       ** a codec is in use.
296986a88114Sdrh       **
297086a88114Sdrh       ** There is a vanishingly small chance that a change will not be
29716fa51035Sdrh       ** detected.  The chance of an undetected change is so small that
297286a88114Sdrh       ** it can be neglected.
297324168728Sdanielk1977       */
297486a88114Sdrh       char dbFileVers[sizeof(pPager->dbFileVers)];
2975ad0132dfSdanielk1977       sqlite3PagerPagecount(pPager, 0);
297624168728Sdanielk1977 
2977e180dd93Sdanielk1977       if( pPager->errCode ){
297852b472aeSdanielk1977         rc = pPager->errCode;
297952b472aeSdanielk1977         goto failed;
2980e277be05Sdanielk1977       }
2981e277be05Sdanielk1977 
2982d92db531Sdanielk1977       assert( pPager->dbSizeValid );
2983e180dd93Sdanielk1977       if( pPager->dbSize>0 ){
2984ae5e445bSdrh         IOTRACE(("CKVERS %p %d\n", pPager, sizeof(dbFileVers)));
298562079060Sdanielk1977         rc = sqlite3OsRead(pPager->fd, &dbFileVers, sizeof(dbFileVers), 24);
2986e180dd93Sdanielk1977         if( rc!=SQLITE_OK ){
298752b472aeSdanielk1977           goto failed;
2988e180dd93Sdanielk1977         }
298986a88114Sdrh       }else{
299086a88114Sdrh         memset(dbFileVers, 0, sizeof(dbFileVers));
2991e180dd93Sdanielk1977       }
2992e180dd93Sdanielk1977 
299386a88114Sdrh       if( memcmp(pPager->dbFileVers, dbFileVers, sizeof(dbFileVers))!=0 ){
2994e277be05Sdanielk1977         pager_reset(pPager);
2995e277be05Sdanielk1977       }
2996e277be05Sdanielk1977     }
29970371f1b2Sdanielk1977     assert( pPager->exclusiveMode || pPager->state==PAGER_SHARED );
2998c5859718Sdanielk1977   }
2999e277be05Sdanielk1977 
300052b472aeSdanielk1977  failed:
300152b472aeSdanielk1977   if( rc!=SQLITE_OK ){
300252b472aeSdanielk1977     /* pager_unlock() is a no-op for exclusive mode and in-memory databases. */
300352b472aeSdanielk1977     pager_unlock(pPager);
300452b472aeSdanielk1977   }
3005e277be05Sdanielk1977   return rc;
3006d9b0257aSdrh }
3007e277be05Sdanielk1977 
3008e277be05Sdanielk1977 /*
3009d33d5a89Sdrh ** Make sure we have the content for a page.  If the page was
3010d33d5a89Sdrh ** previously acquired with noContent==1, then the content was
3011d33d5a89Sdrh ** just initialized to zeros instead of being read from disk.
3012d33d5a89Sdrh ** But now we need the real data off of disk.  So make sure we
3013d33d5a89Sdrh ** have it.  Read it in if we do not have it already.
3014d33d5a89Sdrh */
3015d33d5a89Sdrh static int pager_get_content(PgHdr *pPg){
30168c0a791aSdanielk1977   if( pPg->flags&PGHDR_NEED_READ ){
3017d33d5a89Sdrh     int rc = readDbPage(pPg->pPager, pPg, pPg->pgno);
3018d33d5a89Sdrh     if( rc==SQLITE_OK ){
30198c0a791aSdanielk1977       pPg->flags &= ~PGHDR_NEED_READ;
3020d33d5a89Sdrh     }else{
3021d33d5a89Sdrh       return rc;
3022d33d5a89Sdrh     }
3023d33d5a89Sdrh   }
3024d33d5a89Sdrh   return SQLITE_OK;
3025d33d5a89Sdrh }
3026d33d5a89Sdrh 
3027d33d5a89Sdrh /*
30288c0a791aSdanielk1977 ** If the reference count has reached zero, and the pager is not in the
30298c0a791aSdanielk1977 ** middle of a write transaction or opened in exclusive mode, unlock it.
30308c0a791aSdanielk1977 */
30318c0a791aSdanielk1977 static void pagerUnlockIfUnused(Pager *pPager){
30328c0a791aSdanielk1977   if( (sqlite3PcacheRefCount(pPager->pPCache)==0)
30338c0a791aSdanielk1977     && (!pPager->exclusiveMode || pPager->journalOff>0)
30348c0a791aSdanielk1977   ){
30358c0a791aSdanielk1977     pagerUnlockAndRollback(pPager);
30368c0a791aSdanielk1977   }
30378c0a791aSdanielk1977 }
30388c0a791aSdanielk1977 
30398c0a791aSdanielk1977 /*
30408c0a791aSdanielk1977 ** Drop a page from the cache using sqlite3PcacheDrop().
30418c0a791aSdanielk1977 **
30428c0a791aSdanielk1977 ** If this means there are now no pages with references to them, a rollback
30438c0a791aSdanielk1977 ** occurs and the lock on the database is removed.
30448c0a791aSdanielk1977 */
30458c0a791aSdanielk1977 static void pagerDropPage(DbPage *pPg){
30468c0a791aSdanielk1977   Pager *pPager = pPg->pPager;
30478c0a791aSdanielk1977   sqlite3PcacheDrop(pPg);
30488c0a791aSdanielk1977   pagerUnlockIfUnused(pPager);
30498c0a791aSdanielk1977 }
30508c0a791aSdanielk1977 
30518c0a791aSdanielk1977 /*
3052e277be05Sdanielk1977 ** Acquire a page.
3053e277be05Sdanielk1977 **
3054e277be05Sdanielk1977 ** A read lock on the disk file is obtained when the first page is acquired.
3055e277be05Sdanielk1977 ** This read lock is dropped when the last page is released.
3056e277be05Sdanielk1977 **
3057d33d5a89Sdrh ** This routine works for any page number greater than 0.  If the database
3058e277be05Sdanielk1977 ** file is smaller than the requested page, then no actual disk
3059e277be05Sdanielk1977 ** read occurs and the memory image of the page is initialized to
3060e277be05Sdanielk1977 ** all zeros.  The extra data appended to a page is always initialized
3061e277be05Sdanielk1977 ** to zeros the first time a page is loaded into memory.
3062e277be05Sdanielk1977 **
3063e277be05Sdanielk1977 ** The acquisition might fail for several reasons.  In all cases,
3064e277be05Sdanielk1977 ** an appropriate error code is returned and *ppPage is set to NULL.
3065e277be05Sdanielk1977 **
3066d33d5a89Sdrh ** See also sqlite3PagerLookup().  Both this routine and Lookup() attempt
3067e277be05Sdanielk1977 ** to find a page in the in-memory cache first.  If the page is not already
3068d33d5a89Sdrh ** in memory, this routine goes to disk to read it in whereas Lookup()
3069e277be05Sdanielk1977 ** just returns 0.  This routine acquires a read-lock the first time it
3070e277be05Sdanielk1977 ** has to go to disk, and could also playback an old journal if necessary.
3071d33d5a89Sdrh ** Since Lookup() never goes to disk, it never has to deal with locks
3072e277be05Sdanielk1977 ** or journal files.
3073e277be05Sdanielk1977 **
3074538f570cSdrh ** If noContent is false, the page contents are actually read from disk.
3075538f570cSdrh ** If noContent is true, it means that we do not care about the contents
3076538f570cSdrh ** of the page at this time, so do not do a disk read.  Just fill in the
3077538f570cSdrh ** page content with zeros.  But mark the fact that we have not read the
3078538f570cSdrh ** content by setting the PgHdr.needRead flag.  Later on, if
3079d33d5a89Sdrh ** sqlite3PagerWrite() is called on this page or if this routine is
3080d33d5a89Sdrh ** called again with noContent==0, that means that the content is needed
3081d33d5a89Sdrh ** and the disk read should occur at that point.
3082e277be05Sdanielk1977 */
308365e0ff32Sdanielk1977 int sqlite3PagerAcquire(
3084538f570cSdrh   Pager *pPager,      /* The pager open on the database file */
3085538f570cSdrh   Pgno pgno,          /* Page number to fetch */
3086538f570cSdrh   DbPage **ppPage,    /* Write a pointer to the page here */
3087538f570cSdrh   int noContent       /* Do not bother reading content from disk if true */
3088538f570cSdrh ){
30898c0a791aSdanielk1977   PgHdr *pPg = 0;
3090e277be05Sdanielk1977   int rc;
3091e277be05Sdanielk1977 
30928c0a791aSdanielk1977   assert( pPager->state==PAGER_UNLOCK
30938c0a791aSdanielk1977        || sqlite3PcacheRefCount(pPager->pPCache)>0
30948c0a791aSdanielk1977        || pgno==1
30958c0a791aSdanielk1977   );
3096e277be05Sdanielk1977 
3097e277be05Sdanielk1977   /* The maximum page number is 2^31. Return SQLITE_CORRUPT if a page
3098e277be05Sdanielk1977   ** number greater than this, or zero, is requested.
3099e277be05Sdanielk1977   */
3100e277be05Sdanielk1977   if( pgno>PAGER_MAX_PGNO || pgno==0 || pgno==PAGER_MJ_PGNO(pPager) ){
3101e277be05Sdanielk1977     return SQLITE_CORRUPT_BKPT;
3102e277be05Sdanielk1977   }
3103e277be05Sdanielk1977 
3104e277be05Sdanielk1977   /* Make sure we have not hit any critical errors.
3105e277be05Sdanielk1977   */
3106e277be05Sdanielk1977   assert( pPager!=0 );
3107e277be05Sdanielk1977   *ppPage = 0;
3108e277be05Sdanielk1977 
3109e277be05Sdanielk1977   /* If this is the first page accessed, then get a SHARED lock
3110334cdb63Sdanielk1977   ** on the database file. pagerSharedLock() is a no-op if
3111334cdb63Sdanielk1977   ** a database lock is already held.
3112e277be05Sdanielk1977   */
3113e277be05Sdanielk1977   rc = pagerSharedLock(pPager);
3114e277be05Sdanielk1977   if( rc!=SQLITE_OK ){
3115e277be05Sdanielk1977     return rc;
3116e277be05Sdanielk1977   }
3117e277be05Sdanielk1977   assert( pPager->state!=PAGER_UNLOCK );
3118e277be05Sdanielk1977 
31198c0a791aSdanielk1977   rc = sqlite3PcacheFetch(pPager->pPCache, pgno, 1, &pPg);
3120db48ee02Sdrh   if( rc!=SQLITE_OK ){
312175bab7d6Sdanielk1977     return rc;
3122db48ee02Sdrh   }
31238c0a791aSdanielk1977   if( pPg->pPager==0 ){
31248c0a791aSdanielk1977     /* The pager cache has created a new page. Its content needs to
31258c0a791aSdanielk1977     ** be initialized.
31268c0a791aSdanielk1977     */
31278c0a791aSdanielk1977     int nMax;
31288c0a791aSdanielk1977     PAGER_INCR(pPager->nMiss);
31298c0a791aSdanielk1977     pPg->pPager = pPager;
31308c0a791aSdanielk1977     memset(pPg->pExtra, 0, pPager->nExtra);
31318c0a791aSdanielk1977 
3132ad0132dfSdanielk1977     rc = sqlite3PagerPagecount(pPager, &nMax);
3133ad0132dfSdanielk1977     if( rc!=SQLITE_OK ){
3134ae72d982Sdanielk1977       sqlite3PagerUnref(pPg);
31352e6d11bcSdrh       return rc;
31362e6d11bcSdrh     }
313775bab7d6Sdanielk1977 
3138a1fa00d9Sdanielk1977     if( nMax<(int)pgno || MEMDB || noContent ){
3139f8e632b6Sdrh       if( pgno>pPager->mxPgno ){
3140de3bea7bSdanielk1977         sqlite3PagerUnref(pPg);
3141f8e632b6Sdrh         return SQLITE_FULL;
3142f8e632b6Sdrh       }
31438c0a791aSdanielk1977       memset(pPg->pData, 0, pPager->pageSize);
3144a1fa00d9Sdanielk1977       if( noContent ){
31458c0a791aSdanielk1977         pPg->flags |= PGHDR_NEED_READ;
31468c0a791aSdanielk1977       }
3147538f570cSdrh       IOTRACE(("ZERO %p %d\n", pPager, pgno));
3148306dc213Sdrh     }else{
3149e180dd93Sdanielk1977       rc = readDbPage(pPager, pPg, pgno);
3150551b7736Sdrh       if( rc!=SQLITE_OK && rc!=SQLITE_IOERR_SHORT_READ ){
31518c0a791aSdanielk1977         /* sqlite3PagerUnref(pPg); */
31528c0a791aSdanielk1977         pagerDropPage(pPg);
315397a227c9Sdanielk1977         return rc;
315481a20f21Sdrh       }
3155306dc213Sdrh     }
31563c407374Sdanielk1977 #ifdef SQLITE_CHECK_PAGES
31573c407374Sdanielk1977     pPg->pageHash = pager_pagehash(pPg);
31583c407374Sdanielk1977 #endif
3159ed7c855cSdrh   }else{
3160d9b0257aSdrh     /* The requested page is in the page cache. */
31618c0a791aSdanielk1977     assert(sqlite3PcacheRefCount(pPager->pPCache)>0 || pgno==1);
3162538f570cSdrh     PAGER_INCR(pPager->nHit);
3163d33d5a89Sdrh     if( !noContent ){
3164d33d5a89Sdrh       rc = pager_get_content(pPg);
3165d33d5a89Sdrh       if( rc ){
31668c0a791aSdanielk1977         sqlite3PagerUnref(pPg);
3167d33d5a89Sdrh         return rc;
3168d33d5a89Sdrh       }
3169d33d5a89Sdrh     }
3170ed7c855cSdrh   }
31718c0a791aSdanielk1977 
31723b8a05f6Sdanielk1977   *ppPage = pPg;
3173ed7c855cSdrh   return SQLITE_OK;
3174ed7c855cSdrh }
31758c0a791aSdanielk1977 
3176ed7c855cSdrh /*
31777e3b0a07Sdrh ** Acquire a page if it is already in the in-memory cache.  Do
31787e3b0a07Sdrh ** not read the page from disk.  Return a pointer to the page,
31797e3b0a07Sdrh ** or 0 if the page is not in cache.
31807e3b0a07Sdrh **
31813b8a05f6Sdanielk1977 ** See also sqlite3PagerGet().  The difference between this routine
31823b8a05f6Sdanielk1977 ** and sqlite3PagerGet() is that _get() will go to the disk and read
31837e3b0a07Sdrh ** in the page if the page is not already in cache.  This routine
31845e00f6c7Sdrh ** returns NULL if the page is not in cache or if a disk I/O error
31855e00f6c7Sdrh ** has ever happened.
31867e3b0a07Sdrh */
31873b8a05f6Sdanielk1977 DbPage *sqlite3PagerLookup(Pager *pPager, Pgno pgno){
318886f8c197Sdrh   PgHdr *pPg = 0;
3189836faa48Sdrh   assert( pPager!=0 );
3190836faa48Sdrh   assert( pgno!=0 );
3191e277be05Sdanielk1977 
31928c0a791aSdanielk1977   if( (pPager->state!=PAGER_UNLOCK)
31938c0a791aSdanielk1977    && (pPager->errCode==SQLITE_OK || pPager->errCode==SQLITE_FULL)
31948c0a791aSdanielk1977   ){
31958c0a791aSdanielk1977     sqlite3PcacheFetch(pPager->pPCache, pgno, 0, &pPg);
319686f8c197Sdrh   }
31978c0a791aSdanielk1977 
31983b8a05f6Sdanielk1977   return pPg;
31997e3b0a07Sdrh }
32007e3b0a07Sdrh 
32017e3b0a07Sdrh /*
3202ed7c855cSdrh ** Release a page.
3203ed7c855cSdrh **
3204ed7c855cSdrh ** If the number of references to the page drop to zero, then the
3205ed7c855cSdrh ** page is added to the LRU list.  When all references to all pages
3206d9b0257aSdrh ** are released, a rollback occurs and the lock on the database is
3207ed7c855cSdrh ** removed.
3208ed7c855cSdrh */
32093b8a05f6Sdanielk1977 int sqlite3PagerUnref(DbPage *pPg){
32108c0a791aSdanielk1977   if( pPg ){
32118c0a791aSdanielk1977     Pager *pPager = pPg->pPager;
32128c0a791aSdanielk1977     sqlite3PcacheRelease(pPg);
32138c0a791aSdanielk1977     pagerUnlockIfUnused(pPager);
32148c0a791aSdanielk1977   }
3215d9b0257aSdrh   return SQLITE_OK;
3216d9b0257aSdrh }
3217ed7c855cSdrh 
32189153d850Sdanielk1977 /*
32199153d850Sdanielk1977 ** If the main journal file has already been opened, ensure that the
32209153d850Sdanielk1977 ** sub-journal file is open too. If the main journal is not open,
32219153d850Sdanielk1977 ** this function is a no-op.
32229153d850Sdanielk1977 **
32239153d850Sdanielk1977 ** SQLITE_OK is returned if everything goes according to plan. An
32249153d850Sdanielk1977 ** SQLITE_IOERR_XXX error code is returned if the call to
32259153d850Sdanielk1977 ** sqlite3OsOpen() fails.
32269153d850Sdanielk1977 */
3227fd7f0452Sdanielk1977 static int openSubJournal(Pager *pPager){
3228fd7f0452Sdanielk1977   int rc = SQLITE_OK;
3229fd7f0452Sdanielk1977   if( pPager->journalOpen && !pPager->sjfd->pMethods ){
3230fd7f0452Sdanielk1977     if( pPager->journalMode==PAGER_JOURNALMODE_MEMORY ){
3231fd7f0452Sdanielk1977       sqlite3MemJournalOpen(pPager->sjfd);
3232fd7f0452Sdanielk1977     }else{
3233fd7f0452Sdanielk1977       rc = sqlite3PagerOpentemp(pPager, pPager->sjfd, SQLITE_OPEN_SUBJOURNAL);
3234fd7f0452Sdanielk1977     }
3235fd7f0452Sdanielk1977   }
3236fd7f0452Sdanielk1977   return rc;
3237fd7f0452Sdanielk1977 }
3238fd7f0452Sdanielk1977 
3239ed7c855cSdrh /*
3240a6abd041Sdrh ** Create a journal file for pPager.  There should already be a RESERVED
3241a6abd041Sdrh ** or EXCLUSIVE lock on the database file when this routine is called.
3242da47d774Sdrh **
3243da47d774Sdrh ** Return SQLITE_OK if everything.  Return an error code and release the
3244da47d774Sdrh ** write lock if anything goes wrong.
3245da47d774Sdrh */
3246da47d774Sdrh static int pager_open_journal(Pager *pPager){
3247b4b47411Sdanielk1977   sqlite3_vfs *pVfs = pPager->pVfs;
3248b4b47411Sdanielk1977   int flags = (SQLITE_OPEN_READWRITE|SQLITE_OPEN_EXCLUSIVE|SQLITE_OPEN_CREATE);
3249b4b47411Sdanielk1977 
3250da47d774Sdrh   int rc;
3251a6abd041Sdrh   assert( pPager->state>=PAGER_RESERVED );
3252da47d774Sdrh   assert( pPager->useJournal );
3253f5e7bb51Sdrh   assert( pPager->pInJournal==0 );
3254ad0132dfSdanielk1977   sqlite3PagerPagecount(pPager, 0);
3255f5e7bb51Sdrh   pPager->pInJournal = sqlite3BitvecCreate(pPager->dbSize);
3256f5e7bb51Sdrh   if( pPager->pInJournal==0 ){
32579c105bb9Sdrh     rc = SQLITE_NOMEM;
32589c105bb9Sdrh     goto failed_to_open_journal;
3259da47d774Sdrh   }
3260b4b47411Sdanielk1977 
3261fdc40e91Sdrh   if( pPager->journalOpen==0 ){
3262b4b47411Sdanielk1977     if( pPager->tempFile ){
3263fee2d25aSdanielk1977       flags |= (SQLITE_OPEN_DELETEONCLOSE|SQLITE_OPEN_TEMP_JOURNAL);
3264fee2d25aSdanielk1977     }else{
3265fee2d25aSdanielk1977       flags |= (SQLITE_OPEN_MAIN_JOURNAL);
3266b4b47411Sdanielk1977     }
3267b3175389Sdanielk1977     if( pPager->journalMode==PAGER_JOURNALMODE_MEMORY ){
3268b3175389Sdanielk1977       sqlite3MemJournalOpen(pPager->jfd);
3269b3175389Sdanielk1977       rc = SQLITE_OK;
3270b3175389Sdanielk1977     }else{
3271c7b6017cSdanielk1977 #ifdef SQLITE_ENABLE_ATOMIC_WRITE
3272c7b6017cSdanielk1977       rc = sqlite3JournalOpen(
3273c7b6017cSdanielk1977           pVfs, pPager->zJournal, pPager->jfd, flags, jrnlBufferSize(pPager)
3274c7b6017cSdanielk1977       );
3275c7b6017cSdanielk1977 #else
3276b4b47411Sdanielk1977       rc = sqlite3OsOpen(pVfs, pPager->zJournal, pPager->jfd, flags, 0);
3277c7b6017cSdanielk1977 #endif
3278b3175389Sdanielk1977     }
3279b4b47411Sdanielk1977     assert( rc!=SQLITE_OK || pPager->jfd->pMethods );
32807657240aSdanielk1977     pPager->journalOff = 0;
32817657240aSdanielk1977     pPager->setMaster = 0;
32827657240aSdanielk1977     pPager->journalHdr = 0;
3283da47d774Sdrh     if( rc!=SQLITE_OK ){
3284600e46a0Sdrh       if( rc==SQLITE_NOMEM ){
3285fee2d25aSdanielk1977         sqlite3OsDelete(pVfs, pPager->zJournal, 0);
3286600e46a0Sdrh       }
32879c105bb9Sdrh       goto failed_to_open_journal;
3288da47d774Sdrh     }
3289fdc40e91Sdrh   }
3290da47d774Sdrh   pPager->journalOpen = 1;
3291db48ee02Sdrh   pPager->journalStarted = 0;
3292da47d774Sdrh   pPager->needSync = 0;
3293968af52aSdrh   pPager->nRec = 0;
3294efaaf579Sdanielk1977   if( pPager->errCode ){
3295efaaf579Sdanielk1977     rc = pPager->errCode;
3296dd5b2fa5Sdrh     goto failed_to_open_journal;
32972e6d11bcSdrh   }
32983460d19cSdanielk1977   pPager->dbOrigSize = pPager->dbSize;
3299ae2b40c4Sdrh 
33007657240aSdanielk1977   rc = writeJournalHdr(pPager);
33017657240aSdanielk1977 
3302fd7f0452Sdanielk1977   if( pPager->nSavepoint && rc==SQLITE_OK ){
3303fd7f0452Sdanielk1977     rc = openSubJournal(pPager);
3304da47d774Sdrh   }
3305ae72d982Sdanielk1977   if( rc!=SQLITE_OK && rc!=SQLITE_NOMEM && rc!=SQLITE_IOERR_NOMEM ){
3306df2566a3Sdanielk1977     rc = pager_end_transaction(pPager, 0);
3307da47d774Sdrh     if( rc==SQLITE_OK ){
3308da47d774Sdrh       rc = SQLITE_FULL;
3309da47d774Sdrh     }
3310da47d774Sdrh   }
3311da47d774Sdrh   return rc;
33129c105bb9Sdrh 
33139c105bb9Sdrh failed_to_open_journal:
3314f5e7bb51Sdrh   sqlite3BitvecDestroy(pPager->pInJournal);
3315f5e7bb51Sdrh   pPager->pInJournal = 0;
33169c105bb9Sdrh   return rc;
3317da47d774Sdrh }
3318da47d774Sdrh 
3319da47d774Sdrh /*
33204b845d7eSdrh ** Acquire a write-lock on the database.  The lock is removed when
33214b845d7eSdrh ** the any of the following happen:
33224b845d7eSdrh **
332380e35f46Sdrh **   *  sqlite3PagerCommitPhaseTwo() is called.
33243b8a05f6Sdanielk1977 **   *  sqlite3PagerRollback() is called.
33253b8a05f6Sdanielk1977 **   *  sqlite3PagerClose() is called.
33263b8a05f6Sdanielk1977 **   *  sqlite3PagerUnref() is called to on every outstanding page.
33274b845d7eSdrh **
332813adf8a0Sdanielk1977 ** The first parameter to this routine is a pointer to any open page of the
332913adf8a0Sdanielk1977 ** database file.  Nothing changes about the page - it is used merely to
333013adf8a0Sdanielk1977 ** acquire a pointer to the Pager structure and as proof that there is
333113adf8a0Sdanielk1977 ** already a read-lock on the database.
33324b845d7eSdrh **
333313adf8a0Sdanielk1977 ** The second parameter indicates how much space in bytes to reserve for a
333413adf8a0Sdanielk1977 ** master journal file-name at the start of the journal when it is created.
333513adf8a0Sdanielk1977 **
333613adf8a0Sdanielk1977 ** A journal file is opened if this is not a temporary file.  For temporary
333713adf8a0Sdanielk1977 ** files, the opening of the journal file is deferred until there is an
333813adf8a0Sdanielk1977 ** actual need to write to the journal.
3339da47d774Sdrh **
3340a6abd041Sdrh ** If the database is already reserved for writing, this routine is a no-op.
3341684917c2Sdrh **
3342684917c2Sdrh ** If exFlag is true, go ahead and get an EXCLUSIVE lock on the file
3343684917c2Sdrh ** immediately instead of waiting until we try to flush the cache.  The
3344684917c2Sdrh ** exFlag is ignored if a transaction is already active.
33454b845d7eSdrh */
33463b8a05f6Sdanielk1977 int sqlite3PagerBegin(DbPage *pPg, int exFlag){
33474b845d7eSdrh   Pager *pPager = pPg->pPager;
33484b845d7eSdrh   int rc = SQLITE_OK;
33494b845d7eSdrh   assert( pPg->nRef>0 );
3350a6abd041Sdrh   assert( pPager->state!=PAGER_UNLOCK );
3351a6abd041Sdrh   if( pPager->state==PAGER_SHARED ){
3352f5e7bb51Sdrh     assert( pPager->pInJournal==0 );
3353b3175389Sdanielk1977     assert( !MEMDB );
3354054889ecSdrh     rc = sqlite3OsLock(pPager->fd, RESERVED_LOCK);
3355684917c2Sdrh     if( rc==SQLITE_OK ){
3356684917c2Sdrh       pPager->state = PAGER_RESERVED;
3357684917c2Sdrh       if( exFlag ){
3358684917c2Sdrh         rc = pager_wait_on_lock(pPager, EXCLUSIVE_LOCK);
3359684917c2Sdrh       }
3360684917c2Sdrh     }
33614b845d7eSdrh     if( rc!=SQLITE_OK ){
33624b845d7eSdrh       return rc;
33634b845d7eSdrh     }
3364a6abd041Sdrh     pPager->dirtyCache = 0;
336530d53701Sdrh     PAGERTRACE(("TRANSACTION %d\n", PAGERID(pPager)));
3366fdc40e91Sdrh     if( pPager->useJournal && !pPager->tempFile
3367fdc40e91Sdrh            && pPager->journalMode!=PAGER_JOURNALMODE_OFF ){
3368da47d774Sdrh       rc = pager_open_journal(pPager);
33694b845d7eSdrh     }
3370334cdb63Sdanielk1977   }else if( pPager->journalOpen && pPager->journalOff==0 ){
3371d138c016Sdrh     /* This happens when the pager was in exclusive-access mode the last
3372334cdb63Sdanielk1977     ** time a (read or write) transaction was successfully concluded
3373334cdb63Sdanielk1977     ** by this connection. Instead of deleting the journal file it was
3374d138c016Sdrh     ** kept open and either was truncated to 0 bytes or its header was
3375d138c016Sdrh     ** overwritten with zeros.
3376334cdb63Sdanielk1977     */
3377334cdb63Sdanielk1977     assert( pPager->nRec==0 );
33783460d19cSdanielk1977     assert( pPager->dbOrigSize==0 );
3379f5e7bb51Sdrh     assert( pPager->pInJournal==0 );
3380ad0132dfSdanielk1977     sqlite3PagerPagecount(pPager, 0);
3381f5e7bb51Sdrh     pPager->pInJournal = sqlite3BitvecCreate( pPager->dbSize );
3382f5e7bb51Sdrh     if( !pPager->pInJournal ){
3383334cdb63Sdanielk1977       rc = SQLITE_NOMEM;
3384334cdb63Sdanielk1977     }else{
33853460d19cSdanielk1977       pPager->dbOrigSize = pPager->dbSize;
3386334cdb63Sdanielk1977       rc = writeJournalHdr(pPager);
3387ac69b05eSdrh     }
3388334cdb63Sdanielk1977   }
3389334cdb63Sdanielk1977   assert( !pPager->journalOpen || pPager->journalOff>0 || rc!=SQLITE_OK );
33904b845d7eSdrh   return rc;
33914b845d7eSdrh }
33924b845d7eSdrh 
33934b845d7eSdrh /*
3394ed7c855cSdrh ** Mark a data page as writeable.  The page is written into the journal
3395ed7c855cSdrh ** if it is not there already.  This routine must be called before making
3396ed7c855cSdrh ** changes to a page.
3397ed7c855cSdrh **
3398ed7c855cSdrh ** The first time this routine is called, the pager creates a new
3399a6abd041Sdrh ** journal and acquires a RESERVED lock on the database.  If the RESERVED
3400ed7c855cSdrh ** lock could not be acquired, this routine returns SQLITE_BUSY.  The
3401306dc213Sdrh ** calling routine must check for that return value and be careful not to
3402ed7c855cSdrh ** change any page data until this routine returns SQLITE_OK.
3403d9b0257aSdrh **
3404d9b0257aSdrh ** If the journal file could not be written because the disk is full,
3405d9b0257aSdrh ** then this routine returns SQLITE_FULL and does an immediate rollback.
3406d9b0257aSdrh ** All subsequent write attempts also return SQLITE_FULL until there
34073b8a05f6Sdanielk1977 ** is a call to sqlite3PagerCommit() or sqlite3PagerRollback() to
3408d9b0257aSdrh ** reset.
3409ed7c855cSdrh */
34103b8a05f6Sdanielk1977 static int pager_write(PgHdr *pPg){
34118c0a791aSdanielk1977   void *pData = pPg->pData;
341269688d5fSdrh   Pager *pPager = pPg->pPager;
3413d79caebaSdrh   int rc = SQLITE_OK;
341469688d5fSdrh 
34156446c4dcSdrh   /* Check for errors
34166446c4dcSdrh   */
3417efaaf579Sdanielk1977   if( pPager->errCode ){
3418efaaf579Sdanielk1977     return pPager->errCode;
3419d9b0257aSdrh   }
34205e00f6c7Sdrh   if( pPager->readOnly ){
34215e00f6c7Sdrh     return SQLITE_PERM;
34225e00f6c7Sdrh   }
34236446c4dcSdrh 
34247657240aSdanielk1977   assert( !pPager->setMaster );
34257657240aSdanielk1977 
34263c407374Sdanielk1977   CHECK_PAGE(pPg);
34273c407374Sdanielk1977 
3428538f570cSdrh   /* If this page was previously acquired with noContent==1, that means
3429538f570cSdrh   ** we didn't really read in the content of the page.  This can happen
3430538f570cSdrh   ** (for example) when the page is being moved to the freelist.  But
3431538f570cSdrh   ** now we are (perhaps) moving the page off of the freelist for
3432538f570cSdrh   ** reuse and we need to know its original content so that content
3433538f570cSdrh   ** can be stored in the rollback journal.  So do the read at this
3434538f570cSdrh   ** time.
3435538f570cSdrh   */
3436d33d5a89Sdrh   rc = pager_get_content(pPg);
3437d33d5a89Sdrh   if( rc ){
3438538f570cSdrh     return rc;
3439538f570cSdrh   }
3440538f570cSdrh 
34416446c4dcSdrh   /* Mark the page as dirty.  If the page has already been written
34426446c4dcSdrh   ** to the journal then we can return right away.
34436446c4dcSdrh   */
3444c047b9f7Sdrh   sqlite3PcacheMakeDirty(pPg);
34453460d19cSdanielk1977   if( pageInJournal(pPg) && !subjRequiresPage(pPg) ){
3446a6abd041Sdrh     pPager->dirtyCache = 1;
3447d138c016Sdrh     pPager->dbModified = 1;
3448a0bf2652Sdanielk1977   }else{
34496446c4dcSdrh 
34506446c4dcSdrh     /* If we get this far, it means that the page needs to be
3451fa86c412Sdrh     ** written to the transaction journal or the ckeckpoint journal
3452fa86c412Sdrh     ** or both.
3453fa86c412Sdrh     **
3454fa86c412Sdrh     ** First check to see that the transaction journal exists and
3455fa86c412Sdrh     ** create it if it does not.
34566446c4dcSdrh     */
3457a6abd041Sdrh     assert( pPager->state!=PAGER_UNLOCK );
34583b8a05f6Sdanielk1977     rc = sqlite3PagerBegin(pPg, 0);
3459da47d774Sdrh     if( rc!=SQLITE_OK ){
3460da47d774Sdrh       return rc;
3461da47d774Sdrh     }
3462a6abd041Sdrh     assert( pPager->state>=PAGER_RESERVED );
3463fdc40e91Sdrh     if( !pPager->journalOpen && pPager->useJournal
3464fdc40e91Sdrh           && pPager->journalMode!=PAGER_JOURNALMODE_OFF ){
3465da47d774Sdrh       rc = pager_open_journal(pPager);
3466da47d774Sdrh       if( rc!=SQLITE_OK ) return rc;
3467da47d774Sdrh     }
3468a6abd041Sdrh     pPager->dirtyCache = 1;
3469d138c016Sdrh     pPager->dbModified = 1;
34706446c4dcSdrh 
3471a6abd041Sdrh     /* The transaction journal now exists and we have a RESERVED or an
3472a6abd041Sdrh     ** EXCLUSIVE lock on the main database file.  Write the current page to
3473a6abd041Sdrh     ** the transaction journal if it is not there already.
34746446c4dcSdrh     */
3475bc2ca9ebSdanielk1977     if( !pageInJournal(pPg) && pPager->journalOpen ){
34763460d19cSdanielk1977       if( pPg->pgno<=pPager->dbOrigSize ){
3477bf4bca54Sdrh         u32 cksum;
3478bf4bca54Sdrh         char *pData2;
3479dd97a49cSdanielk1977 
3480267cb326Sdrh         /* We should never write to the journal file the page that
3481267cb326Sdrh         ** contains the database locks.  The following assert verifies
3482267cb326Sdrh         ** that we do not. */
3483267cb326Sdrh         assert( pPg->pgno!=PAGER_MJ_PGNO(pPager) );
3484c001c58aSdrh         pData2 = CODEC2(pPager, pData, pPg->pgno, 7);
34853752785fSdrh         cksum = pager_cksum(pPager, (u8*)pData2);
3486bf4bca54Sdrh         rc = write32bits(pPager->jfd, pPager->journalOff, pPg->pgno);
3487bf4bca54Sdrh         if( rc==SQLITE_OK ){
3488bf4bca54Sdrh           rc = sqlite3OsWrite(pPager->jfd, pData2, pPager->pageSize,
3489bf4bca54Sdrh                               pPager->journalOff + 4);
3490bf4bca54Sdrh           pPager->journalOff += pPager->pageSize+4;
3491bf4bca54Sdrh         }
3492bf4bca54Sdrh         if( rc==SQLITE_OK ){
3493bf4bca54Sdrh           rc = write32bits(pPager->jfd, pPager->journalOff, cksum);
3494bf4bca54Sdrh           pPager->journalOff += 4;
3495bf4bca54Sdrh         }
3496b0603416Sdrh         IOTRACE(("JOUT %p %d %lld %d\n", pPager, pPg->pgno,
3497667a6c98Sdanielk1977                  pPager->journalOff, pPager->pageSize));
3498538f570cSdrh         PAGER_INCR(sqlite3_pager_writej_count);
349930d53701Sdrh         PAGERTRACE(("JOURNAL %d page %d needSync=%d hash(%08x)\n",
35008c0a791aSdanielk1977              PAGERID(pPager), pPg->pgno,
350130d53701Sdrh              ((pPg->flags&PGHDR_NEED_SYNC)?1:0), pager_pagehash(pPg)));
350207cb560bSdanielk1977 
3503f3107512Sdanielk1977         /* Even if an IO or diskfull error occurred while journalling the
3504f3107512Sdanielk1977         ** page in the block above, set the need-sync flag for the page.
3505f3107512Sdanielk1977         ** Otherwise, when the transaction is rolled back, the logic in
3506f3107512Sdanielk1977         ** playback_one_page() will think that the page needs to be restored
3507f3107512Sdanielk1977         ** in the database file. And if an IO error occurs while doing so,
3508f3107512Sdanielk1977         ** then corruption may follow.
3509f3107512Sdanielk1977         */
3510f3107512Sdanielk1977         if( !pPager->noSync ){
3511f3107512Sdanielk1977           pPg->flags |= PGHDR_NEED_SYNC;
3512a4124bdfSdanielk1977           pPager->needSync = 1;
3513f3107512Sdanielk1977         }
3514f3107512Sdanielk1977 
351507cb560bSdanielk1977         /* An error has occured writing to the journal file. The
351607cb560bSdanielk1977         ** transaction will be rolled back by the layer above.
351707cb560bSdanielk1977         */
3518d9b0257aSdrh         if( rc!=SQLITE_OK ){
3519d9b0257aSdrh           return rc;
3520d9b0257aSdrh         }
352107cb560bSdanielk1977 
352299ee3600Sdrh         pPager->nRec++;
3523f5e7bb51Sdrh         assert( pPager->pInJournal!=0 );
35247539b6b8Sdrh         rc = sqlite3BitvecSet(pPager->pInJournal, pPg->pgno);
35257539b6b8Sdrh         testcase( rc==SQLITE_NOMEM );
35267539b6b8Sdrh         assert( rc==SQLITE_OK || rc==SQLITE_NOMEM );
35277539b6b8Sdrh         rc |= addToSavepointBitvecs(pPager, pPg->pgno);
35287539b6b8Sdrh         if( rc!=SQLITE_OK ){
35297539b6b8Sdrh           assert( rc==SQLITE_NOMEM );
35307539b6b8Sdrh           return rc;
35317539b6b8Sdrh         }
3532db48ee02Sdrh       }else{
35338c0a791aSdanielk1977         if( !pPager->journalStarted && !pPager->noSync ){
35348c0a791aSdanielk1977           pPg->flags |= PGHDR_NEED_SYNC;
3535a4124bdfSdanielk1977           pPager->needSync = 1;
3536db48ee02Sdrh         }
353730d53701Sdrh         PAGERTRACE(("APPEND %d page %d needSync=%d\n",
35388c0a791aSdanielk1977                 PAGERID(pPager), pPg->pgno,
353930d53701Sdrh                ((pPg->flags&PGHDR_NEED_SYNC)?1:0)));
35408c0a791aSdanielk1977       }
3541d9b0257aSdrh     }
35426446c4dcSdrh 
3543ac69b05eSdrh     /* If the statement journal is open and the page is not in it,
3544ac69b05eSdrh     ** then write the current page to the statement journal.  Note that
3545ae2b40c4Sdrh     ** the statement journal format differs from the standard journal format
3546ae2b40c4Sdrh     ** in that it omits the checksums and the header.
35476446c4dcSdrh     */
35483460d19cSdanielk1977     if( subjRequiresPage(pPg) ){
3549f2c31ad8Sdanielk1977       rc = subjournalPage(pPg);
3550ac69b05eSdrh     }
3551fa86c412Sdrh   }
3552fa86c412Sdrh 
3553fa86c412Sdrh   /* Update the database size and return.
3554fa86c412Sdrh   */
35551aa2d8b5Sdrh   assert( pPager->state>=PAGER_SHARED );
3556d92db531Sdanielk1977   if( pPager->dbSize<pPg->pgno ){
3557306dc213Sdrh     pPager->dbSize = pPg->pgno;
3558d92db531Sdanielk1977     if( pPager->dbSize==(PAGER_MJ_PGNO(pPager)-1) ){
35591f595716Sdrh       pPager->dbSize++;
35601f595716Sdrh     }
3561306dc213Sdrh   }
356269688d5fSdrh   return rc;
3563ed7c855cSdrh }
3564ed7c855cSdrh 
3565ed7c855cSdrh /*
35664099f6e1Sdanielk1977 ** This function is used to mark a data-page as writable. It uses
35674099f6e1Sdanielk1977 ** pager_write() to open a journal file (if it is not already open)
35684099f6e1Sdanielk1977 ** and write the page *pData to the journal.
35694099f6e1Sdanielk1977 **
35704099f6e1Sdanielk1977 ** The difference between this function and pager_write() is that this
35714099f6e1Sdanielk1977 ** function also deals with the special case where 2 or more pages
35724099f6e1Sdanielk1977 ** fit on a single disk sector. In this case all co-resident pages
35734099f6e1Sdanielk1977 ** must have been written to the journal file before returning.
35744099f6e1Sdanielk1977 */
35753b8a05f6Sdanielk1977 int sqlite3PagerWrite(DbPage *pDbPage){
35764099f6e1Sdanielk1977   int rc = SQLITE_OK;
35774099f6e1Sdanielk1977 
35783b8a05f6Sdanielk1977   PgHdr *pPg = pDbPage;
35794099f6e1Sdanielk1977   Pager *pPager = pPg->pPager;
35804099f6e1Sdanielk1977   Pgno nPagePerSector = (pPager->sectorSize/pPager->pageSize);
35814099f6e1Sdanielk1977 
3582b3175389Sdanielk1977   if( nPagePerSector>1 ){
35834099f6e1Sdanielk1977     Pgno nPageCount;          /* Total number of pages in database file */
35844099f6e1Sdanielk1977     Pgno pg1;                 /* First page of the sector pPg is located on. */
35854099f6e1Sdanielk1977     int nPage;                /* Number of pages starting at pg1 to journal */
35864099f6e1Sdanielk1977     int ii;
3587dd97a49cSdanielk1977     int needSync = 0;
35884099f6e1Sdanielk1977 
35894099f6e1Sdanielk1977     /* Set the doNotSync flag to 1. This is because we cannot allow a journal
35904099f6e1Sdanielk1977     ** header to be written between the pages journaled by this function.
35914099f6e1Sdanielk1977     */
3592b3175389Sdanielk1977     assert( !MEMDB );
35934099f6e1Sdanielk1977     assert( pPager->doNotSync==0 );
35944099f6e1Sdanielk1977     pPager->doNotSync = 1;
35954099f6e1Sdanielk1977 
35964099f6e1Sdanielk1977     /* This trick assumes that both the page-size and sector-size are
35974099f6e1Sdanielk1977     ** an integer power of 2. It sets variable pg1 to the identifier
35984099f6e1Sdanielk1977     ** of the first page of the sector pPg is located on.
35994099f6e1Sdanielk1977     */
36004099f6e1Sdanielk1977     pg1 = ((pPg->pgno-1) & ~(nPagePerSector-1)) + 1;
36014099f6e1Sdanielk1977 
3602ad0132dfSdanielk1977     sqlite3PagerPagecount(pPager, (int *)&nPageCount);
36034099f6e1Sdanielk1977     if( pPg->pgno>nPageCount ){
36044099f6e1Sdanielk1977       nPage = (pPg->pgno - pg1)+1;
36054099f6e1Sdanielk1977     }else if( (pg1+nPagePerSector-1)>nPageCount ){
36064099f6e1Sdanielk1977       nPage = nPageCount+1-pg1;
36074099f6e1Sdanielk1977     }else{
36084099f6e1Sdanielk1977       nPage = nPagePerSector;
36094099f6e1Sdanielk1977     }
36104099f6e1Sdanielk1977     assert(nPage>0);
36114099f6e1Sdanielk1977     assert(pg1<=pPg->pgno);
36124099f6e1Sdanielk1977     assert((pg1+nPage)>pPg->pgno);
36134099f6e1Sdanielk1977 
36144099f6e1Sdanielk1977     for(ii=0; ii<nPage && rc==SQLITE_OK; ii++){
36154099f6e1Sdanielk1977       Pgno pg = pg1+ii;
3616dd97a49cSdanielk1977       PgHdr *pPage;
3617f5e7bb51Sdrh       if( pg==pPg->pgno || !sqlite3BitvecTest(pPager->pInJournal, pg) ){
36184099f6e1Sdanielk1977         if( pg!=PAGER_MJ_PGNO(pPager) ){
36193b8a05f6Sdanielk1977           rc = sqlite3PagerGet(pPager, pg, &pPage);
36204099f6e1Sdanielk1977           if( rc==SQLITE_OK ){
36214099f6e1Sdanielk1977             rc = pager_write(pPage);
36228c0a791aSdanielk1977             if( pPage->flags&PGHDR_NEED_SYNC ){
3623dd97a49cSdanielk1977               needSync = 1;
3624a4124bdfSdanielk1977               assert(pPager->needSync);
3625dd97a49cSdanielk1977             }
36263b8a05f6Sdanielk1977             sqlite3PagerUnref(pPage);
36274099f6e1Sdanielk1977           }
36284099f6e1Sdanielk1977         }
3629c81945e4Sdrh       }else if( (pPage = pager_lookup(pPager, pg))!=0 ){
36308c0a791aSdanielk1977         if( pPage->flags&PGHDR_NEED_SYNC ){
3631dd97a49cSdanielk1977           needSync = 1;
36324099f6e1Sdanielk1977         }
36338c0a791aSdanielk1977         sqlite3PagerUnref(pPage);
36344099f6e1Sdanielk1977       }
3635dd97a49cSdanielk1977     }
3636dd97a49cSdanielk1977 
3637ee03d629Sdrh     /* If the PGHDR_NEED_SYNC flag is set for any of the nPage pages
3638dd97a49cSdanielk1977     ** starting at pg1, then it needs to be set for all of them. Because
3639dd97a49cSdanielk1977     ** writing to any of these nPage pages may damage the others, the
3640dd97a49cSdanielk1977     ** journal file must contain sync()ed copies of all of them
3641dd97a49cSdanielk1977     ** before any of them can be written out to the database file.
3642dd97a49cSdanielk1977     */
3643dd97a49cSdanielk1977     if( needSync ){
3644b3df2e1cSdrh       assert( !MEMDB && pPager->noSync==0 );
3645dd97a49cSdanielk1977       for(ii=0; ii<nPage && needSync; ii++){
3646dd97a49cSdanielk1977         PgHdr *pPage = pager_lookup(pPager, pg1+ii);
3647ee03d629Sdrh         if( pPage ){
3648ee03d629Sdrh           pPage->flags |= PGHDR_NEED_SYNC;
36498c0a791aSdanielk1977           sqlite3PagerUnref(pPage);
3650dd97a49cSdanielk1977         }
3651ee03d629Sdrh       }
3652dd97a49cSdanielk1977       assert(pPager->needSync);
3653dd97a49cSdanielk1977     }
36544099f6e1Sdanielk1977 
36554099f6e1Sdanielk1977     assert( pPager->doNotSync==1 );
36564099f6e1Sdanielk1977     pPager->doNotSync = 0;
36574099f6e1Sdanielk1977   }else{
36583b8a05f6Sdanielk1977     rc = pager_write(pDbPage);
36594099f6e1Sdanielk1977   }
36604099f6e1Sdanielk1977   return rc;
36614099f6e1Sdanielk1977 }
36624099f6e1Sdanielk1977 
36634099f6e1Sdanielk1977 /*
3664aacc543eSdrh ** Return TRUE if the page given in the argument was previously passed
36653b8a05f6Sdanielk1977 ** to sqlite3PagerWrite().  In other words, return TRUE if it is ok
36666019e168Sdrh ** to change the content of the page.
36676019e168Sdrh */
36687d3a666fSdanielk1977 #ifndef NDEBUG
36693b8a05f6Sdanielk1977 int sqlite3PagerIswriteable(DbPage *pPg){
36708c0a791aSdanielk1977   return pPg->flags&PGHDR_DIRTY;
36716019e168Sdrh }
36727d3a666fSdanielk1977 #endif
36736019e168Sdrh 
3674001bbcbbSdrh /*
367530e58750Sdrh ** A call to this routine tells the pager that it is not necessary to
3676538f570cSdrh ** write the information on page pPg back to the disk, even though
3677dfe88eceSdrh ** that page might be marked as dirty.  This happens, for example, when
3678dfe88eceSdrh ** the page has been added as a leaf of the freelist and so its
3679dfe88eceSdrh ** content no longer matters.
368030e58750Sdrh **
368130e58750Sdrh ** The overlying software layer calls this routine when all of the data
368230e58750Sdrh ** on the given page is unused.  The pager marks the page as clean so
368330e58750Sdrh ** that it does not get written to disk.
368430e58750Sdrh **
368530e58750Sdrh ** Tests show that this optimization, together with the
36863b8a05f6Sdanielk1977 ** sqlite3PagerDontRollback() below, more than double the speed
368730e58750Sdrh ** of large INSERT operations and quadruple the speed of large DELETEs.
36888e298f92Sdrh **
368902983931Sdanielk1977 ** When this routine is called, set the bit corresponding to pDbPage in
369002983931Sdanielk1977 ** the Pager.pAlwaysRollback bitvec.  Subsequent calls to
369102983931Sdanielk1977 ** sqlite3PagerDontRollback() for the same page will thereafter be ignored.
369202983931Sdanielk1977 ** This is necessary to avoid a problem where a page with data is added to
369302983931Sdanielk1977 ** the freelist during one part of a transaction then removed from the
369402983931Sdanielk1977 ** freelist during a later part of the same transaction and reused for some
369502983931Sdanielk1977 ** other purpose.  When it is first added to the freelist, this routine is
369602983931Sdanielk1977 ** called.  When reused, the sqlite3PagerDontRollback() routine is called.
369702983931Sdanielk1977 ** But because the page contains critical data, we still need to be sure it
369802983931Sdanielk1977 ** gets rolled back in spite of the sqlite3PagerDontRollback() call.
369930e58750Sdrh */
3700a1fa00d9Sdanielk1977 int sqlite3PagerDontWrite(DbPage *pDbPage){
3701538f570cSdrh   PgHdr *pPg = pDbPage;
3702538f570cSdrh   Pager *pPager = pPg->pPager;
3703a1fa00d9Sdanielk1977   int rc;
37048e298f92Sdrh 
37053460d19cSdanielk1977   if( pPg->pgno>pPager->dbOrigSize ){
3706a1fa00d9Sdanielk1977     return SQLITE_OK;
3707a1fa00d9Sdanielk1977   }
3708a1fa00d9Sdanielk1977   if( pPager->pAlwaysRollback==0 ){
3709a1fa00d9Sdanielk1977     assert( pPager->pInJournal );
37103460d19cSdanielk1977     pPager->pAlwaysRollback = sqlite3BitvecCreate(pPager->dbOrigSize);
3711a1fa00d9Sdanielk1977     if( !pPager->pAlwaysRollback ){
3712a1fa00d9Sdanielk1977       return SQLITE_NOMEM;
3713a1fa00d9Sdanielk1977     }
3714a1fa00d9Sdanielk1977   }
3715a1fa00d9Sdanielk1977   rc = sqlite3BitvecSet(pPager->pAlwaysRollback, pPg->pgno);
3716a1fa00d9Sdanielk1977 
3717fd7f0452Sdanielk1977   if( rc==SQLITE_OK && (pPg->flags&PGHDR_DIRTY) && pPager->nSavepoint==0 ){
37181aa2d8b5Sdrh     assert( pPager->state>=PAGER_SHARED );
37193460d19cSdanielk1977     if( pPager->dbSize==pPg->pgno && pPager->dbOrigSize<pPager->dbSize ){
37208124a30fSdrh       /* If this pages is the last page in the file and the file has grown
37218124a30fSdrh       ** during the current transaction, then do NOT mark the page as clean.
37228124a30fSdrh       ** When the database file grows, we must make sure that the last page
37238124a30fSdrh       ** gets written at least once so that the disk file will be the correct
37248124a30fSdrh       ** size. If you do not write this page and the size of the file
37258124a30fSdrh       ** on the disk ends up being too small, that can lead to database
37268124a30fSdrh       ** corruption during the next transaction.
37278124a30fSdrh       */
37288124a30fSdrh     }else{
372930d53701Sdrh       PAGERTRACE(("DONT_WRITE page %d of %d\n", pPg->pgno, PAGERID(pPager)));
3730538f570cSdrh       IOTRACE(("CLEAN %p %d\n", pPager, pPg->pgno))
373133e3216aSdanielk1977       pPg->flags |= PGHDR_DONT_WRITE;
37323c407374Sdanielk1977 #ifdef SQLITE_CHECK_PAGES
37333c407374Sdanielk1977       pPg->pageHash = pager_pagehash(pPg);
37343c407374Sdanielk1977 #endif
373530e58750Sdrh     }
373630e58750Sdrh   }
3737a1fa00d9Sdanielk1977   return rc;
37388124a30fSdrh }
373930e58750Sdrh 
374030e58750Sdrh /*
374130e58750Sdrh ** A call to this routine tells the pager that if a rollback occurs,
374230e58750Sdrh ** it is not necessary to restore the data on the given page.  This
374330e58750Sdrh ** means that the pager does not have to record the given page in the
374430e58750Sdrh ** rollback journal.
3745538f570cSdrh **
3746538f570cSdrh ** If we have not yet actually read the content of this page (if
3747538f570cSdrh ** the PgHdr.needRead flag is set) then this routine acts as a promise
3748538f570cSdrh ** that we will never need to read the page content in the future.
3749538f570cSdrh ** so the needRead flag can be cleared at this point.
375030e58750Sdrh */
37513b8a05f6Sdanielk1977 void sqlite3PagerDontRollback(DbPage *pPg){
375230e58750Sdrh   Pager *pPager = pPg->pPager;
37537539b6b8Sdrh   TESTONLY( int rc; )  /* Return value from sqlite3BitvecSet() */
375430e58750Sdrh 
3755d3627afcSdrh   assert( pPager->state>=PAGER_RESERVED );
3756a55e9355Sdanielk1977 
3757a55e9355Sdanielk1977   /* If the journal file is not open, or DontWrite() has been called on
375802983931Sdanielk1977   ** this page (DontWrite() sets the Pager.pAlwaysRollback bit), then this
3759a55e9355Sdanielk1977   ** function is a no-op.
3760a55e9355Sdanielk1977   */
3761a1fa00d9Sdanielk1977   if( pPager->journalOpen==0
3762a1fa00d9Sdanielk1977    || sqlite3BitvecTest(pPager->pAlwaysRollback, pPg->pgno)
37633460d19cSdanielk1977    || pPg->pgno>pPager->dbOrigSize
37648c0a791aSdanielk1977   ){
376587c29a94Sdanielk1977     return;
376687c29a94Sdanielk1977   }
3767a55e9355Sdanielk1977 
3768c5d0bd90Sdrh #ifdef SQLITE_SECURE_DELETE
37691feb7dd3Sdrh   if( sqlite3BitvecTest(pPager->pInJournal, pPg->pgno)!=0
37703460d19cSdanielk1977    || pPg->pgno>pPager->dbOrigSize ){
3771c5d0bd90Sdrh     return;
3772c5d0bd90Sdrh   }
3773c5d0bd90Sdrh #endif
3774c5d0bd90Sdrh 
3775c5d0bd90Sdrh   /* If SECURE_DELETE is disabled, then there is no way that this
3776c5d0bd90Sdrh   ** routine can be called on a page for which sqlite3PagerDontWrite()
3777c5d0bd90Sdrh   ** has not been previously called during the same transaction.
3778c5d0bd90Sdrh   ** And if DontWrite() has previously been called, the following
3779c5d0bd90Sdrh   ** conditions must be met.
37801013148bSdrh   **
37811013148bSdrh   ** (Later:)  Not true.  If the database is corrupted by having duplicate
37821013148bSdrh   ** pages on the freelist (ex: corrupt9.test) then the following is not
37831013148bSdrh   ** necessarily true:
3784a55e9355Sdanielk1977   */
37853460d19cSdanielk1977   /* assert( !pPg->inJournal && (int)pPg->pgno <= pPager->dbOrigSize ); */
3786a55e9355Sdanielk1977 
3787f5e7bb51Sdrh   assert( pPager->pInJournal!=0 );
37888c0a791aSdanielk1977   pPg->flags &= ~PGHDR_NEED_READ;
37897539b6b8Sdrh 
37907539b6b8Sdrh   /* Failure to set the bits in the InJournal bit-vectors is benign.
37917539b6b8Sdrh   ** It merely means that we might do some extra work to journal a page
3792a8a71bacSdrh   ** that does not need to be journaled.  Nevertheless, be sure to test the
37937539b6b8Sdrh   ** case where a malloc error occurs while trying to set a bit in a
37947539b6b8Sdrh   ** bit vector.
37957539b6b8Sdrh   */
37967539b6b8Sdrh   sqlite3BeginBenignMalloc();
37977539b6b8Sdrh   TESTONLY( rc = ) sqlite3BitvecSet(pPager->pInJournal, pPg->pgno);
37987539b6b8Sdrh   testcase( rc==SQLITE_NOMEM );
37997539b6b8Sdrh   TESTONLY( rc = ) addToSavepointBitvecs(pPager, pPg->pgno);
38007539b6b8Sdrh   testcase( rc==SQLITE_NOMEM );
38017539b6b8Sdrh   sqlite3EndBenignMalloc();
38027539b6b8Sdrh 
38037539b6b8Sdrh 
380430d53701Sdrh   PAGERTRACE(("DONT_ROLLBACK page %d of %d\n", pPg->pgno, PAGERID(pPager)));
3805b0603416Sdrh   IOTRACE(("GARBAGE %p %d\n", pPager, pPg->pgno))
380630e58750Sdrh }
380730e58750Sdrh 
3808ac69b05eSdrh 
380930e58750Sdrh /*
381080e35f46Sdrh ** This routine is called to increment the database file change-counter,
381180e35f46Sdrh ** stored at byte 24 of the pager file.
381280e35f46Sdrh */
3813c7b6017cSdanielk1977 static int pager_incr_changecounter(Pager *pPager, int isDirect){
381480e35f46Sdrh   PgHdr *pPgHdr;
381580e35f46Sdrh   u32 change_counter;
3816c7b6017cSdanielk1977   int rc = SQLITE_OK;
381780e35f46Sdrh 
3818701bb3b4Sdrh #ifndef SQLITE_ENABLE_ATOMIC_WRITE
3819701bb3b4Sdrh   assert( isDirect==0 );  /* isDirect is only true for atomic writes */
3820701bb3b4Sdrh #endif
382112dd5496Sdanielk1977   if( !pPager->changeCountDone && pPager->dbSize>0 ){
382280e35f46Sdrh     /* Open page 1 of the file for writing. */
382380e35f46Sdrh     rc = sqlite3PagerGet(pPager, 1, &pPgHdr);
382480e35f46Sdrh     if( rc!=SQLITE_OK ) return rc;
3825c7b6017cSdanielk1977 
3826c7b6017cSdanielk1977     if( !isDirect ){
382780e35f46Sdrh       rc = sqlite3PagerWrite(pPgHdr);
3828ae72d982Sdanielk1977       if( rc!=SQLITE_OK ){
3829ae72d982Sdanielk1977         sqlite3PagerUnref(pPgHdr);
3830ae72d982Sdanielk1977         return rc;
3831ae72d982Sdanielk1977       }
3832c7b6017cSdanielk1977     }
383380e35f46Sdrh 
383480e35f46Sdrh     /* Increment the value just read and write it back to byte 24. */
3835b1003913Sdrh     change_counter = sqlite3Get4byte((u8*)pPager->dbFileVers);
383680e35f46Sdrh     change_counter++;
38378c0a791aSdanielk1977     put32bits(((char*)pPgHdr->pData)+24, change_counter);
3838c7b6017cSdanielk1977 
3839701bb3b4Sdrh #ifdef SQLITE_ENABLE_ATOMIC_WRITE
3840c7b6017cSdanielk1977     if( isDirect && pPager->fd->pMethods ){
38418c0a791aSdanielk1977       const void *zBuf = pPgHdr->pData;
38423460d19cSdanielk1977       assert( pPager->dbFileSize>0 );
3843c7b6017cSdanielk1977       rc = sqlite3OsWrite(pPager->fd, zBuf, pPager->pageSize, 0);
3844c7b6017cSdanielk1977     }
3845701bb3b4Sdrh #endif
3846c7b6017cSdanielk1977 
384780e35f46Sdrh     /* Release the page reference. */
384880e35f46Sdrh     sqlite3PagerUnref(pPgHdr);
384980e35f46Sdrh     pPager->changeCountDone = 1;
385080e35f46Sdrh   }
3851c7b6017cSdanielk1977   return rc;
385280e35f46Sdrh }
385380e35f46Sdrh 
385480e35f46Sdrh /*
3855f653d782Sdanielk1977 ** Sync the pager file to disk.
3856f653d782Sdanielk1977 */
3857f653d782Sdanielk1977 int sqlite3PagerSync(Pager *pPager){
3858f653d782Sdanielk1977   int rc;
38597426f864Sdrh   if( MEMDB ){
38607426f864Sdrh     rc = SQLITE_OK;
38617426f864Sdrh   }else{
3862f653d782Sdanielk1977     rc = sqlite3OsSync(pPager->fd, pPager->sync_flags);
38637426f864Sdrh   }
3864f653d782Sdanielk1977   return rc;
3865f653d782Sdanielk1977 }
3866f653d782Sdanielk1977 
3867f653d782Sdanielk1977 /*
386880e35f46Sdrh ** Sync the database file for the pager pPager. zMaster points to the name
386980e35f46Sdrh ** of a master journal file that should be written into the individual
387080e35f46Sdrh ** journal file. zMaster may be NULL, which is interpreted as no master
387180e35f46Sdrh ** journal (a single database transaction).
387280e35f46Sdrh **
387380e35f46Sdrh ** This routine ensures that the journal is synced, all dirty pages written
387480e35f46Sdrh ** to the database file and the database file synced. The only thing that
387580e35f46Sdrh ** remains to commit the transaction is to delete the journal file (or
387680e35f46Sdrh ** master journal file if specified).
387780e35f46Sdrh **
387880e35f46Sdrh ** Note that if zMaster==NULL, this does not overwrite a previous value
387980e35f46Sdrh ** passed to an sqlite3PagerCommitPhaseOne() call.
388080e35f46Sdrh **
3881f653d782Sdanielk1977 ** If the final parameter - noSync - is true, then the database file itself
3882f653d782Sdanielk1977 ** is not synced. The caller must call sqlite3PagerSync() directly to
3883f653d782Sdanielk1977 ** sync the database file before calling CommitPhaseTwo() to delete the
3884f653d782Sdanielk1977 ** journal file in this case.
388580e35f46Sdrh */
3886f653d782Sdanielk1977 int sqlite3PagerCommitPhaseOne(
3887f653d782Sdanielk1977   Pager *pPager,
3888f653d782Sdanielk1977   const char *zMaster,
3889f653d782Sdanielk1977   int noSync
3890f653d782Sdanielk1977 ){
389180e35f46Sdrh   int rc = SQLITE_OK;
389280e35f46Sdrh 
3893dad31b5eSdanielk1977   if( pPager->errCode ){
3894dad31b5eSdanielk1977     return pPager->errCode;
3895dad31b5eSdanielk1977   }
3896dad31b5eSdanielk1977 
3897d138c016Sdrh   /* If no changes have been made, we can leave the transaction early.
3898d138c016Sdrh   */
3899d138c016Sdrh   if( pPager->dbModified==0 &&
3900d138c016Sdrh         (pPager->journalMode!=PAGER_JOURNALMODE_DELETE ||
3901d138c016Sdrh           pPager->exclusiveMode!=0) ){
3902d138c016Sdrh     assert( pPager->dirtyCache==0 || pPager->journalOpen==0 );
3903d138c016Sdrh     return SQLITE_OK;
3904d138c016Sdrh   }
3905d138c016Sdrh 
390630d53701Sdrh   PAGERTRACE(("DATABASE SYNC: File=%s zMaster=%s nSize=%d\n",
390730d53701Sdrh       pPager->zFilename, zMaster, pPager->dbSize));
390880e35f46Sdrh 
390980e35f46Sdrh   /* If this is an in-memory db, or no pages have been written to, or this
391080e35f46Sdrh   ** function has already been called, it is a no-op.
391180e35f46Sdrh   */
391280e35f46Sdrh   if( pPager->state!=PAGER_SYNCED && !MEMDB && pPager->dirtyCache ){
391380e35f46Sdrh     PgHdr *pPg;
391480e35f46Sdrh 
3915c7b6017cSdanielk1977 #ifdef SQLITE_ENABLE_ATOMIC_WRITE
3916c7b6017cSdanielk1977     /* The atomic-write optimization can be used if all of the
3917c7b6017cSdanielk1977     ** following are true:
3918c7b6017cSdanielk1977     **
3919c7b6017cSdanielk1977     **    + The file-system supports the atomic-write property for
3920c7b6017cSdanielk1977     **      blocks of size page-size, and
3921c7b6017cSdanielk1977     **    + This commit is not part of a multi-file transaction, and
3922c7b6017cSdanielk1977     **    + Exactly one page has been modified and store in the journal file.
3923c7b6017cSdanielk1977     **
3924c7b6017cSdanielk1977     ** If the optimization can be used, then the journal file will never
3925c7b6017cSdanielk1977     ** be created for this transaction.
3926c7b6017cSdanielk1977     */
39274d60af9bSdanielk1977     int useAtomicWrite;
39288c0a791aSdanielk1977     pPg = sqlite3PcacheDirtyList(pPager->pPCache);
39294d60af9bSdanielk1977     useAtomicWrite = (
3930f55b8998Sdanielk1977         !zMaster &&
3931700b9c5aSdanielk1977         pPager->journalOpen &&
3932f55b8998Sdanielk1977         pPager->journalOff==jrnlBufferSize(pPager) &&
39333460d19cSdanielk1977         pPager->dbSize>=pPager->dbFileSize &&
39348c0a791aSdanielk1977         (pPg==0 || pPg->pDirty==0)
3935f55b8998Sdanielk1977     );
3936700b9c5aSdanielk1977     assert( pPager->journalOpen || pPager->journalMode==PAGER_JOURNALMODE_OFF );
3937f55b8998Sdanielk1977     if( useAtomicWrite ){
3938c7b6017cSdanielk1977       /* Update the nRec field in the journal file. */
3939c7b6017cSdanielk1977       int offset = pPager->journalHdr + sizeof(aJournalMagic);
3940c7b6017cSdanielk1977       assert(pPager->nRec==1);
3941c7b6017cSdanielk1977       rc = write32bits(pPager->jfd, offset, pPager->nRec);
3942c7b6017cSdanielk1977 
3943c7b6017cSdanielk1977       /* Update the db file change counter. The following call will modify
3944c7b6017cSdanielk1977       ** the in-memory representation of page 1 to include the updated
3945c7b6017cSdanielk1977       ** change counter and then write page 1 directly to the database
3946c7b6017cSdanielk1977       ** file. Because of the atomic-write property of the host file-system,
3947c7b6017cSdanielk1977       ** this is safe.
3948c7b6017cSdanielk1977       */
3949ae72d982Sdanielk1977       if( rc==SQLITE_OK ){
3950c7b6017cSdanielk1977         rc = pager_incr_changecounter(pPager, 1);
3951ae72d982Sdanielk1977       }
3952f55b8998Sdanielk1977     }else{
3953f55b8998Sdanielk1977       rc = sqlite3JournalCreate(pPager->jfd);
3954f55b8998Sdanielk1977     }
3955f55b8998Sdanielk1977 
3956ae72d982Sdanielk1977     if( !useAtomicWrite && rc==SQLITE_OK )
3957c7b6017cSdanielk1977 #endif
3958c7b6017cSdanielk1977 
395980e35f46Sdrh     /* If a master journal file name has already been written to the
396080e35f46Sdrh     ** journal file, then no sync is required. This happens when it is
396180e35f46Sdrh     ** written, then the process fails to upgrade from a RESERVED to an
396280e35f46Sdrh     ** EXCLUSIVE lock. The next time the process tries to commit the
396380e35f46Sdrh     ** transaction the m-j name will have already been written.
396480e35f46Sdrh     */
396580e35f46Sdrh     if( !pPager->setMaster ){
3966c7b6017cSdanielk1977       rc = pager_incr_changecounter(pPager, 0);
396780e35f46Sdrh       if( rc!=SQLITE_OK ) goto sync_exit;
396871aa7fffSdanielk1977       if( pPager->journalMode!=PAGER_JOURNALMODE_OFF ){
396980e35f46Sdrh #ifndef SQLITE_OMIT_AUTOVACUUM
39703460d19cSdanielk1977         if( pPager->dbSize<pPager->dbOrigSize ){
397180e35f46Sdrh           /* If this transaction has made the database smaller, then all pages
397280e35f46Sdrh           ** being discarded by the truncation must be written to the journal
397380e35f46Sdrh           ** file.
397480e35f46Sdrh           */
397580e35f46Sdrh           Pgno i;
3976d92db531Sdanielk1977           Pgno iSkip = PAGER_MJ_PGNO(pPager);
39773460d19cSdanielk1977           Pgno dbSize = pPager->dbSize;
3978f9bce3c5Sdanielk1977           pPager->dbSize = pPager->dbOrigSize;
3979f70c1feeSdanielk1977           for( i=dbSize+1; i<=pPager->dbOrigSize; i++ ){
3980f5e7bb51Sdrh             if( !sqlite3BitvecTest(pPager->pInJournal, i) && i!=iSkip ){
398180e35f46Sdrh               rc = sqlite3PagerGet(pPager, i, &pPg);
398280e35f46Sdrh               if( rc!=SQLITE_OK ) goto sync_exit;
398380e35f46Sdrh               rc = sqlite3PagerWrite(pPg);
398480e35f46Sdrh               sqlite3PagerUnref(pPg);
398580e35f46Sdrh               if( rc!=SQLITE_OK ) goto sync_exit;
398680e35f46Sdrh             }
398780e35f46Sdrh           }
39883460d19cSdanielk1977           pPager->dbSize = dbSize;
398980e35f46Sdrh         }
399080e35f46Sdrh #endif
399180e35f46Sdrh         rc = writeMasterJournal(pPager, zMaster);
399280e35f46Sdrh         if( rc!=SQLITE_OK ) goto sync_exit;
399380e35f46Sdrh         rc = syncJournal(pPager);
399480e35f46Sdrh       }
399571aa7fffSdanielk1977     }
3996c7b6017cSdanielk1977     if( rc!=SQLITE_OK ) goto sync_exit;
399780e35f46Sdrh 
399880e35f46Sdrh     /* Write all dirty pages to the database file */
39998c0a791aSdanielk1977     pPg = sqlite3PcacheDirtyList(pPager->pPCache);
400080e35f46Sdrh     rc = pager_write_pagelist(pPg);
4001153c62c4Sdrh     if( rc!=SQLITE_OK ){
400204c3a46eSdrh       assert( rc!=SQLITE_IOERR_BLOCKED );
400304c3a46eSdrh       /* The error might have left the dirty list all fouled up here,
400404c3a46eSdrh       ** but that does not matter because if the if the dirty list did
400504c3a46eSdrh       ** get corrupted, then the transaction will roll back and
400604c3a46eSdrh       ** discard the dirty list.  There is an assert in
400704c3a46eSdrh       ** pager_get_all_dirty_pages() that verifies that no attempt
400804c3a46eSdrh       ** is made to use an invalid dirty list.
400904c3a46eSdrh       */
4010153c62c4Sdrh       goto sync_exit;
4011153c62c4Sdrh     }
40128c0a791aSdanielk1977     sqlite3PcacheCleanAll(pPager->pPCache);
401380e35f46Sdrh 
4014f90b7260Sdanielk1977     if( pPager->dbSize<pPager->dbFileSize ){
4015f90b7260Sdanielk1977       assert( pPager->state>=PAGER_EXCLUSIVE );
4016f90b7260Sdanielk1977       rc = pager_truncate(pPager, pPager->dbSize);
4017f90b7260Sdanielk1977       if( rc!=SQLITE_OK ) goto sync_exit;
4018f90b7260Sdanielk1977     }
4019f90b7260Sdanielk1977 
402080e35f46Sdrh     /* Sync the database file. */
4021f653d782Sdanielk1977     if( !pPager->noSync && !noSync ){
4022f036aef0Sdanielk1977       rc = sqlite3OsSync(pPager->fd, pPager->sync_flags);
402380e35f46Sdrh     }
402480e35f46Sdrh     IOTRACE(("DBSYNC %p\n", pPager))
402580e35f46Sdrh 
402680e35f46Sdrh     pPager->state = PAGER_SYNCED;
402780e35f46Sdrh   }
402880e35f46Sdrh 
402980e35f46Sdrh sync_exit:
4030e965ac77Sdanielk1977   if( rc==SQLITE_IOERR_BLOCKED ){
4031e965ac77Sdanielk1977     /* pager_incr_changecounter() may attempt to obtain an exclusive
4032e965ac77Sdanielk1977      * lock to spill the cache and return IOERR_BLOCKED. But since
403385b623f2Sdrh      * there is no chance the cache is inconsistent, it is
4034e965ac77Sdanielk1977      * better to return SQLITE_BUSY.
4035e965ac77Sdanielk1977      */
4036e965ac77Sdanielk1977     rc = SQLITE_BUSY;
4037e965ac77Sdanielk1977   }
403880e35f46Sdrh   return rc;
403980e35f46Sdrh }
404080e35f46Sdrh 
404180e35f46Sdrh 
404280e35f46Sdrh /*
4043ed7c855cSdrh ** Commit all changes to the database and release the write lock.
4044d9b0257aSdrh **
4045d9b0257aSdrh ** If the commit fails for any reason, a rollback attempt is made
4046d9b0257aSdrh ** and an error code is returned.  If the commit worked, SQLITE_OK
4047d9b0257aSdrh ** is returned.
4048ed7c855cSdrh */
404980e35f46Sdrh int sqlite3PagerCommitPhaseTwo(Pager *pPager){
40508c0a791aSdanielk1977   int rc = SQLITE_OK;
4051d9b0257aSdrh 
4052efaaf579Sdanielk1977   if( pPager->errCode ){
40537f7bc66eSdanielk1977     return pPager->errCode;
4054d9b0257aSdrh   }
4055a6abd041Sdrh   if( pPager->state<PAGER_RESERVED ){
4056d9b0257aSdrh     return SQLITE_ERROR;
4057d9b0257aSdrh   }
4058d138c016Sdrh   if( pPager->dbModified==0 &&
4059d138c016Sdrh         (pPager->journalMode!=PAGER_JOURNALMODE_DELETE ||
4060d138c016Sdrh           pPager->exclusiveMode!=0) ){
4061d138c016Sdrh     assert( pPager->dirtyCache==0 || pPager->journalOpen==0 );
4062d138c016Sdrh     return SQLITE_OK;
4063d138c016Sdrh   }
406430d53701Sdrh   PAGERTRACE(("COMMIT %d\n", PAGERID(pPager)));
4065b3175389Sdanielk1977   assert( pPager->state==PAGER_SYNCED || MEMDB || !pPager->dirtyCache );
4066df2566a3Sdanielk1977   rc = pager_end_transaction(pPager, pPager->setMaster);
406786f8c197Sdrh   rc = pager_error(pPager, rc);
406886f8c197Sdrh   return rc;
4069ed7c855cSdrh }
4070ed7c855cSdrh 
4071ed7c855cSdrh /*
4072a6abd041Sdrh ** Rollback all changes.  The database falls back to PAGER_SHARED mode.
4073ed7c855cSdrh ** All in-memory cache pages revert to their original data contents.
4074ed7c855cSdrh ** The journal is deleted.
4075d9b0257aSdrh **
4076d9b0257aSdrh ** This routine cannot fail unless some other process is not following
40774f0ee686Sdrh ** the correct locking protocol or unless some other
4078d9b0257aSdrh ** process is writing trash into the journal file (SQLITE_CORRUPT) or
4079d9b0257aSdrh ** unless a prior malloc() failed (SQLITE_NOMEM).  Appropriate error
4080d9b0257aSdrh ** codes are returned for all these occasions.  Otherwise,
4081d9b0257aSdrh ** SQLITE_OK is returned.
4082ed7c855cSdrh */
40833b8a05f6Sdanielk1977 int sqlite3PagerRollback(Pager *pPager){
40848c0a791aSdanielk1977   int rc = SQLITE_OK;
408530d53701Sdrh   PAGERTRACE(("ROLLBACK %d\n", PAGERID(pPager)));
4086b3175389Sdanielk1977   if( !pPager->dirtyCache || !pPager->journalOpen ){
4087df2566a3Sdanielk1977     rc = pager_end_transaction(pPager, pPager->setMaster);
40888c0a791aSdanielk1977   }else if( pPager->errCode && pPager->errCode!=SQLITE_FULL ){
4089a6abd041Sdrh     if( pPager->state>=PAGER_EXCLUSIVE ){
4090e277be05Sdanielk1977       pager_playback(pPager, 0);
40914b845d7eSdrh     }
40928c0a791aSdanielk1977     rc = pPager->errCode;
40938c0a791aSdanielk1977   }else{
4094a6abd041Sdrh     if( pPager->state==PAGER_RESERVED ){
409517221813Sdanielk1977       int rc2;
4096e277be05Sdanielk1977       rc = pager_playback(pPager, 0);
4097df2566a3Sdanielk1977       rc2 = pager_end_transaction(pPager, pPager->setMaster);
4098a6abd041Sdrh       if( rc==SQLITE_OK ){
4099a6abd041Sdrh         rc = rc2;
4100d9b0257aSdrh       }
4101a6abd041Sdrh     }else{
4102e277be05Sdanielk1977       rc = pager_playback(pPager, 0);
4103a6abd041Sdrh     }
41048c0a791aSdanielk1977 
4105b3175389Sdanielk1977     if( !MEMDB ){
4106d92db531Sdanielk1977       pPager->dbSizeValid = 0;
4107b3175389Sdanielk1977     }
410807cb560bSdanielk1977 
410907cb560bSdanielk1977     /* If an error occurs during a ROLLBACK, we can no longer trust the pager
411007cb560bSdanielk1977     ** cache. So call pager_error() on the way out to make any error
411107cb560bSdanielk1977     ** persistent.
411207cb560bSdanielk1977     */
411386f8c197Sdrh     rc = pager_error(pPager, rc);
41148c0a791aSdanielk1977   }
411586f8c197Sdrh   return rc;
411698808babSdrh }
4117d9b0257aSdrh 
4118d9b0257aSdrh /*
41195e00f6c7Sdrh ** Return TRUE if the database file is opened read-only.  Return FALSE
41205e00f6c7Sdrh ** if the database is (in theory) writable.
41215e00f6c7Sdrh */
4122f49661a4Sdrh u8 sqlite3PagerIsreadonly(Pager *pPager){
4123be0072d2Sdrh   return pPager->readOnly;
41245e00f6c7Sdrh }
41255e00f6c7Sdrh 
41265e00f6c7Sdrh /*
41270f7eb611Sdrh ** Return the number of references to the pager.
41280f7eb611Sdrh */
41293b8a05f6Sdanielk1977 int sqlite3PagerRefcount(Pager *pPager){
41308c0a791aSdanielk1977   return sqlite3PcacheRefCount(pPager->pPCache);
41310f7eb611Sdrh }
41320f7eb611Sdrh 
413371d5d2cdSdanielk1977 /*
413471d5d2cdSdanielk1977 ** Return the number of references to the specified page.
413571d5d2cdSdanielk1977 */
413671d5d2cdSdanielk1977 int sqlite3PagerPageRefcount(DbPage *pPage){
413771d5d2cdSdanielk1977   return sqlite3PcachePageRefcount(pPage);
413871d5d2cdSdanielk1977 }
413971d5d2cdSdanielk1977 
41400f7eb611Sdrh #ifdef SQLITE_TEST
41410f7eb611Sdrh /*
4142d9b0257aSdrh ** This routine is used for testing and analysis only.
4143d9b0257aSdrh */
41443b8a05f6Sdanielk1977 int *sqlite3PagerStats(Pager *pPager){
414542741be9Sdanielk1977   static int a[11];
41468c0a791aSdanielk1977   a[0] = sqlite3PcacheRefCount(pPager->pPCache);
41478c0a791aSdanielk1977   a[1] = sqlite3PcachePagecount(pPager->pPCache);
41488c0a791aSdanielk1977   a[2] = sqlite3PcacheGetCachesize(pPager->pPCache);
4149d92db531Sdanielk1977   a[3] = pPager->dbSizeValid ? (int) pPager->dbSize : -1;
4150d9b0257aSdrh   a[4] = pPager->state;
4151efaaf579Sdanielk1977   a[5] = pPager->errCode;
4152d9b0257aSdrh   a[6] = pPager->nHit;
4153d9b0257aSdrh   a[7] = pPager->nMiss;
41547c4ac0c5Sdrh   a[8] = 0;  /* Used to be pPager->nOvfl */
415542741be9Sdanielk1977   a[9] = pPager->nRead;
415642741be9Sdanielk1977   a[10] = pPager->nWrite;
4157d9b0257aSdrh   return a;
4158d9b0257aSdrh }
415917b90b53Sdanielk1977 int sqlite3PagerIsMemdb(Pager *pPager){
416017b90b53Sdanielk1977   return MEMDB;
416117b90b53Sdanielk1977 }
41620f7eb611Sdrh #endif
4163dd79342eSdrh 
4164fa86c412Sdrh /*
4165fd7f0452Sdanielk1977 ** Ensure that there are at least nSavepoint savepoints open.
4166fa86c412Sdrh */
4167fd7f0452Sdanielk1977 int sqlite3PagerOpenSavepoint(Pager *pPager, int nSavepoint){
4168fd7f0452Sdanielk1977   int rc = SQLITE_OK;
4169fd7f0452Sdanielk1977 
417012dd5496Sdanielk1977   if( nSavepoint>pPager->nSavepoint && pPager->useJournal ){
4171fd7f0452Sdanielk1977     int ii;
417249b9d338Sdrh     PagerSavepoint *aNew;
4173fd7f0452Sdanielk1977 
4174c0731c9dSdrh     /* Either there is no active journal or the sub-journal is open or
4175c0731c9dSdrh     ** the journal is always stored in memory */
4176c0731c9dSdrh     assert( pPager->nSavepoint==0 || pPager->sjfd->pMethods ||
4177c0731c9dSdrh             pPager->journalMode==PAGER_JOURNALMODE_MEMORY );
4178fd7f0452Sdanielk1977 
4179fd7f0452Sdanielk1977     /* Grow the Pager.aSavepoint array using realloc(). Return SQLITE_NOMEM
4180fd7f0452Sdanielk1977     ** if the allocation fails. Otherwise, zero the new portion in case a
4181fd7f0452Sdanielk1977     ** malloc failure occurs while populating it in the for(...) loop below.
4182fd7f0452Sdanielk1977     */
418349b9d338Sdrh     aNew = (PagerSavepoint *)sqlite3Realloc(
4184fd7f0452Sdanielk1977         pPager->aSavepoint, sizeof(PagerSavepoint)*nSavepoint
4185fd7f0452Sdanielk1977     );
4186fd7f0452Sdanielk1977     if( !aNew ){
4187fa86c412Sdrh       return SQLITE_NOMEM;
4188fa86c412Sdrh     }
4189fd7f0452Sdanielk1977     memset(&aNew[pPager->nSavepoint], 0,
4190fd7f0452Sdanielk1977         (nSavepoint - pPager->nSavepoint) * sizeof(PagerSavepoint)
4191fd7f0452Sdanielk1977     );
4192fd7f0452Sdanielk1977     pPager->aSavepoint = aNew;
4193fd7f0452Sdanielk1977     ii = pPager->nSavepoint;
4194fd7f0452Sdanielk1977     pPager->nSavepoint = nSavepoint;
4195fa86c412Sdrh 
4196fd7f0452Sdanielk1977     /* Populate the PagerSavepoint structures just allocated. */
4197fd7f0452Sdanielk1977     for(/* no-op */; ii<nSavepoint; ii++){
419812dd5496Sdanielk1977       assert( pPager->dbSizeValid );
4199fd7f0452Sdanielk1977       aNew[ii].nOrig = pPager->dbSize;
420067ddef69Sdanielk1977       if( pPager->journalOpen && pPager->journalOff>0 ){
420167ddef69Sdanielk1977         aNew[ii].iOffset = pPager->journalOff;
420267ddef69Sdanielk1977       }else{
420367ddef69Sdanielk1977         aNew[ii].iOffset = JOURNAL_HDR_SZ(pPager);
420467ddef69Sdanielk1977       }
4205fd7f0452Sdanielk1977       aNew[ii].iSubRec = pPager->stmtNRec;
4206fd7f0452Sdanielk1977       aNew[ii].pInSavepoint = sqlite3BitvecCreate(pPager->dbSize);
4207fd7f0452Sdanielk1977       if( !aNew[ii].pInSavepoint ){
4208fd7f0452Sdanielk1977         return SQLITE_NOMEM;
4209fa86c412Sdrh       }
4210fa86c412Sdrh     }
4211fd7f0452Sdanielk1977 
4212fd7f0452Sdanielk1977     /* Open the sub-journal, if it is not already opened. */
4213fd7f0452Sdanielk1977     rc = openSubJournal(pPager);
4214fd7f0452Sdanielk1977   }
4215fd7f0452Sdanielk1977 
421686f8c197Sdrh   return rc;
421786f8c197Sdrh }
4218fa86c412Sdrh 
4219fa86c412Sdrh /*
4220fd7f0452Sdanielk1977 ** Parameter op is always either SAVEPOINT_ROLLBACK or SAVEPOINT_RELEASE.
4221fd7f0452Sdanielk1977 ** If it is SAVEPOINT_RELEASE, then release and destroy the savepoint with
4222fd7f0452Sdanielk1977 ** index iSavepoint. If it is SAVEPOINT_ROLLBACK, then rollback all changes
4223fd7f0452Sdanielk1977 ** that have occured since savepoint iSavepoint was created.
4224fd7f0452Sdanielk1977 **
4225fd7f0452Sdanielk1977 ** In either case, all savepoints with an index greater than iSavepoint
4226fd7f0452Sdanielk1977 ** are destroyed.
4227fd7f0452Sdanielk1977 **
4228fd7f0452Sdanielk1977 ** If there are less than (iSavepoint+1) active savepoints when this
4229fd7f0452Sdanielk1977 ** function is called it is a no-op.
4230fa86c412Sdrh */
4231fd7f0452Sdanielk1977 int sqlite3PagerSavepoint(Pager *pPager, int op, int iSavepoint){
4232fd7f0452Sdanielk1977   int rc = SQLITE_OK;
4233fd7f0452Sdanielk1977 
4234fd7f0452Sdanielk1977   assert( op==SAVEPOINT_RELEASE || op==SAVEPOINT_ROLLBACK );
4235fd7f0452Sdanielk1977 
4236fd7f0452Sdanielk1977   if( iSavepoint<pPager->nSavepoint ){
4237fd7f0452Sdanielk1977     int ii;
4238fd7f0452Sdanielk1977     int nNew = iSavepoint + (op==SAVEPOINT_ROLLBACK);
4239fd7f0452Sdanielk1977     for(ii=nNew; ii<pPager->nSavepoint; ii++){
4240fd7f0452Sdanielk1977       sqlite3BitvecDestroy(pPager->aSavepoint[ii].pInSavepoint);
4241b3175389Sdanielk1977     }
4242fd7f0452Sdanielk1977     pPager->nSavepoint = nNew;
4243fd7f0452Sdanielk1977 
42448a7adb0dSdanielk1977     if( op==SAVEPOINT_ROLLBACK && pPager->jfd->pMethods ){
4245fd7f0452Sdanielk1977       PagerSavepoint *pSavepoint = (nNew==0)?0:&pPager->aSavepoint[nNew-1];
4246fd7f0452Sdanielk1977       rc = pagerPlaybackSavepoint(pPager, pSavepoint);
4247fd7f0452Sdanielk1977       assert(rc!=SQLITE_DONE);
4248fa86c412Sdrh     }
4249fa86c412Sdrh 
4250fd7f0452Sdanielk1977     /* If this is a release of the outermost savepoint, truncate
4251fd7f0452Sdanielk1977     ** the sub-journal. */
4252fd7f0452Sdanielk1977     if( nNew==0 && op==SAVEPOINT_RELEASE && pPager->sjfd->pMethods ){
4253fd7f0452Sdanielk1977       assert( rc==SQLITE_OK );
4254fd7f0452Sdanielk1977       rc = sqlite3OsTruncate(pPager->sjfd, 0);
4255fd7f0452Sdanielk1977       pPager->stmtNRec = 0;
4256663fc63aSdrh     }
4257fd7f0452Sdanielk1977   }
4258fa86c412Sdrh   return rc;
4259fa86c412Sdrh }
4260fa86c412Sdrh 
426173509eeeSdrh /*
426273509eeeSdrh ** Return the full pathname of the database file.
426373509eeeSdrh */
42643b8a05f6Sdanielk1977 const char *sqlite3PagerFilename(Pager *pPager){
426573509eeeSdrh   return pPager->zFilename;
426673509eeeSdrh }
426773509eeeSdrh 
4268b20ea9d2Sdrh /*
4269d0679edcSdrh ** Return the VFS structure for the pager.
4270d0679edcSdrh */
4271d0679edcSdrh const sqlite3_vfs *sqlite3PagerVfs(Pager *pPager){
4272d0679edcSdrh   return pPager->pVfs;
4273d0679edcSdrh }
4274d0679edcSdrh 
4275d0679edcSdrh /*
4276cc6bb3eaSdrh ** Return the file handle for the database file associated
4277cc6bb3eaSdrh ** with the pager.  This might return NULL if the file has
4278cc6bb3eaSdrh ** not yet been opened.
4279cc6bb3eaSdrh */
4280cc6bb3eaSdrh sqlite3_file *sqlite3PagerFile(Pager *pPager){
4281cc6bb3eaSdrh   return pPager->fd;
4282cc6bb3eaSdrh }
4283cc6bb3eaSdrh 
4284cc6bb3eaSdrh /*
42855865e3d5Sdanielk1977 ** Return the directory of the database file.
42865865e3d5Sdanielk1977 */
42873b8a05f6Sdanielk1977 const char *sqlite3PagerDirname(Pager *pPager){
42885865e3d5Sdanielk1977   return pPager->zDirectory;
42895865e3d5Sdanielk1977 }
42905865e3d5Sdanielk1977 
42915865e3d5Sdanielk1977 /*
42925865e3d5Sdanielk1977 ** Return the full pathname of the journal file.
42935865e3d5Sdanielk1977 */
42943b8a05f6Sdanielk1977 const char *sqlite3PagerJournalname(Pager *pPager){
42955865e3d5Sdanielk1977   return pPager->zJournal;
42965865e3d5Sdanielk1977 }
42975865e3d5Sdanielk1977 
42985865e3d5Sdanielk1977 /*
42992c8997b9Sdrh ** Return true if fsync() calls are disabled for this pager.  Return FALSE
43002c8997b9Sdrh ** if fsync()s are executed normally.
43012c8997b9Sdrh */
43023b8a05f6Sdanielk1977 int sqlite3PagerNosync(Pager *pPager){
43032c8997b9Sdrh   return pPager->noSync;
43042c8997b9Sdrh }
43052c8997b9Sdrh 
43067c4ac0c5Sdrh #ifdef SQLITE_HAS_CODEC
43072c8997b9Sdrh /*
4308b20ea9d2Sdrh ** Set the codec for this pager
4309b20ea9d2Sdrh */
43103b8a05f6Sdanielk1977 void sqlite3PagerSetCodec(
4311b20ea9d2Sdrh   Pager *pPager,
4312c001c58aSdrh   void *(*xCodec)(void*,void*,Pgno,int),
4313b20ea9d2Sdrh   void *pCodecArg
4314b20ea9d2Sdrh ){
4315b20ea9d2Sdrh   pPager->xCodec = xCodec;
4316b20ea9d2Sdrh   pPager->pCodecArg = pCodecArg;
4317b20ea9d2Sdrh }
43187c4ac0c5Sdrh #endif
4319b20ea9d2Sdrh 
4320687566d7Sdanielk1977 #ifndef SQLITE_OMIT_AUTOVACUUM
4321687566d7Sdanielk1977 /*
43225e385311Sdrh ** Move the page pPg to location pgno in the file.
4323687566d7Sdanielk1977 **
43245e385311Sdrh ** There must be no references to the page previously located at
43255e385311Sdrh ** pgno (which we call pPgOld) though that page is allowed to be
4326b3df2e1cSdrh ** in cache.  If the page previously located at pgno is not already
43275e385311Sdrh ** in the rollback journal, it is not put there by by this routine.
4328687566d7Sdanielk1977 **
43295e385311Sdrh ** References to the page pPg remain valid. Updating any
43305e385311Sdrh ** meta-data associated with pPg (i.e. data stored in the nExtra bytes
4331687566d7Sdanielk1977 ** allocated along with the page) is the responsibility of the caller.
4332687566d7Sdanielk1977 **
43335fd057afSdanielk1977 ** A transaction must be active when this routine is called. It used to be
43345fd057afSdanielk1977 ** required that a statement transaction was not active, but this restriction
43355fd057afSdanielk1977 ** has been removed (CREATE INDEX needs to move a page when a statement
43365fd057afSdanielk1977 ** transaction is active).
43374c999999Sdanielk1977 **
43384c999999Sdanielk1977 ** If the fourth argument, isCommit, is non-zero, then this page is being
43394c999999Sdanielk1977 ** moved as part of a database reorganization just before the transaction
43404c999999Sdanielk1977 ** is being committed. In this case, it is guaranteed that the database page
43414c999999Sdanielk1977 ** pPg refers to will not be written to again within this transaction.
4342687566d7Sdanielk1977 */
43434c999999Sdanielk1977 int sqlite3PagerMovepage(Pager *pPager, DbPage *pPg, Pgno pgno, int isCommit){
43445e385311Sdrh   PgHdr *pPgOld;  /* The page being overwritten. */
434594daf7fdSdanielk1977   Pgno needSyncPgno = 0;
43461fab7b66Sdanielk1977   int rc;
4347687566d7Sdanielk1977 
4348687566d7Sdanielk1977   assert( pPg->nRef>0 );
4349687566d7Sdanielk1977 
43501fab7b66Sdanielk1977   /* If the page being moved is dirty and has not been saved by the latest
43511fab7b66Sdanielk1977   ** savepoint, then save the current contents of the page into the
43521fab7b66Sdanielk1977   ** sub-journal now. This is required to handle the following scenario:
43531fab7b66Sdanielk1977   **
43541fab7b66Sdanielk1977   **   BEGIN;
43551fab7b66Sdanielk1977   **     <journal page X, then modify it in memory>
43561fab7b66Sdanielk1977   **     SAVEPOINT one;
43571fab7b66Sdanielk1977   **       <Move page X to location Y>
43581fab7b66Sdanielk1977   **     ROLLBACK TO one;
43591fab7b66Sdanielk1977   **
43601fab7b66Sdanielk1977   ** If page X were not written to the sub-journal here, it would not
43611fab7b66Sdanielk1977   ** be possible to restore its contents when the "ROLLBACK TO one"
43621fab7b66Sdanielk1977   ** statement were processed.
43631fab7b66Sdanielk1977   */
43641fab7b66Sdanielk1977   if( pPg->flags&PGHDR_DIRTY
43651fab7b66Sdanielk1977    && subjRequiresPage(pPg)
43661fab7b66Sdanielk1977    && SQLITE_OK!=(rc = subjournalPage(pPg))
43671fab7b66Sdanielk1977   ){
43681fab7b66Sdanielk1977     return rc;
43691fab7b66Sdanielk1977   }
43701fab7b66Sdanielk1977 
437130d53701Sdrh   PAGERTRACE(("MOVE %d page %d (needSync=%d) moves to %d\n",
437230d53701Sdrh       PAGERID(pPager), pPg->pgno, (pPg->flags&PGHDR_NEED_SYNC)?1:0, pgno));
4373b0603416Sdrh   IOTRACE(("MOVE %p %d %d\n", pPager, pPg->pgno, pgno))
4374ef73ee9aSdanielk1977 
4375b4626a3eSdanielk1977   pager_get_content(pPg);
43764c999999Sdanielk1977 
43774c999999Sdanielk1977   /* If the journal needs to be sync()ed before page pPg->pgno can
43784c999999Sdanielk1977   ** be written to, store pPg->pgno in local variable needSyncPgno.
43794c999999Sdanielk1977   **
43804c999999Sdanielk1977   ** If the isCommit flag is set, there is no need to remember that
43814c999999Sdanielk1977   ** the journal needs to be sync()ed before database page pPg->pgno
43824c999999Sdanielk1977   ** can be written to. The caller has already promised not to write to it.
43834c999999Sdanielk1977   */
43848c0a791aSdanielk1977   if( (pPg->flags&PGHDR_NEED_SYNC) && !isCommit ){
438594daf7fdSdanielk1977     needSyncPgno = pPg->pgno;
43863460d19cSdanielk1977     assert( pageInJournal(pPg) || pPg->pgno>pPager->dbOrigSize );
43878c0a791aSdanielk1977     assert( pPg->flags&PGHDR_DIRTY );
4388ae82558bSdanielk1977     assert( pPager->needSync );
438994daf7fdSdanielk1977   }
439094daf7fdSdanielk1977 
4391ef73ee9aSdanielk1977   /* If the cache contains a page with page-number pgno, remove it
439285b623f2Sdrh   ** from its hash chain. Also, if the PgHdr.needSync was set for
4393599fcbaeSdanielk1977   ** page pgno before the 'move' operation, it needs to be retained
4394599fcbaeSdanielk1977   ** for the page moved there.
4395f5fdda82Sdanielk1977   */
4396bc2ca9ebSdanielk1977   pPg->flags &= ~PGHDR_NEED_SYNC;
4397687566d7Sdanielk1977   pPgOld = pager_lookup(pPager, pgno);
43988c0a791aSdanielk1977   assert( !pPgOld || pPgOld->nRef==1 );
4399687566d7Sdanielk1977   if( pPgOld ){
44008c0a791aSdanielk1977     pPg->flags |= (pPgOld->flags&PGHDR_NEED_SYNC);
4401ef73ee9aSdanielk1977   }
4402687566d7Sdanielk1977 
44038c0a791aSdanielk1977   sqlite3PcacheMove(pPg, pgno);
44048c0a791aSdanielk1977   if( pPgOld ){
4405bc2ca9ebSdanielk1977     sqlite3PcacheDrop(pPgOld);
4406f5fdda82Sdanielk1977   }
4407f5fdda82Sdanielk1977 
4408c047b9f7Sdrh   sqlite3PcacheMakeDirty(pPg);
4409687566d7Sdanielk1977   pPager->dirtyCache = 1;
4410d138c016Sdrh   pPager->dbModified = 1;
4411687566d7Sdanielk1977 
441294daf7fdSdanielk1977   if( needSyncPgno ){
441394daf7fdSdanielk1977     /* If needSyncPgno is non-zero, then the journal file needs to be
441494daf7fdSdanielk1977     ** sync()ed before any data is written to database file page needSyncPgno.
441594daf7fdSdanielk1977     ** Currently, no such page exists in the page-cache and the
44164c999999Sdanielk1977     ** "is journaled" bitvec flag has been set. This needs to be remedied by
44174c999999Sdanielk1977     ** loading the page into the pager-cache and setting the PgHdr.needSync
44184c999999Sdanielk1977     ** flag.
4419ae82558bSdanielk1977     **
4420a98d7b47Sdanielk1977     ** If the attempt to load the page into the page-cache fails, (due
4421f5e7bb51Sdrh     ** to a malloc() or IO failure), clear the bit in the pInJournal[]
4422a98d7b47Sdanielk1977     ** array. Otherwise, if the page is loaded and written again in
4423a98d7b47Sdanielk1977     ** this transaction, it may be written to the database file before
4424a98d7b47Sdanielk1977     ** it is synced into the journal file. This way, it may end up in
4425a98d7b47Sdanielk1977     ** the journal file twice, but that is not a problem.
4426a98d7b47Sdanielk1977     **
44273b8a05f6Sdanielk1977     ** The sqlite3PagerGet() call may cause the journal to sync. So make
4428ae82558bSdanielk1977     ** sure the Pager.needSync flag is set too.
442994daf7fdSdanielk1977     */
44303b8a05f6Sdanielk1977     PgHdr *pPgHdr;
4431ae82558bSdanielk1977     assert( pPager->needSync );
44323b8a05f6Sdanielk1977     rc = sqlite3PagerGet(pPager, needSyncPgno, &pPgHdr);
443387c29a94Sdanielk1977     if( rc!=SQLITE_OK ){
44343460d19cSdanielk1977       if( pPager->pInJournal && needSyncPgno<=pPager->dbOrigSize ){
4435f5e7bb51Sdrh         sqlite3BitvecClear(pPager->pInJournal, needSyncPgno);
4436a98d7b47Sdanielk1977       }
443787c29a94Sdanielk1977       return rc;
443887c29a94Sdanielk1977     }
4439ae82558bSdanielk1977     pPager->needSync = 1;
4440b3df2e1cSdrh     assert( pPager->noSync==0 && !MEMDB );
44418c0a791aSdanielk1977     pPgHdr->flags |= PGHDR_NEED_SYNC;
4442c047b9f7Sdrh     sqlite3PcacheMakeDirty(pPgHdr);
44433b8a05f6Sdanielk1977     sqlite3PagerUnref(pPgHdr);
444494daf7fdSdanielk1977   }
444594daf7fdSdanielk1977 
4446687566d7Sdanielk1977   return SQLITE_OK;
4447687566d7Sdanielk1977 }
4448687566d7Sdanielk1977 #endif
4449687566d7Sdanielk1977 
44503b8a05f6Sdanielk1977 /*
44513b8a05f6Sdanielk1977 ** Return a pointer to the data for the specified page.
44523b8a05f6Sdanielk1977 */
44533b8a05f6Sdanielk1977 void *sqlite3PagerGetData(DbPage *pPg){
445471d5d2cdSdanielk1977   assert( pPg->nRef>0 || pPg->pPager->memDb );
44558c0a791aSdanielk1977   return pPg->pData;
44563b8a05f6Sdanielk1977 }
44573b8a05f6Sdanielk1977 
44583b8a05f6Sdanielk1977 /*
44593b8a05f6Sdanielk1977 ** Return a pointer to the Pager.nExtra bytes of "extra" space
44603b8a05f6Sdanielk1977 ** allocated along with the specified page.
44613b8a05f6Sdanielk1977 */
44623b8a05f6Sdanielk1977 void *sqlite3PagerGetExtra(DbPage *pPg){
44633b8a05f6Sdanielk1977   Pager *pPager = pPg->pPager;
44648c0a791aSdanielk1977   return (pPager?pPg->pExtra:0);
44653b8a05f6Sdanielk1977 }
44663b8a05f6Sdanielk1977 
446741483468Sdanielk1977 /*
446841483468Sdanielk1977 ** Get/set the locking-mode for this pager. Parameter eMode must be one
446941483468Sdanielk1977 ** of PAGER_LOCKINGMODE_QUERY, PAGER_LOCKINGMODE_NORMAL or
447041483468Sdanielk1977 ** PAGER_LOCKINGMODE_EXCLUSIVE. If the parameter is not _QUERY, then
447141483468Sdanielk1977 ** the locking-mode is set to the value specified.
447241483468Sdanielk1977 **
447341483468Sdanielk1977 ** The returned value is either PAGER_LOCKINGMODE_NORMAL or
447441483468Sdanielk1977 ** PAGER_LOCKINGMODE_EXCLUSIVE, indicating the current (possibly updated)
447541483468Sdanielk1977 ** locking-mode.
447641483468Sdanielk1977 */
447741483468Sdanielk1977 int sqlite3PagerLockingMode(Pager *pPager, int eMode){
4478369339dbSdrh   assert( eMode==PAGER_LOCKINGMODE_QUERY
4479369339dbSdrh             || eMode==PAGER_LOCKINGMODE_NORMAL
4480369339dbSdrh             || eMode==PAGER_LOCKINGMODE_EXCLUSIVE );
4481369339dbSdrh   assert( PAGER_LOCKINGMODE_QUERY<0 );
4482369339dbSdrh   assert( PAGER_LOCKINGMODE_NORMAL>=0 && PAGER_LOCKINGMODE_EXCLUSIVE>=0 );
4483369339dbSdrh   if( eMode>=0 && !pPager->tempFile ){
44841bd10f8aSdrh     pPager->exclusiveMode = (u8)eMode;
448541483468Sdanielk1977   }
448641483468Sdanielk1977   return (int)pPager->exclusiveMode;
448741483468Sdanielk1977 }
448841483468Sdanielk1977 
44893b02013eSdrh /*
449004335886Sdrh ** Get/set the journal-mode for this pager. Parameter eMode must be one of:
44913b02013eSdrh **
449204335886Sdrh **    PAGER_JOURNALMODE_QUERY
449304335886Sdrh **    PAGER_JOURNALMODE_DELETE
449404335886Sdrh **    PAGER_JOURNALMODE_TRUNCATE
449504335886Sdrh **    PAGER_JOURNALMODE_PERSIST
449604335886Sdrh **    PAGER_JOURNALMODE_OFF
449704335886Sdrh **
449804335886Sdrh ** If the parameter is not _QUERY, then the journal-mode is set to the
449904335886Sdrh ** value specified.
450004335886Sdrh **
450104335886Sdrh ** The returned indicate the current (possibly updated)
45023b02013eSdrh ** journal-mode.
45033b02013eSdrh */
45043b02013eSdrh int sqlite3PagerJournalMode(Pager *pPager, int eMode){
4505b3175389Sdanielk1977   if( !MEMDB ){
45063b02013eSdrh     assert( eMode==PAGER_JOURNALMODE_QUERY
45073b02013eSdrh               || eMode==PAGER_JOURNALMODE_DELETE
450804335886Sdrh               || eMode==PAGER_JOURNALMODE_TRUNCATE
4509fdc40e91Sdrh               || eMode==PAGER_JOURNALMODE_PERSIST
4510b3175389Sdanielk1977               || eMode==PAGER_JOURNALMODE_OFF
4511b3175389Sdanielk1977               || eMode==PAGER_JOURNALMODE_MEMORY );
45123b02013eSdrh     assert( PAGER_JOURNALMODE_QUERY<0 );
4513fdc40e91Sdrh     if( eMode>=0 ){
45141bd10f8aSdrh       pPager->journalMode = (u8)eMode;
451504335886Sdrh     }else{
451604335886Sdrh       assert( eMode==PAGER_JOURNALMODE_QUERY );
45173b02013eSdrh     }
4518b3175389Sdanielk1977   }
4519fdc40e91Sdrh   return (int)pPager->journalMode;
45203b02013eSdrh }
45213b02013eSdrh 
4522b53e4960Sdanielk1977 /*
4523b53e4960Sdanielk1977 ** Get/set the size-limit used for persistent journal files.
4524b53e4960Sdanielk1977 */
4525b53e4960Sdanielk1977 i64 sqlite3PagerJournalSizeLimit(Pager *pPager, i64 iLimit){
4526b53e4960Sdanielk1977   if( iLimit>=-1 ){
4527b53e4960Sdanielk1977     pPager->journalSizeLimit = iLimit;
4528b53e4960Sdanielk1977   }
4529b53e4960Sdanielk1977   return pPager->journalSizeLimit;
4530b53e4960Sdanielk1977 }
4531b53e4960Sdanielk1977 
45322e66f0b9Sdrh #endif /* SQLITE_OMIT_DISKIO */
4533