xref: /sqlite-3.40.0/src/pager.c (revision 8a29dfde)
1 /*
2 ** 2001 September 15
3 **
4 ** The author disclaims copyright to this source code.  In place of
5 ** a legal notice, here is a blessing:
6 **
7 **    May you do good and not evil.
8 **    May you find forgiveness for yourself and forgive others.
9 **    May you share freely, never taking more than you give.
10 **
11 *************************************************************************
12 ** This is the implementation of the page cache subsystem or "pager".
13 **
14 ** The pager is used to access a database disk file.  It implements
15 ** atomic commit and rollback through the use of a journal file that
16 ** is separate from the database file.  The pager also implements file
17 ** locking to prevent two processes from writing the same database
18 ** file simultaneously, or one process from reading the database while
19 ** another is writing.
20 **
21 ** @(#) $Id: pager.c,v 1.423 2008/04/03 10:13:01 danielk1977 Exp $
22 */
23 #ifndef SQLITE_OMIT_DISKIO
24 #include "sqliteInt.h"
25 #include <assert.h>
26 #include <string.h>
27 
28 /*
29 ** Macros for troubleshooting.  Normally turned off
30 */
31 #if 0
32 #define sqlite3DebugPrintf printf
33 #define PAGERTRACE1(X)       sqlite3DebugPrintf(X)
34 #define PAGERTRACE2(X,Y)     sqlite3DebugPrintf(X,Y)
35 #define PAGERTRACE3(X,Y,Z)   sqlite3DebugPrintf(X,Y,Z)
36 #define PAGERTRACE4(X,Y,Z,W) sqlite3DebugPrintf(X,Y,Z,W)
37 #define PAGERTRACE5(X,Y,Z,W,V) sqlite3DebugPrintf(X,Y,Z,W,V)
38 #else
39 #define PAGERTRACE1(X)
40 #define PAGERTRACE2(X,Y)
41 #define PAGERTRACE3(X,Y,Z)
42 #define PAGERTRACE4(X,Y,Z,W)
43 #define PAGERTRACE5(X,Y,Z,W,V)
44 #endif
45 
46 /*
47 ** The following two macros are used within the PAGERTRACEX() macros above
48 ** to print out file-descriptors.
49 **
50 ** PAGERID() takes a pointer to a Pager struct as its argument. The
51 ** associated file-descriptor is returned. FILEHANDLEID() takes an sqlite3_file
52 ** struct as its argument.
53 */
54 #define PAGERID(p) ((int)(p->fd))
55 #define FILEHANDLEID(fd) ((int)fd)
56 
57 /*
58 ** The page cache as a whole is always in one of the following
59 ** states:
60 **
61 **   PAGER_UNLOCK        The page cache is not currently reading or
62 **                       writing the database file.  There is no
63 **                       data held in memory.  This is the initial
64 **                       state.
65 **
66 **   PAGER_SHARED        The page cache is reading the database.
67 **                       Writing is not permitted.  There can be
68 **                       multiple readers accessing the same database
69 **                       file at the same time.
70 **
71 **   PAGER_RESERVED      This process has reserved the database for writing
72 **                       but has not yet made any changes.  Only one process
73 **                       at a time can reserve the database.  The original
74 **                       database file has not been modified so other
75 **                       processes may still be reading the on-disk
76 **                       database file.
77 **
78 **   PAGER_EXCLUSIVE     The page cache is writing the database.
79 **                       Access is exclusive.  No other processes or
80 **                       threads can be reading or writing while one
81 **                       process is writing.
82 **
83 **   PAGER_SYNCED        The pager moves to this state from PAGER_EXCLUSIVE
84 **                       after all dirty pages have been written to the
85 **                       database file and the file has been synced to
86 **                       disk. All that remains to do is to remove or
87 **                       truncate the journal file and the transaction
88 **                       will be committed.
89 **
90 ** The page cache comes up in PAGER_UNLOCK.  The first time a
91 ** sqlite3PagerGet() occurs, the state transitions to PAGER_SHARED.
92 ** After all pages have been released using sqlite_page_unref(),
93 ** the state transitions back to PAGER_UNLOCK.  The first time
94 ** that sqlite3PagerWrite() is called, the state transitions to
95 ** PAGER_RESERVED.  (Note that sqlite3PagerWrite() can only be
96 ** called on an outstanding page which means that the pager must
97 ** be in PAGER_SHARED before it transitions to PAGER_RESERVED.)
98 ** PAGER_RESERVED means that there is an open rollback journal.
99 ** The transition to PAGER_EXCLUSIVE occurs before any changes
100 ** are made to the database file, though writes to the rollback
101 ** journal occurs with just PAGER_RESERVED.  After an sqlite3PagerRollback()
102 ** or sqlite3PagerCommitPhaseTwo(), the state can go back to PAGER_SHARED,
103 ** or it can stay at PAGER_EXCLUSIVE if we are in exclusive access mode.
104 */
105 #define PAGER_UNLOCK      0
106 #define PAGER_SHARED      1   /* same as SHARED_LOCK */
107 #define PAGER_RESERVED    2   /* same as RESERVED_LOCK */
108 #define PAGER_EXCLUSIVE   4   /* same as EXCLUSIVE_LOCK */
109 #define PAGER_SYNCED      5
110 
111 /*
112 ** If the SQLITE_BUSY_RESERVED_LOCK macro is set to true at compile-time,
113 ** then failed attempts to get a reserved lock will invoke the busy callback.
114 ** This is off by default.  To see why, consider the following scenario:
115 **
116 ** Suppose thread A already has a shared lock and wants a reserved lock.
117 ** Thread B already has a reserved lock and wants an exclusive lock.  If
118 ** both threads are using their busy callbacks, it might be a long time
119 ** be for one of the threads give up and allows the other to proceed.
120 ** But if the thread trying to get the reserved lock gives up quickly
121 ** (if it never invokes its busy callback) then the contention will be
122 ** resolved quickly.
123 */
124 #ifndef SQLITE_BUSY_RESERVED_LOCK
125 # define SQLITE_BUSY_RESERVED_LOCK 0
126 #endif
127 
128 /*
129 ** This macro rounds values up so that if the value is an address it
130 ** is guaranteed to be an address that is aligned to an 8-byte boundary.
131 */
132 #define FORCE_ALIGNMENT(X)   (((X)+7)&~7)
133 
134 typedef struct PgHdr PgHdr;
135 
136 /*
137 ** Each pager stores all currently unreferenced pages in a list sorted
138 ** in least-recently-used (LRU) order (i.e. the first item on the list has
139 ** not been referenced in a long time, the last item has been recently
140 ** used). An instance of this structure is included as part of each
141 ** pager structure for this purpose (variable Pager.lru).
142 **
143 ** Additionally, if memory-management is enabled, all unreferenced pages
144 ** are stored in a global LRU list (global variable sqlite3LruPageList).
145 **
146 ** In both cases, the PagerLruList.pFirstSynced variable points to
147 ** the first page in the corresponding list that does not require an
148 ** fsync() operation before its memory can be reclaimed. If no such
149 ** page exists, PagerLruList.pFirstSynced is set to NULL.
150 */
151 typedef struct PagerLruList PagerLruList;
152 struct PagerLruList {
153   PgHdr *pFirst;         /* First page in LRU list */
154   PgHdr *pLast;          /* Last page in LRU list (the most recently used) */
155   PgHdr *pFirstSynced;   /* First page in list with PgHdr.needSync==0 */
156 };
157 
158 /*
159 ** The following structure contains the next and previous pointers used
160 ** to link a PgHdr structure into a PagerLruList linked list.
161 */
162 typedef struct PagerLruLink PagerLruLink;
163 struct PagerLruLink {
164   PgHdr *pNext;
165   PgHdr *pPrev;
166 };
167 
168 /*
169 ** Each in-memory image of a page begins with the following header.
170 ** This header is only visible to this pager module.  The client
171 ** code that calls pager sees only the data that follows the header.
172 **
173 ** Client code should call sqlite3PagerWrite() on a page prior to making
174 ** any modifications to that page.  The first time sqlite3PagerWrite()
175 ** is called, the original page contents are written into the rollback
176 ** journal and PgHdr.inJournal and PgHdr.needSync are set.  Later, once
177 ** the journal page has made it onto the disk surface, PgHdr.needSync
178 ** is cleared.  The modified page cannot be written back into the original
179 ** database file until the journal pages has been synced to disk and the
180 ** PgHdr.needSync has been cleared.
181 **
182 ** The PgHdr.dirty flag is set when sqlite3PagerWrite() is called and
183 ** is cleared again when the page content is written back to the original
184 ** database file.
185 **
186 ** Details of important structure elements:
187 **
188 ** needSync
189 **
190 **     If this is true, this means that it is not safe to write the page
191 **     content to the database because the original content needed
192 **     for rollback has not by synced to the main rollback journal.
193 **     The original content may have been written to the rollback journal
194 **     but it has not yet been synced.  So we cannot write to the database
195 **     file because power failure might cause the page in the journal file
196 **     to never reach the disk.  It is as if the write to the journal file
197 **     does not occur until the journal file is synced.
198 **
199 **     This flag is false if the page content exactly matches what
200 **     currently exists in the database file.  The needSync flag is also
201 **     false if the original content has been written to the main rollback
202 **     journal and synced.  If the page represents a new page that has
203 **     been added onto the end of the database during the current
204 **     transaction, the needSync flag is true until the original database
205 **     size in the journal header has been synced to disk.
206 **
207 ** inJournal
208 **
209 **     This is true if the original page has been written into the main
210 **     rollback journal.  This is always false for new pages added to
211 **     the end of the database file during the current transaction.
212 **     And this flag says nothing about whether or not the journal
213 **     has been synced to disk.  For pages that are in the original
214 **     database file, the following expression should always be true:
215 **
216 **       inJournal = sqlite3BitvecTest(pPager->pInJournal, pgno)
217 **
218 **     The pPager->pInJournal object is only valid for the original
219 **     pages of the database, not new pages that are added to the end
220 **     of the database, so obviously the above expression cannot be
221 **     valid for new pages.  For new pages inJournal is always 0.
222 **
223 ** dirty
224 **
225 **     When true, this means that the content of the page has been
226 **     modified and needs to be written back to the database file.
227 **     If false, it means that either the content of the page is
228 **     unchanged or else the content is unimportant and we do not
229 **     care whether or not it is preserved.
230 **
231 ** alwaysRollback
232 **
233 **     This means that the sqlite3PagerDontRollback() API should be
234 **     ignored for this page.  The DontRollback() API attempts to say
235 **     that the content of the page on disk is unimportant (it is an
236 **     unused page on the freelist) so that it is unnecessary to
237 **     rollback changes to this page because the content of the page
238 **     can change without changing the meaning of the database.  This
239 **     flag overrides any DontRollback() attempt.  This flag is set
240 **     when a page that originally contained valid data is added to
241 **     the freelist.  Later in the same transaction, this page might
242 **     be pulled from the freelist and reused for something different
243 **     and at that point the DontRollback() API will be called because
244 **     pages taken from the freelist do not need to be protected by
245 **     the rollback journal.  But this flag says that the page was
246 **     not originally part of the freelist so that it still needs to
247 **     be rolled back in spite of any subsequent DontRollback() calls.
248 **
249 ** needRead
250 **
251 **     This flag means (when true) that the content of the page has
252 **     not yet been loaded from disk.  The in-memory content is just
253 **     garbage.  (Actually, we zero the content, but you should not
254 **     make any assumptions about the content nevertheless.)  If the
255 **     content is needed in the future, it should be read from the
256 **     original database file.
257 */
258 struct PgHdr {
259   Pager *pPager;                 /* The pager to which this page belongs */
260   Pgno pgno;                     /* The page number for this page */
261   PgHdr *pNextHash, *pPrevHash;  /* Hash collision chain for PgHdr.pgno */
262   PagerLruLink free;             /* Next and previous free pages */
263   PgHdr *pNextAll;               /* A list of all pages */
264   u8 inJournal;                  /* TRUE if has been written to journal */
265   u8 dirty;                      /* TRUE if we need to write back changes */
266   u8 needSync;                   /* Sync journal before writing this page */
267   u8 alwaysRollback;             /* Disable DontRollback() for this page */
268   u8 needRead;                   /* Read content if PagerWrite() is called */
269   short int nRef;                /* Number of users of this page */
270   PgHdr *pDirty, *pPrevDirty;    /* Dirty pages */
271 #ifdef SQLITE_ENABLE_MEMORY_MANAGEMENT
272   PagerLruLink gfree;            /* Global list of nRef==0 pages */
273 #endif
274 #ifdef SQLITE_CHECK_PAGES
275   u32 pageHash;
276 #endif
277   void *pData;                   /* Page data */
278   /* Pager.nExtra bytes of local data appended to this header */
279 };
280 
281 /*
282 ** For an in-memory only database, some extra information is recorded about
283 ** each page so that changes can be rolled back.  (Journal files are not
284 ** used for in-memory databases.)  The following information is added to
285 ** the end of every EXTRA block for in-memory databases.
286 **
287 ** This information could have been added directly to the PgHdr structure.
288 ** But then it would take up an extra 8 bytes of storage on every PgHdr
289 ** even for disk-based databases.  Splitting it out saves 8 bytes.  This
290 ** is only a savings of 0.8% but those percentages add up.
291 */
292 typedef struct PgHistory PgHistory;
293 struct PgHistory {
294   u8 *pOrig;     /* Original page text.  Restore to this on a full rollback */
295   u8 *pStmt;     /* Text as it was at the beginning of the current statement */
296   PgHdr *pNextStmt, *pPrevStmt;  /* List of pages in the statement journal */
297   u8 inStmt;                     /* TRUE if in the statement subjournal */
298 };
299 
300 /*
301 ** A macro used for invoking the codec if there is one
302 */
303 #ifdef SQLITE_HAS_CODEC
304 # define CODEC1(P,D,N,X) if( P->xCodec!=0 ){ P->xCodec(P->pCodecArg,D,N,X); }
305 # define CODEC2(P,D,N,X) ((char*)(P->xCodec!=0?P->xCodec(P->pCodecArg,D,N,X):D))
306 #else
307 # define CODEC1(P,D,N,X) /* NO-OP */
308 # define CODEC2(P,D,N,X) ((char*)D)
309 #endif
310 
311 /*
312 ** Convert a pointer to a PgHdr into a pointer to its data
313 ** and back again.
314 */
315 #define PGHDR_TO_DATA(P)    ((P)->pData)
316 #define PGHDR_TO_EXTRA(G,P) ((void*)&((G)[1]))
317 #define PGHDR_TO_HIST(P,PGR)  \
318             ((PgHistory*)&((char*)(&(P)[1]))[(PGR)->nExtra])
319 
320 /*
321 ** A open page cache is an instance of the following structure.
322 **
323 ** Pager.errCode may be set to SQLITE_IOERR, SQLITE_CORRUPT, or
324 ** or SQLITE_FULL. Once one of the first three errors occurs, it persists
325 ** and is returned as the result of every major pager API call.  The
326 ** SQLITE_FULL return code is slightly different. It persists only until the
327 ** next successful rollback is performed on the pager cache. Also,
328 ** SQLITE_FULL does not affect the sqlite3PagerGet() and sqlite3PagerLookup()
329 ** APIs, they may still be used successfully.
330 */
331 struct Pager {
332   sqlite3_vfs *pVfs;          /* OS functions to use for IO */
333   u8 journalOpen;             /* True if journal file descriptors is valid */
334   u8 journalStarted;          /* True if header of journal is synced */
335   u8 useJournal;              /* Use a rollback journal on this file */
336   u8 noReadlock;              /* Do not bother to obtain readlocks */
337   u8 stmtOpen;                /* True if the statement subjournal is open */
338   u8 stmtInUse;               /* True we are in a statement subtransaction */
339   u8 stmtAutoopen;            /* Open stmt journal when main journal is opened*/
340   u8 noSync;                  /* Do not sync the journal if true */
341   u8 fullSync;                /* Do extra syncs of the journal for robustness */
342   u8 sync_flags;              /* One of SYNC_NORMAL or SYNC_FULL */
343   u8 state;                   /* PAGER_UNLOCK, _SHARED, _RESERVED, etc. */
344   u8 tempFile;                /* zFilename is a temporary file */
345   u8 readOnly;                /* True for a read-only database */
346   u8 needSync;                /* True if an fsync() is needed on the journal */
347   u8 dirtyCache;              /* True if cached pages have changed */
348   u8 alwaysRollback;          /* Disable DontRollback() for all pages */
349   u8 memDb;                   /* True to inhibit all file I/O */
350   u8 setMaster;               /* True if a m-j name has been written to jrnl */
351   u8 doNotSync;               /* Boolean. While true, do not spill the cache */
352   u8 exclusiveMode;           /* Boolean. True if locking_mode==EXCLUSIVE */
353   u8 changeCountDone;         /* Set after incrementing the change-counter */
354   u32 vfsFlags;               /* Flags for sqlite3_vfs.xOpen() */
355   int errCode;                /* One of several kinds of errors */
356   int dbSize;                 /* Number of pages in the file */
357   int origDbSize;             /* dbSize before the current change */
358   int stmtSize;               /* Size of database (in pages) at stmt_begin() */
359   int nRec;                   /* Number of pages written to the journal */
360   u32 cksumInit;              /* Quasi-random value added to every checksum */
361   int stmtNRec;               /* Number of records in stmt subjournal */
362   int nExtra;                 /* Add this many bytes to each in-memory page */
363   int pageSize;               /* Number of bytes in a page */
364   int nPage;                  /* Total number of in-memory pages */
365   int nRef;                   /* Number of in-memory pages with PgHdr.nRef>0 */
366   int mxPage;                 /* Maximum number of pages to hold in cache */
367   Pgno mxPgno;                /* Maximum allowed size of the database */
368   Bitvec *pInJournal;         /* One bit for each page in the database file */
369   Bitvec *pInStmt;            /* One bit for each page in the database */
370   char *zFilename;            /* Name of the database file */
371   char *zJournal;             /* Name of the journal file */
372   char *zDirectory;           /* Directory hold database and journal files */
373   char *zStmtJrnl;            /* Name of the statement journal file */
374   sqlite3_file *fd, *jfd;     /* File descriptors for database and journal */
375   sqlite3_file *stfd;         /* File descriptor for the statement subjournal*/
376   BusyHandler *pBusyHandler;  /* Pointer to sqlite.busyHandler */
377   PagerLruList lru;           /* LRU list of free pages */
378   PgHdr *pAll;                /* List of all pages */
379   PgHdr *pStmt;               /* List of pages in the statement subjournal */
380   PgHdr *pDirty;              /* List of all dirty pages */
381   i64 journalOff;             /* Current byte offset in the journal file */
382   i64 journalHdr;             /* Byte offset to previous journal header */
383   i64 stmtHdrOff;             /* First journal header written this statement */
384   i64 stmtCksum;              /* cksumInit when statement was started */
385   i64 stmtJSize;              /* Size of journal at stmt_begin() */
386   int sectorSize;             /* Assumed sector size during rollback */
387 #ifdef SQLITE_TEST
388   int nHit, nMiss;            /* Cache hits and missing */
389   int nRead, nWrite;          /* Database pages read/written */
390 #endif
391   void (*xDestructor)(DbPage*,int); /* Call this routine when freeing pages */
392   void (*xReiniter)(DbPage*,int);   /* Call this routine when reloading pages */
393 #ifdef SQLITE_HAS_CODEC
394   void *(*xCodec)(void*,void*,Pgno,int); /* Routine for en/decoding data */
395   void *pCodecArg;            /* First argument to xCodec() */
396 #endif
397   int nHash;                  /* Size of the pager hash table */
398   PgHdr **aHash;              /* Hash table to map page number to PgHdr */
399 #ifdef SQLITE_ENABLE_MEMORY_MANAGEMENT
400   Pager *pNext;               /* Doubly linked list of pagers on which */
401   Pager *pPrev;               /* sqlite3_release_memory() will work */
402   int iInUseMM;               /* Non-zero if unavailable to MM */
403   int iInUseDB;               /* Non-zero if in sqlite3_release_memory() */
404 #endif
405   char *pTmpSpace;            /* Pager.pageSize bytes of space for tmp use */
406   char dbFileVers[16];        /* Changes whenever database file changes */
407 };
408 
409 /*
410 ** The following global variables hold counters used for
411 ** testing purposes only.  These variables do not exist in
412 ** a non-testing build.  These variables are not thread-safe.
413 */
414 #ifdef SQLITE_TEST
415 int sqlite3_pager_readdb_count = 0;    /* Number of full pages read from DB */
416 int sqlite3_pager_writedb_count = 0;   /* Number of full pages written to DB */
417 int sqlite3_pager_writej_count = 0;    /* Number of pages written to journal */
418 int sqlite3_pager_pgfree_count = 0;    /* Number of cache pages freed */
419 # define PAGER_INCR(v)  v++
420 #else
421 # define PAGER_INCR(v)
422 #endif
423 
424 /*
425 ** The following variable points to the head of a double-linked list
426 ** of all pagers that are eligible for page stealing by the
427 ** sqlite3_release_memory() interface.  Access to this list is
428 ** protected by the SQLITE_MUTEX_STATIC_MEM2 mutex.
429 */
430 #ifdef SQLITE_ENABLE_MEMORY_MANAGEMENT
431 static Pager *sqlite3PagerList = 0;
432 static PagerLruList sqlite3LruPageList = {0, 0, 0};
433 #endif
434 
435 
436 /*
437 ** Journal files begin with the following magic string.  The data
438 ** was obtained from /dev/random.  It is used only as a sanity check.
439 **
440 ** Since version 2.8.0, the journal format contains additional sanity
441 ** checking information.  If the power fails while the journal is begin
442 ** written, semi-random garbage data might appear in the journal
443 ** file after power is restored.  If an attempt is then made
444 ** to roll the journal back, the database could be corrupted.  The additional
445 ** sanity checking data is an attempt to discover the garbage in the
446 ** journal and ignore it.
447 **
448 ** The sanity checking information for the new journal format consists
449 ** of a 32-bit checksum on each page of data.  The checksum covers both
450 ** the page number and the pPager->pageSize bytes of data for the page.
451 ** This cksum is initialized to a 32-bit random value that appears in the
452 ** journal file right after the header.  The random initializer is important,
453 ** because garbage data that appears at the end of a journal is likely
454 ** data that was once in other files that have now been deleted.  If the
455 ** garbage data came from an obsolete journal file, the checksums might
456 ** be correct.  But by initializing the checksum to random value which
457 ** is different for every journal, we minimize that risk.
458 */
459 static const unsigned char aJournalMagic[] = {
460   0xd9, 0xd5, 0x05, 0xf9, 0x20, 0xa1, 0x63, 0xd7,
461 };
462 
463 /*
464 ** The size of the header and of each page in the journal is determined
465 ** by the following macros.
466 */
467 #define JOURNAL_PG_SZ(pPager)  ((pPager->pageSize) + 8)
468 
469 /*
470 ** The journal header size for this pager. In the future, this could be
471 ** set to some value read from the disk controller. The important
472 ** characteristic is that it is the same size as a disk sector.
473 */
474 #define JOURNAL_HDR_SZ(pPager) (pPager->sectorSize)
475 
476 /*
477 ** The macro MEMDB is true if we are dealing with an in-memory database.
478 ** We do this as a macro so that if the SQLITE_OMIT_MEMORYDB macro is set,
479 ** the value of MEMDB will be a constant and the compiler will optimize
480 ** out code that would never execute.
481 */
482 #ifdef SQLITE_OMIT_MEMORYDB
483 # define MEMDB 0
484 #else
485 # define MEMDB pPager->memDb
486 #endif
487 
488 /*
489 ** Page number PAGER_MJ_PGNO is never used in an SQLite database (it is
490 ** reserved for working around a windows/posix incompatibility). It is
491 ** used in the journal to signify that the remainder of the journal file
492 ** is devoted to storing a master journal name - there are no more pages to
493 ** roll back. See comments for function writeMasterJournal() for details.
494 */
495 /* #define PAGER_MJ_PGNO(x) (PENDING_BYTE/((x)->pageSize)) */
496 #define PAGER_MJ_PGNO(x) ((PENDING_BYTE/((x)->pageSize))+1)
497 
498 /*
499 ** The maximum legal page number is (2^31 - 1).
500 */
501 #define PAGER_MAX_PGNO 2147483647
502 
503 /*
504 ** The pagerEnter() and pagerLeave() routines acquire and release
505 ** a mutex on each pager.  The mutex is recursive.
506 **
507 ** This is a special-purpose mutex.  It only provides mutual exclusion
508 ** between the Btree and the Memory Management sqlite3_release_memory()
509 ** function.  It does not prevent, for example, two Btrees from accessing
510 ** the same pager at the same time.  Other general-purpose mutexes in
511 ** the btree layer handle that chore.
512 */
513 #ifdef SQLITE_ENABLE_MEMORY_MANAGEMENT
514   static void pagerEnter(Pager *p){
515     p->iInUseDB++;
516     if( p->iInUseMM && p->iInUseDB==1 ){
517       sqlite3_mutex *mutex;
518       mutex = sqlite3_mutex_alloc(SQLITE_MUTEX_STATIC_MEM2);
519       p->iInUseDB = 0;
520       sqlite3_mutex_enter(mutex);
521       p->iInUseDB = 1;
522       sqlite3_mutex_leave(mutex);
523     }
524     assert( p->iInUseMM==0 );
525   }
526   static void pagerLeave(Pager *p){
527     p->iInUseDB--;
528     assert( p->iInUseDB>=0 );
529   }
530 #else
531 # define pagerEnter(X)
532 # define pagerLeave(X)
533 #endif
534 
535 /*
536 ** Add page pPg to the end of the linked list managed by structure
537 ** pList (pPg becomes the last entry in the list - the most recently
538 ** used). Argument pLink should point to either pPg->free or pPg->gfree,
539 ** depending on whether pPg is being added to the pager-specific or
540 ** global LRU list.
541 */
542 static void listAdd(PagerLruList *pList, PagerLruLink *pLink, PgHdr *pPg){
543   pLink->pNext = 0;
544   pLink->pPrev = pList->pLast;
545 
546 #ifdef SQLITE_ENABLE_MEMORY_MANAGEMENT
547   assert(pLink==&pPg->free || pLink==&pPg->gfree);
548   assert(pLink==&pPg->gfree || pList!=&sqlite3LruPageList);
549 #endif
550 
551   if( pList->pLast ){
552     int iOff = (char *)pLink - (char *)pPg;
553     PagerLruLink *pLastLink = (PagerLruLink *)(&((u8 *)pList->pLast)[iOff]);
554     pLastLink->pNext = pPg;
555   }else{
556     assert(!pList->pFirst);
557     pList->pFirst = pPg;
558   }
559 
560   pList->pLast = pPg;
561   if( !pList->pFirstSynced && pPg->needSync==0 ){
562     pList->pFirstSynced = pPg;
563   }
564 }
565 
566 /*
567 ** Remove pPg from the list managed by the structure pointed to by pList.
568 **
569 ** Argument pLink should point to either pPg->free or pPg->gfree, depending
570 ** on whether pPg is being added to the pager-specific or global LRU list.
571 */
572 static void listRemove(PagerLruList *pList, PagerLruLink *pLink, PgHdr *pPg){
573   int iOff = (char *)pLink - (char *)pPg;
574 
575 #ifdef SQLITE_ENABLE_MEMORY_MANAGEMENT
576   assert(pLink==&pPg->free || pLink==&pPg->gfree);
577   assert(pLink==&pPg->gfree || pList!=&sqlite3LruPageList);
578 #endif
579 
580   if( pPg==pList->pFirst ){
581     pList->pFirst = pLink->pNext;
582   }
583   if( pPg==pList->pLast ){
584     pList->pLast = pLink->pPrev;
585   }
586   if( pLink->pPrev ){
587     PagerLruLink *pPrevLink = (PagerLruLink *)(&((u8 *)pLink->pPrev)[iOff]);
588     pPrevLink->pNext = pLink->pNext;
589   }
590   if( pLink->pNext ){
591     PagerLruLink *pNextLink = (PagerLruLink *)(&((u8 *)pLink->pNext)[iOff]);
592     pNextLink->pPrev = pLink->pPrev;
593   }
594   if( pPg==pList->pFirstSynced ){
595     PgHdr *p = pLink->pNext;
596     while( p && p->needSync ){
597       PagerLruLink *pL = (PagerLruLink *)(&((u8 *)p)[iOff]);
598       p = pL->pNext;
599     }
600     pList->pFirstSynced = p;
601   }
602 
603   pLink->pNext = pLink->pPrev = 0;
604 }
605 
606 /*
607 ** Add page pPg to the list of free pages for the pager. If
608 ** memory-management is enabled, also add the page to the global
609 ** list of free pages.
610 */
611 static void lruListAdd(PgHdr *pPg){
612   listAdd(&pPg->pPager->lru, &pPg->free, pPg);
613 #ifdef SQLITE_ENABLE_MEMORY_MANAGEMENT
614   if( !pPg->pPager->memDb ){
615     sqlite3_mutex_enter(sqlite3_mutex_alloc(SQLITE_MUTEX_STATIC_LRU));
616     listAdd(&sqlite3LruPageList, &pPg->gfree, pPg);
617     sqlite3_mutex_leave(sqlite3_mutex_alloc(SQLITE_MUTEX_STATIC_LRU));
618   }
619 #endif
620 }
621 
622 /*
623 ** Remove page pPg from the list of free pages for the associated pager.
624 ** If memory-management is enabled, also remove pPg from the global list
625 ** of free pages.
626 */
627 static void lruListRemove(PgHdr *pPg){
628   listRemove(&pPg->pPager->lru, &pPg->free, pPg);
629 #ifdef SQLITE_ENABLE_MEMORY_MANAGEMENT
630   if( !pPg->pPager->memDb ){
631     sqlite3_mutex_enter(sqlite3_mutex_alloc(SQLITE_MUTEX_STATIC_LRU));
632     listRemove(&sqlite3LruPageList, &pPg->gfree, pPg);
633     sqlite3_mutex_leave(sqlite3_mutex_alloc(SQLITE_MUTEX_STATIC_LRU));
634   }
635 #endif
636 }
637 
638 /*
639 ** This function is called just after the needSync flag has been cleared
640 ** from all pages managed by pPager (usually because the journal file
641 ** has just been synced). It updates the pPager->lru.pFirstSynced variable
642 ** and, if memory-management is enabled, the sqlite3LruPageList.pFirstSynced
643 ** variable also.
644 */
645 static void lruListSetFirstSynced(Pager *pPager){
646   pPager->lru.pFirstSynced = pPager->lru.pFirst;
647 #ifdef SQLITE_ENABLE_MEMORY_MANAGEMENT
648   if( !pPager->memDb ){
649     PgHdr *p;
650     sqlite3_mutex_enter(sqlite3_mutex_alloc(SQLITE_MUTEX_STATIC_LRU));
651     for(p=sqlite3LruPageList.pFirst; p && p->needSync; p=p->gfree.pNext);
652     assert(p==pPager->lru.pFirstSynced || p==sqlite3LruPageList.pFirstSynced);
653     sqlite3LruPageList.pFirstSynced = p;
654     sqlite3_mutex_leave(sqlite3_mutex_alloc(SQLITE_MUTEX_STATIC_LRU));
655   }
656 #endif
657 }
658 
659 /*
660 ** Return true if page *pPg has already been written to the statement
661 ** journal (or statement snapshot has been created, if *pPg is part
662 ** of an in-memory database).
663 */
664 static int pageInStatement(PgHdr *pPg){
665   Pager *pPager = pPg->pPager;
666   if( MEMDB ){
667     return PGHDR_TO_HIST(pPg, pPager)->inStmt;
668   }else{
669     return sqlite3BitvecTest(pPager->pInStmt, pPg->pgno);
670   }
671 }
672 
673 /*
674 ** Change the size of the pager hash table to N.  N must be a power
675 ** of two.
676 */
677 static void pager_resize_hash_table(Pager *pPager, int N){
678   PgHdr **aHash, *pPg;
679   assert( N>0 && (N&(N-1))==0 );
680 #ifdef SQLITE_MALLOC_SOFT_LIMIT
681   if( N*sizeof(aHash[0])>SQLITE_MALLOC_SOFT_LIMIT ){
682     N = SQLITE_MALLOC_SOFT_LIMIT/sizeof(aHash[0]);
683   }
684   if( N==pPager->nHash ) return;
685 #endif
686   pagerLeave(pPager);
687   sqlite3FaultBenign(SQLITE_FAULTINJECTOR_MALLOC, pPager->aHash!=0);
688   aHash = sqlite3MallocZero( sizeof(aHash[0])*N );
689   sqlite3FaultBenign(SQLITE_FAULTINJECTOR_MALLOC, 0);
690   pagerEnter(pPager);
691   if( aHash==0 ){
692     /* Failure to rehash is not an error.  It is only a performance hit. */
693     return;
694   }
695   sqlite3_free(pPager->aHash);
696   pPager->nHash = N;
697   pPager->aHash = aHash;
698   for(pPg=pPager->pAll; pPg; pPg=pPg->pNextAll){
699     int h;
700     if( pPg->pgno==0 ){
701       assert( pPg->pNextHash==0 && pPg->pPrevHash==0 );
702       continue;
703     }
704     h = pPg->pgno & (N-1);
705     pPg->pNextHash = aHash[h];
706     if( aHash[h] ){
707       aHash[h]->pPrevHash = pPg;
708     }
709     aHash[h] = pPg;
710     pPg->pPrevHash = 0;
711   }
712 }
713 
714 /*
715 ** Read a 32-bit integer from the given file descriptor.  Store the integer
716 ** that is read in *pRes.  Return SQLITE_OK if everything worked, or an
717 ** error code is something goes wrong.
718 **
719 ** All values are stored on disk as big-endian.
720 */
721 static int read32bits(sqlite3_file *fd, i64 offset, u32 *pRes){
722   unsigned char ac[4];
723   int rc = sqlite3OsRead(fd, ac, sizeof(ac), offset);
724   if( rc==SQLITE_OK ){
725     *pRes = sqlite3Get4byte(ac);
726   }
727   return rc;
728 }
729 
730 /*
731 ** Write a 32-bit integer into a string buffer in big-endian byte order.
732 */
733 #define put32bits(A,B)  sqlite3Put4byte((u8*)A,B)
734 
735 /*
736 ** Write a 32-bit integer into the given file descriptor.  Return SQLITE_OK
737 ** on success or an error code is something goes wrong.
738 */
739 static int write32bits(sqlite3_file *fd, i64 offset, u32 val){
740   char ac[4];
741   put32bits(ac, val);
742   return sqlite3OsWrite(fd, ac, 4, offset);
743 }
744 
745 /*
746 ** If file pFd is open, call sqlite3OsUnlock() on it.
747 */
748 static int osUnlock(sqlite3_file *pFd, int eLock){
749   if( !pFd->pMethods ){
750     return SQLITE_OK;
751   }
752   return sqlite3OsUnlock(pFd, eLock);
753 }
754 
755 /*
756 ** This function determines whether or not the atomic-write optimization
757 ** can be used with this pager. The optimization can be used if:
758 **
759 **  (a) the value returned by OsDeviceCharacteristics() indicates that
760 **      a database page may be written atomically, and
761 **  (b) the value returned by OsSectorSize() is less than or equal
762 **      to the page size.
763 **
764 ** If the optimization cannot be used, 0 is returned. If it can be used,
765 ** then the value returned is the size of the journal file when it
766 ** contains rollback data for exactly one page.
767 */
768 #ifdef SQLITE_ENABLE_ATOMIC_WRITE
769 static int jrnlBufferSize(Pager *pPager){
770   int dc;           /* Device characteristics */
771   int nSector;      /* Sector size */
772   int nPage;        /* Page size */
773   sqlite3_file *fd = pPager->fd;
774 
775   if( fd->pMethods ){
776     dc = sqlite3OsDeviceCharacteristics(fd);
777     nSector = sqlite3OsSectorSize(fd);
778     nPage = pPager->pageSize;
779   }
780 
781   assert(SQLITE_IOCAP_ATOMIC512==(512>>8));
782   assert(SQLITE_IOCAP_ATOMIC64K==(65536>>8));
783 
784   if( !fd->pMethods || (dc&(SQLITE_IOCAP_ATOMIC|(nPage>>8))&&nSector<=nPage) ){
785     return JOURNAL_HDR_SZ(pPager) + JOURNAL_PG_SZ(pPager);
786   }
787   return 0;
788 }
789 #endif
790 
791 /*
792 ** This function should be called when an error occurs within the pager
793 ** code. The first argument is a pointer to the pager structure, the
794 ** second the error-code about to be returned by a pager API function.
795 ** The value returned is a copy of the second argument to this function.
796 **
797 ** If the second argument is SQLITE_IOERR, SQLITE_CORRUPT, or SQLITE_FULL
798 ** the error becomes persistent. Until the persisten error is cleared,
799 ** subsequent API calls on this Pager will immediately return the same
800 ** error code.
801 **
802 ** A persistent error indicates that the contents of the pager-cache
803 ** cannot be trusted. This state can be cleared by completely discarding
804 ** the contents of the pager-cache. If a transaction was active when
805 ** the persistent error occured, then the rollback journal may need
806 ** to be replayed.
807 */
808 static void pager_unlock(Pager *pPager);
809 static int pager_error(Pager *pPager, int rc){
810   int rc2 = rc & 0xff;
811   assert(
812        pPager->errCode==SQLITE_FULL ||
813        pPager->errCode==SQLITE_OK ||
814        (pPager->errCode & 0xff)==SQLITE_IOERR
815   );
816   if(
817     rc2==SQLITE_FULL ||
818     rc2==SQLITE_IOERR ||
819     rc2==SQLITE_CORRUPT
820   ){
821     pPager->errCode = rc;
822     if( pPager->state==PAGER_UNLOCK && pPager->nRef==0 ){
823       /* If the pager is already unlocked, call pager_unlock() now to
824       ** clear the error state and ensure that the pager-cache is
825       ** completely empty.
826       */
827       pager_unlock(pPager);
828     }
829   }
830   return rc;
831 }
832 
833 /*
834 ** If SQLITE_CHECK_PAGES is defined then we do some sanity checking
835 ** on the cache using a hash function.  This is used for testing
836 ** and debugging only.
837 */
838 #ifdef SQLITE_CHECK_PAGES
839 /*
840 ** Return a 32-bit hash of the page data for pPage.
841 */
842 static u32 pager_datahash(int nByte, unsigned char *pData){
843   u32 hash = 0;
844   int i;
845   for(i=0; i<nByte; i++){
846     hash = (hash*1039) + pData[i];
847   }
848   return hash;
849 }
850 static u32 pager_pagehash(PgHdr *pPage){
851   return pager_datahash(pPage->pPager->pageSize,
852                         (unsigned char *)PGHDR_TO_DATA(pPage));
853 }
854 
855 /*
856 ** The CHECK_PAGE macro takes a PgHdr* as an argument. If SQLITE_CHECK_PAGES
857 ** is defined, and NDEBUG is not defined, an assert() statement checks
858 ** that the page is either dirty or still matches the calculated page-hash.
859 */
860 #define CHECK_PAGE(x) checkPage(x)
861 static void checkPage(PgHdr *pPg){
862   Pager *pPager = pPg->pPager;
863   assert( !pPg->pageHash || pPager->errCode || MEMDB || pPg->dirty ||
864       pPg->pageHash==pager_pagehash(pPg) );
865 }
866 
867 #else
868 #define pager_datahash(X,Y)  0
869 #define pager_pagehash(X)  0
870 #define CHECK_PAGE(x)
871 #endif
872 
873 /*
874 ** When this is called the journal file for pager pPager must be open.
875 ** The master journal file name is read from the end of the file and
876 ** written into memory supplied by the caller.
877 **
878 ** zMaster must point to a buffer of at least nMaster bytes allocated by
879 ** the caller. This should be sqlite3_vfs.mxPathname+1 (to ensure there is
880 ** enough space to write the master journal name). If the master journal
881 ** name in the journal is longer than nMaster bytes (including a
882 ** nul-terminator), then this is handled as if no master journal name
883 ** were present in the journal.
884 **
885 ** If no master journal file name is present zMaster[0] is set to 0 and
886 ** SQLITE_OK returned.
887 */
888 static int readMasterJournal(sqlite3_file *pJrnl, char *zMaster, int nMaster){
889   int rc;
890   u32 len;
891   i64 szJ;
892   u32 cksum;
893   int i;
894   unsigned char aMagic[8]; /* A buffer to hold the magic header */
895 
896   zMaster[0] = '\0';
897 
898   rc = sqlite3OsFileSize(pJrnl, &szJ);
899   if( rc!=SQLITE_OK || szJ<16 ) return rc;
900 
901   rc = read32bits(pJrnl, szJ-16, &len);
902   if( rc!=SQLITE_OK ) return rc;
903 
904   if( len>=nMaster ){
905     return SQLITE_OK;
906   }
907 
908   rc = read32bits(pJrnl, szJ-12, &cksum);
909   if( rc!=SQLITE_OK ) return rc;
910 
911   rc = sqlite3OsRead(pJrnl, aMagic, 8, szJ-8);
912   if( rc!=SQLITE_OK || memcmp(aMagic, aJournalMagic, 8) ) return rc;
913 
914   rc = sqlite3OsRead(pJrnl, zMaster, len, szJ-16-len);
915   if( rc!=SQLITE_OK ){
916     return rc;
917   }
918   zMaster[len] = '\0';
919 
920   /* See if the checksum matches the master journal name */
921   for(i=0; i<len; i++){
922     cksum -= zMaster[i];
923    }
924   if( cksum ){
925     /* If the checksum doesn't add up, then one or more of the disk sectors
926     ** containing the master journal filename is corrupted. This means
927     ** definitely roll back, so just return SQLITE_OK and report a (nul)
928     ** master-journal filename.
929     */
930     zMaster[0] = '\0';
931   }
932 
933   return SQLITE_OK;
934 }
935 
936 /*
937 ** Seek the journal file descriptor to the next sector boundary where a
938 ** journal header may be read or written. Pager.journalOff is updated with
939 ** the new seek offset.
940 **
941 ** i.e for a sector size of 512:
942 **
943 ** Input Offset              Output Offset
944 ** ---------------------------------------
945 ** 0                         0
946 ** 512                       512
947 ** 100                       512
948 ** 2000                      2048
949 **
950 */
951 static void seekJournalHdr(Pager *pPager){
952   i64 offset = 0;
953   i64 c = pPager->journalOff;
954   if( c ){
955     offset = ((c-1)/JOURNAL_HDR_SZ(pPager) + 1) * JOURNAL_HDR_SZ(pPager);
956   }
957   assert( offset%JOURNAL_HDR_SZ(pPager)==0 );
958   assert( offset>=c );
959   assert( (offset-c)<JOURNAL_HDR_SZ(pPager) );
960   pPager->journalOff = offset;
961 }
962 
963 /*
964 ** The journal file must be open when this routine is called. A journal
965 ** header (JOURNAL_HDR_SZ bytes) is written into the journal file at the
966 ** current location.
967 **
968 ** The format for the journal header is as follows:
969 ** - 8 bytes: Magic identifying journal format.
970 ** - 4 bytes: Number of records in journal, or -1 no-sync mode is on.
971 ** - 4 bytes: Random number used for page hash.
972 ** - 4 bytes: Initial database page count.
973 ** - 4 bytes: Sector size used by the process that wrote this journal.
974 ** - 4 bytes: Database page size.
975 **
976 ** Followed by (JOURNAL_HDR_SZ - 28) bytes of unused space.
977 */
978 static int writeJournalHdr(Pager *pPager){
979   char zHeader[sizeof(aJournalMagic)+20];
980   int rc;
981 
982   if( pPager->stmtHdrOff==0 ){
983     pPager->stmtHdrOff = pPager->journalOff;
984   }
985 
986   seekJournalHdr(pPager);
987   pPager->journalHdr = pPager->journalOff;
988 
989   memcpy(zHeader, aJournalMagic, sizeof(aJournalMagic));
990 
991   /*
992   ** Write the nRec Field - the number of page records that follow this
993   ** journal header. Normally, zero is written to this value at this time.
994   ** After the records are added to the journal (and the journal synced,
995   ** if in full-sync mode), the zero is overwritten with the true number
996   ** of records (see syncJournal()).
997   **
998   ** A faster alternative is to write 0xFFFFFFFF to the nRec field. When
999   ** reading the journal this value tells SQLite to assume that the
1000   ** rest of the journal file contains valid page records. This assumption
1001   ** is dangerous, as if a failure occured whilst writing to the journal
1002   ** file it may contain some garbage data. There are two scenarios
1003   ** where this risk can be ignored:
1004   **
1005   **   * When the pager is in no-sync mode. Corruption can follow a
1006   **     power failure in this case anyway.
1007   **
1008   **   * When the SQLITE_IOCAP_SAFE_APPEND flag is set. This guarantees
1009   **     that garbage data is never appended to the journal file.
1010   */
1011   assert(pPager->fd->pMethods||pPager->noSync);
1012   if( (pPager->noSync)
1013    || (sqlite3OsDeviceCharacteristics(pPager->fd)&SQLITE_IOCAP_SAFE_APPEND)
1014   ){
1015     put32bits(&zHeader[sizeof(aJournalMagic)], 0xffffffff);
1016   }else{
1017     put32bits(&zHeader[sizeof(aJournalMagic)], 0);
1018   }
1019 
1020   /* The random check-hash initialiser */
1021   sqlite3_randomness(sizeof(pPager->cksumInit), &pPager->cksumInit);
1022   put32bits(&zHeader[sizeof(aJournalMagic)+4], pPager->cksumInit);
1023   /* The initial database size */
1024   put32bits(&zHeader[sizeof(aJournalMagic)+8], pPager->dbSize);
1025   /* The assumed sector size for this process */
1026   put32bits(&zHeader[sizeof(aJournalMagic)+12], pPager->sectorSize);
1027   if( pPager->journalHdr==0 ){
1028     /* The page size */
1029     put32bits(&zHeader[sizeof(aJournalMagic)+16], pPager->pageSize);
1030   }
1031   IOTRACE(("JHDR %p %lld %d\n", pPager, pPager->journalHdr, sizeof(zHeader)))
1032   rc = sqlite3OsWrite(pPager->jfd, zHeader, sizeof(zHeader),pPager->journalOff);
1033   pPager->journalOff += JOURNAL_HDR_SZ(pPager);
1034 
1035   /* The journal header has been written successfully. Seek the journal
1036   ** file descriptor to the end of the journal header sector.
1037   */
1038   if( rc==SQLITE_OK ){
1039     IOTRACE(("JTAIL %p %lld\n", pPager, pPager->journalOff-1))
1040     rc = sqlite3OsWrite(pPager->jfd, "\000", 1, pPager->journalOff-1);
1041   }
1042   return rc;
1043 }
1044 
1045 /*
1046 ** The journal file must be open when this is called. A journal header file
1047 ** (JOURNAL_HDR_SZ bytes) is read from the current location in the journal
1048 ** file. See comments above function writeJournalHdr() for a description of
1049 ** the journal header format.
1050 **
1051 ** If the header is read successfully, *nRec is set to the number of
1052 ** page records following this header and *dbSize is set to the size of the
1053 ** database before the transaction began, in pages. Also, pPager->cksumInit
1054 ** is set to the value read from the journal header. SQLITE_OK is returned
1055 ** in this case.
1056 **
1057 ** If the journal header file appears to be corrupted, SQLITE_DONE is
1058 ** returned and *nRec and *dbSize are not set.  If JOURNAL_HDR_SZ bytes
1059 ** cannot be read from the journal file an error code is returned.
1060 */
1061 static int readJournalHdr(
1062   Pager *pPager,
1063   i64 journalSize,
1064   u32 *pNRec,
1065   u32 *pDbSize
1066 ){
1067   int rc;
1068   unsigned char aMagic[8]; /* A buffer to hold the magic header */
1069   i64 jrnlOff;
1070   int iPageSize;
1071 
1072   seekJournalHdr(pPager);
1073   if( pPager->journalOff+JOURNAL_HDR_SZ(pPager) > journalSize ){
1074     return SQLITE_DONE;
1075   }
1076   jrnlOff = pPager->journalOff;
1077 
1078   rc = sqlite3OsRead(pPager->jfd, aMagic, sizeof(aMagic), jrnlOff);
1079   if( rc ) return rc;
1080   jrnlOff += sizeof(aMagic);
1081 
1082   if( memcmp(aMagic, aJournalMagic, sizeof(aMagic))!=0 ){
1083     return SQLITE_DONE;
1084   }
1085 
1086   rc = read32bits(pPager->jfd, jrnlOff, pNRec);
1087   if( rc ) return rc;
1088 
1089   rc = read32bits(pPager->jfd, jrnlOff+4, &pPager->cksumInit);
1090   if( rc ) return rc;
1091 
1092   rc = read32bits(pPager->jfd, jrnlOff+8, pDbSize);
1093   if( rc ) return rc;
1094 
1095   rc = read32bits(pPager->jfd, jrnlOff+16, (u32 *)&iPageSize);
1096   if( rc==SQLITE_OK
1097    && iPageSize>=512
1098    && iPageSize<=SQLITE_MAX_PAGE_SIZE
1099    && ((iPageSize-1)&iPageSize)==0
1100   ){
1101     u16 pagesize = iPageSize;
1102     rc = sqlite3PagerSetPagesize(pPager, &pagesize);
1103   }
1104   if( rc ) return rc;
1105 
1106   /* Update the assumed sector-size to match the value used by
1107   ** the process that created this journal. If this journal was
1108   ** created by a process other than this one, then this routine
1109   ** is being called from within pager_playback(). The local value
1110   ** of Pager.sectorSize is restored at the end of that routine.
1111   */
1112   rc = read32bits(pPager->jfd, jrnlOff+12, (u32 *)&pPager->sectorSize);
1113   if( rc ) return rc;
1114 
1115   pPager->journalOff += JOURNAL_HDR_SZ(pPager);
1116   return SQLITE_OK;
1117 }
1118 
1119 
1120 /*
1121 ** Write the supplied master journal name into the journal file for pager
1122 ** pPager at the current location. The master journal name must be the last
1123 ** thing written to a journal file. If the pager is in full-sync mode, the
1124 ** journal file descriptor is advanced to the next sector boundary before
1125 ** anything is written. The format is:
1126 **
1127 ** + 4 bytes: PAGER_MJ_PGNO.
1128 ** + N bytes: length of master journal name.
1129 ** + 4 bytes: N
1130 ** + 4 bytes: Master journal name checksum.
1131 ** + 8 bytes: aJournalMagic[].
1132 **
1133 ** The master journal page checksum is the sum of the bytes in the master
1134 ** journal name.
1135 **
1136 ** If zMaster is a NULL pointer (occurs for a single database transaction),
1137 ** this call is a no-op.
1138 */
1139 static int writeMasterJournal(Pager *pPager, const char *zMaster){
1140   int rc;
1141   int len;
1142   int i;
1143   i64 jrnlOff;
1144   u32 cksum = 0;
1145   char zBuf[sizeof(aJournalMagic)+2*4];
1146 
1147   if( !zMaster || pPager->setMaster) return SQLITE_OK;
1148   pPager->setMaster = 1;
1149 
1150   len = strlen(zMaster);
1151   for(i=0; i<len; i++){
1152     cksum += zMaster[i];
1153   }
1154 
1155   /* If in full-sync mode, advance to the next disk sector before writing
1156   ** the master journal name. This is in case the previous page written to
1157   ** the journal has already been synced.
1158   */
1159   if( pPager->fullSync ){
1160     seekJournalHdr(pPager);
1161   }
1162   jrnlOff = pPager->journalOff;
1163   pPager->journalOff += (len+20);
1164 
1165   rc = write32bits(pPager->jfd, jrnlOff, PAGER_MJ_PGNO(pPager));
1166   if( rc!=SQLITE_OK ) return rc;
1167   jrnlOff += 4;
1168 
1169   rc = sqlite3OsWrite(pPager->jfd, zMaster, len, jrnlOff);
1170   if( rc!=SQLITE_OK ) return rc;
1171   jrnlOff += len;
1172 
1173   put32bits(zBuf, len);
1174   put32bits(&zBuf[4], cksum);
1175   memcpy(&zBuf[8], aJournalMagic, sizeof(aJournalMagic));
1176   rc = sqlite3OsWrite(pPager->jfd, zBuf, 8+sizeof(aJournalMagic), jrnlOff);
1177   pPager->needSync = !pPager->noSync;
1178   return rc;
1179 }
1180 
1181 /*
1182 ** Add or remove a page from the list of all pages that are in the
1183 ** statement journal.
1184 **
1185 ** The Pager keeps a separate list of pages that are currently in
1186 ** the statement journal.  This helps the sqlite3PagerStmtCommit()
1187 ** routine run MUCH faster for the common case where there are many
1188 ** pages in memory but only a few are in the statement journal.
1189 */
1190 static void page_add_to_stmt_list(PgHdr *pPg){
1191   Pager *pPager = pPg->pPager;
1192   PgHistory *pHist = PGHDR_TO_HIST(pPg, pPager);
1193   assert( MEMDB );
1194   if( !pHist->inStmt ){
1195     assert( pHist->pPrevStmt==0 && pHist->pNextStmt==0 );
1196     if( pPager->pStmt ){
1197       PGHDR_TO_HIST(pPager->pStmt, pPager)->pPrevStmt = pPg;
1198     }
1199     pHist->pNextStmt = pPager->pStmt;
1200     pPager->pStmt = pPg;
1201     pHist->inStmt = 1;
1202   }
1203 }
1204 
1205 /*
1206 ** Find a page in the hash table given its page number.  Return
1207 ** a pointer to the page or NULL if not found.
1208 */
1209 static PgHdr *pager_lookup(Pager *pPager, Pgno pgno){
1210   PgHdr *p;
1211   if( pPager->aHash==0 ) return 0;
1212   p = pPager->aHash[pgno & (pPager->nHash-1)];
1213   while( p && p->pgno!=pgno ){
1214     p = p->pNextHash;
1215   }
1216   return p;
1217 }
1218 
1219 /*
1220 ** Clear the in-memory cache.  This routine
1221 ** sets the state of the pager back to what it was when it was first
1222 ** opened.  Any outstanding pages are invalidated and subsequent attempts
1223 ** to access those pages will likely result in a coredump.
1224 */
1225 static void pager_reset(Pager *pPager){
1226   PgHdr *pPg, *pNext;
1227   if( pPager->errCode ) return;
1228   for(pPg=pPager->pAll; pPg; pPg=pNext){
1229     IOTRACE(("PGFREE %p %d\n", pPager, pPg->pgno));
1230     PAGER_INCR(sqlite3_pager_pgfree_count);
1231     pNext = pPg->pNextAll;
1232     lruListRemove(pPg);
1233     sqlite3_free(pPg->pData);
1234     sqlite3_free(pPg);
1235   }
1236   assert(pPager->lru.pFirst==0);
1237   assert(pPager->lru.pFirstSynced==0);
1238   assert(pPager->lru.pLast==0);
1239   pPager->pStmt = 0;
1240   pPager->pAll = 0;
1241   pPager->pDirty = 0;
1242   pPager->nHash = 0;
1243   sqlite3_free(pPager->aHash);
1244   pPager->nPage = 0;
1245   pPager->aHash = 0;
1246   pPager->nRef = 0;
1247 }
1248 
1249 /*
1250 ** Unlock the database file.
1251 **
1252 ** If the pager is currently in error state, discard the contents of
1253 ** the cache and reset the Pager structure internal state. If there is
1254 ** an open journal-file, then the next time a shared-lock is obtained
1255 ** on the pager file (by this or any other process), it will be
1256 ** treated as a hot-journal and rolled back.
1257 */
1258 static void pager_unlock(Pager *pPager){
1259   if( !pPager->exclusiveMode ){
1260     if( !MEMDB ){
1261       int rc = osUnlock(pPager->fd, NO_LOCK);
1262       if( rc ) pPager->errCode = rc;
1263       pPager->dbSize = -1;
1264       IOTRACE(("UNLOCK %p\n", pPager))
1265 
1266       /* If Pager.errCode is set, the contents of the pager cache cannot be
1267       ** trusted. Now that the pager file is unlocked, the contents of the
1268       ** cache can be discarded and the error code safely cleared.
1269       */
1270       if( pPager->errCode ){
1271         if( rc==SQLITE_OK ) pPager->errCode = SQLITE_OK;
1272         pager_reset(pPager);
1273         if( pPager->stmtOpen ){
1274           sqlite3OsClose(pPager->stfd);
1275           sqlite3BitvecDestroy(pPager->pInStmt);
1276           pPager->pInStmt = 0;
1277         }
1278         if( pPager->journalOpen ){
1279           sqlite3OsClose(pPager->jfd);
1280           pPager->journalOpen = 0;
1281           sqlite3BitvecDestroy(pPager->pInJournal);
1282           pPager->pInJournal = 0;
1283         }
1284         pPager->stmtOpen = 0;
1285         pPager->stmtInUse = 0;
1286         pPager->journalOff = 0;
1287         pPager->journalStarted = 0;
1288         pPager->stmtAutoopen = 0;
1289         pPager->origDbSize = 0;
1290       }
1291     }
1292 
1293     if( !MEMDB || pPager->errCode==SQLITE_OK ){
1294       pPager->state = PAGER_UNLOCK;
1295       pPager->changeCountDone = 0;
1296     }
1297   }
1298 }
1299 
1300 /*
1301 ** Execute a rollback if a transaction is active and unlock the
1302 ** database file. If the pager has already entered the error state,
1303 ** do not attempt the rollback.
1304 */
1305 static void pagerUnlockAndRollback(Pager *p){
1306   assert( p->state>=PAGER_RESERVED || p->journalOpen==0 );
1307   if( p->errCode==SQLITE_OK && p->state>=PAGER_RESERVED ){
1308     sqlite3PagerRollback(p);
1309   }
1310   pager_unlock(p);
1311   assert( p->errCode || !p->journalOpen || (p->exclusiveMode&&!p->journalOff) );
1312   assert( p->errCode || !p->stmtOpen || p->exclusiveMode );
1313 }
1314 
1315 /*
1316 ** This routine ends a transaction.  A transaction is ended by either
1317 ** a COMMIT or a ROLLBACK.
1318 **
1319 ** When this routine is called, the pager has the journal file open and
1320 ** a RESERVED or EXCLUSIVE lock on the database.  This routine will release
1321 ** the database lock and acquires a SHARED lock in its place if that is
1322 ** the appropriate thing to do.  Release locks usually is appropriate,
1323 ** unless we are in exclusive access mode or unless this is a
1324 ** COMMIT AND BEGIN or ROLLBACK AND BEGIN operation.
1325 **
1326 ** The journal file is either deleted or truncated.
1327 **
1328 ** TODO: Consider keeping the journal file open for temporary databases.
1329 ** This might give a performance improvement on windows where opening
1330 ** a file is an expensive operation.
1331 */
1332 static int pager_end_transaction(Pager *pPager){
1333   PgHdr *pPg;
1334   int rc = SQLITE_OK;
1335   int rc2 = SQLITE_OK;
1336   assert( !MEMDB );
1337   if( pPager->state<PAGER_RESERVED ){
1338     return SQLITE_OK;
1339   }
1340   sqlite3PagerStmtCommit(pPager);
1341   if( pPager->stmtOpen && !pPager->exclusiveMode ){
1342     sqlite3OsClose(pPager->stfd);
1343     pPager->stmtOpen = 0;
1344   }
1345   if( pPager->journalOpen ){
1346     if( pPager->exclusiveMode
1347           && (rc = sqlite3OsTruncate(pPager->jfd, 0))==SQLITE_OK ){;
1348       pPager->journalOff = 0;
1349       pPager->journalStarted = 0;
1350     }else{
1351       sqlite3OsClose(pPager->jfd);
1352       pPager->journalOpen = 0;
1353       if( rc==SQLITE_OK ){
1354         rc = sqlite3OsDelete(pPager->pVfs, pPager->zJournal, 0);
1355       }
1356     }
1357     sqlite3BitvecDestroy(pPager->pInJournal);
1358     pPager->pInJournal = 0;
1359     for(pPg=pPager->pAll; pPg; pPg=pPg->pNextAll){
1360       pPg->inJournal = 0;
1361       pPg->dirty = 0;
1362       pPg->needSync = 0;
1363       pPg->alwaysRollback = 0;
1364 #ifdef SQLITE_CHECK_PAGES
1365       pPg->pageHash = pager_pagehash(pPg);
1366 #endif
1367     }
1368     pPager->pDirty = 0;
1369     pPager->dirtyCache = 0;
1370     pPager->nRec = 0;
1371   }else{
1372     assert( pPager->pInJournal==0 );
1373     assert( pPager->dirtyCache==0 || pPager->useJournal==0 );
1374   }
1375 
1376   if( !pPager->exclusiveMode ){
1377     rc2 = osUnlock(pPager->fd, SHARED_LOCK);
1378     pPager->state = PAGER_SHARED;
1379   }else if( pPager->state==PAGER_SYNCED ){
1380     pPager->state = PAGER_EXCLUSIVE;
1381   }
1382   pPager->origDbSize = 0;
1383   pPager->setMaster = 0;
1384   pPager->needSync = 0;
1385   lruListSetFirstSynced(pPager);
1386   pPager->dbSize = -1;
1387 
1388   return (rc==SQLITE_OK?rc2:rc);
1389 }
1390 
1391 /*
1392 ** Compute and return a checksum for the page of data.
1393 **
1394 ** This is not a real checksum.  It is really just the sum of the
1395 ** random initial value and the page number.  We experimented with
1396 ** a checksum of the entire data, but that was found to be too slow.
1397 **
1398 ** Note that the page number is stored at the beginning of data and
1399 ** the checksum is stored at the end.  This is important.  If journal
1400 ** corruption occurs due to a power failure, the most likely scenario
1401 ** is that one end or the other of the record will be changed.  It is
1402 ** much less likely that the two ends of the journal record will be
1403 ** correct and the middle be corrupt.  Thus, this "checksum" scheme,
1404 ** though fast and simple, catches the mostly likely kind of corruption.
1405 **
1406 ** FIX ME:  Consider adding every 200th (or so) byte of the data to the
1407 ** checksum.  That way if a single page spans 3 or more disk sectors and
1408 ** only the middle sector is corrupt, we will still have a reasonable
1409 ** chance of failing the checksum and thus detecting the problem.
1410 */
1411 static u32 pager_cksum(Pager *pPager, const u8 *aData){
1412   u32 cksum = pPager->cksumInit;
1413   int i = pPager->pageSize-200;
1414   while( i>0 ){
1415     cksum += aData[i];
1416     i -= 200;
1417   }
1418   return cksum;
1419 }
1420 
1421 /* Forward declaration */
1422 static void makeClean(PgHdr*);
1423 
1424 /*
1425 ** Read a single page from the journal file opened on file descriptor
1426 ** jfd.  Playback this one page.
1427 **
1428 ** If useCksum==0 it means this journal does not use checksums.  Checksums
1429 ** are not used in statement journals because statement journals do not
1430 ** need to survive power failures.
1431 */
1432 static int pager_playback_one_page(
1433   Pager *pPager,
1434   sqlite3_file *jfd,
1435   i64 offset,
1436   int useCksum
1437 ){
1438   int rc;
1439   PgHdr *pPg;                   /* An existing page in the cache */
1440   Pgno pgno;                    /* The page number of a page in journal */
1441   u32 cksum;                    /* Checksum used for sanity checking */
1442   u8 *aData = (u8 *)pPager->pTmpSpace;   /* Temp storage for a page */
1443 
1444   /* useCksum should be true for the main journal and false for
1445   ** statement journals.  Verify that this is always the case
1446   */
1447   assert( jfd == (useCksum ? pPager->jfd : pPager->stfd) );
1448   assert( aData );
1449 
1450   rc = read32bits(jfd, offset, &pgno);
1451   if( rc!=SQLITE_OK ) return rc;
1452   rc = sqlite3OsRead(jfd, aData, pPager->pageSize, offset+4);
1453   if( rc!=SQLITE_OK ) return rc;
1454   pPager->journalOff += pPager->pageSize + 4;
1455 
1456   /* Sanity checking on the page.  This is more important that I originally
1457   ** thought.  If a power failure occurs while the journal is being written,
1458   ** it could cause invalid data to be written into the journal.  We need to
1459   ** detect this invalid data (with high probability) and ignore it.
1460   */
1461   if( pgno==0 || pgno==PAGER_MJ_PGNO(pPager) ){
1462     return SQLITE_DONE;
1463   }
1464   if( pgno>(unsigned)pPager->dbSize ){
1465     return SQLITE_OK;
1466   }
1467   if( useCksum ){
1468     rc = read32bits(jfd, offset+pPager->pageSize+4, &cksum);
1469     if( rc ) return rc;
1470     pPager->journalOff += 4;
1471     if( pager_cksum(pPager, aData)!=cksum ){
1472       return SQLITE_DONE;
1473     }
1474   }
1475 
1476   assert( pPager->state==PAGER_RESERVED || pPager->state>=PAGER_EXCLUSIVE );
1477 
1478   /* If the pager is in RESERVED state, then there must be a copy of this
1479   ** page in the pager cache. In this case just update the pager cache,
1480   ** not the database file. The page is left marked dirty in this case.
1481   **
1482   ** An exception to the above rule: If the database is in no-sync mode
1483   ** and a page is moved during an incremental vacuum then the page may
1484   ** not be in the pager cache. Later: if a malloc() or IO error occurs
1485   ** during a Movepage() call, then the page may not be in the cache
1486   ** either. So the condition described in the above paragraph is not
1487   ** assert()able.
1488   **
1489   ** If in EXCLUSIVE state, then we update the pager cache if it exists
1490   ** and the main file. The page is then marked not dirty.
1491   **
1492   ** Ticket #1171:  The statement journal might contain page content that is
1493   ** different from the page content at the start of the transaction.
1494   ** This occurs when a page is changed prior to the start of a statement
1495   ** then changed again within the statement.  When rolling back such a
1496   ** statement we must not write to the original database unless we know
1497   ** for certain that original page contents are synced into the main rollback
1498   ** journal.  Otherwise, a power loss might leave modified data in the
1499   ** database file without an entry in the rollback journal that can
1500   ** restore the database to its original form.  Two conditions must be
1501   ** met before writing to the database files. (1) the database must be
1502   ** locked.  (2) we know that the original page content is fully synced
1503   ** in the main journal either because the page is not in cache or else
1504   ** the page is marked as needSync==0.
1505   */
1506   pPg = pager_lookup(pPager, pgno);
1507   PAGERTRACE4("PLAYBACK %d page %d hash(%08x)\n",
1508                PAGERID(pPager), pgno, pager_datahash(pPager->pageSize, aData));
1509   if( pPager->state>=PAGER_EXCLUSIVE && (pPg==0 || pPg->needSync==0) ){
1510     i64 offset = (pgno-1)*(i64)pPager->pageSize;
1511     rc = sqlite3OsWrite(pPager->fd, aData, pPager->pageSize, offset);
1512     if( pPg ){
1513       makeClean(pPg);
1514     }
1515   }
1516   if( pPg ){
1517     /* No page should ever be explicitly rolled back that is in use, except
1518     ** for page 1 which is held in use in order to keep the lock on the
1519     ** database active. However such a page may be rolled back as a result
1520     ** of an internal error resulting in an automatic call to
1521     ** sqlite3PagerRollback().
1522     */
1523     void *pData;
1524     /* assert( pPg->nRef==0 || pPg->pgno==1 ); */
1525     pData = PGHDR_TO_DATA(pPg);
1526     memcpy(pData, aData, pPager->pageSize);
1527     if( pPager->xReiniter ){
1528       pPager->xReiniter(pPg, pPager->pageSize);
1529     }
1530 #ifdef SQLITE_CHECK_PAGES
1531     pPg->pageHash = pager_pagehash(pPg);
1532 #endif
1533     /* If this was page 1, then restore the value of Pager.dbFileVers.
1534     ** Do this before any decoding. */
1535     if( pgno==1 ){
1536       memcpy(&pPager->dbFileVers, &((u8*)pData)[24],sizeof(pPager->dbFileVers));
1537     }
1538 
1539     /* Decode the page just read from disk */
1540     CODEC1(pPager, pData, pPg->pgno, 3);
1541   }
1542   return rc;
1543 }
1544 
1545 /*
1546 ** Parameter zMaster is the name of a master journal file. A single journal
1547 ** file that referred to the master journal file has just been rolled back.
1548 ** This routine checks if it is possible to delete the master journal file,
1549 ** and does so if it is.
1550 **
1551 ** Argument zMaster may point to Pager.pTmpSpace. So that buffer is not
1552 ** available for use within this function.
1553 **
1554 **
1555 ** The master journal file contains the names of all child journals.
1556 ** To tell if a master journal can be deleted, check to each of the
1557 ** children.  If all children are either missing or do not refer to
1558 ** a different master journal, then this master journal can be deleted.
1559 */
1560 static int pager_delmaster(Pager *pPager, const char *zMaster){
1561   sqlite3_vfs *pVfs = pPager->pVfs;
1562   int rc;
1563   int master_open = 0;
1564   sqlite3_file *pMaster;
1565   sqlite3_file *pJournal;
1566   char *zMasterJournal = 0; /* Contents of master journal file */
1567   i64 nMasterJournal;       /* Size of master journal file */
1568 
1569   /* Open the master journal file exclusively in case some other process
1570   ** is running this routine also. Not that it makes too much difference.
1571   */
1572   pMaster = (sqlite3_file *)sqlite3_malloc(pVfs->szOsFile * 2);
1573   pJournal = (sqlite3_file *)(((u8 *)pMaster) + pVfs->szOsFile);
1574   if( !pMaster ){
1575     rc = SQLITE_NOMEM;
1576   }else{
1577     int flags = (SQLITE_OPEN_READONLY|SQLITE_OPEN_MASTER_JOURNAL);
1578     rc = sqlite3OsOpen(pVfs, zMaster, pMaster, flags, 0);
1579   }
1580   if( rc!=SQLITE_OK ) goto delmaster_out;
1581   master_open = 1;
1582 
1583   rc = sqlite3OsFileSize(pMaster, &nMasterJournal);
1584   if( rc!=SQLITE_OK ) goto delmaster_out;
1585 
1586   if( nMasterJournal>0 ){
1587     char *zJournal;
1588     char *zMasterPtr = 0;
1589     int nMasterPtr = pPager->pVfs->mxPathname+1;
1590 
1591     /* Load the entire master journal file into space obtained from
1592     ** sqlite3_malloc() and pointed to by zMasterJournal.
1593     */
1594     zMasterJournal = (char *)sqlite3_malloc(nMasterJournal + nMasterPtr);
1595     if( !zMasterJournal ){
1596       rc = SQLITE_NOMEM;
1597       goto delmaster_out;
1598     }
1599     zMasterPtr = &zMasterJournal[nMasterJournal];
1600     rc = sqlite3OsRead(pMaster, zMasterJournal, nMasterJournal, 0);
1601     if( rc!=SQLITE_OK ) goto delmaster_out;
1602 
1603     zJournal = zMasterJournal;
1604     while( (zJournal-zMasterJournal)<nMasterJournal ){
1605       rc = sqlite3OsAccess(pVfs, zJournal, SQLITE_ACCESS_EXISTS);
1606       if( rc!=0 && rc!=1 ){
1607         rc = SQLITE_IOERR_NOMEM;
1608         goto delmaster_out;
1609       }
1610       if( rc==1 ){
1611         /* One of the journals pointed to by the master journal exists.
1612         ** Open it and check if it points at the master journal. If
1613         ** so, return without deleting the master journal file.
1614         */
1615         int c;
1616         int flags = (SQLITE_OPEN_READONLY|SQLITE_OPEN_MAIN_JOURNAL);
1617         rc = sqlite3OsOpen(pVfs, zJournal, pJournal, flags, 0);
1618         if( rc!=SQLITE_OK ){
1619           goto delmaster_out;
1620         }
1621 
1622         rc = readMasterJournal(pJournal, zMasterPtr, nMasterPtr);
1623         sqlite3OsClose(pJournal);
1624         if( rc!=SQLITE_OK ){
1625           goto delmaster_out;
1626         }
1627 
1628         c = zMasterPtr[0]!=0 && strcmp(zMasterPtr, zMaster)==0;
1629         if( c ){
1630           /* We have a match. Do not delete the master journal file. */
1631           goto delmaster_out;
1632         }
1633       }
1634       zJournal += (strlen(zJournal)+1);
1635     }
1636   }
1637 
1638   rc = sqlite3OsDelete(pVfs, zMaster, 0);
1639 
1640 delmaster_out:
1641   if( zMasterJournal ){
1642     sqlite3_free(zMasterJournal);
1643   }
1644   if( master_open ){
1645     sqlite3OsClose(pMaster);
1646   }
1647   sqlite3_free(pMaster);
1648   return rc;
1649 }
1650 
1651 
1652 static void pager_truncate_cache(Pager *pPager);
1653 
1654 /*
1655 ** Truncate the main file of the given pager to the number of pages
1656 ** indicated. Also truncate the cached representation of the file.
1657 **
1658 ** Might might be the case that the file on disk is smaller than nPage.
1659 ** This can happen, for example, if we are in the middle of a transaction
1660 ** which has extended the file size and the new pages are still all held
1661 ** in cache, then an INSERT or UPDATE does a statement rollback.  Some
1662 ** operating system implementations can get confused if you try to
1663 ** truncate a file to some size that is larger than it currently is,
1664 ** so detect this case and do not do the truncation.
1665 */
1666 static int pager_truncate(Pager *pPager, int nPage){
1667   int rc = SQLITE_OK;
1668   if( pPager->state>=PAGER_EXCLUSIVE && pPager->fd->pMethods ){
1669     i64 currentSize, newSize;
1670     rc = sqlite3OsFileSize(pPager->fd, &currentSize);
1671     newSize = pPager->pageSize*(i64)nPage;
1672     if( rc==SQLITE_OK && currentSize>newSize ){
1673       rc = sqlite3OsTruncate(pPager->fd, newSize);
1674     }
1675   }
1676   if( rc==SQLITE_OK ){
1677     pPager->dbSize = nPage;
1678     pager_truncate_cache(pPager);
1679   }
1680   return rc;
1681 }
1682 
1683 /*
1684 ** Set the sectorSize for the given pager.
1685 **
1686 ** The sector size is at least as big as the sector size reported
1687 ** by sqlite3OsSectorSize().  The minimum sector size is 512.
1688 */
1689 static void setSectorSize(Pager *pPager){
1690   assert(pPager->fd->pMethods||pPager->tempFile);
1691   if( !pPager->tempFile ){
1692     /* Sector size doesn't matter for temporary files. Also, the file
1693     ** may not have been opened yet, in whcih case the OsSectorSize()
1694     ** call will segfault.
1695     */
1696     pPager->sectorSize = sqlite3OsSectorSize(pPager->fd);
1697   }
1698   if( pPager->sectorSize<512 ){
1699     pPager->sectorSize = 512;
1700   }
1701 }
1702 
1703 /*
1704 ** Playback the journal and thus restore the database file to
1705 ** the state it was in before we started making changes.
1706 **
1707 ** The journal file format is as follows:
1708 **
1709 **  (1)  8 byte prefix.  A copy of aJournalMagic[].
1710 **  (2)  4 byte big-endian integer which is the number of valid page records
1711 **       in the journal.  If this value is 0xffffffff, then compute the
1712 **       number of page records from the journal size.
1713 **  (3)  4 byte big-endian integer which is the initial value for the
1714 **       sanity checksum.
1715 **  (4)  4 byte integer which is the number of pages to truncate the
1716 **       database to during a rollback.
1717 **  (5)  4 byte big-endian integer which is the sector size.  The header
1718 **       is this many bytes in size.
1719 **  (6)  4 byte big-endian integer which is the page case.
1720 **  (7)  4 byte integer which is the number of bytes in the master journal
1721 **       name.  The value may be zero (indicate that there is no master
1722 **       journal.)
1723 **  (8)  N bytes of the master journal name.  The name will be nul-terminated
1724 **       and might be shorter than the value read from (5).  If the first byte
1725 **       of the name is \000 then there is no master journal.  The master
1726 **       journal name is stored in UTF-8.
1727 **  (9)  Zero or more pages instances, each as follows:
1728 **        +  4 byte page number.
1729 **        +  pPager->pageSize bytes of data.
1730 **        +  4 byte checksum
1731 **
1732 ** When we speak of the journal header, we mean the first 8 items above.
1733 ** Each entry in the journal is an instance of the 9th item.
1734 **
1735 ** Call the value from the second bullet "nRec".  nRec is the number of
1736 ** valid page entries in the journal.  In most cases, you can compute the
1737 ** value of nRec from the size of the journal file.  But if a power
1738 ** failure occurred while the journal was being written, it could be the
1739 ** case that the size of the journal file had already been increased but
1740 ** the extra entries had not yet made it safely to disk.  In such a case,
1741 ** the value of nRec computed from the file size would be too large.  For
1742 ** that reason, we always use the nRec value in the header.
1743 **
1744 ** If the nRec value is 0xffffffff it means that nRec should be computed
1745 ** from the file size.  This value is used when the user selects the
1746 ** no-sync option for the journal.  A power failure could lead to corruption
1747 ** in this case.  But for things like temporary table (which will be
1748 ** deleted when the power is restored) we don't care.
1749 **
1750 ** If the file opened as the journal file is not a well-formed
1751 ** journal file then all pages up to the first corrupted page are rolled
1752 ** back (or no pages if the journal header is corrupted). The journal file
1753 ** is then deleted and SQLITE_OK returned, just as if no corruption had
1754 ** been encountered.
1755 **
1756 ** If an I/O or malloc() error occurs, the journal-file is not deleted
1757 ** and an error code is returned.
1758 */
1759 static int pager_playback(Pager *pPager, int isHot){
1760   sqlite3_vfs *pVfs = pPager->pVfs;
1761   i64 szJ;                 /* Size of the journal file in bytes */
1762   u32 nRec;                /* Number of Records in the journal */
1763   int i;                   /* Loop counter */
1764   Pgno mxPg = 0;           /* Size of the original file in pages */
1765   int rc;                  /* Result code of a subroutine */
1766   int res = 0;             /* Value returned by sqlite3OsAccess() */
1767   char *zMaster = 0;       /* Name of master journal file if any */
1768 
1769   /* Figure out how many records are in the journal.  Abort early if
1770   ** the journal is empty.
1771   */
1772   assert( pPager->journalOpen );
1773   rc = sqlite3OsFileSize(pPager->jfd, &szJ);
1774   if( rc!=SQLITE_OK || szJ==0 ){
1775     goto end_playback;
1776   }
1777 
1778   /* Read the master journal name from the journal, if it is present.
1779   ** If a master journal file name is specified, but the file is not
1780   ** present on disk, then the journal is not hot and does not need to be
1781   ** played back.
1782   */
1783   zMaster = pPager->pTmpSpace;
1784   rc = readMasterJournal(pPager->jfd, zMaster, pPager->pVfs->mxPathname+1);
1785   if( rc!=SQLITE_OK || (zMaster[0]
1786    && (res=sqlite3OsAccess(pVfs, zMaster, SQLITE_ACCESS_EXISTS))==0 )
1787   ){
1788     zMaster = 0;
1789     goto end_playback;
1790   }
1791   zMaster = 0;
1792   if( res<0 ){
1793     rc = SQLITE_IOERR_NOMEM;
1794     goto end_playback;
1795   }
1796   pPager->journalOff = 0;
1797 
1798   /* This loop terminates either when the readJournalHdr() call returns
1799   ** SQLITE_DONE or an IO error occurs. */
1800   while( 1 ){
1801 
1802     /* Read the next journal header from the journal file.  If there are
1803     ** not enough bytes left in the journal file for a complete header, or
1804     ** it is corrupted, then a process must of failed while writing it.
1805     ** This indicates nothing more needs to be rolled back.
1806     */
1807     rc = readJournalHdr(pPager, szJ, &nRec, &mxPg);
1808     if( rc!=SQLITE_OK ){
1809       if( rc==SQLITE_DONE ){
1810         rc = SQLITE_OK;
1811       }
1812       goto end_playback;
1813     }
1814 
1815     /* If nRec is 0xffffffff, then this journal was created by a process
1816     ** working in no-sync mode. This means that the rest of the journal
1817     ** file consists of pages, there are no more journal headers. Compute
1818     ** the value of nRec based on this assumption.
1819     */
1820     if( nRec==0xffffffff ){
1821       assert( pPager->journalOff==JOURNAL_HDR_SZ(pPager) );
1822       nRec = (szJ - JOURNAL_HDR_SZ(pPager))/JOURNAL_PG_SZ(pPager);
1823     }
1824 
1825     /* If nRec is 0 and this rollback is of a transaction created by this
1826     ** process and if this is the final header in the journal, then it means
1827     ** that this part of the journal was being filled but has not yet been
1828     ** synced to disk.  Compute the number of pages based on the remaining
1829     ** size of the file.
1830     **
1831     ** The third term of the test was added to fix ticket #2565.
1832     */
1833     if( nRec==0 && !isHot &&
1834         pPager->journalHdr+JOURNAL_HDR_SZ(pPager)==pPager->journalOff ){
1835       nRec = (szJ - pPager->journalOff) / JOURNAL_PG_SZ(pPager);
1836     }
1837 
1838     /* If this is the first header read from the journal, truncate the
1839     ** database file back to its original size.
1840     */
1841     if( pPager->journalOff==JOURNAL_HDR_SZ(pPager) ){
1842       rc = pager_truncate(pPager, mxPg);
1843       if( rc!=SQLITE_OK ){
1844         goto end_playback;
1845       }
1846     }
1847 
1848     /* Copy original pages out of the journal and back into the database file.
1849     */
1850     for(i=0; i<nRec; i++){
1851       rc = pager_playback_one_page(pPager, pPager->jfd, pPager->journalOff, 1);
1852       if( rc!=SQLITE_OK ){
1853         if( rc==SQLITE_DONE ){
1854           rc = SQLITE_OK;
1855           pPager->journalOff = szJ;
1856           break;
1857         }else{
1858           goto end_playback;
1859         }
1860       }
1861     }
1862   }
1863   /*NOTREACHED*/
1864   assert( 0 );
1865 
1866 end_playback:
1867   if( rc==SQLITE_OK ){
1868     zMaster = pPager->pTmpSpace;
1869     rc = readMasterJournal(pPager->jfd, zMaster, pPager->pVfs->mxPathname+1);
1870   }
1871   if( rc==SQLITE_OK ){
1872     rc = pager_end_transaction(pPager);
1873   }
1874   if( rc==SQLITE_OK && zMaster[0] ){
1875     /* If there was a master journal and this routine will return success,
1876     ** see if it is possible to delete the master journal.
1877     */
1878     rc = pager_delmaster(pPager, zMaster);
1879   }
1880 
1881   /* The Pager.sectorSize variable may have been updated while rolling
1882   ** back a journal created by a process with a different sector size
1883   ** value. Reset it to the correct value for this process.
1884   */
1885   setSectorSize(pPager);
1886   return rc;
1887 }
1888 
1889 /*
1890 ** Playback the statement journal.
1891 **
1892 ** This is similar to playing back the transaction journal but with
1893 ** a few extra twists.
1894 **
1895 **    (1)  The number of pages in the database file at the start of
1896 **         the statement is stored in pPager->stmtSize, not in the
1897 **         journal file itself.
1898 **
1899 **    (2)  In addition to playing back the statement journal, also
1900 **         playback all pages of the transaction journal beginning
1901 **         at offset pPager->stmtJSize.
1902 */
1903 static int pager_stmt_playback(Pager *pPager){
1904   i64 szJ;                 /* Size of the full journal */
1905   i64 hdrOff;
1906   int nRec;                /* Number of Records */
1907   int i;                   /* Loop counter */
1908   int rc;
1909 
1910   szJ = pPager->journalOff;
1911 #ifndef NDEBUG
1912   {
1913     i64 os_szJ;
1914     rc = sqlite3OsFileSize(pPager->jfd, &os_szJ);
1915     if( rc!=SQLITE_OK ) return rc;
1916     assert( szJ==os_szJ );
1917   }
1918 #endif
1919 
1920   /* Set hdrOff to be the offset just after the end of the last journal
1921   ** page written before the first journal-header for this statement
1922   ** transaction was written, or the end of the file if no journal
1923   ** header was written.
1924   */
1925   hdrOff = pPager->stmtHdrOff;
1926   assert( pPager->fullSync || !hdrOff );
1927   if( !hdrOff ){
1928     hdrOff = szJ;
1929   }
1930 
1931   /* Truncate the database back to its original size.
1932   */
1933   rc = pager_truncate(pPager, pPager->stmtSize);
1934   assert( pPager->state>=PAGER_SHARED );
1935 
1936   /* Figure out how many records are in the statement journal.
1937   */
1938   assert( pPager->stmtInUse && pPager->journalOpen );
1939   nRec = pPager->stmtNRec;
1940 
1941   /* Copy original pages out of the statement journal and back into the
1942   ** database file.  Note that the statement journal omits checksums from
1943   ** each record since power-failure recovery is not important to statement
1944   ** journals.
1945   */
1946   for(i=0; i<nRec; i++){
1947     i64 offset = i*(4+pPager->pageSize);
1948     rc = pager_playback_one_page(pPager, pPager->stfd, offset, 0);
1949     assert( rc!=SQLITE_DONE );
1950     if( rc!=SQLITE_OK ) goto end_stmt_playback;
1951   }
1952 
1953   /* Now roll some pages back from the transaction journal. Pager.stmtJSize
1954   ** was the size of the journal file when this statement was started, so
1955   ** everything after that needs to be rolled back, either into the
1956   ** database, the memory cache, or both.
1957   **
1958   ** If it is not zero, then Pager.stmtHdrOff is the offset to the start
1959   ** of the first journal header written during this statement transaction.
1960   */
1961   pPager->journalOff = pPager->stmtJSize;
1962   pPager->cksumInit = pPager->stmtCksum;
1963   while( pPager->journalOff < hdrOff ){
1964     rc = pager_playback_one_page(pPager, pPager->jfd, pPager->journalOff, 1);
1965     assert( rc!=SQLITE_DONE );
1966     if( rc!=SQLITE_OK ) goto end_stmt_playback;
1967   }
1968 
1969   while( pPager->journalOff < szJ ){
1970     u32 nJRec;         /* Number of Journal Records */
1971     u32 dummy;
1972     rc = readJournalHdr(pPager, szJ, &nJRec, &dummy);
1973     if( rc!=SQLITE_OK ){
1974       assert( rc!=SQLITE_DONE );
1975       goto end_stmt_playback;
1976     }
1977     if( nJRec==0 ){
1978       nJRec = (szJ - pPager->journalOff) / (pPager->pageSize+8);
1979     }
1980     for(i=nJRec-1; i>=0 && pPager->journalOff < szJ; i--){
1981       rc = pager_playback_one_page(pPager, pPager->jfd, pPager->journalOff, 1);
1982       assert( rc!=SQLITE_DONE );
1983       if( rc!=SQLITE_OK ) goto end_stmt_playback;
1984     }
1985   }
1986 
1987   pPager->journalOff = szJ;
1988 
1989 end_stmt_playback:
1990   if( rc==SQLITE_OK) {
1991     pPager->journalOff = szJ;
1992     /* pager_reload_cache(pPager); */
1993   }
1994   return rc;
1995 }
1996 
1997 /*
1998 ** Change the maximum number of in-memory pages that are allowed.
1999 */
2000 void sqlite3PagerSetCachesize(Pager *pPager, int mxPage){
2001   if( mxPage>10 ){
2002     pPager->mxPage = mxPage;
2003   }else{
2004     pPager->mxPage = 10;
2005   }
2006 }
2007 
2008 /*
2009 ** Adjust the robustness of the database to damage due to OS crashes
2010 ** or power failures by changing the number of syncs()s when writing
2011 ** the rollback journal.  There are three levels:
2012 **
2013 **    OFF       sqlite3OsSync() is never called.  This is the default
2014 **              for temporary and transient files.
2015 **
2016 **    NORMAL    The journal is synced once before writes begin on the
2017 **              database.  This is normally adequate protection, but
2018 **              it is theoretically possible, though very unlikely,
2019 **              that an inopertune power failure could leave the journal
2020 **              in a state which would cause damage to the database
2021 **              when it is rolled back.
2022 **
2023 **    FULL      The journal is synced twice before writes begin on the
2024 **              database (with some additional information - the nRec field
2025 **              of the journal header - being written in between the two
2026 **              syncs).  If we assume that writing a
2027 **              single disk sector is atomic, then this mode provides
2028 **              assurance that the journal will not be corrupted to the
2029 **              point of causing damage to the database during rollback.
2030 **
2031 ** Numeric values associated with these states are OFF==1, NORMAL=2,
2032 ** and FULL=3.
2033 */
2034 #ifndef SQLITE_OMIT_PAGER_PRAGMAS
2035 void sqlite3PagerSetSafetyLevel(Pager *pPager, int level, int full_fsync){
2036   pPager->noSync =  level==1 || pPager->tempFile;
2037   pPager->fullSync = level==3 && !pPager->tempFile;
2038   pPager->sync_flags = (full_fsync?SQLITE_SYNC_FULL:SQLITE_SYNC_NORMAL);
2039   if( pPager->noSync ) pPager->needSync = 0;
2040 }
2041 #endif
2042 
2043 /*
2044 ** The following global variable is incremented whenever the library
2045 ** attempts to open a temporary file.  This information is used for
2046 ** testing and analysis only.
2047 */
2048 #ifdef SQLITE_TEST
2049 int sqlite3_opentemp_count = 0;
2050 #endif
2051 
2052 /*
2053 ** Open a temporary file.
2054 **
2055 ** Write the file descriptor into *fd.  Return SQLITE_OK on success or some
2056 ** other error code if we fail. The OS will automatically delete the temporary
2057 ** file when it is closed.
2058 */
2059 static int sqlite3PagerOpentemp(
2060   sqlite3_vfs *pVfs,    /* The virtual file system layer */
2061   sqlite3_file *pFile,  /* Write the file descriptor here */
2062   char *zFilename,      /* Name of the file.  Might be NULL */
2063   int vfsFlags          /* Flags passed through to the VFS */
2064 ){
2065   int rc;
2066   assert( zFilename!=0 );
2067 
2068 #ifdef SQLITE_TEST
2069   sqlite3_opentemp_count++;  /* Used for testing and analysis only */
2070 #endif
2071 
2072   vfsFlags |=  SQLITE_OPEN_READWRITE | SQLITE_OPEN_CREATE |
2073             SQLITE_OPEN_EXCLUSIVE | SQLITE_OPEN_DELETEONCLOSE;
2074   rc = sqlite3OsOpen(pVfs, zFilename, pFile, vfsFlags, 0);
2075   assert( rc!=SQLITE_OK || pFile->pMethods );
2076   return rc;
2077 }
2078 
2079 /*
2080 ** Create a new page cache and put a pointer to the page cache in *ppPager.
2081 ** The file to be cached need not exist.  The file is not locked until
2082 ** the first call to sqlite3PagerGet() and is only held open until the
2083 ** last page is released using sqlite3PagerUnref().
2084 **
2085 ** If zFilename is NULL then a randomly-named temporary file is created
2086 ** and used as the file to be cached.  The file will be deleted
2087 ** automatically when it is closed.
2088 **
2089 ** If zFilename is ":memory:" then all information is held in cache.
2090 ** It is never written to disk.  This can be used to implement an
2091 ** in-memory database.
2092 */
2093 int sqlite3PagerOpen(
2094   sqlite3_vfs *pVfs,       /* The virtual file system to use */
2095   Pager **ppPager,         /* Return the Pager structure here */
2096   const char *zFilename,   /* Name of the database file to open */
2097   int nExtra,              /* Extra bytes append to each in-memory page */
2098   int flags,               /* flags controlling this file */
2099   int vfsFlags             /* flags passed through to sqlite3_vfs.xOpen() */
2100 ){
2101   u8 *pPtr;
2102   Pager *pPager = 0;
2103   int rc = SQLITE_OK;
2104   int i;
2105   int tempFile = 0;
2106   int memDb = 0;
2107   int readOnly = 0;
2108   int useJournal = (flags & PAGER_OMIT_JOURNAL)==0;
2109   int noReadlock = (flags & PAGER_NO_READLOCK)!=0;
2110   int journalFileSize = sqlite3JournalSize(pVfs);
2111   int nDefaultPage = SQLITE_DEFAULT_PAGE_SIZE;
2112   char *zPathname;
2113   int nPathname;
2114   char *zStmtJrnl;
2115   int nStmtJrnl;
2116 
2117   /* The default return is a NULL pointer */
2118   *ppPager = 0;
2119 
2120   /* Compute the full pathname */
2121   nPathname = pVfs->mxPathname+1;
2122   zPathname = sqlite3_malloc(nPathname*2);
2123   if( zPathname==0 ){
2124     return SQLITE_NOMEM;
2125   }
2126   if( zFilename && zFilename[0] ){
2127 #ifndef SQLITE_OMIT_MEMORYDB
2128     if( strcmp(zFilename,":memory:")==0 ){
2129       memDb = 1;
2130       zPathname[0] = 0;
2131     }else
2132 #endif
2133     {
2134       rc = sqlite3OsFullPathname(pVfs, zFilename, nPathname, zPathname);
2135     }
2136   }else{
2137     rc = sqlite3OsGetTempname(pVfs, nPathname, zPathname);
2138   }
2139   if( rc!=SQLITE_OK ){
2140     sqlite3_free(zPathname);
2141     return rc;
2142   }
2143   nPathname = strlen(zPathname);
2144 
2145   /* Put the statement journal in temporary disk space since this is
2146   ** sometimes RAM disk or other optimized storage.  Unlikely the main
2147   ** main journal file, the statement journal does not need to be
2148   ** colocated with the database nor does it need to be persistent.
2149   */
2150   zStmtJrnl = &zPathname[nPathname+1];
2151   rc = sqlite3OsGetTempname(pVfs, pVfs->mxPathname+1, zStmtJrnl);
2152   if( rc!=SQLITE_OK ){
2153     sqlite3_free(zPathname);
2154     return rc;
2155   }
2156   nStmtJrnl = strlen(zStmtJrnl);
2157 
2158   /* Allocate memory for the pager structure */
2159   pPager = sqlite3MallocZero(
2160     sizeof(*pPager) +           /* Pager structure */
2161     journalFileSize +           /* The journal file structure */
2162     pVfs->szOsFile * 3 +        /* The main db and two journal files */
2163     3*nPathname + 40 +          /* zFilename, zDirectory, zJournal */
2164     nStmtJrnl                   /* zStmtJrnl */
2165   );
2166   if( !pPager ){
2167     sqlite3_free(zPathname);
2168     return SQLITE_NOMEM;
2169   }
2170   pPtr = (u8 *)&pPager[1];
2171   pPager->vfsFlags = vfsFlags;
2172   pPager->fd = (sqlite3_file*)&pPtr[pVfs->szOsFile*0];
2173   pPager->stfd = (sqlite3_file*)&pPtr[pVfs->szOsFile*1];
2174   pPager->jfd = (sqlite3_file*)&pPtr[pVfs->szOsFile*2];
2175   pPager->zFilename = (char*)&pPtr[pVfs->szOsFile*2+journalFileSize];
2176   pPager->zDirectory = &pPager->zFilename[nPathname+1];
2177   pPager->zJournal = &pPager->zDirectory[nPathname+1];
2178   pPager->zStmtJrnl = &pPager->zJournal[nPathname+10];
2179   pPager->pVfs = pVfs;
2180   memcpy(pPager->zFilename, zPathname, nPathname+1);
2181   memcpy(pPager->zStmtJrnl, zStmtJrnl, nStmtJrnl+1);
2182   sqlite3_free(zPathname);
2183 
2184   /* Open the pager file.
2185   */
2186   if( zFilename && zFilename[0] && !memDb ){
2187     if( nPathname>(pVfs->mxPathname - sizeof("-journal")) ){
2188       rc = SQLITE_CANTOPEN;
2189     }else{
2190       int fout = 0;
2191       rc = sqlite3OsOpen(pVfs, pPager->zFilename, pPager->fd,
2192                          pPager->vfsFlags, &fout);
2193       readOnly = (fout&SQLITE_OPEN_READONLY);
2194 
2195       /* If the file was successfully opened for read/write access,
2196       ** choose a default page size in case we have to create the
2197       ** database file. The default page size is the maximum of:
2198       **
2199       **    + SQLITE_DEFAULT_PAGE_SIZE,
2200       **    + The value returned by sqlite3OsSectorSize()
2201       **    + The largest page size that can be written atomically.
2202       */
2203       if( rc==SQLITE_OK && !readOnly ){
2204         int iSectorSize = sqlite3OsSectorSize(pPager->fd);
2205         if( nDefaultPage<iSectorSize ){
2206           nDefaultPage = iSectorSize;
2207         }
2208 #ifdef SQLITE_ENABLE_ATOMIC_WRITE
2209         {
2210           int iDc = sqlite3OsDeviceCharacteristics(pPager->fd);
2211           int ii;
2212           assert(SQLITE_IOCAP_ATOMIC512==(512>>8));
2213           assert(SQLITE_IOCAP_ATOMIC64K==(65536>>8));
2214           assert(SQLITE_MAX_DEFAULT_PAGE_SIZE<=65536);
2215           for(ii=nDefaultPage; ii<=SQLITE_MAX_DEFAULT_PAGE_SIZE; ii=ii*2){
2216             if( iDc&(SQLITE_IOCAP_ATOMIC|(ii>>8)) ) nDefaultPage = ii;
2217           }
2218         }
2219 #endif
2220         if( nDefaultPage>SQLITE_MAX_DEFAULT_PAGE_SIZE ){
2221           nDefaultPage = SQLITE_MAX_DEFAULT_PAGE_SIZE;
2222         }
2223       }
2224     }
2225   }else if( !memDb ){
2226     /* If a temporary file is requested, it is not opened immediately.
2227     ** In this case we accept the default page size and delay actually
2228     ** opening the file until the first call to OsWrite().
2229     */
2230     tempFile = 1;
2231     pPager->state = PAGER_EXCLUSIVE;
2232   }
2233 
2234   if( pPager && rc==SQLITE_OK ){
2235     pPager->pTmpSpace = (char *)sqlite3_malloc(nDefaultPage);
2236   }
2237 
2238   /* If an error occured in either of the blocks above.
2239   ** Free the Pager structure and close the file.
2240   ** Since the pager is not allocated there is no need to set
2241   ** any Pager.errMask variables.
2242   */
2243   if( !pPager || !pPager->pTmpSpace ){
2244     sqlite3OsClose(pPager->fd);
2245     sqlite3_free(pPager);
2246     return ((rc==SQLITE_OK)?SQLITE_NOMEM:rc);
2247   }
2248 
2249   PAGERTRACE3("OPEN %d %s\n", FILEHANDLEID(pPager->fd), pPager->zFilename);
2250   IOTRACE(("OPEN %p %s\n", pPager, pPager->zFilename))
2251 
2252   /* Fill in Pager.zDirectory[] */
2253   memcpy(pPager->zDirectory, pPager->zFilename, nPathname+1);
2254   for(i=strlen(pPager->zDirectory); i>0 && pPager->zDirectory[i-1]!='/'; i--){}
2255   if( i>0 ) pPager->zDirectory[i-1] = 0;
2256 
2257   /* Fill in Pager.zJournal[] */
2258   memcpy(pPager->zJournal, pPager->zFilename, nPathname);
2259   memcpy(&pPager->zJournal[nPathname], "-journal", 9);
2260 
2261   /* pPager->journalOpen = 0; */
2262   pPager->useJournal = useJournal && !memDb;
2263   pPager->noReadlock = noReadlock && readOnly;
2264   /* pPager->stmtOpen = 0; */
2265   /* pPager->stmtInUse = 0; */
2266   /* pPager->nRef = 0; */
2267   pPager->dbSize = memDb-1;
2268   pPager->pageSize = nDefaultPage;
2269   /* pPager->stmtSize = 0; */
2270   /* pPager->stmtJSize = 0; */
2271   /* pPager->nPage = 0; */
2272   pPager->mxPage = 100;
2273   pPager->mxPgno = SQLITE_MAX_PAGE_COUNT;
2274   /* pPager->state = PAGER_UNLOCK; */
2275   assert( pPager->state == (tempFile ? PAGER_EXCLUSIVE : PAGER_UNLOCK) );
2276   /* pPager->errMask = 0; */
2277   pPager->tempFile = tempFile;
2278   assert( tempFile==PAGER_LOCKINGMODE_NORMAL
2279           || tempFile==PAGER_LOCKINGMODE_EXCLUSIVE );
2280   assert( PAGER_LOCKINGMODE_EXCLUSIVE==1 );
2281   pPager->exclusiveMode = tempFile;
2282   pPager->memDb = memDb;
2283   pPager->readOnly = readOnly;
2284   /* pPager->needSync = 0; */
2285   pPager->noSync = pPager->tempFile || !useJournal;
2286   pPager->fullSync = (pPager->noSync?0:1);
2287   pPager->sync_flags = SQLITE_SYNC_NORMAL;
2288   /* pPager->pFirst = 0; */
2289   /* pPager->pFirstSynced = 0; */
2290   /* pPager->pLast = 0; */
2291   pPager->nExtra = FORCE_ALIGNMENT(nExtra);
2292   assert(pPager->fd->pMethods||memDb||tempFile);
2293   if( !memDb ){
2294     setSectorSize(pPager);
2295   }
2296   /* pPager->pBusyHandler = 0; */
2297   /* memset(pPager->aHash, 0, sizeof(pPager->aHash)); */
2298   *ppPager = pPager;
2299 #ifdef SQLITE_ENABLE_MEMORY_MANAGEMENT
2300   pPager->iInUseMM = 0;
2301   pPager->iInUseDB = 0;
2302   if( !memDb ){
2303     sqlite3_mutex *mutex = sqlite3_mutex_alloc(SQLITE_MUTEX_STATIC_MEM2);
2304     sqlite3_mutex_enter(mutex);
2305     pPager->pNext = sqlite3PagerList;
2306     if( sqlite3PagerList ){
2307       assert( sqlite3PagerList->pPrev==0 );
2308       sqlite3PagerList->pPrev = pPager;
2309     }
2310     pPager->pPrev = 0;
2311     sqlite3PagerList = pPager;
2312     sqlite3_mutex_leave(mutex);
2313   }
2314 #endif
2315   return SQLITE_OK;
2316 }
2317 
2318 /*
2319 ** Set the busy handler function.
2320 */
2321 void sqlite3PagerSetBusyhandler(Pager *pPager, BusyHandler *pBusyHandler){
2322   pPager->pBusyHandler = pBusyHandler;
2323 }
2324 
2325 /*
2326 ** Set the destructor for this pager.  If not NULL, the destructor is called
2327 ** when the reference count on each page reaches zero.  The destructor can
2328 ** be used to clean up information in the extra segment appended to each page.
2329 **
2330 ** The destructor is not called as a result sqlite3PagerClose().
2331 ** Destructors are only called by sqlite3PagerUnref().
2332 */
2333 void sqlite3PagerSetDestructor(Pager *pPager, void (*xDesc)(DbPage*,int)){
2334   pPager->xDestructor = xDesc;
2335 }
2336 
2337 /*
2338 ** Set the reinitializer for this pager.  If not NULL, the reinitializer
2339 ** is called when the content of a page in cache is restored to its original
2340 ** value as a result of a rollback.  The callback gives higher-level code
2341 ** an opportunity to restore the EXTRA section to agree with the restored
2342 ** page data.
2343 */
2344 void sqlite3PagerSetReiniter(Pager *pPager, void (*xReinit)(DbPage*,int)){
2345   pPager->xReiniter = xReinit;
2346 }
2347 
2348 /*
2349 ** Set the page size to *pPageSize. If the suggest new page size is
2350 ** inappropriate, then an alternative page size is set to that
2351 ** value before returning.
2352 */
2353 int sqlite3PagerSetPagesize(Pager *pPager, u16 *pPageSize){
2354   int rc = SQLITE_OK;
2355   u16 pageSize = *pPageSize;
2356   assert( pageSize==0 || (pageSize>=512 && pageSize<=SQLITE_MAX_PAGE_SIZE) );
2357   if( pageSize && pageSize!=pPager->pageSize
2358    && !pPager->memDb && pPager->nRef==0
2359   ){
2360     char *pNew = (char *)sqlite3_malloc(pageSize);
2361     if( !pNew ){
2362       rc = SQLITE_NOMEM;
2363     }else{
2364       pagerEnter(pPager);
2365       pager_reset(pPager);
2366       pPager->pageSize = pageSize;
2367       setSectorSize(pPager);
2368       sqlite3_free(pPager->pTmpSpace);
2369       pPager->pTmpSpace = pNew;
2370       pagerLeave(pPager);
2371     }
2372   }
2373   *pPageSize = pPager->pageSize;
2374   return rc;
2375 }
2376 
2377 /*
2378 ** Return a pointer to the "temporary page" buffer held internally
2379 ** by the pager.  This is a buffer that is big enough to hold the
2380 ** entire content of a database page.  This buffer is used internally
2381 ** during rollback and will be overwritten whenever a rollback
2382 ** occurs.  But other modules are free to use it too, as long as
2383 ** no rollbacks are happening.
2384 */
2385 void *sqlite3PagerTempSpace(Pager *pPager){
2386   return pPager->pTmpSpace;
2387 }
2388 
2389 /*
2390 ** Attempt to set the maximum database page count if mxPage is positive.
2391 ** Make no changes if mxPage is zero or negative.  And never reduce the
2392 ** maximum page count below the current size of the database.
2393 **
2394 ** Regardless of mxPage, return the current maximum page count.
2395 */
2396 int sqlite3PagerMaxPageCount(Pager *pPager, int mxPage){
2397   if( mxPage>0 ){
2398     pPager->mxPgno = mxPage;
2399   }
2400   sqlite3PagerPagecount(pPager);
2401   return pPager->mxPgno;
2402 }
2403 
2404 /*
2405 ** The following set of routines are used to disable the simulated
2406 ** I/O error mechanism.  These routines are used to avoid simulated
2407 ** errors in places where we do not care about errors.
2408 **
2409 ** Unless -DSQLITE_TEST=1 is used, these routines are all no-ops
2410 ** and generate no code.
2411 */
2412 #ifdef SQLITE_TEST
2413 extern int sqlite3_io_error_pending;
2414 extern int sqlite3_io_error_hit;
2415 static int saved_cnt;
2416 void disable_simulated_io_errors(void){
2417   saved_cnt = sqlite3_io_error_pending;
2418   sqlite3_io_error_pending = -1;
2419 }
2420 void enable_simulated_io_errors(void){
2421   sqlite3_io_error_pending = saved_cnt;
2422 }
2423 #else
2424 # define disable_simulated_io_errors()
2425 # define enable_simulated_io_errors()
2426 #endif
2427 
2428 /*
2429 ** Read the first N bytes from the beginning of the file into memory
2430 ** that pDest points to.
2431 **
2432 ** No error checking is done. The rational for this is that this function
2433 ** may be called even if the file does not exist or contain a header. In
2434 ** these cases sqlite3OsRead() will return an error, to which the correct
2435 ** response is to zero the memory at pDest and continue.  A real IO error
2436 ** will presumably recur and be picked up later (Todo: Think about this).
2437 */
2438 int sqlite3PagerReadFileheader(Pager *pPager, int N, unsigned char *pDest){
2439   int rc = SQLITE_OK;
2440   memset(pDest, 0, N);
2441   assert(MEMDB||pPager->fd->pMethods||pPager->tempFile);
2442   if( pPager->fd->pMethods ){
2443     IOTRACE(("DBHDR %p 0 %d\n", pPager, N))
2444     rc = sqlite3OsRead(pPager->fd, pDest, N, 0);
2445     if( rc==SQLITE_IOERR_SHORT_READ ){
2446       rc = SQLITE_OK;
2447     }
2448   }
2449   return rc;
2450 }
2451 
2452 /*
2453 ** Return the total number of pages in the disk file associated with
2454 ** pPager.
2455 **
2456 ** If the PENDING_BYTE lies on the page directly after the end of the
2457 ** file, then consider this page part of the file too. For example, if
2458 ** PENDING_BYTE is byte 4096 (the first byte of page 5) and the size of the
2459 ** file is 4096 bytes, 5 is returned instead of 4.
2460 */
2461 int sqlite3PagerPagecount(Pager *pPager){
2462   i64 n = 0;
2463   int rc;
2464   assert( pPager!=0 );
2465   if( pPager->errCode ){
2466     return -1;
2467   }
2468   if( pPager->dbSize>=0 ){
2469     n = pPager->dbSize;
2470   } else {
2471     assert(pPager->fd->pMethods||pPager->tempFile);
2472     if( (pPager->fd->pMethods)
2473      && (rc = sqlite3OsFileSize(pPager->fd, &n))!=SQLITE_OK ){
2474       pPager->nRef++;
2475       pager_error(pPager, rc);
2476       pPager->nRef--;
2477       return -1;
2478     }
2479     if( n>0 && n<pPager->pageSize ){
2480       n = 1;
2481     }else{
2482       n /= pPager->pageSize;
2483     }
2484     if( pPager->state!=PAGER_UNLOCK ){
2485       pPager->dbSize = n;
2486     }
2487   }
2488   if( n==(PENDING_BYTE/pPager->pageSize) ){
2489     n++;
2490   }
2491   if( n>pPager->mxPgno ){
2492     pPager->mxPgno = n;
2493   }
2494   return n;
2495 }
2496 
2497 
2498 #ifndef SQLITE_OMIT_MEMORYDB
2499 /*
2500 ** Clear a PgHistory block
2501 */
2502 static void clearHistory(PgHistory *pHist){
2503   sqlite3_free(pHist->pOrig);
2504   sqlite3_free(pHist->pStmt);
2505   pHist->pOrig = 0;
2506   pHist->pStmt = 0;
2507 }
2508 #else
2509 #define clearHistory(x)
2510 #endif
2511 
2512 /*
2513 ** Forward declaration
2514 */
2515 static int syncJournal(Pager*);
2516 
2517 /*
2518 ** Unlink pPg from its hash chain. Also set the page number to 0 to indicate
2519 ** that the page is not part of any hash chain. This is required because the
2520 ** sqlite3PagerMovepage() routine can leave a page in the
2521 ** pNextFree/pPrevFree list that is not a part of any hash-chain.
2522 */
2523 static void unlinkHashChain(Pager *pPager, PgHdr *pPg){
2524   if( pPg->pgno==0 ){
2525     assert( pPg->pNextHash==0 && pPg->pPrevHash==0 );
2526     return;
2527   }
2528   if( pPg->pNextHash ){
2529     pPg->pNextHash->pPrevHash = pPg->pPrevHash;
2530   }
2531   if( pPg->pPrevHash ){
2532     assert( pPager->aHash[pPg->pgno & (pPager->nHash-1)]!=pPg );
2533     pPg->pPrevHash->pNextHash = pPg->pNextHash;
2534   }else{
2535     int h = pPg->pgno & (pPager->nHash-1);
2536     pPager->aHash[h] = pPg->pNextHash;
2537   }
2538   if( MEMDB ){
2539     clearHistory(PGHDR_TO_HIST(pPg, pPager));
2540   }
2541   pPg->pgno = 0;
2542   pPg->pNextHash = pPg->pPrevHash = 0;
2543 }
2544 
2545 /*
2546 ** Unlink a page from the free list (the list of all pages where nRef==0)
2547 ** and from its hash collision chain.
2548 */
2549 static void unlinkPage(PgHdr *pPg){
2550   Pager *pPager = pPg->pPager;
2551 
2552   /* Unlink from free page list */
2553   lruListRemove(pPg);
2554 
2555   /* Unlink from the pgno hash table */
2556   unlinkHashChain(pPager, pPg);
2557 }
2558 
2559 /*
2560 ** This routine is used to truncate the cache when a database
2561 ** is truncated.  Drop from the cache all pages whose pgno is
2562 ** larger than pPager->dbSize and is unreferenced.
2563 **
2564 ** Referenced pages larger than pPager->dbSize are zeroed.
2565 **
2566 ** Actually, at the point this routine is called, it would be
2567 ** an error to have a referenced page.  But rather than delete
2568 ** that page and guarantee a subsequent segfault, it seems better
2569 ** to zero it and hope that we error out sanely.
2570 */
2571 static void pager_truncate_cache(Pager *pPager){
2572   PgHdr *pPg;
2573   PgHdr **ppPg;
2574   int dbSize = pPager->dbSize;
2575 
2576   ppPg = &pPager->pAll;
2577   while( (pPg = *ppPg)!=0 ){
2578     if( pPg->pgno<=dbSize ){
2579       ppPg = &pPg->pNextAll;
2580     }else if( pPg->nRef>0 ){
2581       memset(PGHDR_TO_DATA(pPg), 0, pPager->pageSize);
2582       ppPg = &pPg->pNextAll;
2583     }else{
2584       *ppPg = pPg->pNextAll;
2585       IOTRACE(("PGFREE %p %d\n", pPager, pPg->pgno));
2586       PAGER_INCR(sqlite3_pager_pgfree_count);
2587       unlinkPage(pPg);
2588       makeClean(pPg);
2589       sqlite3_free(pPg->pData);
2590       sqlite3_free(pPg);
2591       pPager->nPage--;
2592     }
2593   }
2594 }
2595 
2596 /*
2597 ** Try to obtain a lock on a file.  Invoke the busy callback if the lock
2598 ** is currently not available.  Repeat until the busy callback returns
2599 ** false or until the lock succeeds.
2600 **
2601 ** Return SQLITE_OK on success and an error code if we cannot obtain
2602 ** the lock.
2603 */
2604 static int pager_wait_on_lock(Pager *pPager, int locktype){
2605   int rc;
2606 
2607   /* The OS lock values must be the same as the Pager lock values */
2608   assert( PAGER_SHARED==SHARED_LOCK );
2609   assert( PAGER_RESERVED==RESERVED_LOCK );
2610   assert( PAGER_EXCLUSIVE==EXCLUSIVE_LOCK );
2611 
2612   /* If the file is currently unlocked then the size must be unknown */
2613   assert( pPager->state>=PAGER_SHARED || pPager->dbSize<0 || MEMDB );
2614 
2615   if( pPager->state>=locktype ){
2616     rc = SQLITE_OK;
2617   }else{
2618     if( pPager->pBusyHandler ) pPager->pBusyHandler->nBusy = 0;
2619     do {
2620       rc = sqlite3OsLock(pPager->fd, locktype);
2621     }while( rc==SQLITE_BUSY && sqlite3InvokeBusyHandler(pPager->pBusyHandler) );
2622     if( rc==SQLITE_OK ){
2623       pPager->state = locktype;
2624       IOTRACE(("LOCK %p %d\n", pPager, locktype))
2625     }
2626   }
2627   return rc;
2628 }
2629 
2630 /*
2631 ** Truncate the file to the number of pages specified.
2632 */
2633 int sqlite3PagerTruncate(Pager *pPager, Pgno nPage){
2634   int rc;
2635   assert( pPager->state>=PAGER_SHARED || MEMDB );
2636   sqlite3PagerPagecount(pPager);
2637   if( pPager->errCode ){
2638     rc = pPager->errCode;
2639     return rc;
2640   }
2641   if( nPage>=(unsigned)pPager->dbSize ){
2642     return SQLITE_OK;
2643   }
2644   if( MEMDB ){
2645     pPager->dbSize = nPage;
2646     pager_truncate_cache(pPager);
2647     return SQLITE_OK;
2648   }
2649   pagerEnter(pPager);
2650   rc = syncJournal(pPager);
2651   pagerLeave(pPager);
2652   if( rc!=SQLITE_OK ){
2653     return rc;
2654   }
2655 
2656   /* Get an exclusive lock on the database before truncating. */
2657   pagerEnter(pPager);
2658   rc = pager_wait_on_lock(pPager, EXCLUSIVE_LOCK);
2659   pagerLeave(pPager);
2660   if( rc!=SQLITE_OK ){
2661     return rc;
2662   }
2663 
2664   rc = pager_truncate(pPager, nPage);
2665   return rc;
2666 }
2667 
2668 /*
2669 ** Shutdown the page cache.  Free all memory and close all files.
2670 **
2671 ** If a transaction was in progress when this routine is called, that
2672 ** transaction is rolled back.  All outstanding pages are invalidated
2673 ** and their memory is freed.  Any attempt to use a page associated
2674 ** with this page cache after this function returns will likely
2675 ** result in a coredump.
2676 **
2677 ** This function always succeeds. If a transaction is active an attempt
2678 ** is made to roll it back. If an error occurs during the rollback
2679 ** a hot journal may be left in the filesystem but no error is returned
2680 ** to the caller.
2681 */
2682 int sqlite3PagerClose(Pager *pPager){
2683 #ifdef SQLITE_ENABLE_MEMORY_MANAGEMENT
2684   if( !MEMDB ){
2685     sqlite3_mutex *mutex = sqlite3_mutex_alloc(SQLITE_MUTEX_STATIC_MEM2);
2686     sqlite3_mutex_enter(mutex);
2687     if( pPager->pPrev ){
2688       pPager->pPrev->pNext = pPager->pNext;
2689     }else{
2690       sqlite3PagerList = pPager->pNext;
2691     }
2692     if( pPager->pNext ){
2693       pPager->pNext->pPrev = pPager->pPrev;
2694     }
2695     sqlite3_mutex_leave(mutex);
2696   }
2697 #endif
2698 
2699   disable_simulated_io_errors();
2700   pPager->errCode = 0;
2701   pPager->exclusiveMode = 0;
2702   pager_reset(pPager);
2703   pagerUnlockAndRollback(pPager);
2704   enable_simulated_io_errors();
2705   PAGERTRACE2("CLOSE %d\n", PAGERID(pPager));
2706   IOTRACE(("CLOSE %p\n", pPager))
2707   assert( pPager->errCode || (pPager->journalOpen==0 && pPager->stmtOpen==0) );
2708   if( pPager->journalOpen ){
2709     sqlite3OsClose(pPager->jfd);
2710   }
2711   sqlite3BitvecDestroy(pPager->pInJournal);
2712   if( pPager->stmtOpen ){
2713     sqlite3OsClose(pPager->stfd);
2714   }
2715   sqlite3OsClose(pPager->fd);
2716   /* Temp files are automatically deleted by the OS
2717   ** if( pPager->tempFile ){
2718   **   sqlite3OsDelete(pPager->zFilename);
2719   ** }
2720   */
2721 
2722   sqlite3_free(pPager->aHash);
2723   sqlite3_free(pPager->pTmpSpace);
2724   sqlite3_free(pPager);
2725   return SQLITE_OK;
2726 }
2727 
2728 #if !defined(NDEBUG) || defined(SQLITE_TEST)
2729 /*
2730 ** Return the page number for the given page data.
2731 */
2732 Pgno sqlite3PagerPagenumber(DbPage *p){
2733   return p->pgno;
2734 }
2735 #endif
2736 
2737 /*
2738 ** The page_ref() function increments the reference count for a page.
2739 ** If the page is currently on the freelist (the reference count is zero) then
2740 ** remove it from the freelist.
2741 **
2742 ** For non-test systems, page_ref() is a macro that calls _page_ref()
2743 ** online of the reference count is zero.  For test systems, page_ref()
2744 ** is a real function so that we can set breakpoints and trace it.
2745 */
2746 static void _page_ref(PgHdr *pPg){
2747   if( pPg->nRef==0 ){
2748     /* The page is currently on the freelist.  Remove it. */
2749     lruListRemove(pPg);
2750     pPg->pPager->nRef++;
2751   }
2752   pPg->nRef++;
2753 }
2754 #ifdef SQLITE_DEBUG
2755   static void page_ref(PgHdr *pPg){
2756     if( pPg->nRef==0 ){
2757       _page_ref(pPg);
2758     }else{
2759       pPg->nRef++;
2760     }
2761   }
2762 #else
2763 # define page_ref(P)   ((P)->nRef==0?_page_ref(P):(void)(P)->nRef++)
2764 #endif
2765 
2766 /*
2767 ** Increment the reference count for a page.  The input pointer is
2768 ** a reference to the page data.
2769 */
2770 int sqlite3PagerRef(DbPage *pPg){
2771   pagerEnter(pPg->pPager);
2772   page_ref(pPg);
2773   pagerLeave(pPg->pPager);
2774   return SQLITE_OK;
2775 }
2776 
2777 /*
2778 ** Sync the journal.  In other words, make sure all the pages that have
2779 ** been written to the journal have actually reached the surface of the
2780 ** disk.  It is not safe to modify the original database file until after
2781 ** the journal has been synced.  If the original database is modified before
2782 ** the journal is synced and a power failure occurs, the unsynced journal
2783 ** data would be lost and we would be unable to completely rollback the
2784 ** database changes.  Database corruption would occur.
2785 **
2786 ** This routine also updates the nRec field in the header of the journal.
2787 ** (See comments on the pager_playback() routine for additional information.)
2788 ** If the sync mode is FULL, two syncs will occur.  First the whole journal
2789 ** is synced, then the nRec field is updated, then a second sync occurs.
2790 **
2791 ** For temporary databases, we do not care if we are able to rollback
2792 ** after a power failure, so no sync occurs.
2793 **
2794 ** If the IOCAP_SEQUENTIAL flag is set for the persistent media on which
2795 ** the database is stored, then OsSync() is never called on the journal
2796 ** file. In this case all that is required is to update the nRec field in
2797 ** the journal header.
2798 **
2799 ** This routine clears the needSync field of every page current held in
2800 ** memory.
2801 */
2802 static int syncJournal(Pager *pPager){
2803   PgHdr *pPg;
2804   int rc = SQLITE_OK;
2805 
2806 
2807   /* Sync the journal before modifying the main database
2808   ** (assuming there is a journal and it needs to be synced.)
2809   */
2810   if( pPager->needSync ){
2811     if( !pPager->tempFile ){
2812       int iDc = sqlite3OsDeviceCharacteristics(pPager->fd);
2813       assert( pPager->journalOpen );
2814 
2815       /* assert( !pPager->noSync ); // noSync might be set if synchronous
2816       ** was turned off after the transaction was started.  Ticket #615 */
2817 #ifndef NDEBUG
2818       {
2819         /* Make sure the pPager->nRec counter we are keeping agrees
2820         ** with the nRec computed from the size of the journal file.
2821         */
2822         i64 jSz;
2823         rc = sqlite3OsFileSize(pPager->jfd, &jSz);
2824         if( rc!=0 ) return rc;
2825         assert( pPager->journalOff==jSz );
2826       }
2827 #endif
2828       if( 0==(iDc&SQLITE_IOCAP_SAFE_APPEND) ){
2829         /* Write the nRec value into the journal file header. If in
2830         ** full-synchronous mode, sync the journal first. This ensures that
2831         ** all data has really hit the disk before nRec is updated to mark
2832         ** it as a candidate for rollback.
2833         **
2834         ** This is not required if the persistent media supports the
2835         ** SAFE_APPEND property. Because in this case it is not possible
2836         ** for garbage data to be appended to the file, the nRec field
2837         ** is populated with 0xFFFFFFFF when the journal header is written
2838         ** and never needs to be updated.
2839         */
2840         i64 jrnlOff;
2841         if( pPager->fullSync && 0==(iDc&SQLITE_IOCAP_SEQUENTIAL) ){
2842           PAGERTRACE2("SYNC journal of %d\n", PAGERID(pPager));
2843           IOTRACE(("JSYNC %p\n", pPager))
2844           rc = sqlite3OsSync(pPager->jfd, pPager->sync_flags);
2845           if( rc!=0 ) return rc;
2846         }
2847 
2848         jrnlOff = pPager->journalHdr + sizeof(aJournalMagic);
2849         IOTRACE(("JHDR %p %lld %d\n", pPager, jrnlOff, 4));
2850         rc = write32bits(pPager->jfd, jrnlOff, pPager->nRec);
2851         if( rc ) return rc;
2852       }
2853       if( 0==(iDc&SQLITE_IOCAP_SEQUENTIAL) ){
2854         PAGERTRACE2("SYNC journal of %d\n", PAGERID(pPager));
2855         IOTRACE(("JSYNC %p\n", pPager))
2856         rc = sqlite3OsSync(pPager->jfd, pPager->sync_flags|
2857           (pPager->sync_flags==SQLITE_SYNC_FULL?SQLITE_SYNC_DATAONLY:0)
2858         );
2859         if( rc!=0 ) return rc;
2860       }
2861       pPager->journalStarted = 1;
2862     }
2863     pPager->needSync = 0;
2864 
2865     /* Erase the needSync flag from every page.
2866     */
2867     for(pPg=pPager->pAll; pPg; pPg=pPg->pNextAll){
2868       pPg->needSync = 0;
2869     }
2870     lruListSetFirstSynced(pPager);
2871   }
2872 
2873 #ifndef NDEBUG
2874   /* If the Pager.needSync flag is clear then the PgHdr.needSync
2875   ** flag must also be clear for all pages.  Verify that this
2876   ** invariant is true.
2877   */
2878   else{
2879     for(pPg=pPager->pAll; pPg; pPg=pPg->pNextAll){
2880       assert( pPg->needSync==0 );
2881     }
2882     assert( pPager->lru.pFirstSynced==pPager->lru.pFirst );
2883   }
2884 #endif
2885 
2886   return rc;
2887 }
2888 
2889 /*
2890 ** Merge two lists of pages connected by pDirty and in pgno order.
2891 ** Do not both fixing the pPrevDirty pointers.
2892 */
2893 static PgHdr *merge_pagelist(PgHdr *pA, PgHdr *pB){
2894   PgHdr result, *pTail;
2895   pTail = &result;
2896   while( pA && pB ){
2897     if( pA->pgno<pB->pgno ){
2898       pTail->pDirty = pA;
2899       pTail = pA;
2900       pA = pA->pDirty;
2901     }else{
2902       pTail->pDirty = pB;
2903       pTail = pB;
2904       pB = pB->pDirty;
2905     }
2906   }
2907   if( pA ){
2908     pTail->pDirty = pA;
2909   }else if( pB ){
2910     pTail->pDirty = pB;
2911   }else{
2912     pTail->pDirty = 0;
2913   }
2914   return result.pDirty;
2915 }
2916 
2917 /*
2918 ** Sort the list of pages in accending order by pgno.  Pages are
2919 ** connected by pDirty pointers.  The pPrevDirty pointers are
2920 ** corrupted by this sort.
2921 */
2922 #define N_SORT_BUCKET_ALLOC 25
2923 #define N_SORT_BUCKET       25
2924 #ifdef SQLITE_TEST
2925   int sqlite3_pager_n_sort_bucket = 0;
2926   #undef N_SORT_BUCKET
2927   #define N_SORT_BUCKET \
2928    (sqlite3_pager_n_sort_bucket?sqlite3_pager_n_sort_bucket:N_SORT_BUCKET_ALLOC)
2929 #endif
2930 static PgHdr *sort_pagelist(PgHdr *pIn){
2931   PgHdr *a[N_SORT_BUCKET_ALLOC], *p;
2932   int i;
2933   memset(a, 0, sizeof(a));
2934   while( pIn ){
2935     p = pIn;
2936     pIn = p->pDirty;
2937     p->pDirty = 0;
2938     for(i=0; i<N_SORT_BUCKET-1; i++){
2939       if( a[i]==0 ){
2940         a[i] = p;
2941         break;
2942       }else{
2943         p = merge_pagelist(a[i], p);
2944         a[i] = 0;
2945       }
2946     }
2947     if( i==N_SORT_BUCKET-1 ){
2948       /* Coverage: To get here, there need to be 2^(N_SORT_BUCKET)
2949       ** elements in the input list. This is possible, but impractical.
2950       ** Testing this line is the point of global variable
2951       ** sqlite3_pager_n_sort_bucket.
2952       */
2953       a[i] = merge_pagelist(a[i], p);
2954     }
2955   }
2956   p = a[0];
2957   for(i=1; i<N_SORT_BUCKET; i++){
2958     p = merge_pagelist(p, a[i]);
2959   }
2960   return p;
2961 }
2962 
2963 /*
2964 ** Given a list of pages (connected by the PgHdr.pDirty pointer) write
2965 ** every one of those pages out to the database file and mark them all
2966 ** as clean.
2967 */
2968 static int pager_write_pagelist(PgHdr *pList){
2969   Pager *pPager;
2970   PgHdr *p;
2971   int rc;
2972 
2973   if( pList==0 ) return SQLITE_OK;
2974   pPager = pList->pPager;
2975 
2976   /* At this point there may be either a RESERVED or EXCLUSIVE lock on the
2977   ** database file. If there is already an EXCLUSIVE lock, the following
2978   ** calls to sqlite3OsLock() are no-ops.
2979   **
2980   ** Moving the lock from RESERVED to EXCLUSIVE actually involves going
2981   ** through an intermediate state PENDING.   A PENDING lock prevents new
2982   ** readers from attaching to the database but is unsufficient for us to
2983   ** write.  The idea of a PENDING lock is to prevent new readers from
2984   ** coming in while we wait for existing readers to clear.
2985   **
2986   ** While the pager is in the RESERVED state, the original database file
2987   ** is unchanged and we can rollback without having to playback the
2988   ** journal into the original database file.  Once we transition to
2989   ** EXCLUSIVE, it means the database file has been changed and any rollback
2990   ** will require a journal playback.
2991   */
2992   rc = pager_wait_on_lock(pPager, EXCLUSIVE_LOCK);
2993   if( rc!=SQLITE_OK ){
2994     return rc;
2995   }
2996 
2997   pList = sort_pagelist(pList);
2998   for(p=pList; p; p=p->pDirty){
2999     assert( p->dirty );
3000     p->dirty = 0;
3001   }
3002   while( pList ){
3003 
3004     /* If the file has not yet been opened, open it now. */
3005     if( !pPager->fd->pMethods ){
3006       assert(pPager->tempFile);
3007       rc = sqlite3PagerOpentemp(pPager->pVfs, pPager->fd, pPager->zFilename,
3008                                 pPager->vfsFlags);
3009       if( rc ) return rc;
3010     }
3011 
3012     /* If there are dirty pages in the page cache with page numbers greater
3013     ** than Pager.dbSize, this means sqlite3PagerTruncate() was called to
3014     ** make the file smaller (presumably by auto-vacuum code). Do not write
3015     ** any such pages to the file.
3016     */
3017     if( pList->pgno<=pPager->dbSize ){
3018       i64 offset = (pList->pgno-1)*(i64)pPager->pageSize;
3019       char *pData = CODEC2(pPager, PGHDR_TO_DATA(pList), pList->pgno, 6);
3020       PAGERTRACE4("STORE %d page %d hash(%08x)\n",
3021                    PAGERID(pPager), pList->pgno, pager_pagehash(pList));
3022       IOTRACE(("PGOUT %p %d\n", pPager, pList->pgno));
3023       rc = sqlite3OsWrite(pPager->fd, pData, pPager->pageSize, offset);
3024       PAGER_INCR(sqlite3_pager_writedb_count);
3025       PAGER_INCR(pPager->nWrite);
3026       if( pList->pgno==1 ){
3027         memcpy(&pPager->dbFileVers, &pData[24], sizeof(pPager->dbFileVers));
3028       }
3029     }
3030 #ifndef NDEBUG
3031     else{
3032       PAGERTRACE3("NOSTORE %d page %d\n", PAGERID(pPager), pList->pgno);
3033     }
3034 #endif
3035     if( rc ) return rc;
3036 #ifdef SQLITE_CHECK_PAGES
3037     pList->pageHash = pager_pagehash(pList);
3038 #endif
3039     pList = pList->pDirty;
3040   }
3041   return SQLITE_OK;
3042 }
3043 
3044 /*
3045 ** Collect every dirty page into a dirty list and
3046 ** return a pointer to the head of that list.  All pages are
3047 ** collected even if they are still in use.
3048 */
3049 static PgHdr *pager_get_all_dirty_pages(Pager *pPager){
3050 
3051 #ifndef NDEBUG
3052   /* Verify the sanity of the dirty list when we are running
3053   ** in debugging mode.  This is expensive, so do not
3054   ** do this on a normal build. */
3055   int n1 = 0;
3056   int n2 = 0;
3057   PgHdr *p;
3058   for(p=pPager->pAll; p; p=p->pNextAll){ if( p->dirty ) n1++; }
3059   for(p=pPager->pDirty; p; p=p->pDirty){ n2++; }
3060   assert( n1==n2 );
3061 #endif
3062 
3063   return pPager->pDirty;
3064 }
3065 
3066 /*
3067 ** Return 1 if there is a hot journal on the given pager.
3068 ** A hot journal is one that needs to be played back.
3069 **
3070 ** If the current size of the database file is 0 but a journal file
3071 ** exists, that is probably an old journal left over from a prior
3072 ** database with the same name.  Just delete the journal.
3073 **
3074 ** Return negative if unable to determine the status of the journal.
3075 */
3076 static int hasHotJournal(Pager *pPager){
3077   sqlite3_vfs *pVfs = pPager->pVfs;
3078   int rc;
3079   if( !pPager->useJournal ) return 0;
3080   if( !pPager->fd->pMethods ) return 0;
3081   rc = sqlite3OsAccess(pVfs, pPager->zJournal, SQLITE_ACCESS_EXISTS);
3082   if( rc<=0 ){
3083     return rc;
3084   }
3085   if( sqlite3OsCheckReservedLock(pPager->fd) ){
3086     return 0;
3087   }
3088   if( sqlite3PagerPagecount(pPager)==0 ){
3089     sqlite3OsDelete(pVfs, pPager->zJournal, 0);
3090     return 0;
3091   }else{
3092     return 1;
3093   }
3094 }
3095 
3096 /*
3097 ** Try to find a page in the cache that can be recycled.
3098 **
3099 ** This routine may return SQLITE_IOERR, SQLITE_FULL or SQLITE_OK. It
3100 ** does not set the pPager->errCode variable.
3101 */
3102 static int pager_recycle(Pager *pPager, PgHdr **ppPg){
3103   PgHdr *pPg;
3104   *ppPg = 0;
3105 
3106   /* It is illegal to call this function unless the pager object
3107   ** pointed to by pPager has at least one free page (page with nRef==0).
3108   */
3109   assert(!MEMDB);
3110   assert(pPager->lru.pFirst);
3111 
3112   /* Find a page to recycle.  Try to locate a page that does not
3113   ** require us to do an fsync() on the journal.
3114   */
3115   pPg = pPager->lru.pFirstSynced;
3116 
3117   /* If we could not find a page that does not require an fsync()
3118   ** on the journal file then fsync the journal file.  This is a
3119   ** very slow operation, so we work hard to avoid it.  But sometimes
3120   ** it can't be helped.
3121   */
3122   if( pPg==0 && pPager->lru.pFirst){
3123     int iDc = sqlite3OsDeviceCharacteristics(pPager->fd);
3124     int rc = syncJournal(pPager);
3125     if( rc!=0 ){
3126       return rc;
3127     }
3128     if( pPager->fullSync && 0==(iDc&SQLITE_IOCAP_SAFE_APPEND) ){
3129       /* If in full-sync mode, write a new journal header into the
3130       ** journal file. This is done to avoid ever modifying a journal
3131       ** header that is involved in the rollback of pages that have
3132       ** already been written to the database (in case the header is
3133       ** trashed when the nRec field is updated).
3134       */
3135       pPager->nRec = 0;
3136       assert( pPager->journalOff > 0 );
3137       assert( pPager->doNotSync==0 );
3138       rc = writeJournalHdr(pPager);
3139       if( rc!=0 ){
3140         return rc;
3141       }
3142     }
3143     pPg = pPager->lru.pFirst;
3144   }
3145 
3146   assert( pPg->nRef==0 );
3147 
3148   /* Write the page to the database file if it is dirty.
3149   */
3150   if( pPg->dirty ){
3151     int rc;
3152     assert( pPg->needSync==0 );
3153     makeClean(pPg);
3154     pPg->dirty = 1;
3155     pPg->pDirty = 0;
3156     rc = pager_write_pagelist( pPg );
3157     pPg->dirty = 0;
3158     if( rc!=SQLITE_OK ){
3159       return rc;
3160     }
3161   }
3162   assert( pPg->dirty==0 );
3163 
3164   /* If the page we are recycling is marked as alwaysRollback, then
3165   ** set the global alwaysRollback flag, thus disabling the
3166   ** sqlite3PagerDontRollback() optimization for the rest of this transaction.
3167   ** It is necessary to do this because the page marked alwaysRollback
3168   ** might be reloaded at a later time but at that point we won't remember
3169   ** that is was marked alwaysRollback.  This means that all pages must
3170   ** be marked as alwaysRollback from here on out.
3171   */
3172   if( pPg->alwaysRollback ){
3173     IOTRACE(("ALWAYS_ROLLBACK %p\n", pPager))
3174     pPager->alwaysRollback = 1;
3175   }
3176 
3177   /* Unlink the old page from the free list and the hash table
3178   */
3179   unlinkPage(pPg);
3180   assert( pPg->pgno==0 );
3181 
3182   *ppPg = pPg;
3183   return SQLITE_OK;
3184 }
3185 
3186 #ifdef SQLITE_ENABLE_MEMORY_MANAGEMENT
3187 /*
3188 ** This function is called to free superfluous dynamically allocated memory
3189 ** held by the pager system. Memory in use by any SQLite pager allocated
3190 ** by the current thread may be sqlite3_free()ed.
3191 **
3192 ** nReq is the number of bytes of memory required. Once this much has
3193 ** been released, the function returns. The return value is the total number
3194 ** of bytes of memory released.
3195 */
3196 int sqlite3PagerReleaseMemory(int nReq){
3197   int nReleased = 0;          /* Bytes of memory released so far */
3198   sqlite3_mutex *mutex;       /* The MEM2 mutex */
3199   Pager *pPager;              /* For looping over pagers */
3200   BusyHandler *savedBusy;     /* Saved copy of the busy handler */
3201   int rc = SQLITE_OK;
3202 
3203   /* Acquire the memory-management mutex
3204   */
3205   mutex = sqlite3_mutex_alloc(SQLITE_MUTEX_STATIC_MEM2);
3206   sqlite3_mutex_enter(mutex);
3207 
3208   /* Signal all database connections that memory management wants
3209   ** to have access to the pagers.
3210   */
3211   for(pPager=sqlite3PagerList; pPager; pPager=pPager->pNext){
3212      pPager->iInUseMM = 1;
3213   }
3214 
3215   while( rc==SQLITE_OK && (nReq<0 || nReleased<nReq) ){
3216     PgHdr *pPg;
3217     PgHdr *pRecycled;
3218 
3219     /* Try to find a page to recycle that does not require a sync(). If
3220     ** this is not possible, find one that does require a sync().
3221     */
3222     sqlite3_mutex_enter(sqlite3_mutex_alloc(SQLITE_MUTEX_STATIC_LRU));
3223     pPg = sqlite3LruPageList.pFirstSynced;
3224     while( pPg && (pPg->needSync || pPg->pPager->iInUseDB) ){
3225       pPg = pPg->gfree.pNext;
3226     }
3227     if( !pPg ){
3228       pPg = sqlite3LruPageList.pFirst;
3229       while( pPg && pPg->pPager->iInUseDB ){
3230         pPg = pPg->gfree.pNext;
3231       }
3232     }
3233     sqlite3_mutex_leave(sqlite3_mutex_alloc(SQLITE_MUTEX_STATIC_LRU));
3234 
3235     /* If pPg==0, then the block above has failed to find a page to
3236     ** recycle. In this case return early - no further memory will
3237     ** be released.
3238     */
3239     if( !pPg ) break;
3240 
3241     pPager = pPg->pPager;
3242     assert(!pPg->needSync || pPg==pPager->lru.pFirst);
3243     assert(pPg->needSync || pPg==pPager->lru.pFirstSynced);
3244 
3245     savedBusy = pPager->pBusyHandler;
3246     pPager->pBusyHandler = 0;
3247     rc = pager_recycle(pPager, &pRecycled);
3248     pPager->pBusyHandler = savedBusy;
3249     assert(pRecycled==pPg || rc!=SQLITE_OK);
3250     if( rc==SQLITE_OK ){
3251       /* We've found a page to free. At this point the page has been
3252       ** removed from the page hash-table, free-list and synced-list
3253       ** (pFirstSynced). It is still in the all pages (pAll) list.
3254       ** Remove it from this list before freeing.
3255       **
3256       ** Todo: Check the Pager.pStmt list to make sure this is Ok. It
3257       ** probably is though.
3258       */
3259       PgHdr *pTmp;
3260       assert( pPg );
3261       if( pPg==pPager->pAll ){
3262          pPager->pAll = pPg->pNextAll;
3263       }else{
3264         for( pTmp=pPager->pAll; pTmp->pNextAll!=pPg; pTmp=pTmp->pNextAll ){}
3265         pTmp->pNextAll = pPg->pNextAll;
3266       }
3267       nReleased += (
3268           sizeof(*pPg) + pPager->pageSize
3269           + sizeof(u32) + pPager->nExtra
3270           + MEMDB*sizeof(PgHistory)
3271       );
3272       IOTRACE(("PGFREE %p %d *\n", pPager, pPg->pgno));
3273       PAGER_INCR(sqlite3_pager_pgfree_count);
3274       sqlite3_free(pPg->pData);
3275       sqlite3_free(pPg);
3276       pPager->nPage--;
3277     }else{
3278       /* An error occured whilst writing to the database file or
3279       ** journal in pager_recycle(). The error is not returned to the
3280       ** caller of this function. Instead, set the Pager.errCode variable.
3281       ** The error will be returned to the user (or users, in the case
3282       ** of a shared pager cache) of the pager for which the error occured.
3283       */
3284       assert(
3285           (rc&0xff)==SQLITE_IOERR ||
3286           rc==SQLITE_FULL ||
3287           rc==SQLITE_BUSY
3288       );
3289       assert( pPager->state>=PAGER_RESERVED );
3290       pager_error(pPager, rc);
3291     }
3292   }
3293 
3294   /* Clear the memory management flags and release the mutex
3295   */
3296   for(pPager=sqlite3PagerList; pPager; pPager=pPager->pNext){
3297      pPager->iInUseMM = 0;
3298   }
3299   sqlite3_mutex_leave(mutex);
3300 
3301   /* Return the number of bytes released
3302   */
3303   return nReleased;
3304 }
3305 #endif /* SQLITE_ENABLE_MEMORY_MANAGEMENT */
3306 
3307 /*
3308 ** Read the content of page pPg out of the database file.
3309 */
3310 static int readDbPage(Pager *pPager, PgHdr *pPg, Pgno pgno){
3311   int rc;
3312   i64 offset;
3313   assert( MEMDB==0 );
3314   assert(pPager->fd->pMethods||pPager->tempFile);
3315   if( !pPager->fd->pMethods ){
3316     return SQLITE_IOERR_SHORT_READ;
3317   }
3318   offset = (pgno-1)*(i64)pPager->pageSize;
3319   rc = sqlite3OsRead(pPager->fd, PGHDR_TO_DATA(pPg), pPager->pageSize, offset);
3320   PAGER_INCR(sqlite3_pager_readdb_count);
3321   PAGER_INCR(pPager->nRead);
3322   IOTRACE(("PGIN %p %d\n", pPager, pgno));
3323   if( pgno==1 ){
3324     memcpy(&pPager->dbFileVers, &((u8*)PGHDR_TO_DATA(pPg))[24],
3325                                               sizeof(pPager->dbFileVers));
3326   }
3327   CODEC1(pPager, PGHDR_TO_DATA(pPg), pPg->pgno, 3);
3328   PAGERTRACE4("FETCH %d page %d hash(%08x)\n",
3329                PAGERID(pPager), pPg->pgno, pager_pagehash(pPg));
3330   return rc;
3331 }
3332 
3333 
3334 /*
3335 ** This function is called to obtain the shared lock required before
3336 ** data may be read from the pager cache. If the shared lock has already
3337 ** been obtained, this function is a no-op.
3338 **
3339 ** Immediately after obtaining the shared lock (if required), this function
3340 ** checks for a hot-journal file. If one is found, an emergency rollback
3341 ** is performed immediately.
3342 */
3343 static int pagerSharedLock(Pager *pPager){
3344   int rc = SQLITE_OK;
3345   int isHot = 0;
3346 
3347   /* If this database is opened for exclusive access, has no outstanding
3348   ** page references and is in an error-state, now is the chance to clear
3349   ** the error. Discard the contents of the pager-cache and treat any
3350   ** open journal file as a hot-journal.
3351   */
3352   if( !MEMDB && pPager->exclusiveMode && pPager->nRef==0 && pPager->errCode ){
3353     if( pPager->journalOpen ){
3354       isHot = 1;
3355     }
3356     pager_reset(pPager);
3357     pPager->errCode = SQLITE_OK;
3358   }
3359 
3360   /* If the pager is still in an error state, do not proceed. The error
3361   ** state will be cleared at some point in the future when all page
3362   ** references are dropped and the cache can be discarded.
3363   */
3364   if( pPager->errCode && pPager->errCode!=SQLITE_FULL ){
3365     return pPager->errCode;
3366   }
3367 
3368   if( pPager->state==PAGER_UNLOCK || isHot ){
3369     sqlite3_vfs *pVfs = pPager->pVfs;
3370     if( !MEMDB ){
3371       assert( pPager->nRef==0 );
3372       if( !pPager->noReadlock ){
3373         rc = pager_wait_on_lock(pPager, SHARED_LOCK);
3374         if( rc!=SQLITE_OK ){
3375           return pager_error(pPager, rc);
3376         }
3377         assert( pPager->state>=SHARED_LOCK );
3378       }
3379 
3380       /* If a journal file exists, and there is no RESERVED lock on the
3381       ** database file, then it either needs to be played back or deleted.
3382       */
3383       rc = hasHotJournal(pPager);
3384       if( rc<0 ){
3385         return pager_error(pPager, SQLITE_IOERR_NOMEM);
3386       }
3387       if( rc==1 || isHot ){
3388         /* Get an EXCLUSIVE lock on the database file. At this point it is
3389         ** important that a RESERVED lock is not obtained on the way to the
3390         ** EXCLUSIVE lock. If it were, another process might open the
3391         ** database file, detect the RESERVED lock, and conclude that the
3392         ** database is safe to read while this process is still rolling it
3393         ** back.
3394         **
3395         ** Because the intermediate RESERVED lock is not requested, the
3396         ** second process will get to this point in the code and fail to
3397         ** obtain its own EXCLUSIVE lock on the database file.
3398         */
3399         if( pPager->state<EXCLUSIVE_LOCK ){
3400           rc = sqlite3OsLock(pPager->fd, EXCLUSIVE_LOCK);
3401           if( rc!=SQLITE_OK ){
3402             pager_unlock(pPager);
3403             return pager_error(pPager, rc);
3404           }
3405           pPager->state = PAGER_EXCLUSIVE;
3406         }
3407 
3408         /* Open the journal for reading only.  Return SQLITE_BUSY if
3409         ** we are unable to open the journal file.
3410         **
3411         ** The journal file does not need to be locked itself.  The
3412         ** journal file is never open unless the main database file holds
3413         ** a write lock, so there is never any chance of two or more
3414         ** processes opening the journal at the same time.
3415         **
3416         ** Open the journal for read/write access. This is because in
3417         ** exclusive-access mode the file descriptor will be kept open and
3418         ** possibly used for a transaction later on. On some systems, the
3419         ** OsTruncate() call used in exclusive-access mode also requires
3420         ** a read/write file handle.
3421         */
3422         if( !isHot ){
3423           int res = sqlite3OsAccess(pVfs,pPager->zJournal,SQLITE_ACCESS_EXISTS);
3424           if( res==1 ){
3425             int fout = 0;
3426             int f = SQLITE_OPEN_READWRITE|SQLITE_OPEN_MAIN_JOURNAL;
3427             assert( !pPager->tempFile );
3428             rc = sqlite3OsOpen(pVfs, pPager->zJournal, pPager->jfd, f, &fout);
3429             assert( rc!=SQLITE_OK || pPager->jfd->pMethods );
3430             if( fout&SQLITE_OPEN_READONLY ){
3431               rc = SQLITE_BUSY;
3432               sqlite3OsClose(pPager->jfd);
3433             }
3434           }else{
3435             rc = (res==0?SQLITE_BUSY:SQLITE_IOERR_NOMEM);
3436           }
3437         }
3438         if( rc!=SQLITE_OK ){
3439           pager_unlock(pPager);
3440           switch( rc ){
3441             case SQLITE_NOMEM:
3442             case SQLITE_IOERR_UNLOCK:
3443             case SQLITE_IOERR_NOMEM:
3444               return rc;
3445             default:
3446               return SQLITE_BUSY;
3447           }
3448         }
3449         pPager->journalOpen = 1;
3450         pPager->journalStarted = 0;
3451         pPager->journalOff = 0;
3452         pPager->setMaster = 0;
3453         pPager->journalHdr = 0;
3454 
3455         /* Playback and delete the journal.  Drop the database write
3456         ** lock and reacquire the read lock.
3457         */
3458         rc = pager_playback(pPager, 1);
3459         if( rc!=SQLITE_OK ){
3460           return pager_error(pPager, rc);
3461         }
3462         assert(pPager->state==PAGER_SHARED ||
3463             (pPager->exclusiveMode && pPager->state>PAGER_SHARED)
3464         );
3465       }
3466 
3467       if( pPager->pAll ){
3468         /* The shared-lock has just been acquired on the database file
3469         ** and there are already pages in the cache (from a previous
3470         ** read or write transaction).  Check to see if the database
3471         ** has been modified.  If the database has changed, flush the
3472         ** cache.
3473         **
3474         ** Database changes is detected by looking at 15 bytes beginning
3475         ** at offset 24 into the file.  The first 4 of these 16 bytes are
3476         ** a 32-bit counter that is incremented with each change.  The
3477         ** other bytes change randomly with each file change when
3478         ** a codec is in use.
3479         **
3480         ** There is a vanishingly small chance that a change will not be
3481         ** detected.  The chance of an undetected change is so small that
3482         ** it can be neglected.
3483         */
3484         char dbFileVers[sizeof(pPager->dbFileVers)];
3485         sqlite3PagerPagecount(pPager);
3486 
3487         if( pPager->errCode ){
3488           return pPager->errCode;
3489         }
3490 
3491         if( pPager->dbSize>0 ){
3492           IOTRACE(("CKVERS %p %d\n", pPager, sizeof(dbFileVers)));
3493           rc = sqlite3OsRead(pPager->fd, &dbFileVers, sizeof(dbFileVers), 24);
3494           if( rc!=SQLITE_OK ){
3495             return rc;
3496           }
3497         }else{
3498           memset(dbFileVers, 0, sizeof(dbFileVers));
3499         }
3500 
3501         if( memcmp(pPager->dbFileVers, dbFileVers, sizeof(dbFileVers))!=0 ){
3502           pager_reset(pPager);
3503         }
3504       }
3505     }
3506     assert( pPager->exclusiveMode || pPager->state<=PAGER_SHARED );
3507     if( pPager->state==PAGER_UNLOCK ){
3508       pPager->state = PAGER_SHARED;
3509     }
3510   }
3511 
3512   return rc;
3513 }
3514 
3515 /*
3516 ** Allocate a PgHdr object.   Either create a new one or reuse
3517 ** an existing one that is not otherwise in use.
3518 **
3519 ** A new PgHdr structure is created if any of the following are
3520 ** true:
3521 **
3522 **     (1)  We have not exceeded our maximum allocated cache size
3523 **          as set by the "PRAGMA cache_size" command.
3524 **
3525 **     (2)  There are no unused PgHdr objects available at this time.
3526 **
3527 **     (3)  This is an in-memory database.
3528 **
3529 **     (4)  There are no PgHdr objects that do not require a journal
3530 **          file sync and a sync of the journal file is currently
3531 **          prohibited.
3532 **
3533 ** Otherwise, reuse an existing PgHdr.  In other words, reuse an
3534 ** existing PgHdr if all of the following are true:
3535 **
3536 **     (1)  We have reached or exceeded the maximum cache size
3537 **          allowed by "PRAGMA cache_size".
3538 **
3539 **     (2)  There is a PgHdr available with PgHdr->nRef==0
3540 **
3541 **     (3)  We are not in an in-memory database
3542 **
3543 **     (4)  Either there is an available PgHdr that does not need
3544 **          to be synced to disk or else disk syncing is currently
3545 **          allowed.
3546 */
3547 static int pagerAllocatePage(Pager *pPager, PgHdr **ppPg){
3548   int rc = SQLITE_OK;
3549   PgHdr *pPg;
3550   int nByteHdr;
3551 
3552   /* Create a new PgHdr if any of the four conditions defined
3553   ** above are met: */
3554   if( pPager->nPage<pPager->mxPage
3555    || pPager->lru.pFirst==0
3556    || MEMDB
3557    || (pPager->lru.pFirstSynced==0 && pPager->doNotSync)
3558   ){
3559     void *pData;
3560     if( pPager->nPage>=pPager->nHash ){
3561       pager_resize_hash_table(pPager,
3562          pPager->nHash<256 ? 256 : pPager->nHash*2);
3563       if( pPager->nHash==0 ){
3564         rc = SQLITE_NOMEM;
3565         goto pager_allocate_out;
3566       }
3567     }
3568     pagerLeave(pPager);
3569     nByteHdr = sizeof(*pPg) + sizeof(u32) + pPager->nExtra
3570               + MEMDB*sizeof(PgHistory);
3571     pPg = sqlite3_malloc( nByteHdr );
3572     if( pPg ){
3573       pData = sqlite3_malloc( pPager->pageSize );
3574       if( pData==0 ){
3575         sqlite3_free(pPg);
3576         pPg = 0;
3577       }
3578     }
3579     pagerEnter(pPager);
3580     if( pPg==0 ){
3581       rc = SQLITE_NOMEM;
3582       goto pager_allocate_out;
3583     }
3584     memset(pPg, 0, nByteHdr);
3585     pPg->pData = pData;
3586     pPg->pPager = pPager;
3587     pPg->pNextAll = pPager->pAll;
3588     pPager->pAll = pPg;
3589     pPager->nPage++;
3590   }else{
3591     /* Recycle an existing page with a zero ref-count. */
3592     rc = pager_recycle(pPager, &pPg);
3593     if( rc==SQLITE_BUSY ){
3594       rc = SQLITE_IOERR_BLOCKED;
3595     }
3596     if( rc!=SQLITE_OK ){
3597       goto pager_allocate_out;
3598     }
3599     assert( pPager->state>=SHARED_LOCK );
3600     assert(pPg);
3601   }
3602   *ppPg = pPg;
3603 
3604 pager_allocate_out:
3605   return rc;
3606 }
3607 
3608 /*
3609 ** Make sure we have the content for a page.  If the page was
3610 ** previously acquired with noContent==1, then the content was
3611 ** just initialized to zeros instead of being read from disk.
3612 ** But now we need the real data off of disk.  So make sure we
3613 ** have it.  Read it in if we do not have it already.
3614 */
3615 static int pager_get_content(PgHdr *pPg){
3616   if( pPg->needRead ){
3617     int rc = readDbPage(pPg->pPager, pPg, pPg->pgno);
3618     if( rc==SQLITE_OK ){
3619       pPg->needRead = 0;
3620     }else{
3621       return rc;
3622     }
3623   }
3624   return SQLITE_OK;
3625 }
3626 
3627 /*
3628 ** Acquire a page.
3629 **
3630 ** A read lock on the disk file is obtained when the first page is acquired.
3631 ** This read lock is dropped when the last page is released.
3632 **
3633 ** This routine works for any page number greater than 0.  If the database
3634 ** file is smaller than the requested page, then no actual disk
3635 ** read occurs and the memory image of the page is initialized to
3636 ** all zeros.  The extra data appended to a page is always initialized
3637 ** to zeros the first time a page is loaded into memory.
3638 **
3639 ** The acquisition might fail for several reasons.  In all cases,
3640 ** an appropriate error code is returned and *ppPage is set to NULL.
3641 **
3642 ** See also sqlite3PagerLookup().  Both this routine and Lookup() attempt
3643 ** to find a page in the in-memory cache first.  If the page is not already
3644 ** in memory, this routine goes to disk to read it in whereas Lookup()
3645 ** just returns 0.  This routine acquires a read-lock the first time it
3646 ** has to go to disk, and could also playback an old journal if necessary.
3647 ** Since Lookup() never goes to disk, it never has to deal with locks
3648 ** or journal files.
3649 **
3650 ** If noContent is false, the page contents are actually read from disk.
3651 ** If noContent is true, it means that we do not care about the contents
3652 ** of the page at this time, so do not do a disk read.  Just fill in the
3653 ** page content with zeros.  But mark the fact that we have not read the
3654 ** content by setting the PgHdr.needRead flag.  Later on, if
3655 ** sqlite3PagerWrite() is called on this page or if this routine is
3656 ** called again with noContent==0, that means that the content is needed
3657 ** and the disk read should occur at that point.
3658 */
3659 static int pagerAcquire(
3660   Pager *pPager,      /* The pager open on the database file */
3661   Pgno pgno,          /* Page number to fetch */
3662   DbPage **ppPage,    /* Write a pointer to the page here */
3663   int noContent       /* Do not bother reading content from disk if true */
3664 ){
3665   PgHdr *pPg;
3666   int rc;
3667 
3668   assert( pPager->state==PAGER_UNLOCK || pPager->nRef>0 || pgno==1 );
3669 
3670   /* The maximum page number is 2^31. Return SQLITE_CORRUPT if a page
3671   ** number greater than this, or zero, is requested.
3672   */
3673   if( pgno>PAGER_MAX_PGNO || pgno==0 || pgno==PAGER_MJ_PGNO(pPager) ){
3674     return SQLITE_CORRUPT_BKPT;
3675   }
3676 
3677   /* Make sure we have not hit any critical errors.
3678   */
3679   assert( pPager!=0 );
3680   *ppPage = 0;
3681 
3682   /* If this is the first page accessed, then get a SHARED lock
3683   ** on the database file. pagerSharedLock() is a no-op if
3684   ** a database lock is already held.
3685   */
3686   rc = pagerSharedLock(pPager);
3687   if( rc!=SQLITE_OK ){
3688     return rc;
3689   }
3690   assert( pPager->state!=PAGER_UNLOCK );
3691 
3692   pPg = pager_lookup(pPager, pgno);
3693   if( pPg==0 ){
3694     /* The requested page is not in the page cache. */
3695     int nMax;
3696     int h;
3697     PAGER_INCR(pPager->nMiss);
3698     rc = pagerAllocatePage(pPager, &pPg);
3699     if( rc!=SQLITE_OK ){
3700       return rc;
3701     }
3702 
3703     pPg->pgno = pgno;
3704     assert( !MEMDB || pgno>pPager->stmtSize );
3705     pPg->inJournal = sqlite3BitvecTest(pPager->pInJournal, pgno);
3706     pPg->needSync = 0;
3707 
3708     makeClean(pPg);
3709     pPg->nRef = 1;
3710 
3711     pPager->nRef++;
3712     if( pPager->nExtra>0 ){
3713       memset(PGHDR_TO_EXTRA(pPg, pPager), 0, pPager->nExtra);
3714     }
3715     nMax = sqlite3PagerPagecount(pPager);
3716     if( pPager->errCode ){
3717       rc = pPager->errCode;
3718       sqlite3PagerUnref(pPg);
3719       return rc;
3720     }
3721 
3722     /* Populate the page with data, either by reading from the database
3723     ** file, or by setting the entire page to zero.
3724     */
3725     if( nMax<(int)pgno || MEMDB || (noContent && !pPager->alwaysRollback) ){
3726       if( pgno>pPager->mxPgno ){
3727         sqlite3PagerUnref(pPg);
3728         return SQLITE_FULL;
3729       }
3730       memset(PGHDR_TO_DATA(pPg), 0, pPager->pageSize);
3731       pPg->needRead = noContent && !pPager->alwaysRollback;
3732       IOTRACE(("ZERO %p %d\n", pPager, pgno));
3733     }else{
3734       rc = readDbPage(pPager, pPg, pgno);
3735       if( rc!=SQLITE_OK && rc!=SQLITE_IOERR_SHORT_READ ){
3736         pPg->pgno = 0;
3737         sqlite3PagerUnref(pPg);
3738         return rc;
3739       }
3740       pPg->needRead = 0;
3741     }
3742 
3743     /* Link the page into the page hash table */
3744     h = pgno & (pPager->nHash-1);
3745     assert( pgno!=0 );
3746     pPg->pNextHash = pPager->aHash[h];
3747     pPager->aHash[h] = pPg;
3748     if( pPg->pNextHash ){
3749       assert( pPg->pNextHash->pPrevHash==0 );
3750       pPg->pNextHash->pPrevHash = pPg;
3751     }
3752 
3753 #ifdef SQLITE_CHECK_PAGES
3754     pPg->pageHash = pager_pagehash(pPg);
3755 #endif
3756   }else{
3757     /* The requested page is in the page cache. */
3758     assert(pPager->nRef>0 || pgno==1);
3759     PAGER_INCR(pPager->nHit);
3760     if( !noContent ){
3761       rc = pager_get_content(pPg);
3762       if( rc ){
3763         return rc;
3764       }
3765     }
3766     page_ref(pPg);
3767   }
3768   *ppPage = pPg;
3769   return SQLITE_OK;
3770 }
3771 int sqlite3PagerAcquire(
3772   Pager *pPager,      /* The pager open on the database file */
3773   Pgno pgno,          /* Page number to fetch */
3774   DbPage **ppPage,    /* Write a pointer to the page here */
3775   int noContent       /* Do not bother reading content from disk if true */
3776 ){
3777   int rc;
3778   pagerEnter(pPager);
3779   rc = pagerAcquire(pPager, pgno, ppPage, noContent);
3780   pagerLeave(pPager);
3781   return rc;
3782 }
3783 
3784 
3785 /*
3786 ** Acquire a page if it is already in the in-memory cache.  Do
3787 ** not read the page from disk.  Return a pointer to the page,
3788 ** or 0 if the page is not in cache.
3789 **
3790 ** See also sqlite3PagerGet().  The difference between this routine
3791 ** and sqlite3PagerGet() is that _get() will go to the disk and read
3792 ** in the page if the page is not already in cache.  This routine
3793 ** returns NULL if the page is not in cache or if a disk I/O error
3794 ** has ever happened.
3795 */
3796 DbPage *sqlite3PagerLookup(Pager *pPager, Pgno pgno){
3797   PgHdr *pPg = 0;
3798 
3799   assert( pPager!=0 );
3800   assert( pgno!=0 );
3801 
3802   pagerEnter(pPager);
3803   if( pPager->state==PAGER_UNLOCK ){
3804     assert( !pPager->pAll || pPager->exclusiveMode );
3805   }else if( pPager->errCode && pPager->errCode!=SQLITE_FULL ){
3806     /* Do nothing */
3807   }else if( (pPg = pager_lookup(pPager, pgno))!=0 ){
3808     page_ref(pPg);
3809   }
3810   pagerLeave(pPager);
3811   return pPg;
3812 }
3813 
3814 /*
3815 ** Release a page.
3816 **
3817 ** If the number of references to the page drop to zero, then the
3818 ** page is added to the LRU list.  When all references to all pages
3819 ** are released, a rollback occurs and the lock on the database is
3820 ** removed.
3821 */
3822 int sqlite3PagerUnref(DbPage *pPg){
3823   Pager *pPager = pPg->pPager;
3824 
3825   /* Decrement the reference count for this page
3826   */
3827   assert( pPg->nRef>0 );
3828   pagerEnter(pPg->pPager);
3829   pPg->nRef--;
3830 
3831   CHECK_PAGE(pPg);
3832 
3833   /* When the number of references to a page reach 0, call the
3834   ** destructor and add the page to the freelist.
3835   */
3836   if( pPg->nRef==0 ){
3837 
3838     lruListAdd(pPg);
3839     if( pPager->xDestructor ){
3840       pPager->xDestructor(pPg, pPager->pageSize);
3841     }
3842 
3843     /* When all pages reach the freelist, drop the read lock from
3844     ** the database file.
3845     */
3846     pPager->nRef--;
3847     assert( pPager->nRef>=0 );
3848     if( pPager->nRef==0 && (!pPager->exclusiveMode || pPager->journalOff>0) ){
3849       pagerUnlockAndRollback(pPager);
3850     }
3851   }
3852   pagerLeave(pPager);
3853   return SQLITE_OK;
3854 }
3855 
3856 /*
3857 ** Create a journal file for pPager.  There should already be a RESERVED
3858 ** or EXCLUSIVE lock on the database file when this routine is called.
3859 **
3860 ** Return SQLITE_OK if everything.  Return an error code and release the
3861 ** write lock if anything goes wrong.
3862 */
3863 static int pager_open_journal(Pager *pPager){
3864   sqlite3_vfs *pVfs = pPager->pVfs;
3865   int flags = (SQLITE_OPEN_READWRITE|SQLITE_OPEN_EXCLUSIVE|SQLITE_OPEN_CREATE);
3866 
3867   int rc;
3868   assert( !MEMDB );
3869   assert( pPager->state>=PAGER_RESERVED );
3870   assert( pPager->journalOpen==0 );
3871   assert( pPager->useJournal );
3872   assert( pPager->pInJournal==0 );
3873   sqlite3PagerPagecount(pPager);
3874   pagerLeave(pPager);
3875   pPager->pInJournal = sqlite3BitvecCreate(pPager->dbSize);
3876   pagerEnter(pPager);
3877   if( pPager->pInJournal==0 ){
3878     rc = SQLITE_NOMEM;
3879     goto failed_to_open_journal;
3880   }
3881 
3882   if( pPager->tempFile ){
3883     flags |= (SQLITE_OPEN_DELETEONCLOSE|SQLITE_OPEN_TEMP_JOURNAL);
3884   }else{
3885     flags |= (SQLITE_OPEN_MAIN_JOURNAL);
3886   }
3887 #ifdef SQLITE_ENABLE_ATOMIC_WRITE
3888   rc = sqlite3JournalOpen(
3889       pVfs, pPager->zJournal, pPager->jfd, flags, jrnlBufferSize(pPager)
3890   );
3891 #else
3892   rc = sqlite3OsOpen(pVfs, pPager->zJournal, pPager->jfd, flags, 0);
3893 #endif
3894   assert( rc!=SQLITE_OK || pPager->jfd->pMethods );
3895   pPager->journalOff = 0;
3896   pPager->setMaster = 0;
3897   pPager->journalHdr = 0;
3898   if( rc!=SQLITE_OK ){
3899     if( rc==SQLITE_NOMEM ){
3900       sqlite3OsDelete(pVfs, pPager->zJournal, 0);
3901     }
3902     goto failed_to_open_journal;
3903   }
3904   pPager->journalOpen = 1;
3905   pPager->journalStarted = 0;
3906   pPager->needSync = 0;
3907   pPager->alwaysRollback = 0;
3908   pPager->nRec = 0;
3909   if( pPager->errCode ){
3910     rc = pPager->errCode;
3911     goto failed_to_open_journal;
3912   }
3913   pPager->origDbSize = pPager->dbSize;
3914 
3915   rc = writeJournalHdr(pPager);
3916 
3917   if( pPager->stmtAutoopen && rc==SQLITE_OK ){
3918     rc = sqlite3PagerStmtBegin(pPager);
3919   }
3920   if( rc!=SQLITE_OK && rc!=SQLITE_NOMEM && rc!=SQLITE_IOERR_NOMEM ){
3921     rc = pager_end_transaction(pPager);
3922     if( rc==SQLITE_OK ){
3923       rc = SQLITE_FULL;
3924     }
3925   }
3926   return rc;
3927 
3928 failed_to_open_journal:
3929   sqlite3BitvecDestroy(pPager->pInJournal);
3930   pPager->pInJournal = 0;
3931   return rc;
3932 }
3933 
3934 /*
3935 ** Acquire a write-lock on the database.  The lock is removed when
3936 ** the any of the following happen:
3937 **
3938 **   *  sqlite3PagerCommitPhaseTwo() is called.
3939 **   *  sqlite3PagerRollback() is called.
3940 **   *  sqlite3PagerClose() is called.
3941 **   *  sqlite3PagerUnref() is called to on every outstanding page.
3942 **
3943 ** The first parameter to this routine is a pointer to any open page of the
3944 ** database file.  Nothing changes about the page - it is used merely to
3945 ** acquire a pointer to the Pager structure and as proof that there is
3946 ** already a read-lock on the database.
3947 **
3948 ** The second parameter indicates how much space in bytes to reserve for a
3949 ** master journal file-name at the start of the journal when it is created.
3950 **
3951 ** A journal file is opened if this is not a temporary file.  For temporary
3952 ** files, the opening of the journal file is deferred until there is an
3953 ** actual need to write to the journal.
3954 **
3955 ** If the database is already reserved for writing, this routine is a no-op.
3956 **
3957 ** If exFlag is true, go ahead and get an EXCLUSIVE lock on the file
3958 ** immediately instead of waiting until we try to flush the cache.  The
3959 ** exFlag is ignored if a transaction is already active.
3960 */
3961 int sqlite3PagerBegin(DbPage *pPg, int exFlag){
3962   Pager *pPager = pPg->pPager;
3963   int rc = SQLITE_OK;
3964   pagerEnter(pPager);
3965   assert( pPg->nRef>0 );
3966   assert( pPager->state!=PAGER_UNLOCK );
3967   if( pPager->state==PAGER_SHARED ){
3968     assert( pPager->pInJournal==0 );
3969     if( MEMDB ){
3970       pPager->state = PAGER_EXCLUSIVE;
3971       pPager->origDbSize = pPager->dbSize;
3972     }else{
3973       rc = sqlite3OsLock(pPager->fd, RESERVED_LOCK);
3974       if( rc==SQLITE_OK ){
3975         pPager->state = PAGER_RESERVED;
3976         if( exFlag ){
3977           rc = pager_wait_on_lock(pPager, EXCLUSIVE_LOCK);
3978         }
3979       }
3980       if( rc!=SQLITE_OK ){
3981         pagerLeave(pPager);
3982         return rc;
3983       }
3984       pPager->dirtyCache = 0;
3985       PAGERTRACE2("TRANSACTION %d\n", PAGERID(pPager));
3986       if( pPager->useJournal && !pPager->tempFile ){
3987         rc = pager_open_journal(pPager);
3988       }
3989     }
3990   }else if( pPager->journalOpen && pPager->journalOff==0 ){
3991     /* This happens when the pager was in exclusive-access mode last
3992     ** time a (read or write) transaction was successfully concluded
3993     ** by this connection. Instead of deleting the journal file it was
3994     ** kept open and truncated to 0 bytes.
3995     */
3996     assert( pPager->nRec==0 );
3997     assert( pPager->origDbSize==0 );
3998     assert( pPager->pInJournal==0 );
3999     sqlite3PagerPagecount(pPager);
4000     pagerLeave(pPager);
4001     pPager->pInJournal = sqlite3BitvecCreate( pPager->dbSize );
4002     pagerEnter(pPager);
4003     if( !pPager->pInJournal ){
4004       rc = SQLITE_NOMEM;
4005     }else{
4006       pPager->origDbSize = pPager->dbSize;
4007       rc = writeJournalHdr(pPager);
4008     }
4009   }
4010   assert( !pPager->journalOpen || pPager->journalOff>0 || rc!=SQLITE_OK );
4011   pagerLeave(pPager);
4012   return rc;
4013 }
4014 
4015 /*
4016 ** Make a page dirty.  Set its dirty flag and add it to the dirty
4017 ** page list.
4018 */
4019 static void makeDirty(PgHdr *pPg){
4020   if( pPg->dirty==0 ){
4021     Pager *pPager = pPg->pPager;
4022     pPg->dirty = 1;
4023     pPg->pDirty = pPager->pDirty;
4024     if( pPager->pDirty ){
4025       pPager->pDirty->pPrevDirty = pPg;
4026     }
4027     pPg->pPrevDirty = 0;
4028     pPager->pDirty = pPg;
4029   }
4030 }
4031 
4032 /*
4033 ** Make a page clean.  Clear its dirty bit and remove it from the
4034 ** dirty page list.
4035 */
4036 static void makeClean(PgHdr *pPg){
4037   if( pPg->dirty ){
4038     pPg->dirty = 0;
4039     if( pPg->pDirty ){
4040       assert( pPg->pDirty->pPrevDirty==pPg );
4041       pPg->pDirty->pPrevDirty = pPg->pPrevDirty;
4042     }
4043     if( pPg->pPrevDirty ){
4044       assert( pPg->pPrevDirty->pDirty==pPg );
4045       pPg->pPrevDirty->pDirty = pPg->pDirty;
4046     }else{
4047       assert( pPg->pPager->pDirty==pPg );
4048       pPg->pPager->pDirty = pPg->pDirty;
4049     }
4050   }
4051 }
4052 
4053 
4054 /*
4055 ** Mark a data page as writeable.  The page is written into the journal
4056 ** if it is not there already.  This routine must be called before making
4057 ** changes to a page.
4058 **
4059 ** The first time this routine is called, the pager creates a new
4060 ** journal and acquires a RESERVED lock on the database.  If the RESERVED
4061 ** lock could not be acquired, this routine returns SQLITE_BUSY.  The
4062 ** calling routine must check for that return value and be careful not to
4063 ** change any page data until this routine returns SQLITE_OK.
4064 **
4065 ** If the journal file could not be written because the disk is full,
4066 ** then this routine returns SQLITE_FULL and does an immediate rollback.
4067 ** All subsequent write attempts also return SQLITE_FULL until there
4068 ** is a call to sqlite3PagerCommit() or sqlite3PagerRollback() to
4069 ** reset.
4070 */
4071 static int pager_write(PgHdr *pPg){
4072   void *pData = PGHDR_TO_DATA(pPg);
4073   Pager *pPager = pPg->pPager;
4074   int rc = SQLITE_OK;
4075 
4076   /* Check for errors
4077   */
4078   if( pPager->errCode ){
4079     return pPager->errCode;
4080   }
4081   if( pPager->readOnly ){
4082     return SQLITE_PERM;
4083   }
4084 
4085   assert( !pPager->setMaster );
4086 
4087   CHECK_PAGE(pPg);
4088 
4089   /* If this page was previously acquired with noContent==1, that means
4090   ** we didn't really read in the content of the page.  This can happen
4091   ** (for example) when the page is being moved to the freelist.  But
4092   ** now we are (perhaps) moving the page off of the freelist for
4093   ** reuse and we need to know its original content so that content
4094   ** can be stored in the rollback journal.  So do the read at this
4095   ** time.
4096   */
4097   rc = pager_get_content(pPg);
4098   if( rc ){
4099     return rc;
4100   }
4101 
4102   /* Mark the page as dirty.  If the page has already been written
4103   ** to the journal then we can return right away.
4104   */
4105   makeDirty(pPg);
4106   if( pPg->inJournal && (pageInStatement(pPg) || pPager->stmtInUse==0) ){
4107     pPager->dirtyCache = 1;
4108   }else{
4109 
4110     /* If we get this far, it means that the page needs to be
4111     ** written to the transaction journal or the ckeckpoint journal
4112     ** or both.
4113     **
4114     ** First check to see that the transaction journal exists and
4115     ** create it if it does not.
4116     */
4117     assert( pPager->state!=PAGER_UNLOCK );
4118     rc = sqlite3PagerBegin(pPg, 0);
4119     if( rc!=SQLITE_OK ){
4120       return rc;
4121     }
4122     assert( pPager->state>=PAGER_RESERVED );
4123     if( !pPager->journalOpen && pPager->useJournal ){
4124       rc = pager_open_journal(pPager);
4125       if( rc!=SQLITE_OK ) return rc;
4126     }
4127     assert( pPager->journalOpen || !pPager->useJournal );
4128     pPager->dirtyCache = 1;
4129 
4130     /* The transaction journal now exists and we have a RESERVED or an
4131     ** EXCLUSIVE lock on the main database file.  Write the current page to
4132     ** the transaction journal if it is not there already.
4133     */
4134     if( !pPg->inJournal && (pPager->useJournal || MEMDB) ){
4135       if( (int)pPg->pgno <= pPager->origDbSize ){
4136         if( MEMDB ){
4137           PgHistory *pHist = PGHDR_TO_HIST(pPg, pPager);
4138           PAGERTRACE3("JOURNAL %d page %d\n", PAGERID(pPager), pPg->pgno);
4139           assert( pHist->pOrig==0 );
4140           pHist->pOrig = sqlite3_malloc( pPager->pageSize );
4141           if( !pHist->pOrig ){
4142             return SQLITE_NOMEM;
4143           }
4144           memcpy(pHist->pOrig, PGHDR_TO_DATA(pPg), pPager->pageSize);
4145         }else{
4146           u32 cksum;
4147           char *pData2;
4148 
4149           /* We should never write to the journal file the page that
4150           ** contains the database locks.  The following assert verifies
4151           ** that we do not. */
4152           assert( pPg->pgno!=PAGER_MJ_PGNO(pPager) );
4153           pData2 = CODEC2(pPager, pData, pPg->pgno, 7);
4154           cksum = pager_cksum(pPager, (u8*)pData2);
4155           rc = write32bits(pPager->jfd, pPager->journalOff, pPg->pgno);
4156           if( rc==SQLITE_OK ){
4157             rc = sqlite3OsWrite(pPager->jfd, pData2, pPager->pageSize,
4158                                 pPager->journalOff + 4);
4159             pPager->journalOff += pPager->pageSize+4;
4160           }
4161           if( rc==SQLITE_OK ){
4162             rc = write32bits(pPager->jfd, pPager->journalOff, cksum);
4163             pPager->journalOff += 4;
4164           }
4165           IOTRACE(("JOUT %p %d %lld %d\n", pPager, pPg->pgno,
4166                    pPager->journalOff, pPager->pageSize));
4167           PAGER_INCR(sqlite3_pager_writej_count);
4168           PAGERTRACE5("JOURNAL %d page %d needSync=%d hash(%08x)\n",
4169                PAGERID(pPager), pPg->pgno, pPg->needSync, pager_pagehash(pPg));
4170 
4171           /* An error has occured writing to the journal file. The
4172           ** transaction will be rolled back by the layer above.
4173           */
4174           if( rc!=SQLITE_OK ){
4175             return rc;
4176           }
4177 
4178           pPager->nRec++;
4179           assert( pPager->pInJournal!=0 );
4180           sqlite3BitvecSet(pPager->pInJournal, pPg->pgno);
4181           pPg->needSync = !pPager->noSync;
4182           if( pPager->stmtInUse ){
4183             sqlite3BitvecSet(pPager->pInStmt, pPg->pgno);
4184           }
4185         }
4186       }else{
4187         pPg->needSync = !pPager->journalStarted && !pPager->noSync;
4188         PAGERTRACE4("APPEND %d page %d needSync=%d\n",
4189                 PAGERID(pPager), pPg->pgno, pPg->needSync);
4190       }
4191       if( pPg->needSync ){
4192         pPager->needSync = 1;
4193       }
4194       pPg->inJournal = 1;
4195     }
4196 
4197     /* If the statement journal is open and the page is not in it,
4198     ** then write the current page to the statement journal.  Note that
4199     ** the statement journal format differs from the standard journal format
4200     ** in that it omits the checksums and the header.
4201     */
4202     if( pPager->stmtInUse
4203      && !pageInStatement(pPg)
4204      && (int)pPg->pgno<=pPager->stmtSize
4205     ){
4206       assert( pPg->inJournal || (int)pPg->pgno>pPager->origDbSize );
4207       if( MEMDB ){
4208         PgHistory *pHist = PGHDR_TO_HIST(pPg, pPager);
4209         assert( pHist->pStmt==0 );
4210         pHist->pStmt = sqlite3_malloc( pPager->pageSize );
4211         if( pHist->pStmt ){
4212           memcpy(pHist->pStmt, PGHDR_TO_DATA(pPg), pPager->pageSize);
4213         }
4214         PAGERTRACE3("STMT-JOURNAL %d page %d\n", PAGERID(pPager), pPg->pgno);
4215         page_add_to_stmt_list(pPg);
4216       }else{
4217         i64 offset = pPager->stmtNRec*(4+pPager->pageSize);
4218         char *pData2 = CODEC2(pPager, pData, pPg->pgno, 7);
4219         rc = write32bits(pPager->stfd, offset, pPg->pgno);
4220         if( rc==SQLITE_OK ){
4221           rc = sqlite3OsWrite(pPager->stfd, pData2, pPager->pageSize, offset+4);
4222         }
4223         PAGERTRACE3("STMT-JOURNAL %d page %d\n", PAGERID(pPager), pPg->pgno);
4224         if( rc!=SQLITE_OK ){
4225           return rc;
4226         }
4227         pPager->stmtNRec++;
4228         assert( pPager->pInStmt!=0 );
4229         sqlite3BitvecSet(pPager->pInStmt, pPg->pgno);
4230       }
4231     }
4232   }
4233 
4234   /* Update the database size and return.
4235   */
4236   assert( pPager->state>=PAGER_SHARED );
4237   if( pPager->dbSize<(int)pPg->pgno ){
4238     pPager->dbSize = pPg->pgno;
4239     if( !MEMDB && pPager->dbSize==PENDING_BYTE/pPager->pageSize ){
4240       pPager->dbSize++;
4241     }
4242   }
4243   return rc;
4244 }
4245 
4246 /*
4247 ** This function is used to mark a data-page as writable. It uses
4248 ** pager_write() to open a journal file (if it is not already open)
4249 ** and write the page *pData to the journal.
4250 **
4251 ** The difference between this function and pager_write() is that this
4252 ** function also deals with the special case where 2 or more pages
4253 ** fit on a single disk sector. In this case all co-resident pages
4254 ** must have been written to the journal file before returning.
4255 */
4256 int sqlite3PagerWrite(DbPage *pDbPage){
4257   int rc = SQLITE_OK;
4258 
4259   PgHdr *pPg = pDbPage;
4260   Pager *pPager = pPg->pPager;
4261   Pgno nPagePerSector = (pPager->sectorSize/pPager->pageSize);
4262 
4263   pagerEnter(pPager);
4264   if( !MEMDB && nPagePerSector>1 ){
4265     Pgno nPageCount;          /* Total number of pages in database file */
4266     Pgno pg1;                 /* First page of the sector pPg is located on. */
4267     int nPage;                /* Number of pages starting at pg1 to journal */
4268     int ii;
4269     int needSync = 0;
4270 
4271     /* Set the doNotSync flag to 1. This is because we cannot allow a journal
4272     ** header to be written between the pages journaled by this function.
4273     */
4274     assert( pPager->doNotSync==0 );
4275     pPager->doNotSync = 1;
4276 
4277     /* This trick assumes that both the page-size and sector-size are
4278     ** an integer power of 2. It sets variable pg1 to the identifier
4279     ** of the first page of the sector pPg is located on.
4280     */
4281     pg1 = ((pPg->pgno-1) & ~(nPagePerSector-1)) + 1;
4282 
4283     nPageCount = sqlite3PagerPagecount(pPager);
4284     if( pPg->pgno>nPageCount ){
4285       nPage = (pPg->pgno - pg1)+1;
4286     }else if( (pg1+nPagePerSector-1)>nPageCount ){
4287       nPage = nPageCount+1-pg1;
4288     }else{
4289       nPage = nPagePerSector;
4290     }
4291     assert(nPage>0);
4292     assert(pg1<=pPg->pgno);
4293     assert((pg1+nPage)>pPg->pgno);
4294 
4295     for(ii=0; ii<nPage && rc==SQLITE_OK; ii++){
4296       Pgno pg = pg1+ii;
4297       PgHdr *pPage;
4298       if( pg==pPg->pgno || !sqlite3BitvecTest(pPager->pInJournal, pg) ){
4299         if( pg!=PAGER_MJ_PGNO(pPager) ){
4300           rc = sqlite3PagerGet(pPager, pg, &pPage);
4301           if( rc==SQLITE_OK ){
4302             rc = pager_write(pPage);
4303             if( pPage->needSync ){
4304               needSync = 1;
4305             }
4306             sqlite3PagerUnref(pPage);
4307           }
4308         }
4309       }else if( (pPage = pager_lookup(pPager, pg))!=0 ){
4310         if( pPage->needSync ){
4311           needSync = 1;
4312         }
4313       }
4314     }
4315 
4316     /* If the PgHdr.needSync flag is set for any of the nPage pages
4317     ** starting at pg1, then it needs to be set for all of them. Because
4318     ** writing to any of these nPage pages may damage the others, the
4319     ** journal file must contain sync()ed copies of all of them
4320     ** before any of them can be written out to the database file.
4321     */
4322     if( needSync ){
4323       for(ii=0; ii<nPage && needSync; ii++){
4324         PgHdr *pPage = pager_lookup(pPager, pg1+ii);
4325         if( pPage ) pPage->needSync = 1;
4326       }
4327       assert(pPager->needSync);
4328     }
4329 
4330     assert( pPager->doNotSync==1 );
4331     pPager->doNotSync = 0;
4332   }else{
4333     rc = pager_write(pDbPage);
4334   }
4335   pagerLeave(pPager);
4336   return rc;
4337 }
4338 
4339 /*
4340 ** Return TRUE if the page given in the argument was previously passed
4341 ** to sqlite3PagerWrite().  In other words, return TRUE if it is ok
4342 ** to change the content of the page.
4343 */
4344 #ifndef NDEBUG
4345 int sqlite3PagerIswriteable(DbPage *pPg){
4346   return pPg->dirty;
4347 }
4348 #endif
4349 
4350 #ifndef SQLITE_OMIT_VACUUM
4351 /*
4352 ** Replace the content of a single page with the information in the third
4353 ** argument.
4354 */
4355 int sqlite3PagerOverwrite(Pager *pPager, Pgno pgno, void *pData){
4356   PgHdr *pPg;
4357   int rc;
4358 
4359   pagerEnter(pPager);
4360   rc = sqlite3PagerGet(pPager, pgno, &pPg);
4361   if( rc==SQLITE_OK ){
4362     rc = sqlite3PagerWrite(pPg);
4363     if( rc==SQLITE_OK ){
4364       memcpy(sqlite3PagerGetData(pPg), pData, pPager->pageSize);
4365     }
4366     sqlite3PagerUnref(pPg);
4367   }
4368   pagerLeave(pPager);
4369   return rc;
4370 }
4371 #endif
4372 
4373 /*
4374 ** A call to this routine tells the pager that it is not necessary to
4375 ** write the information on page pPg back to the disk, even though
4376 ** that page might be marked as dirty.
4377 **
4378 ** The overlying software layer calls this routine when all of the data
4379 ** on the given page is unused.  The pager marks the page as clean so
4380 ** that it does not get written to disk.
4381 **
4382 ** Tests show that this optimization, together with the
4383 ** sqlite3PagerDontRollback() below, more than double the speed
4384 ** of large INSERT operations and quadruple the speed of large DELETEs.
4385 **
4386 ** When this routine is called, set the alwaysRollback flag to true.
4387 ** Subsequent calls to sqlite3PagerDontRollback() for the same page
4388 ** will thereafter be ignored.  This is necessary to avoid a problem
4389 ** where a page with data is added to the freelist during one part of
4390 ** a transaction then removed from the freelist during a later part
4391 ** of the same transaction and reused for some other purpose.  When it
4392 ** is first added to the freelist, this routine is called.  When reused,
4393 ** the sqlite3PagerDontRollback() routine is called.  But because the
4394 ** page contains critical data, we still need to be sure it gets
4395 ** rolled back in spite of the sqlite3PagerDontRollback() call.
4396 */
4397 void sqlite3PagerDontWrite(DbPage *pDbPage){
4398   PgHdr *pPg = pDbPage;
4399   Pager *pPager = pPg->pPager;
4400 
4401   if( MEMDB ) return;
4402   pagerEnter(pPager);
4403   pPg->alwaysRollback = 1;
4404   if( pPg->dirty && !pPager->stmtInUse ){
4405     assert( pPager->state>=PAGER_SHARED );
4406     if( pPager->dbSize==(int)pPg->pgno && pPager->origDbSize<pPager->dbSize ){
4407       /* If this pages is the last page in the file and the file has grown
4408       ** during the current transaction, then do NOT mark the page as clean.
4409       ** When the database file grows, we must make sure that the last page
4410       ** gets written at least once so that the disk file will be the correct
4411       ** size. If you do not write this page and the size of the file
4412       ** on the disk ends up being too small, that can lead to database
4413       ** corruption during the next transaction.
4414       */
4415     }else{
4416       PAGERTRACE3("DONT_WRITE page %d of %d\n", pPg->pgno, PAGERID(pPager));
4417       IOTRACE(("CLEAN %p %d\n", pPager, pPg->pgno))
4418       makeClean(pPg);
4419 #ifdef SQLITE_CHECK_PAGES
4420       pPg->pageHash = pager_pagehash(pPg);
4421 #endif
4422     }
4423   }
4424   pagerLeave(pPager);
4425 }
4426 
4427 /*
4428 ** A call to this routine tells the pager that if a rollback occurs,
4429 ** it is not necessary to restore the data on the given page.  This
4430 ** means that the pager does not have to record the given page in the
4431 ** rollback journal.
4432 **
4433 ** If we have not yet actually read the content of this page (if
4434 ** the PgHdr.needRead flag is set) then this routine acts as a promise
4435 ** that we will never need to read the page content in the future.
4436 ** so the needRead flag can be cleared at this point.
4437 **
4438 ** This routine is only called from a single place in the sqlite btree
4439 ** code (when a leaf is removed from the free-list). This allows the
4440 ** following assumptions to be made about pPg:
4441 **
4442 **   1. PagerDontWrite() has been called on the page, OR
4443 **      PagerWrite() has not yet been called on the page.
4444 **
4445 **   2. The page existed when the transaction was started.
4446 **
4447 ** Details: DontRollback() (this routine) is only called when a leaf is
4448 ** removed from the free list. DontWrite() is called whenever a page
4449 ** becomes a free-list leaf.
4450 */
4451 void sqlite3PagerDontRollback(DbPage *pPg){
4452   Pager *pPager = pPg->pPager;
4453 
4454   pagerEnter(pPager);
4455   assert( pPager->state>=PAGER_RESERVED );
4456 
4457   /* If the journal file is not open, or DontWrite() has been called on
4458   ** this page (DontWrite() sets the alwaysRollback flag), then this
4459   ** function is a no-op.
4460   */
4461   if( pPager->journalOpen==0 || pPg->alwaysRollback || pPager->alwaysRollback ){
4462     pagerLeave(pPager);
4463     return;
4464   }
4465   assert( !MEMDB );    /* For a memdb, pPager->journalOpen is always 0 */
4466 
4467   /* Check that PagerWrite() has not yet been called on this page, and
4468   ** that the page existed when the transaction started.
4469   */
4470   assert( !pPg->inJournal && (int)pPg->pgno <= pPager->origDbSize );
4471 
4472   assert( pPager->pInJournal!=0 );
4473   sqlite3BitvecSet(pPager->pInJournal, pPg->pgno);
4474   pPg->inJournal = 1;
4475   pPg->needRead = 0;
4476   if( pPager->stmtInUse ){
4477     assert( pPager->stmtSize <= pPager->origDbSize );
4478     sqlite3BitvecSet(pPager->pInStmt, pPg->pgno);
4479   }
4480   PAGERTRACE3("DONT_ROLLBACK page %d of %d\n", pPg->pgno, PAGERID(pPager));
4481   IOTRACE(("GARBAGE %p %d\n", pPager, pPg->pgno))
4482   pagerLeave(pPager);
4483 }
4484 
4485 
4486 /*
4487 ** This routine is called to increment the database file change-counter,
4488 ** stored at byte 24 of the pager file.
4489 */
4490 static int pager_incr_changecounter(Pager *pPager, int isDirect){
4491   PgHdr *pPgHdr;
4492   u32 change_counter;
4493   int rc = SQLITE_OK;
4494 
4495   if( !pPager->changeCountDone ){
4496     /* Open page 1 of the file for writing. */
4497     rc = sqlite3PagerGet(pPager, 1, &pPgHdr);
4498     if( rc!=SQLITE_OK ) return rc;
4499 
4500     if( !isDirect ){
4501       rc = sqlite3PagerWrite(pPgHdr);
4502       if( rc!=SQLITE_OK ){
4503         sqlite3PagerUnref(pPgHdr);
4504         return rc;
4505       }
4506     }
4507 
4508     /* Increment the value just read and write it back to byte 24. */
4509     change_counter = sqlite3Get4byte((u8*)pPager->dbFileVers);
4510     change_counter++;
4511     put32bits(((char*)PGHDR_TO_DATA(pPgHdr))+24, change_counter);
4512 
4513     if( isDirect && pPager->fd->pMethods ){
4514       const void *zBuf = PGHDR_TO_DATA(pPgHdr);
4515       rc = sqlite3OsWrite(pPager->fd, zBuf, pPager->pageSize, 0);
4516     }
4517 
4518     /* Release the page reference. */
4519     sqlite3PagerUnref(pPgHdr);
4520     pPager->changeCountDone = 1;
4521   }
4522   return rc;
4523 }
4524 
4525 /*
4526 ** Sync the pager file to disk.
4527 */
4528 int sqlite3PagerSync(Pager *pPager){
4529   int rc;
4530   pagerEnter(pPager);
4531   rc = sqlite3OsSync(pPager->fd, pPager->sync_flags);
4532   pagerLeave(pPager);
4533   return rc;
4534 }
4535 
4536 /*
4537 ** Sync the database file for the pager pPager. zMaster points to the name
4538 ** of a master journal file that should be written into the individual
4539 ** journal file. zMaster may be NULL, which is interpreted as no master
4540 ** journal (a single database transaction).
4541 **
4542 ** This routine ensures that the journal is synced, all dirty pages written
4543 ** to the database file and the database file synced. The only thing that
4544 ** remains to commit the transaction is to delete the journal file (or
4545 ** master journal file if specified).
4546 **
4547 ** Note that if zMaster==NULL, this does not overwrite a previous value
4548 ** passed to an sqlite3PagerCommitPhaseOne() call.
4549 **
4550 ** If parameter nTrunc is non-zero, then the pager file is truncated to
4551 ** nTrunc pages (this is used by auto-vacuum databases).
4552 **
4553 ** If the final parameter - noSync - is true, then the database file itself
4554 ** is not synced. The caller must call sqlite3PagerSync() directly to
4555 ** sync the database file before calling CommitPhaseTwo() to delete the
4556 ** journal file in this case.
4557 */
4558 int sqlite3PagerCommitPhaseOne(
4559   Pager *pPager,
4560   const char *zMaster,
4561   Pgno nTrunc,
4562   int noSync
4563 ){
4564   int rc = SQLITE_OK;
4565 
4566   PAGERTRACE4("DATABASE SYNC: File=%s zMaster=%s nTrunc=%d\n",
4567       pPager->zFilename, zMaster, nTrunc);
4568   pagerEnter(pPager);
4569 
4570   /* If this is an in-memory db, or no pages have been written to, or this
4571   ** function has already been called, it is a no-op.
4572   */
4573   if( pPager->state!=PAGER_SYNCED && !MEMDB && pPager->dirtyCache ){
4574     PgHdr *pPg;
4575 
4576 #ifdef SQLITE_ENABLE_ATOMIC_WRITE
4577     /* The atomic-write optimization can be used if all of the
4578     ** following are true:
4579     **
4580     **    + The file-system supports the atomic-write property for
4581     **      blocks of size page-size, and
4582     **    + This commit is not part of a multi-file transaction, and
4583     **    + Exactly one page has been modified and store in the journal file.
4584     **
4585     ** If the optimization can be used, then the journal file will never
4586     ** be created for this transaction.
4587     */
4588     int useAtomicWrite = (
4589         !zMaster &&
4590         pPager->journalOff==jrnlBufferSize(pPager) &&
4591         nTrunc==0 &&
4592         (0==pPager->pDirty || 0==pPager->pDirty->pDirty)
4593     );
4594     if( useAtomicWrite ){
4595       /* Update the nRec field in the journal file. */
4596       int offset = pPager->journalHdr + sizeof(aJournalMagic);
4597       assert(pPager->nRec==1);
4598       rc = write32bits(pPager->jfd, offset, pPager->nRec);
4599 
4600       /* Update the db file change counter. The following call will modify
4601       ** the in-memory representation of page 1 to include the updated
4602       ** change counter and then write page 1 directly to the database
4603       ** file. Because of the atomic-write property of the host file-system,
4604       ** this is safe.
4605       */
4606       if( rc==SQLITE_OK ){
4607         rc = pager_incr_changecounter(pPager, 1);
4608       }
4609     }else{
4610       rc = sqlite3JournalCreate(pPager->jfd);
4611     }
4612 
4613     if( !useAtomicWrite && rc==SQLITE_OK )
4614 #endif
4615 
4616     /* If a master journal file name has already been written to the
4617     ** journal file, then no sync is required. This happens when it is
4618     ** written, then the process fails to upgrade from a RESERVED to an
4619     ** EXCLUSIVE lock. The next time the process tries to commit the
4620     ** transaction the m-j name will have already been written.
4621     */
4622     if( !pPager->setMaster ){
4623       assert( pPager->journalOpen );
4624       rc = pager_incr_changecounter(pPager, 0);
4625       if( rc!=SQLITE_OK ) goto sync_exit;
4626 #ifndef SQLITE_OMIT_AUTOVACUUM
4627       if( nTrunc!=0 ){
4628         /* If this transaction has made the database smaller, then all pages
4629         ** being discarded by the truncation must be written to the journal
4630         ** file.
4631         */
4632         Pgno i;
4633         int iSkip = PAGER_MJ_PGNO(pPager);
4634         for( i=nTrunc+1; i<=pPager->origDbSize; i++ ){
4635           if( !sqlite3BitvecTest(pPager->pInJournal, i) && i!=iSkip ){
4636             rc = sqlite3PagerGet(pPager, i, &pPg);
4637             if( rc!=SQLITE_OK ) goto sync_exit;
4638             rc = sqlite3PagerWrite(pPg);
4639             sqlite3PagerUnref(pPg);
4640             if( rc!=SQLITE_OK ) goto sync_exit;
4641           }
4642         }
4643       }
4644 #endif
4645       rc = writeMasterJournal(pPager, zMaster);
4646       if( rc!=SQLITE_OK ) goto sync_exit;
4647       rc = syncJournal(pPager);
4648     }
4649     if( rc!=SQLITE_OK ) goto sync_exit;
4650 
4651 #ifndef SQLITE_OMIT_AUTOVACUUM
4652     if( nTrunc!=0 ){
4653       rc = sqlite3PagerTruncate(pPager, nTrunc);
4654       if( rc!=SQLITE_OK ) goto sync_exit;
4655     }
4656 #endif
4657 
4658     /* Write all dirty pages to the database file */
4659     pPg = pager_get_all_dirty_pages(pPager);
4660     rc = pager_write_pagelist(pPg);
4661     if( rc!=SQLITE_OK ){
4662       assert( rc!=SQLITE_IOERR_BLOCKED );
4663       /* The error might have left the dirty list all fouled up here,
4664       ** but that does not matter because if the if the dirty list did
4665       ** get corrupted, then the transaction will roll back and
4666       ** discard the dirty list.  There is an assert in
4667       ** pager_get_all_dirty_pages() that verifies that no attempt
4668       ** is made to use an invalid dirty list.
4669       */
4670       goto sync_exit;
4671     }
4672     pPager->pDirty = 0;
4673 
4674     /* Sync the database file. */
4675     if( !pPager->noSync && !noSync ){
4676       rc = sqlite3OsSync(pPager->fd, pPager->sync_flags);
4677     }
4678     IOTRACE(("DBSYNC %p\n", pPager))
4679 
4680     pPager->state = PAGER_SYNCED;
4681   }else if( MEMDB && nTrunc!=0 ){
4682     rc = sqlite3PagerTruncate(pPager, nTrunc);
4683   }
4684 
4685 sync_exit:
4686   if( rc==SQLITE_IOERR_BLOCKED ){
4687     /* pager_incr_changecounter() may attempt to obtain an exclusive
4688      * lock to spill the cache and return IOERR_BLOCKED. But since
4689      * there is no chance the cache is inconsistent, it is
4690      * better to return SQLITE_BUSY.
4691      */
4692     rc = SQLITE_BUSY;
4693   }
4694   pagerLeave(pPager);
4695   return rc;
4696 }
4697 
4698 
4699 /*
4700 ** Commit all changes to the database and release the write lock.
4701 **
4702 ** If the commit fails for any reason, a rollback attempt is made
4703 ** and an error code is returned.  If the commit worked, SQLITE_OK
4704 ** is returned.
4705 */
4706 int sqlite3PagerCommitPhaseTwo(Pager *pPager){
4707   int rc;
4708   PgHdr *pPg;
4709 
4710   if( pPager->errCode ){
4711     return pPager->errCode;
4712   }
4713   if( pPager->state<PAGER_RESERVED ){
4714     return SQLITE_ERROR;
4715   }
4716   pagerEnter(pPager);
4717   PAGERTRACE2("COMMIT %d\n", PAGERID(pPager));
4718   if( MEMDB ){
4719     pPg = pager_get_all_dirty_pages(pPager);
4720     while( pPg ){
4721       PgHistory *pHist = PGHDR_TO_HIST(pPg, pPager);
4722       clearHistory(pHist);
4723       pPg->dirty = 0;
4724       pPg->inJournal = 0;
4725       pHist->inStmt = 0;
4726       pPg->needSync = 0;
4727       pHist->pPrevStmt = pHist->pNextStmt = 0;
4728       pPg = pPg->pDirty;
4729     }
4730     pPager->pDirty = 0;
4731 #ifndef NDEBUG
4732     for(pPg=pPager->pAll; pPg; pPg=pPg->pNextAll){
4733       PgHistory *pHist = PGHDR_TO_HIST(pPg, pPager);
4734       assert( !pPg->alwaysRollback );
4735       assert( !pHist->pOrig );
4736       assert( !pHist->pStmt );
4737     }
4738 #endif
4739     pPager->pStmt = 0;
4740     pPager->state = PAGER_SHARED;
4741     pagerLeave(pPager);
4742     return SQLITE_OK;
4743   }
4744   assert( pPager->journalOpen || !pPager->dirtyCache );
4745   assert( pPager->state==PAGER_SYNCED || !pPager->dirtyCache );
4746   rc = pager_end_transaction(pPager);
4747   rc = pager_error(pPager, rc);
4748   pagerLeave(pPager);
4749   return rc;
4750 }
4751 
4752 /*
4753 ** Rollback all changes.  The database falls back to PAGER_SHARED mode.
4754 ** All in-memory cache pages revert to their original data contents.
4755 ** The journal is deleted.
4756 **
4757 ** This routine cannot fail unless some other process is not following
4758 ** the correct locking protocol or unless some other
4759 ** process is writing trash into the journal file (SQLITE_CORRUPT) or
4760 ** unless a prior malloc() failed (SQLITE_NOMEM).  Appropriate error
4761 ** codes are returned for all these occasions.  Otherwise,
4762 ** SQLITE_OK is returned.
4763 */
4764 int sqlite3PagerRollback(Pager *pPager){
4765   int rc;
4766   PAGERTRACE2("ROLLBACK %d\n", PAGERID(pPager));
4767   if( MEMDB ){
4768     PgHdr *p;
4769     for(p=pPager->pAll; p; p=p->pNextAll){
4770       PgHistory *pHist;
4771       assert( !p->alwaysRollback );
4772       if( !p->dirty ){
4773         assert( !((PgHistory *)PGHDR_TO_HIST(p, pPager))->pOrig );
4774         assert( !((PgHistory *)PGHDR_TO_HIST(p, pPager))->pStmt );
4775         continue;
4776       }
4777 
4778       pHist = PGHDR_TO_HIST(p, pPager);
4779       if( pHist->pOrig ){
4780         memcpy(PGHDR_TO_DATA(p), pHist->pOrig, pPager->pageSize);
4781         PAGERTRACE3("ROLLBACK-PAGE %d of %d\n", p->pgno, PAGERID(pPager));
4782       }else{
4783         PAGERTRACE3("PAGE %d is clean on %d\n", p->pgno, PAGERID(pPager));
4784       }
4785       clearHistory(pHist);
4786       p->dirty = 0;
4787       p->inJournal = 0;
4788       pHist->inStmt = 0;
4789       pHist->pPrevStmt = pHist->pNextStmt = 0;
4790       if( pPager->xReiniter ){
4791         pPager->xReiniter(p, pPager->pageSize);
4792       }
4793     }
4794     pPager->pDirty = 0;
4795     pPager->pStmt = 0;
4796     pPager->dbSize = pPager->origDbSize;
4797     pager_truncate_cache(pPager);
4798     pPager->stmtInUse = 0;
4799     pPager->state = PAGER_SHARED;
4800     return SQLITE_OK;
4801   }
4802 
4803   pagerEnter(pPager);
4804   if( !pPager->dirtyCache || !pPager->journalOpen ){
4805     rc = pager_end_transaction(pPager);
4806     pagerLeave(pPager);
4807     return rc;
4808   }
4809 
4810   if( pPager->errCode && pPager->errCode!=SQLITE_FULL ){
4811     if( pPager->state>=PAGER_EXCLUSIVE ){
4812       pager_playback(pPager, 0);
4813     }
4814     pagerLeave(pPager);
4815     return pPager->errCode;
4816   }
4817   if( pPager->state==PAGER_RESERVED ){
4818     int rc2;
4819     rc = pager_playback(pPager, 0);
4820     rc2 = pager_end_transaction(pPager);
4821     if( rc==SQLITE_OK ){
4822       rc = rc2;
4823     }
4824   }else{
4825     rc = pager_playback(pPager, 0);
4826   }
4827   /* pager_reset(pPager); */
4828   pPager->dbSize = -1;
4829 
4830   /* If an error occurs during a ROLLBACK, we can no longer trust the pager
4831   ** cache. So call pager_error() on the way out to make any error
4832   ** persistent.
4833   */
4834   rc = pager_error(pPager, rc);
4835   pagerLeave(pPager);
4836   return rc;
4837 }
4838 
4839 /*
4840 ** Return TRUE if the database file is opened read-only.  Return FALSE
4841 ** if the database is (in theory) writable.
4842 */
4843 int sqlite3PagerIsreadonly(Pager *pPager){
4844   return pPager->readOnly;
4845 }
4846 
4847 /*
4848 ** Return the number of references to the pager.
4849 */
4850 int sqlite3PagerRefcount(Pager *pPager){
4851   return pPager->nRef;
4852 }
4853 
4854 #ifdef SQLITE_TEST
4855 /*
4856 ** This routine is used for testing and analysis only.
4857 */
4858 int *sqlite3PagerStats(Pager *pPager){
4859   static int a[11];
4860   a[0] = pPager->nRef;
4861   a[1] = pPager->nPage;
4862   a[2] = pPager->mxPage;
4863   a[3] = pPager->dbSize;
4864   a[4] = pPager->state;
4865   a[5] = pPager->errCode;
4866   a[6] = pPager->nHit;
4867   a[7] = pPager->nMiss;
4868   a[8] = 0;  /* Used to be pPager->nOvfl */
4869   a[9] = pPager->nRead;
4870   a[10] = pPager->nWrite;
4871   return a;
4872 }
4873 #endif
4874 
4875 /*
4876 ** Set the statement rollback point.
4877 **
4878 ** This routine should be called with the transaction journal already
4879 ** open.  A new statement journal is created that can be used to rollback
4880 ** changes of a single SQL command within a larger transaction.
4881 */
4882 static int pagerStmtBegin(Pager *pPager){
4883   int rc;
4884   assert( !pPager->stmtInUse );
4885   assert( pPager->state>=PAGER_SHARED );
4886   assert( pPager->dbSize>=0 );
4887   PAGERTRACE2("STMT-BEGIN %d\n", PAGERID(pPager));
4888   if( MEMDB ){
4889     pPager->stmtInUse = 1;
4890     pPager->stmtSize = pPager->dbSize;
4891     return SQLITE_OK;
4892   }
4893   if( !pPager->journalOpen ){
4894     pPager->stmtAutoopen = 1;
4895     return SQLITE_OK;
4896   }
4897   assert( pPager->journalOpen );
4898   pagerLeave(pPager);
4899   assert( pPager->pInStmt==0 );
4900   pPager->pInStmt = sqlite3BitvecCreate(pPager->dbSize);
4901   pagerEnter(pPager);
4902   if( pPager->pInStmt==0 ){
4903     /* sqlite3OsLock(pPager->fd, SHARED_LOCK); */
4904     return SQLITE_NOMEM;
4905   }
4906 #ifndef NDEBUG
4907   rc = sqlite3OsFileSize(pPager->jfd, &pPager->stmtJSize);
4908   if( rc ) goto stmt_begin_failed;
4909   assert( pPager->stmtJSize == pPager->journalOff );
4910 #endif
4911   pPager->stmtJSize = pPager->journalOff;
4912   pPager->stmtSize = pPager->dbSize;
4913   pPager->stmtHdrOff = 0;
4914   pPager->stmtCksum = pPager->cksumInit;
4915   if( !pPager->stmtOpen ){
4916     rc = sqlite3PagerOpentemp(pPager->pVfs, pPager->stfd, pPager->zStmtJrnl,
4917                               SQLITE_OPEN_SUBJOURNAL);
4918     if( rc ){
4919       goto stmt_begin_failed;
4920     }
4921     pPager->stmtOpen = 1;
4922     pPager->stmtNRec = 0;
4923   }
4924   pPager->stmtInUse = 1;
4925   return SQLITE_OK;
4926 
4927 stmt_begin_failed:
4928   if( pPager->pInStmt ){
4929     sqlite3BitvecDestroy(pPager->pInStmt);
4930     pPager->pInStmt = 0;
4931   }
4932   return rc;
4933 }
4934 int sqlite3PagerStmtBegin(Pager *pPager){
4935   int rc;
4936   pagerEnter(pPager);
4937   rc = pagerStmtBegin(pPager);
4938   pagerLeave(pPager);
4939   return rc;
4940 }
4941 
4942 /*
4943 ** Commit a statement.
4944 */
4945 int sqlite3PagerStmtCommit(Pager *pPager){
4946   pagerEnter(pPager);
4947   if( pPager->stmtInUse ){
4948     PgHdr *pPg, *pNext;
4949     PAGERTRACE2("STMT-COMMIT %d\n", PAGERID(pPager));
4950     if( !MEMDB ){
4951       /* sqlite3OsTruncate(pPager->stfd, 0); */
4952       sqlite3BitvecDestroy(pPager->pInStmt);
4953       pPager->pInStmt = 0;
4954     }else{
4955       for(pPg=pPager->pStmt; pPg; pPg=pNext){
4956         PgHistory *pHist = PGHDR_TO_HIST(pPg, pPager);
4957         pNext = pHist->pNextStmt;
4958         assert( pHist->inStmt );
4959         pHist->inStmt = 0;
4960         pHist->pPrevStmt = pHist->pNextStmt = 0;
4961         sqlite3_free(pHist->pStmt);
4962         pHist->pStmt = 0;
4963       }
4964     }
4965     pPager->stmtNRec = 0;
4966     pPager->stmtInUse = 0;
4967     pPager->pStmt = 0;
4968   }
4969   pPager->stmtAutoopen = 0;
4970   pagerLeave(pPager);
4971   return SQLITE_OK;
4972 }
4973 
4974 /*
4975 ** Rollback a statement.
4976 */
4977 int sqlite3PagerStmtRollback(Pager *pPager){
4978   int rc;
4979   pagerEnter(pPager);
4980   if( pPager->stmtInUse ){
4981     PAGERTRACE2("STMT-ROLLBACK %d\n", PAGERID(pPager));
4982     if( MEMDB ){
4983       PgHdr *pPg;
4984       PgHistory *pHist;
4985       for(pPg=pPager->pStmt; pPg; pPg=pHist->pNextStmt){
4986         pHist = PGHDR_TO_HIST(pPg, pPager);
4987         if( pHist->pStmt ){
4988           memcpy(PGHDR_TO_DATA(pPg), pHist->pStmt, pPager->pageSize);
4989           sqlite3_free(pHist->pStmt);
4990           pHist->pStmt = 0;
4991         }
4992       }
4993       pPager->dbSize = pPager->stmtSize;
4994       pager_truncate_cache(pPager);
4995       rc = SQLITE_OK;
4996     }else{
4997       rc = pager_stmt_playback(pPager);
4998     }
4999     sqlite3PagerStmtCommit(pPager);
5000   }else{
5001     rc = SQLITE_OK;
5002   }
5003   pPager->stmtAutoopen = 0;
5004   pagerLeave(pPager);
5005   return rc;
5006 }
5007 
5008 /*
5009 ** Return the full pathname of the database file.
5010 */
5011 const char *sqlite3PagerFilename(Pager *pPager){
5012   return pPager->zFilename;
5013 }
5014 
5015 /*
5016 ** Return the VFS structure for the pager.
5017 */
5018 const sqlite3_vfs *sqlite3PagerVfs(Pager *pPager){
5019   return pPager->pVfs;
5020 }
5021 
5022 /*
5023 ** Return the file handle for the database file associated
5024 ** with the pager.  This might return NULL if the file has
5025 ** not yet been opened.
5026 */
5027 sqlite3_file *sqlite3PagerFile(Pager *pPager){
5028   return pPager->fd;
5029 }
5030 
5031 /*
5032 ** Return the directory of the database file.
5033 */
5034 const char *sqlite3PagerDirname(Pager *pPager){
5035   return pPager->zDirectory;
5036 }
5037 
5038 /*
5039 ** Return the full pathname of the journal file.
5040 */
5041 const char *sqlite3PagerJournalname(Pager *pPager){
5042   return pPager->zJournal;
5043 }
5044 
5045 /*
5046 ** Return true if fsync() calls are disabled for this pager.  Return FALSE
5047 ** if fsync()s are executed normally.
5048 */
5049 int sqlite3PagerNosync(Pager *pPager){
5050   return pPager->noSync;
5051 }
5052 
5053 #ifdef SQLITE_HAS_CODEC
5054 /*
5055 ** Set the codec for this pager
5056 */
5057 void sqlite3PagerSetCodec(
5058   Pager *pPager,
5059   void *(*xCodec)(void*,void*,Pgno,int),
5060   void *pCodecArg
5061 ){
5062   pPager->xCodec = xCodec;
5063   pPager->pCodecArg = pCodecArg;
5064 }
5065 #endif
5066 
5067 #ifndef SQLITE_OMIT_AUTOVACUUM
5068 /*
5069 ** Move the page pPg to location pgno in the file.
5070 **
5071 ** There must be no references to the page previously located at
5072 ** pgno (which we call pPgOld) though that page is allowed to be
5073 ** in cache.  If the page previous located at pgno is not already
5074 ** in the rollback journal, it is not put there by by this routine.
5075 **
5076 ** References to the page pPg remain valid. Updating any
5077 ** meta-data associated with pPg (i.e. data stored in the nExtra bytes
5078 ** allocated along with the page) is the responsibility of the caller.
5079 **
5080 ** A transaction must be active when this routine is called. It used to be
5081 ** required that a statement transaction was not active, but this restriction
5082 ** has been removed (CREATE INDEX needs to move a page when a statement
5083 ** transaction is active).
5084 */
5085 int sqlite3PagerMovepage(Pager *pPager, DbPage *pPg, Pgno pgno){
5086   PgHdr *pPgOld;  /* The page being overwritten. */
5087   int h;
5088   Pgno needSyncPgno = 0;
5089 
5090   pagerEnter(pPager);
5091   assert( pPg->nRef>0 );
5092 
5093   PAGERTRACE5("MOVE %d page %d (needSync=%d) moves to %d\n",
5094       PAGERID(pPager), pPg->pgno, pPg->needSync, pgno);
5095   IOTRACE(("MOVE %p %d %d\n", pPager, pPg->pgno, pgno))
5096 
5097   pager_get_content(pPg);
5098   if( pPg->needSync ){
5099     needSyncPgno = pPg->pgno;
5100     assert( pPg->inJournal || (int)pgno>pPager->origDbSize );
5101     assert( pPg->dirty );
5102     assert( pPager->needSync );
5103   }
5104 
5105   /* Unlink pPg from its hash-chain */
5106   unlinkHashChain(pPager, pPg);
5107 
5108   /* If the cache contains a page with page-number pgno, remove it
5109   ** from its hash chain. Also, if the PgHdr.needSync was set for
5110   ** page pgno before the 'move' operation, it needs to be retained
5111   ** for the page moved there.
5112   */
5113   pPg->needSync = 0;
5114   pPgOld = pager_lookup(pPager, pgno);
5115   if( pPgOld ){
5116     assert( pPgOld->nRef==0 );
5117     unlinkHashChain(pPager, pPgOld);
5118     makeClean(pPgOld);
5119     pPg->needSync = pPgOld->needSync;
5120   }else{
5121     pPg->needSync = 0;
5122   }
5123   pPg->inJournal = sqlite3BitvecTest(pPager->pInJournal, pgno);
5124 
5125   /* Change the page number for pPg and insert it into the new hash-chain. */
5126   assert( pgno!=0 );
5127   pPg->pgno = pgno;
5128   h = pgno & (pPager->nHash-1);
5129   if( pPager->aHash[h] ){
5130     assert( pPager->aHash[h]->pPrevHash==0 );
5131     pPager->aHash[h]->pPrevHash = pPg;
5132   }
5133   pPg->pNextHash = pPager->aHash[h];
5134   pPager->aHash[h] = pPg;
5135   pPg->pPrevHash = 0;
5136 
5137   makeDirty(pPg);
5138   pPager->dirtyCache = 1;
5139 
5140   if( needSyncPgno ){
5141     /* If needSyncPgno is non-zero, then the journal file needs to be
5142     ** sync()ed before any data is written to database file page needSyncPgno.
5143     ** Currently, no such page exists in the page-cache and the
5144     ** Pager.pInJournal bit has been set. This needs to be remedied by loading
5145     ** the page into the pager-cache and setting the PgHdr.needSync flag.
5146     **
5147     ** If the attempt to load the page into the page-cache fails, (due
5148     ** to a malloc() or IO failure), clear the bit in the pInJournal[]
5149     ** array. Otherwise, if the page is loaded and written again in
5150     ** this transaction, it may be written to the database file before
5151     ** it is synced into the journal file. This way, it may end up in
5152     ** the journal file twice, but that is not a problem.
5153     **
5154     ** The sqlite3PagerGet() call may cause the journal to sync. So make
5155     ** sure the Pager.needSync flag is set too.
5156     */
5157     int rc;
5158     PgHdr *pPgHdr;
5159     assert( pPager->needSync );
5160     rc = sqlite3PagerGet(pPager, needSyncPgno, &pPgHdr);
5161     if( rc!=SQLITE_OK ){
5162       if( pPager->pInJournal && (int)needSyncPgno<=pPager->origDbSize ){
5163         sqlite3BitvecClear(pPager->pInJournal, needSyncPgno);
5164       }
5165       pagerLeave(pPager);
5166       return rc;
5167     }
5168     pPager->needSync = 1;
5169     pPgHdr->needSync = 1;
5170     pPgHdr->inJournal = 1;
5171     makeDirty(pPgHdr);
5172     sqlite3PagerUnref(pPgHdr);
5173   }
5174 
5175   pagerLeave(pPager);
5176   return SQLITE_OK;
5177 }
5178 #endif
5179 
5180 /*
5181 ** Return a pointer to the data for the specified page.
5182 */
5183 void *sqlite3PagerGetData(DbPage *pPg){
5184   return PGHDR_TO_DATA(pPg);
5185 }
5186 
5187 /*
5188 ** Return a pointer to the Pager.nExtra bytes of "extra" space
5189 ** allocated along with the specified page.
5190 */
5191 void *sqlite3PagerGetExtra(DbPage *pPg){
5192   Pager *pPager = pPg->pPager;
5193   return (pPager?PGHDR_TO_EXTRA(pPg, pPager):0);
5194 }
5195 
5196 /*
5197 ** Get/set the locking-mode for this pager. Parameter eMode must be one
5198 ** of PAGER_LOCKINGMODE_QUERY, PAGER_LOCKINGMODE_NORMAL or
5199 ** PAGER_LOCKINGMODE_EXCLUSIVE. If the parameter is not _QUERY, then
5200 ** the locking-mode is set to the value specified.
5201 **
5202 ** The returned value is either PAGER_LOCKINGMODE_NORMAL or
5203 ** PAGER_LOCKINGMODE_EXCLUSIVE, indicating the current (possibly updated)
5204 ** locking-mode.
5205 */
5206 int sqlite3PagerLockingMode(Pager *pPager, int eMode){
5207   assert( eMode==PAGER_LOCKINGMODE_QUERY
5208             || eMode==PAGER_LOCKINGMODE_NORMAL
5209             || eMode==PAGER_LOCKINGMODE_EXCLUSIVE );
5210   assert( PAGER_LOCKINGMODE_QUERY<0 );
5211   assert( PAGER_LOCKINGMODE_NORMAL>=0 && PAGER_LOCKINGMODE_EXCLUSIVE>=0 );
5212   if( eMode>=0 && !pPager->tempFile ){
5213     pPager->exclusiveMode = eMode;
5214   }
5215   return (int)pPager->exclusiveMode;
5216 }
5217 
5218 #ifdef SQLITE_TEST
5219 /*
5220 ** Print a listing of all referenced pages and their ref count.
5221 */
5222 void sqlite3PagerRefdump(Pager *pPager){
5223   PgHdr *pPg;
5224   for(pPg=pPager->pAll; pPg; pPg=pPg->pNextAll){
5225     if( pPg->nRef<=0 ) continue;
5226     sqlite3DebugPrintf("PAGE %3d addr=%p nRef=%d\n",
5227        pPg->pgno, PGHDR_TO_DATA(pPg), pPg->nRef);
5228   }
5229 }
5230 #endif
5231 
5232 #endif /* SQLITE_OMIT_DISKIO */
5233